From 27501bda49cf850a9a99d0bcadfc9616edc3789e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=80=C3=AE=C3=97=C3=9A=C3=95=C3=B1?= <lizz556@163.com>
Date: Tue, 2 Aug 2022 16:02:49 +0800
Subject: [PATCH] =?UTF-8?q?=E7=95=99=E5=AD=98=E5=88=86=E6=9E=90=E5=88=86?=
 =?UTF-8?q?=E7=BB=84=E8=87=AA=E5=AE=9A=E4=B9=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 api/api_v1/endpoints/query.py | 182 ++++++++++++++++++++++------------
 1 file changed, 121 insertions(+), 61 deletions(-)

diff --git a/api/api_v1/endpoints/query.py b/api/api_v1/endpoints/query.py
index c2e431e..be120aa 100644
--- a/api/api_v1/endpoints/query.py
+++ b/api/api_v1/endpoints/query.py
@@ -576,8 +576,51 @@ async def retention_model(request: Request,
     filter_item_type = res['filter_item_type']  # all
     filter_item = res['filter_item']  # 列表  0,1,3,7,14,21,30
     # 映射对应中文返回给前端展示
-    groupby_list=analysis.event_view.get('groupBy')
+    groupby_list = analysis.event_view.get('groupBy')
     groupby = [i['columnName'] for i in groupby_list if i['tableType'] != 'user_label']
+    true_group = []     # 定义分组实际选择
+    for g_data in groupby_list:
+        data_type = g_data['data_type']
+
+        # 不是int类型
+        if data_type != "int":
+            true_group.append("str")
+            continue
+
+        # 自定义区间
+        if g_data['intervalType'] == 'user_defined':
+            int_range = analysis.event_view.get('groupBy')[0]['quotaIntervalArr']
+            chk_range = []
+            for index, value in enumerate(int_range):
+                # 开头
+                if index == 0:
+                    chk_range.append(['-', value])
+                    # 只有两个数
+                    if len(int_range) >= 2:
+                        chk_range.append([value, int_range[index + 1]])
+                    continue
+                # 结尾
+                if index + 1 >= len(int_range):
+                    chk_range.append([value, '+'])
+                    continue
+                # 中间
+                chk_range.append([value, int_range[index + 1]])
+            true_group.append(chk_range)
+
+        # 默认区间
+        elif g_data['intervalType'] == 'def':
+            zidai = []
+            max_v = int(df[g_data['columnName']].max())
+            min_v = int(df[g_data['columnName']].min())
+            interval = (max_v - min_v) // 10 or 1
+            for i in range(min_v, max_v, interval):
+                zidai.append([i, i + interval])
+            true_group.append(zidai)
+
+        # 离散数字
+        else:
+            true_group.append('discrete')
+
     if len(groupby_list) == 1:
         max_v = int(df[groupby_list[0]['columnName']].max())
         min_v = int(df[groupby_list[0]['columnName']].min())
@@ -592,7 +635,7 @@ async def retention_model(request: Request,
             for k, v in chinese.items():
                 # 开始映射
                 df.loc[df['svrindex'] == k, 'svrindex'] = v
-    times=df['reg_date'][0]
+    times = df['reg_date'][0]
     df.set_index(groupby, inplace=True)
     # for d in set(res['date_range']) - set(df.index):
     #     df.loc[d] = 0
@@ -676,65 +719,82 @@ async def retention_model(request: Request,
         tmp['p_outflow'].append(n)
         tmp['n_outflow'].append(rd['o_cntn'])
     #  如果分组项是int类型按选择的分组
-
-    # 默认区间
-    if analysis.event_view.get('groupBy')[0]['intervalType'] == 'def':
+    if '均值' in summary_valuess:
         summary_valuess.pop('均值')
-        interval = (max_v - min_v) // 10 or 1
-        lens = len(summary_valuess[max_v]['n'])
-        ress = {}
-        for i in range(min_v, max_v, interval):
-            d0 = 0
-            n1 = []
-            n_outflow1 = []
-            for k, v in summary_valuess.items():
-                if k >= i and k < i + interval:
-                    d0 += v['d0']
-                    n1.append(v['n'])
-                    n_outflow1.append(v['n_outflow'])
-            if len(n1) > 0:
-                re_dict = {}
-                n = np.sum([ii for ii in n1], axis=0).tolist()
-                n_outflow = np.sum([iii for iii in n_outflow1], axis=0).tolist()
-                p = [round(nu*100 / d0, 2) for nu in n]
-                p_outflow = [round(num*100 / d0, 2) for num in n_outflow]
-                re_dict['d0'] = d0
-                re_dict['n'] = n
-                re_dict['n_outflow'] = n_outflow
-                re_dict['p'] = p
-                re_dict['p_outflow'] = p_outflow
-                ress[f"[{i},{i + interval})"] = re_dict
-            else:
-                re_dict = {'d0': 0}
-                n = []
-                n_outflow = []
-                p = []
-                p_outflow = []
-                for cishu in range(0, lens):
-                    n.append(0)
-                    n_outflow.append(0)
-                    p.append(0)
-                    p_outflow.append(0)
-                re_dict['n'] = n
-                re_dict['n_outflow'] = n_outflow
-                re_dict['p'] = p
-                re_dict['p_outflow'] = p_outflow
-                ress[f"[{i},{i + interval})"] = re_dict
-        summary_valuess=ress
-    # 自定义区间
-    elif analysis.event_view.get('groupBy')[0]['intervalType'] == 'user_defined':
-        pass
-    #  次留数
+    if "['均值']" in summary_valuess:
+        summary_valuess.pop("['均值']")
+    new_summary_valuess = {}
+    for group_key, group_data in summary_valuess.items():
+        key_list = eval(group_key)
+        true_key = []  # 重新定义后的分组
+        for index, value in enumerate(key_list):
+
+            true_group_index = true_group[index]
+            # 默认区间或者自定义区间
+            if isinstance(true_group_index, list):
+                for defined_list in true_group_index:
+                    defined_list_max = defined_list[1]
+                    defined_list_min = defined_list[0]
+                    if defined_list_min == '-':
+                        if value < defined_list_max:
+                            true_key.append(defined_list)
+                            break
+                        else:
+                            continue
+                    if defined_list_max == '+':
+                        if value >= defined_list_min:
+                            true_key.append(defined_list)
+                            break
+                        else:
+                            continue
+
+                    if defined_list_min <= value < defined_list_max:
+                        true_key.append(defined_list)
+                        break
+                    continue
+                continue
+
+            # 分组是字符串或者离散直接取这个值得str类型
+            if true_group_index in ['str', 'discrete']:
+                true_key.append(str(value))
+                continue
+
+        # 这个分组不存在:
+        if str(true_key) not in new_summary_valuess:
+            new_summary_valuess[str(true_key)] = group_data
+            continue
+
+        # 这个分组已存在
+        # d0相加
+        new_summary_valuess[str(true_key)]['d0'] += group_data['d0']
+
+        # n相加
+        n_list = new_summary_valuess[str(true_key)]['n']
+        n_list1 = group_data['n']
+        sum_n_lst = [x + y for x, y in zip(n_list, n_list1)]
+        new_summary_valuess[str(true_key)]['n'] = sum_n_lst
+
+        # n_outflow相加
+        n_outflow_list = new_summary_valuess[str(true_key)]['n_outflow']
+        n_outflow_list1 = group_data['n_outflow']
+        sum_n_ourflow_lst = [x + y for x, y in zip(n_outflow_list, n_outflow_list1)]
+        new_summary_valuess[str(true_key)]['n_outflow'] = sum_n_ourflow_lst
+
+    # 计算概率
+    for key1, value1 in new_summary_valuess.items():
+        new_summary_valuess[key1]['p'] = [round(i / value1['d0'], 2) for i in value1['n']]
+        new_summary_valuess[key1]['p_outflow'] = [round(i1 / value1['d0'], 2) for i1 in value1['n_outflow']]
+
     title = ['分组项', '用户数', '次留', *[f'{i + 1}留' for i in retention_n[1:]]]
 
     # 未到达的日期需要补齐-
     retention_length = len(retention_n)
-    for _, items in summary_valuess.items():
+    for _, items in new_summary_valuess.items():
         for key in ['p', 'n', 'p_outflow', 'n_outflow']:
             items[key].extend(['-'] * (retention_length - len(items[key])))
 
     resp = {
-        'summary_values': summary_valuess,
+        'summary_values': new_summary_valuess,
         # 'values': values,
         'date_range': [d.strftime('%Y-%m-%d') for d in date_range],
         'title': title,
@@ -1324,7 +1384,7 @@ async def scatter_model(
         # 这是整体的
         for key, tmp_df in df.groupby('date'):
             bins_s = pd.cut(tmp_df['values'], bins=bins,
-                            right=False,include_lowest=True).value_counts()
+                            right=False, include_lowest=True).value_counts()
             bins_s.sort_index(inplace=True)
             total = int(bins_s.sum())
             if res['time_particle'] == 'total':
@@ -1583,8 +1643,8 @@ async def scatter_model(
                     # if 'time' not in groupby:
                     resp['list'][str(key)] = dict()
                     resp['list'][str(key)] = {'n': bins_s.to_list(), 'total': total,
-                                         'p': [str(i) + '%' for i in p],
-                                         'title': '总体'}
+                                              'p': [str(i) + '%' for i in p],
+                                              'title': '总体'}
                     # else:
                     #     resp['list'][key.strftime('%Y-%m-%d %H:%M:%S')] = dict()
                     #     resp['list'][key.strftime('%Y-%m-%d %H:%M:%S')] = {'n': bins_s.to_list(), 'total': total,
@@ -1675,7 +1735,7 @@ async def scatter_model(
                         list_p.append(number_str)
 
                     resp['list'][str(dt)] = {'n': [labels_dict01.get(i, 0) for i in labels], 'total': total,
-                                        'p': list_p}
+                                             'p': list_p}
                 else:
                     list_p = []
                     for i in labels:
@@ -1683,7 +1743,7 @@ async def scatter_model(
                         number_str = str(number_int) + '%'
                         list_p.append(number_str)
                     resp['list'][str(dt)] = {'n': [labels_dict.get(i, 0) for i in labels], 'total': total,
-                                        'p': list_p}
+                                             'p': list_p}
                     # resp['list'][dt] = {'总体': {'n': [labels_dict.get(i, 0) for i in labels], 'total': total,
                     #                          'p': [round(labels_dict.get(i, 0) * 100 / total, 2) for i in labels]}}
             if where == "step_id" and event_type == "guide":
@@ -1985,17 +2045,17 @@ async def user_property_model(
     if data_in.user_arrt_type == 'datetime':
         sql = f"""select `#account_id`,`{ziduan}` from {game}.`user` WHERE addHours(`{ziduan}`, 8) >= '{data_in.start_time}' 
                     and addHours(`{ziduan}`, 8) <= '{data_in.end_time}' ORDER BY `#reg_time` LIMIT 10 OFFSET {(
-                                                                                                                                                                                                                                                       data_in.pages - 1) * 10}"""
+                                                                                                                                                                                                                                                   data_in.pages - 1) * 10}"""
     # 如果查询'#account_id'，则不多余返回一个account_id
     elif ziduan == '#account_id':
         sql = f"""select `{ziduan}`,name from {game}.`user` WHERE `{ziduan}` {tiaojian} '{data_in.condition}' ORDER BY `#reg_time` LIMIT 10 OFFSET {(
-                                                                                                                                                                data_in.pages - 1) * 10} """
+                                                                                                                                                            data_in.pages - 1) * 10} """
     elif data_in.user_arrt_type == 'int':
         sql = f"""select `#account_id`,`{ziduan}` from {game}.`user` WHERE `{ziduan}` {tiaojian} {data_in.condition} ORDER BY `#reg_time` LIMIT 10 OFFSET {(
-                                                                                                                                                                       data_in.pages - 1) * 10}"""
+                                                                                                                                                                   data_in.pages - 1) * 10}"""
     else:
         sql = f"""select `#account_id`,`{ziduan}` from `{game}`.`user` WHERE `{ziduan}` {tiaojian} '{data}' ORDER BY `#reg_time` LIMIT 10 OFFSET {(
-                                                                                                                                                              data_in.pages - 1) * 10}"""
+                                                                                                                                                          data_in.pages - 1) * 10}"""
     # 查询数据
     try:
         df = await ckdb.query_dataframe(sql)