留存分析分组自定义

2022-08-02 16:02:49 +08:00 · 2022-08-02 16:02:49 +08:00 · 27501bda49
commit 27501bda49
parent f2be717f3e
1 changed files with 121 additions and 61 deletions
--- a/api/api_v1/endpoints/query.py
+++ b/api/api_v1/endpoints/query.py
@ -578,6 +578,49 @@ async def retention_model(request: Request,
    # 映射对应中文返回给前端展示
    groupby_list = analysis.event_view.get('groupBy')
    groupby = [i['columnName'] for i in groupby_list if i['tableType'] != 'user_label']
+    true_group = []     # 定义分组实际选择
+    for g_data in groupby_list:
+        data_type = g_data['data_type']
+
+        # 不是int类型
+        if data_type != "int":
+            true_group.append("str")
+            continue
+
+        # 自定义区间
+        if g_data['intervalType'] == 'user_defined':
+            int_range = analysis.event_view.get('groupBy')[0]['quotaIntervalArr']
+            chk_range = []
+            for index, value in enumerate(int_range):
+                # 开头
+                if index == 0:
+                    chk_range.append(['-', value])
+                    # 只有两个数
+                    if len(int_range) >= 2:
+                        chk_range.append([value, int_range[index + 1]])
+                    continue
+                # 结尾
+                if index + 1 >= len(int_range):
+                    chk_range.append([value, '+'])
+                    continue
+                # 中间
+                chk_range.append([value, int_range[index + 1]])
+            true_group.append(chk_range)
+
+        # 默认区间
+        elif g_data['intervalType'] == 'def':
+            zidai = []
+            max_v = int(df[g_data['columnName']].max())
+            min_v = int(df[g_data['columnName']].min())
+            interval = (max_v - min_v) // 10 or 1
+            for i in range(min_v, max_v, interval):
+                zidai.append([i, i + interval])
+            true_group.append(zidai)
+
+        # 离散数字
+        else:
+            true_group.append('discrete')
+
    if len(groupby_list) == 1:
        max_v = int(df[groupby_list[0]['columnName']].max())
        min_v = int(df[groupby_list[0]['columnName']].min())
@ -676,65 +719,82 @@ async def retention_model(request: Request,
        tmp['p_outflow'].append(n)
        tmp['n_outflow'].append(rd['o_cntn'])
    #  如果分组项是int类型按选择的分组
-
-    # 默认区间
-    if analysis.event_view.get('groupBy')[0]['intervalType'] == 'def':
+    if '均值' in summary_valuess:
        summary_valuess.pop('均值')
-        interval = (max_v - min_v) // 10 or 1
-        lens = len(summary_valuess[max_v]['n'])
-        ress = {}
-        for i in range(min_v, max_v, interval):
-            d0 = 0
-            n1 = []
-            n_outflow1 = []
-            for k, v in summary_valuess.items():
-                if k >= i and k < i + interval:
-                    d0 += v['d0']
-                    n1.append(v['n'])
-                    n_outflow1.append(v['n_outflow'])
-            if len(n1) > 0:
-                re_dict = {}
-                n = np.sum([ii for ii in n1], axis=0).tolist()
-                n_outflow = np.sum([iii for iii in n_outflow1], axis=0).tolist()
-                p = [round(nu*100 / d0, 2) for nu in n]
-                p_outflow = [round(num*100 / d0, 2) for num in n_outflow]
-                re_dict['d0'] = d0
-                re_dict['n'] = n
-                re_dict['n_outflow'] = n_outflow
-                re_dict['p'] = p
-                re_dict['p_outflow'] = p_outflow
-                ress[f"[{i},{i + interval})"] = re_dict
+    if "['均值']" in summary_valuess:
+        summary_valuess.pop("['均值']")
+    new_summary_valuess = {}
+    for group_key, group_data in summary_valuess.items():
+        key_list = eval(group_key)
+        true_key = []  # 重新定义后的分组
+        for index, value in enumerate(key_list):
+
+            true_group_index = true_group[index]
+            # 默认区间或者自定义区间
+            if isinstance(true_group_index, list):
+                for defined_list in true_group_index:
+                    defined_list_max = defined_list[1]
+                    defined_list_min = defined_list[0]
+                    if defined_list_min == '-':
+                        if value < defined_list_max:
+                            true_key.append(defined_list)
+                            break
                        else:
-                re_dict = {'d0': 0}
-                n = []
-                n_outflow = []
-                p = []
-                p_outflow = []
-                for cishu in range(0, lens):
-                    n.append(0)
-                    n_outflow.append(0)
-                    p.append(0)
-                    p_outflow.append(0)
-                re_dict['n'] = n
-                re_dict['n_outflow'] = n_outflow
-                re_dict['p'] = p
-                re_dict['p_outflow'] = p_outflow
-                ress[f"[{i},{i + interval})"] = re_dict
-        summary_valuess=ress
-    # 自定义区间
-    elif analysis.event_view.get('groupBy')[0]['intervalType'] == 'user_defined':
-        pass
-    #  次留数
+                            continue
+                    if defined_list_max == '+':
+                        if value >= defined_list_min:
+                            true_key.append(defined_list)
+                            break
+                        else:
+                            continue
+
+                    if defined_list_min <= value < defined_list_max:
+                        true_key.append(defined_list)
+                        break
+                    continue
+                continue
+
+            # 分组是字符串或者离散直接取这个值得str类型
+            if true_group_index in ['str', 'discrete']:
+                true_key.append(str(value))
+                continue
+
+        # 这个分组不存在:
+        if str(true_key) not in new_summary_valuess:
+            new_summary_valuess[str(true_key)] = group_data
+            continue
+
+        # 这个分组已存在
+        # d0相加
+        new_summary_valuess[str(true_key)]['d0'] += group_data['d0']
+
+        # n相加
+        n_list = new_summary_valuess[str(true_key)]['n']
+        n_list1 = group_data['n']
+        sum_n_lst = [x + y for x, y in zip(n_list, n_list1)]
+        new_summary_valuess[str(true_key)]['n'] = sum_n_lst
+
+        # n_outflow相加
+        n_outflow_list = new_summary_valuess[str(true_key)]['n_outflow']
+        n_outflow_list1 = group_data['n_outflow']
+        sum_n_ourflow_lst = [x + y for x, y in zip(n_outflow_list, n_outflow_list1)]
+        new_summary_valuess[str(true_key)]['n_outflow'] = sum_n_ourflow_lst
+
+    # 计算概率
+    for key1, value1 in new_summary_valuess.items():
+        new_summary_valuess[key1]['p'] = [round(i / value1['d0'], 2) for i in value1['n']]
+        new_summary_valuess[key1]['p_outflow'] = [round(i1 / value1['d0'], 2) for i1 in value1['n_outflow']]
+
    title = ['分组项', '用户数', '次留', *[f'{i + 1}留' for i in retention_n[1:]]]

    # 未到达的日期需要补齐-
    retention_length = len(retention_n)
-    for _, items in summary_valuess.items():
+    for _, items in new_summary_valuess.items():
        for key in ['p', 'n', 'p_outflow', 'n_outflow']:
            items[key].extend(['-'] * (retention_length - len(items[key])))

    resp = {
-        'summary_values': summary_valuess,
+        'summary_values': new_summary_valuess,
        # 'values': values,
        'date_range': [d.strftime('%Y-%m-%d') for d in date_range],
        'title': title,