留存分析分组自定义

2022-08-02 16:02:49 +08:00 · 2022-08-02 16:02:49 +08:00 · 27501bda49
commit 27501bda49
parent f2be717f3e
1 changed files with 121 additions and 61 deletions
--- a/api/api_v1/endpoints/query.py
+++ b/api/api_v1/endpoints/query.py
@ -576,8 +576,51 @@ async def retention_model(request: Request,
    filter_item_type = res['filter_item_type']  # all
    filter_item = res['filter_item']  # 列表  0,1,3,7,14,21,30
    # 映射对应中文返回给前端展示
-    groupby_list=analysis.event_view.get('groupBy')
+    groupby_list = analysis.event_view.get('groupBy')
    groupby = [i['columnName'] for i in groupby_list if i['tableType'] != 'user_label']
+    true_group = []     # 定义分组实际选择
+    for g_data in groupby_list:
+        data_type = g_data['data_type']
+
+        # 不是int类型
+        if data_type != "int":
+            true_group.append("str")
+            continue
+
+        # 自定义区间
+        if g_data['intervalType'] == 'user_defined':
+            int_range = analysis.event_view.get('groupBy')[0]['quotaIntervalArr']
+            chk_range = []
+            for index, value in enumerate(int_range):
+                # 开头
+                if index == 0:
+                    chk_range.append(['-', value])
+                    # 只有两个数
+                    if len(int_range) >= 2:
+                        chk_range.append([value, int_range[index + 1]])
+                    continue
+                # 结尾
+                if index + 1 >= len(int_range):
+                    chk_range.append([value, '+'])
+                    continue
+                # 中间
+                chk_range.append([value, int_range[index + 1]])
+            true_group.append(chk_range)
+
+        # 默认区间
+        elif g_data['intervalType'] == 'def':
+            zidai = []
+            max_v = int(df[g_data['columnName']].max())
+            min_v = int(df[g_data['columnName']].min())
+            interval = (max_v - min_v) // 10 or 1
+            for i in range(min_v, max_v, interval):
+                zidai.append([i, i + interval])
+            true_group.append(zidai)
+
+        # 离散数字
+        else:
+            true_group.append('discrete')
+
    if len(groupby_list) == 1:
        max_v = int(df[groupby_list[0]['columnName']].max())
        min_v = int(df[groupby_list[0]['columnName']].min())
@ -592,7 +635,7 @@ async def retention_model(request: Request,
            for k, v in chinese.items():
                # 开始映射
                df.loc[df['svrindex'] == k, 'svrindex'] = v
-    times=df['reg_date'][0]
+    times = df['reg_date'][0]
    df.set_index(groupby, inplace=True)
    # for d in set(res['date_range']) - set(df.index):
    #     df.loc[d] = 0
@ -676,65 +719,82 @@ async def retention_model(request: Request,
        tmp['p_outflow'].append(n)
        tmp['n_outflow'].append(rd['o_cntn'])
    #  如果分组项是int类型按选择的分组
-
-    # 默认区间
-    if analysis.event_view.get('groupBy')[0]['intervalType'] == 'def':
+    if '均值' in summary_valuess:
        summary_valuess.pop('均值')
-        interval = (max_v - min_v) // 10 or 1
-        lens = len(summary_valuess[max_v]['n'])
-        ress = {}
-        for i in range(min_v, max_v, interval):
-            d0 = 0
-            n1 = []
-            n_outflow1 = []
-            for k, v in summary_valuess.items():
-                if k >= i and k < i + interval:
-                    d0 += v['d0']
-                    n1.append(v['n'])
-                    n_outflow1.append(v['n_outflow'])
-            if len(n1) > 0:
-                re_dict = {}
-                n = np.sum([ii for ii in n1], axis=0).tolist()
-                n_outflow = np.sum([iii for iii in n_outflow1], axis=0).tolist()
-                p = [round(nu*100 / d0, 2) for nu in n]
-                p_outflow = [round(num*100 / d0, 2) for num in n_outflow]
-                re_dict['d0'] = d0
-                re_dict['n'] = n
-                re_dict['n_outflow'] = n_outflow
-                re_dict['p'] = p
-                re_dict['p_outflow'] = p_outflow
-                ress[f"[{i},{i + interval})"] = re_dict
-            else:
-                re_dict = {'d0': 0}
-                n = []
-                n_outflow = []
-                p = []
-                p_outflow = []
-                for cishu in range(0, lens):
-                    n.append(0)
-                    n_outflow.append(0)
-                    p.append(0)
-                    p_outflow.append(0)
-                re_dict['n'] = n
-                re_dict['n_outflow'] = n_outflow
-                re_dict['p'] = p
-                re_dict['p_outflow'] = p_outflow
-                ress[f"[{i},{i + interval})"] = re_dict
-        summary_valuess=ress
-    # 自定义区间
-    elif analysis.event_view.get('groupBy')[0]['intervalType'] == 'user_defined':
-        pass
-    #  次留数
+    if "['均值']" in summary_valuess:
+        summary_valuess.pop("['均值']")
+    new_summary_valuess = {}
+    for group_key, group_data in summary_valuess.items():
+        key_list = eval(group_key)
+        true_key = []  # 重新定义后的分组
+        for index, value in enumerate(key_list):
+
+            true_group_index = true_group[index]
+            # 默认区间或者自定义区间
+            if isinstance(true_group_index, list):
+                for defined_list in true_group_index:
+                    defined_list_max = defined_list[1]
+                    defined_list_min = defined_list[0]
+                    if defined_list_min == '-':
+                        if value < defined_list_max:
+                            true_key.append(defined_list)
+                            break
+                        else:
+                            continue
+                    if defined_list_max == '+':
+                        if value >= defined_list_min:
+                            true_key.append(defined_list)
+                            break
+                        else:
+                            continue
+
+                    if defined_list_min <= value < defined_list_max:
+                        true_key.append(defined_list)
+                        break
+                    continue
+                continue
+
+            # 分组是字符串或者离散直接取这个值得str类型
+            if true_group_index in ['str', 'discrete']:
+                true_key.append(str(value))
+                continue
+
+        # 这个分组不存在:
+        if str(true_key) not in new_summary_valuess:
+            new_summary_valuess[str(true_key)] = group_data
+            continue
+
+        # 这个分组已存在
+        # d0相加
+        new_summary_valuess[str(true_key)]['d0'] += group_data['d0']
+
+        # n相加
+        n_list = new_summary_valuess[str(true_key)]['n']
+        n_list1 = group_data['n']
+        sum_n_lst = [x + y for x, y in zip(n_list, n_list1)]
+        new_summary_valuess[str(true_key)]['n'] = sum_n_lst
+
+        # n_outflow相加
+        n_outflow_list = new_summary_valuess[str(true_key)]['n_outflow']
+        n_outflow_list1 = group_data['n_outflow']
+        sum_n_ourflow_lst = [x + y for x, y in zip(n_outflow_list, n_outflow_list1)]
+        new_summary_valuess[str(true_key)]['n_outflow'] = sum_n_ourflow_lst
+
+    # 计算概率
+    for key1, value1 in new_summary_valuess.items():
+        new_summary_valuess[key1]['p'] = [round(i / value1['d0'], 2) for i in value1['n']]
+        new_summary_valuess[key1]['p_outflow'] = [round(i1 / value1['d0'], 2) for i1 in value1['n_outflow']]
+
    title = ['分组项', '用户数', '次留', *[f'{i + 1}留' for i in retention_n[1:]]]

    # 未到达的日期需要补齐-
    retention_length = len(retention_n)
-    for _, items in summary_valuess.items():
+    for _, items in new_summary_valuess.items():
        for key in ['p', 'n', 'p_outflow', 'n_outflow']:
            items[key].extend(['-'] * (retention_length - len(items[key])))

    resp = {
-        'summary_values': summary_valuess,
+        'summary_values': new_summary_valuess,
        # 'values': values,
        'date_range': [d.strftime('%Y-%m-%d') for d in date_range],
        'title': title,
@ -1324,7 +1384,7 @@ async def scatter_model(
        # 这是整体的
        for key, tmp_df in df.groupby('date'):
            bins_s = pd.cut(tmp_df['values'], bins=bins,
-                            right=False,include_lowest=True).value_counts()
+                            right=False, include_lowest=True).value_counts()
            bins_s.sort_index(inplace=True)
            total = int(bins_s.sum())
            if res['time_particle'] == 'total':
@ -1583,8 +1643,8 @@ async def scatter_model(
                    # if 'time' not in groupby:
                    resp['list'][str(key)] = dict()
                    resp['list'][str(key)] = {'n': bins_s.to_list(), 'total': total,
-                                         'p': [str(i) + '%' for i in p],
-                                         'title': '总体'}
+                                              'p': [str(i) + '%' for i in p],
+                                              'title': '总体'}
                    # else:
                    #     resp['list'][key.strftime('%Y-%m-%d %H:%M:%S')] = dict()
                    #     resp['list'][key.strftime('%Y-%m-%d %H:%M:%S')] = {'n': bins_s.to_list(), 'total': total,
@ -1675,7 +1735,7 @@ async def scatter_model(
                        list_p.append(number_str)

                    resp['list'][str(dt)] = {'n': [labels_dict01.get(i, 0) for i in labels], 'total': total,
-                                        'p': list_p}
+                                             'p': list_p}
                else:
                    list_p = []
                    for i in labels:
@ -1683,7 +1743,7 @@ async def scatter_model(
                        number_str = str(number_int) + '%'
                        list_p.append(number_str)
                    resp['list'][str(dt)] = {'n': [labels_dict.get(i, 0) for i in labels], 'total': total,
-                                        'p': list_p}
+                                             'p': list_p}
                    # resp['list'][dt] = {'总体': {'n': [labels_dict.get(i, 0) for i in labels], 'total': total,
                    #                          'p': [round(labels_dict.get(i, 0) * 100 / total, 2) for i in labels]}}
            if where == "step_id" and event_type == "guide":
@ -1985,17 +2045,17 @@ async def user_property_model(
    if data_in.user_arrt_type == 'datetime':
        sql = f"""select `#account_id`,`{ziduan}` from {game}.`user` WHERE addHours(`{ziduan}`, 8) >= '{data_in.start_time}' 
                    and addHours(`{ziduan}`, 8) <= '{data_in.end_time}' ORDER BY `#reg_time` LIMIT 10 OFFSET {(
-                                                                                                                                                                                                                                                       data_in.pages - 1) * 10}"""
+                                                                                                                                                                                                                                                   data_in.pages - 1) * 10}"""
    # 如果查询'#account_id'，则不多余返回一个account_id
    elif ziduan == '#account_id':
        sql = f"""select `{ziduan}`,name from {game}.`user` WHERE `{ziduan}` {tiaojian} '{data_in.condition}' ORDER BY `#reg_time` LIMIT 10 OFFSET {(
-                                                                                                                                                                data_in.pages - 1) * 10} """
+                                                                                                                                                            data_in.pages - 1) * 10} """
    elif data_in.user_arrt_type == 'int':
        sql = f"""select `#account_id`,`{ziduan}` from {game}.`user` WHERE `{ziduan}` {tiaojian} {data_in.condition} ORDER BY `#reg_time` LIMIT 10 OFFSET {(
-                                                                                                                                                                       data_in.pages - 1) * 10}"""
+                                                                                                                                                                   data_in.pages - 1) * 10}"""
    else:
        sql = f"""select `#account_id`,`{ziduan}` from `{game}`.`user` WHERE `{ziduan}` {tiaojian} '{data}' ORDER BY `#reg_time` LIMIT 10 OFFSET {(
-                                                                                                                                                              data_in.pages - 1) * 10}"""
+                                                                                                                                                          data_in.pages - 1) * 10}"""
    # 查询数据
    try:
        df = await ckdb.query_dataframe(sql)