From 27501bda49cf850a9a99d0bcadfc9616edc3789e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=80=C3=AE=C3=97=C3=9A=C3=95=C3=B1?= Date: Tue, 2 Aug 2022 16:02:49 +0800 Subject: [PATCH] =?UTF-8?q?=E7=95=99=E5=AD=98=E5=88=86=E6=9E=90=E5=88=86?= =?UTF-8?q?=E7=BB=84=E8=87=AA=E5=AE=9A=E4=B9=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- api/api_v1/endpoints/query.py | 182 ++++++++++++++++++++++------------ 1 file changed, 121 insertions(+), 61 deletions(-) diff --git a/api/api_v1/endpoints/query.py b/api/api_v1/endpoints/query.py index c2e431e..be120aa 100644 --- a/api/api_v1/endpoints/query.py +++ b/api/api_v1/endpoints/query.py @@ -576,8 +576,51 @@ async def retention_model(request: Request, filter_item_type = res['filter_item_type'] # all filter_item = res['filter_item'] # 列表 0,1,3,7,14,21,30 # 映射对应中文返回给前端展示 - groupby_list=analysis.event_view.get('groupBy') + groupby_list = analysis.event_view.get('groupBy') groupby = [i['columnName'] for i in groupby_list if i['tableType'] != 'user_label'] + true_group = [] # 定义分组实际选择 + for g_data in groupby_list: + data_type = g_data['data_type'] + + # 不是int类型 + if data_type != "int": + true_group.append("str") + continue + + # 自定义区间 + if g_data['intervalType'] == 'user_defined': + int_range = analysis.event_view.get('groupBy')[0]['quotaIntervalArr'] + chk_range = [] + for index, value in enumerate(int_range): + # 开头 + if index == 0: + chk_range.append(['-', value]) + # 只有两个数 + if len(int_range) >= 2: + chk_range.append([value, int_range[index + 1]]) + continue + # 结尾 + if index + 1 >= len(int_range): + chk_range.append([value, '+']) + continue + # 中间 + chk_range.append([value, int_range[index + 1]]) + true_group.append(chk_range) + + # 默认区间 + elif g_data['intervalType'] == 'def': + zidai = [] + max_v = int(df[g_data['columnName']].max()) + min_v = int(df[g_data['columnName']].min()) + interval = (max_v - min_v) // 10 or 1 + for i in range(min_v, max_v, interval): + zidai.append([i, i + interval]) + true_group.append(zidai) + + # 离散数字 + else: + true_group.append('discrete') + if len(groupby_list) == 1: max_v = int(df[groupby_list[0]['columnName']].max()) min_v = int(df[groupby_list[0]['columnName']].min()) @@ -592,7 +635,7 @@ async def retention_model(request: Request, for k, v in chinese.items(): # 开始映射 df.loc[df['svrindex'] == k, 'svrindex'] = v - times=df['reg_date'][0] + times = df['reg_date'][0] df.set_index(groupby, inplace=True) # for d in set(res['date_range']) - set(df.index): # df.loc[d] = 0 @@ -676,65 +719,82 @@ async def retention_model(request: Request, tmp['p_outflow'].append(n) tmp['n_outflow'].append(rd['o_cntn']) # 如果分组项是int类型按选择的分组 - - # 默认区间 - if analysis.event_view.get('groupBy')[0]['intervalType'] == 'def': + if '均值' in summary_valuess: summary_valuess.pop('均值') - interval = (max_v - min_v) // 10 or 1 - lens = len(summary_valuess[max_v]['n']) - ress = {} - for i in range(min_v, max_v, interval): - d0 = 0 - n1 = [] - n_outflow1 = [] - for k, v in summary_valuess.items(): - if k >= i and k < i + interval: - d0 += v['d0'] - n1.append(v['n']) - n_outflow1.append(v['n_outflow']) - if len(n1) > 0: - re_dict = {} - n = np.sum([ii for ii in n1], axis=0).tolist() - n_outflow = np.sum([iii for iii in n_outflow1], axis=0).tolist() - p = [round(nu*100 / d0, 2) for nu in n] - p_outflow = [round(num*100 / d0, 2) for num in n_outflow] - re_dict['d0'] = d0 - re_dict['n'] = n - re_dict['n_outflow'] = n_outflow - re_dict['p'] = p - re_dict['p_outflow'] = p_outflow - ress[f"[{i},{i + interval})"] = re_dict - else: - re_dict = {'d0': 0} - n = [] - n_outflow = [] - p = [] - p_outflow = [] - for cishu in range(0, lens): - n.append(0) - n_outflow.append(0) - p.append(0) - p_outflow.append(0) - re_dict['n'] = n - re_dict['n_outflow'] = n_outflow - re_dict['p'] = p - re_dict['p_outflow'] = p_outflow - ress[f"[{i},{i + interval})"] = re_dict - summary_valuess=ress - # 自定义区间 - elif analysis.event_view.get('groupBy')[0]['intervalType'] == 'user_defined': - pass - # 次留数 + if "['均值']" in summary_valuess: + summary_valuess.pop("['均值']") + new_summary_valuess = {} + for group_key, group_data in summary_valuess.items(): + key_list = eval(group_key) + true_key = [] # 重新定义后的分组 + for index, value in enumerate(key_list): + + true_group_index = true_group[index] + # 默认区间或者自定义区间 + if isinstance(true_group_index, list): + for defined_list in true_group_index: + defined_list_max = defined_list[1] + defined_list_min = defined_list[0] + if defined_list_min == '-': + if value < defined_list_max: + true_key.append(defined_list) + break + else: + continue + if defined_list_max == '+': + if value >= defined_list_min: + true_key.append(defined_list) + break + else: + continue + + if defined_list_min <= value < defined_list_max: + true_key.append(defined_list) + break + continue + continue + + # 分组是字符串或者离散直接取这个值得str类型 + if true_group_index in ['str', 'discrete']: + true_key.append(str(value)) + continue + + # 这个分组不存在: + if str(true_key) not in new_summary_valuess: + new_summary_valuess[str(true_key)] = group_data + continue + + # 这个分组已存在 + # d0相加 + new_summary_valuess[str(true_key)]['d0'] += group_data['d0'] + + # n相加 + n_list = new_summary_valuess[str(true_key)]['n'] + n_list1 = group_data['n'] + sum_n_lst = [x + y for x, y in zip(n_list, n_list1)] + new_summary_valuess[str(true_key)]['n'] = sum_n_lst + + # n_outflow相加 + n_outflow_list = new_summary_valuess[str(true_key)]['n_outflow'] + n_outflow_list1 = group_data['n_outflow'] + sum_n_ourflow_lst = [x + y for x, y in zip(n_outflow_list, n_outflow_list1)] + new_summary_valuess[str(true_key)]['n_outflow'] = sum_n_ourflow_lst + + # 计算概率 + for key1, value1 in new_summary_valuess.items(): + new_summary_valuess[key1]['p'] = [round(i / value1['d0'], 2) for i in value1['n']] + new_summary_valuess[key1]['p_outflow'] = [round(i1 / value1['d0'], 2) for i1 in value1['n_outflow']] + title = ['分组项', '用户数', '次留', *[f'{i + 1}留' for i in retention_n[1:]]] # 未到达的日期需要补齐- retention_length = len(retention_n) - for _, items in summary_valuess.items(): + for _, items in new_summary_valuess.items(): for key in ['p', 'n', 'p_outflow', 'n_outflow']: items[key].extend(['-'] * (retention_length - len(items[key]))) resp = { - 'summary_values': summary_valuess, + 'summary_values': new_summary_valuess, # 'values': values, 'date_range': [d.strftime('%Y-%m-%d') for d in date_range], 'title': title, @@ -1324,7 +1384,7 @@ async def scatter_model( # 这是整体的 for key, tmp_df in df.groupby('date'): bins_s = pd.cut(tmp_df['values'], bins=bins, - right=False,include_lowest=True).value_counts() + right=False, include_lowest=True).value_counts() bins_s.sort_index(inplace=True) total = int(bins_s.sum()) if res['time_particle'] == 'total': @@ -1583,8 +1643,8 @@ async def scatter_model( # if 'time' not in groupby: resp['list'][str(key)] = dict() resp['list'][str(key)] = {'n': bins_s.to_list(), 'total': total, - 'p': [str(i) + '%' for i in p], - 'title': '总体'} + 'p': [str(i) + '%' for i in p], + 'title': '总体'} # else: # resp['list'][key.strftime('%Y-%m-%d %H:%M:%S')] = dict() # resp['list'][key.strftime('%Y-%m-%d %H:%M:%S')] = {'n': bins_s.to_list(), 'total': total, @@ -1675,7 +1735,7 @@ async def scatter_model( list_p.append(number_str) resp['list'][str(dt)] = {'n': [labels_dict01.get(i, 0) for i in labels], 'total': total, - 'p': list_p} + 'p': list_p} else: list_p = [] for i in labels: @@ -1683,7 +1743,7 @@ async def scatter_model( number_str = str(number_int) + '%' list_p.append(number_str) resp['list'][str(dt)] = {'n': [labels_dict.get(i, 0) for i in labels], 'total': total, - 'p': list_p} + 'p': list_p} # resp['list'][dt] = {'总体': {'n': [labels_dict.get(i, 0) for i in labels], 'total': total, # 'p': [round(labels_dict.get(i, 0) * 100 / total, 2) for i in labels]}} if where == "step_id" and event_type == "guide": @@ -1985,17 +2045,17 @@ async def user_property_model( if data_in.user_arrt_type == 'datetime': sql = f"""select `#account_id`,`{ziduan}` from {game}.`user` WHERE addHours(`{ziduan}`, 8) >= '{data_in.start_time}' and addHours(`{ziduan}`, 8) <= '{data_in.end_time}' ORDER BY `#reg_time` LIMIT 10 OFFSET {( - data_in.pages - 1) * 10}""" + data_in.pages - 1) * 10}""" # 如果查询'#account_id',则不多余返回一个account_id elif ziduan == '#account_id': sql = f"""select `{ziduan}`,name from {game}.`user` WHERE `{ziduan}` {tiaojian} '{data_in.condition}' ORDER BY `#reg_time` LIMIT 10 OFFSET {( - data_in.pages - 1) * 10} """ + data_in.pages - 1) * 10} """ elif data_in.user_arrt_type == 'int': sql = f"""select `#account_id`,`{ziduan}` from {game}.`user` WHERE `{ziduan}` {tiaojian} {data_in.condition} ORDER BY `#reg_time` LIMIT 10 OFFSET {( - data_in.pages - 1) * 10}""" + data_in.pages - 1) * 10}""" else: sql = f"""select `#account_id`,`{ziduan}` from `{game}`.`user` WHERE `{ziduan}` {tiaojian} '{data}' ORDER BY `#reg_time` LIMIT 10 OFFSET {( - data_in.pages - 1) * 10}""" + data_in.pages - 1) * 10}""" # 查询数据 try: df = await ckdb.query_dataframe(sql)