留存分析分组自定义

This commit is contained in:
Àî×ÚÕñ 2022-08-02 16:02:49 +08:00
parent f2be717f3e
commit 27501bda49

View File

@ -576,8 +576,51 @@ async def retention_model(request: Request,
filter_item_type = res['filter_item_type'] # all filter_item_type = res['filter_item_type'] # all
filter_item = res['filter_item'] # 列表 0,1,3,7,14,21,30 filter_item = res['filter_item'] # 列表 0,1,3,7,14,21,30
# 映射对应中文返回给前端展示 # 映射对应中文返回给前端展示
groupby_list=analysis.event_view.get('groupBy') groupby_list = analysis.event_view.get('groupBy')
groupby = [i['columnName'] for i in groupby_list if i['tableType'] != 'user_label'] groupby = [i['columnName'] for i in groupby_list if i['tableType'] != 'user_label']
true_group = [] # 定义分组实际选择
for g_data in groupby_list:
data_type = g_data['data_type']
# 不是int类型
if data_type != "int":
true_group.append("str")
continue
# 自定义区间
if g_data['intervalType'] == 'user_defined':
int_range = analysis.event_view.get('groupBy')[0]['quotaIntervalArr']
chk_range = []
for index, value in enumerate(int_range):
# 开头
if index == 0:
chk_range.append(['-', value])
# 只有两个数
if len(int_range) >= 2:
chk_range.append([value, int_range[index + 1]])
continue
# 结尾
if index + 1 >= len(int_range):
chk_range.append([value, '+'])
continue
# 中间
chk_range.append([value, int_range[index + 1]])
true_group.append(chk_range)
# 默认区间
elif g_data['intervalType'] == 'def':
zidai = []
max_v = int(df[g_data['columnName']].max())
min_v = int(df[g_data['columnName']].min())
interval = (max_v - min_v) // 10 or 1
for i in range(min_v, max_v, interval):
zidai.append([i, i + interval])
true_group.append(zidai)
# 离散数字
else:
true_group.append('discrete')
if len(groupby_list) == 1: if len(groupby_list) == 1:
max_v = int(df[groupby_list[0]['columnName']].max()) max_v = int(df[groupby_list[0]['columnName']].max())
min_v = int(df[groupby_list[0]['columnName']].min()) min_v = int(df[groupby_list[0]['columnName']].min())
@ -592,7 +635,7 @@ async def retention_model(request: Request,
for k, v in chinese.items(): for k, v in chinese.items():
# 开始映射 # 开始映射
df.loc[df['svrindex'] == k, 'svrindex'] = v df.loc[df['svrindex'] == k, 'svrindex'] = v
times=df['reg_date'][0] times = df['reg_date'][0]
df.set_index(groupby, inplace=True) df.set_index(groupby, inplace=True)
# for d in set(res['date_range']) - set(df.index): # for d in set(res['date_range']) - set(df.index):
# df.loc[d] = 0 # df.loc[d] = 0
@ -676,65 +719,82 @@ async def retention_model(request: Request,
tmp['p_outflow'].append(n) tmp['p_outflow'].append(n)
tmp['n_outflow'].append(rd['o_cntn']) tmp['n_outflow'].append(rd['o_cntn'])
# 如果分组项是int类型按选择的分组 # 如果分组项是int类型按选择的分组
if '均值' in summary_valuess:
# 默认区间
if analysis.event_view.get('groupBy')[0]['intervalType'] == 'def':
summary_valuess.pop('均值') summary_valuess.pop('均值')
interval = (max_v - min_v) // 10 or 1 if "['均值']" in summary_valuess:
lens = len(summary_valuess[max_v]['n']) summary_valuess.pop("['均值']")
ress = {} new_summary_valuess = {}
for i in range(min_v, max_v, interval): for group_key, group_data in summary_valuess.items():
d0 = 0 key_list = eval(group_key)
n1 = [] true_key = [] # 重新定义后的分组
n_outflow1 = [] for index, value in enumerate(key_list):
for k, v in summary_valuess.items():
if k >= i and k < i + interval: true_group_index = true_group[index]
d0 += v['d0'] # 默认区间或者自定义区间
n1.append(v['n']) if isinstance(true_group_index, list):
n_outflow1.append(v['n_outflow']) for defined_list in true_group_index:
if len(n1) > 0: defined_list_max = defined_list[1]
re_dict = {} defined_list_min = defined_list[0]
n = np.sum([ii for ii in n1], axis=0).tolist() if defined_list_min == '-':
n_outflow = np.sum([iii for iii in n_outflow1], axis=0).tolist() if value < defined_list_max:
p = [round(nu*100 / d0, 2) for nu in n] true_key.append(defined_list)
p_outflow = [round(num*100 / d0, 2) for num in n_outflow] break
re_dict['d0'] = d0 else:
re_dict['n'] = n continue
re_dict['n_outflow'] = n_outflow if defined_list_max == '+':
re_dict['p'] = p if value >= defined_list_min:
re_dict['p_outflow'] = p_outflow true_key.append(defined_list)
ress[f"[{i},{i + interval})"] = re_dict break
else: else:
re_dict = {'d0': 0} continue
n = []
n_outflow = [] if defined_list_min <= value < defined_list_max:
p = [] true_key.append(defined_list)
p_outflow = [] break
for cishu in range(0, lens): continue
n.append(0) continue
n_outflow.append(0)
p.append(0) # 分组是字符串或者离散直接取这个值得str类型
p_outflow.append(0) if true_group_index in ['str', 'discrete']:
re_dict['n'] = n true_key.append(str(value))
re_dict['n_outflow'] = n_outflow continue
re_dict['p'] = p
re_dict['p_outflow'] = p_outflow # 这个分组不存在:
ress[f"[{i},{i + interval})"] = re_dict if str(true_key) not in new_summary_valuess:
summary_valuess=ress new_summary_valuess[str(true_key)] = group_data
# 自定义区间 continue
elif analysis.event_view.get('groupBy')[0]['intervalType'] == 'user_defined':
pass # 这个分组已存在
# 次留数 # d0相加
new_summary_valuess[str(true_key)]['d0'] += group_data['d0']
# n相加
n_list = new_summary_valuess[str(true_key)]['n']
n_list1 = group_data['n']
sum_n_lst = [x + y for x, y in zip(n_list, n_list1)]
new_summary_valuess[str(true_key)]['n'] = sum_n_lst
# n_outflow相加
n_outflow_list = new_summary_valuess[str(true_key)]['n_outflow']
n_outflow_list1 = group_data['n_outflow']
sum_n_ourflow_lst = [x + y for x, y in zip(n_outflow_list, n_outflow_list1)]
new_summary_valuess[str(true_key)]['n_outflow'] = sum_n_ourflow_lst
# 计算概率
for key1, value1 in new_summary_valuess.items():
new_summary_valuess[key1]['p'] = [round(i / value1['d0'], 2) for i in value1['n']]
new_summary_valuess[key1]['p_outflow'] = [round(i1 / value1['d0'], 2) for i1 in value1['n_outflow']]
title = ['分组项', '用户数', '次留', *[f'{i + 1}' for i in retention_n[1:]]] title = ['分组项', '用户数', '次留', *[f'{i + 1}' for i in retention_n[1:]]]
# 未到达的日期需要补齐- # 未到达的日期需要补齐-
retention_length = len(retention_n) retention_length = len(retention_n)
for _, items in summary_valuess.items(): for _, items in new_summary_valuess.items():
for key in ['p', 'n', 'p_outflow', 'n_outflow']: for key in ['p', 'n', 'p_outflow', 'n_outflow']:
items[key].extend(['-'] * (retention_length - len(items[key]))) items[key].extend(['-'] * (retention_length - len(items[key])))
resp = { resp = {
'summary_values': summary_valuess, 'summary_values': new_summary_valuess,
# 'values': values, # 'values': values,
'date_range': [d.strftime('%Y-%m-%d') for d in date_range], 'date_range': [d.strftime('%Y-%m-%d') for d in date_range],
'title': title, 'title': title,
@ -1324,7 +1384,7 @@ async def scatter_model(
# 这是整体的 # 这是整体的
for key, tmp_df in df.groupby('date'): for key, tmp_df in df.groupby('date'):
bins_s = pd.cut(tmp_df['values'], bins=bins, bins_s = pd.cut(tmp_df['values'], bins=bins,
right=False,include_lowest=True).value_counts() right=False, include_lowest=True).value_counts()
bins_s.sort_index(inplace=True) bins_s.sort_index(inplace=True)
total = int(bins_s.sum()) total = int(bins_s.sum())
if res['time_particle'] == 'total': if res['time_particle'] == 'total':
@ -1583,8 +1643,8 @@ async def scatter_model(
# if 'time' not in groupby: # if 'time' not in groupby:
resp['list'][str(key)] = dict() resp['list'][str(key)] = dict()
resp['list'][str(key)] = {'n': bins_s.to_list(), 'total': total, resp['list'][str(key)] = {'n': bins_s.to_list(), 'total': total,
'p': [str(i) + '%' for i in p], 'p': [str(i) + '%' for i in p],
'title': '总体'} 'title': '总体'}
# else: # else:
# resp['list'][key.strftime('%Y-%m-%d %H:%M:%S')] = dict() # resp['list'][key.strftime('%Y-%m-%d %H:%M:%S')] = dict()
# resp['list'][key.strftime('%Y-%m-%d %H:%M:%S')] = {'n': bins_s.to_list(), 'total': total, # resp['list'][key.strftime('%Y-%m-%d %H:%M:%S')] = {'n': bins_s.to_list(), 'total': total,
@ -1675,7 +1735,7 @@ async def scatter_model(
list_p.append(number_str) list_p.append(number_str)
resp['list'][str(dt)] = {'n': [labels_dict01.get(i, 0) for i in labels], 'total': total, resp['list'][str(dt)] = {'n': [labels_dict01.get(i, 0) for i in labels], 'total': total,
'p': list_p} 'p': list_p}
else: else:
list_p = [] list_p = []
for i in labels: for i in labels:
@ -1683,7 +1743,7 @@ async def scatter_model(
number_str = str(number_int) + '%' number_str = str(number_int) + '%'
list_p.append(number_str) list_p.append(number_str)
resp['list'][str(dt)] = {'n': [labels_dict.get(i, 0) for i in labels], 'total': total, resp['list'][str(dt)] = {'n': [labels_dict.get(i, 0) for i in labels], 'total': total,
'p': list_p} 'p': list_p}
# resp['list'][dt] = {'总体': {'n': [labels_dict.get(i, 0) for i in labels], 'total': total, # resp['list'][dt] = {'总体': {'n': [labels_dict.get(i, 0) for i in labels], 'total': total,
# 'p': [round(labels_dict.get(i, 0) * 100 / total, 2) for i in labels]}} # 'p': [round(labels_dict.get(i, 0) * 100 / total, 2) for i in labels]}}
if where == "step_id" and event_type == "guide": if where == "step_id" and event_type == "guide":
@ -1985,17 +2045,17 @@ async def user_property_model(
if data_in.user_arrt_type == 'datetime': if data_in.user_arrt_type == 'datetime':
sql = f"""select `#account_id`,`{ziduan}` from {game}.`user` WHERE addHours(`{ziduan}`, 8) >= '{data_in.start_time}' sql = f"""select `#account_id`,`{ziduan}` from {game}.`user` WHERE addHours(`{ziduan}`, 8) >= '{data_in.start_time}'
and addHours(`{ziduan}`, 8) <= '{data_in.end_time}' ORDER BY `#reg_time` LIMIT 10 OFFSET {( and addHours(`{ziduan}`, 8) <= '{data_in.end_time}' ORDER BY `#reg_time` LIMIT 10 OFFSET {(
data_in.pages - 1) * 10}""" data_in.pages - 1) * 10}"""
# 如果查询'#account_id'则不多余返回一个account_id # 如果查询'#account_id'则不多余返回一个account_id
elif ziduan == '#account_id': elif ziduan == '#account_id':
sql = f"""select `{ziduan}`,name from {game}.`user` WHERE `{ziduan}` {tiaojian} '{data_in.condition}' ORDER BY `#reg_time` LIMIT 10 OFFSET {( sql = f"""select `{ziduan}`,name from {game}.`user` WHERE `{ziduan}` {tiaojian} '{data_in.condition}' ORDER BY `#reg_time` LIMIT 10 OFFSET {(
data_in.pages - 1) * 10} """ data_in.pages - 1) * 10} """
elif data_in.user_arrt_type == 'int': elif data_in.user_arrt_type == 'int':
sql = f"""select `#account_id`,`{ziduan}` from {game}.`user` WHERE `{ziduan}` {tiaojian} {data_in.condition} ORDER BY `#reg_time` LIMIT 10 OFFSET {( sql = f"""select `#account_id`,`{ziduan}` from {game}.`user` WHERE `{ziduan}` {tiaojian} {data_in.condition} ORDER BY `#reg_time` LIMIT 10 OFFSET {(
data_in.pages - 1) * 10}""" data_in.pages - 1) * 10}"""
else: else:
sql = f"""select `#account_id`,`{ziduan}` from `{game}`.`user` WHERE `{ziduan}` {tiaojian} '{data}' ORDER BY `#reg_time` LIMIT 10 OFFSET {( sql = f"""select `#account_id`,`{ziduan}` from `{game}`.`user` WHERE `{ziduan}` {tiaojian} '{data}' ORDER BY `#reg_time` LIMIT 10 OFFSET {(
data_in.pages - 1) * 10}""" data_in.pages - 1) * 10}"""
# 查询数据 # 查询数据
try: try:
df = await ckdb.query_dataframe(sql) df = await ckdb.query_dataframe(sql)