From 7ae11c4b90854aee72590778336b399ea1de62a9 Mon Sep 17 00:00:00 2001 From: wuaho Date: Wed, 29 Sep 2021 14:28:42 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AF=BC=E5=87=BA=E5=88=86=E5=B8=83?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- api/api_v1/endpoints/query.py | 101 +++++++++++++++++++++++++++++++--- utils/export.py | 2 +- 2 files changed, 94 insertions(+), 9 deletions(-) diff --git a/api/api_v1/endpoints/query.py b/api/api_v1/endpoints/query.py index 84b6b7b..4fc1ab5 100644 --- a/api/api_v1/endpoints/query.py +++ b/api/api_v1/endpoints/query.py @@ -587,6 +587,7 @@ async def scatter_model_sql( data = analysis.scatter_model_sql() return schemas.Msg(code=0, msg='ok', data=[data]) + @router.post("/scatter_model_export") async def retention_model_export(request: Request, game: str, @@ -596,17 +597,101 @@ async def retention_model_export(request: Request, ): """ 分布分析 数据导出""" await analysis.init(data_where=current_user.data_where) - data = analysis.scatter_model_sql() + res = analysis.scatter_model_sql() file_name = quote(f'分布分析.xlsx') mime = mimetypes.guess_type(file_name)[0] - - sql = data['sql'] + sql = res['sql'] df = await ckdb.query_dataframe(sql) - df_to_stream = DfToStream((df, '分布分析')) - with df_to_stream as d: - export = d.to_stream() - return StreamingResponse(export, media_type=mime, headers={'Content-Disposition': f'filename="{file_name}"'}) + interval_type = res['interval_type'] + analysis = res['analysis'] + groupby = res['groupby'] + quota_interval_arr = res['quota_interval_arr'] + if analysis != 'number_of_days' and interval_type != 'discrete': + max_v = int(df['values'].max()) + min_v = int(df['values'].min()) + interval = (max_v - min_v) // 10 or 1 + resp = {'list': dict(), + 'start_date': res['start_date'], + 'end_date': res['end_date'], + 'time_particle': res['time_particle'] + } + if not quota_interval_arr: + resp['label'] = [f'[{i},{i + interval})' for i in range(min_v, max_v, interval)] + bins = [i for i in range(min_v, max_v + interval, interval)] + else: + quota_interval_arr = [-float('inf')] + quota_interval_arr + [float('inf')] + resp['label'] = [] + bins = [quota_interval_arr[0]] + for i, v in enumerate(quota_interval_arr[1:]): + resp['label'].append(f'[{quota_interval_arr[i]},{v})') + bins.append(v) + + # 这是整体的 + for key, tmp_df in df.groupby('date'): + bins_s = pd.cut(tmp_df['values'], bins=bins, + right=False).value_counts() + bins_s.sort_index(inplace=True) + total = int(bins_s.sum()) + resp['list'][key.strftime('%Y-%m-%d')] = dict() + resp['list'][key.strftime('%Y-%m-%d')]['总体'] = {'n': bins_s.to_list(), 'total': total, + 'p': round(bins_s * 100 / total, 2).to_list(), + 'title': '总体'} + # 分组的 + if groupby: + export_df = pd.DataFrame(columns=resp['label']) + + for key, tmp_df in df.groupby(['date', *groupby]): + bins_s = pd.cut(tmp_df['values'], bins=bins, + right=False).value_counts() + bins_s.sort_index(inplace=True) + total = int(bins_s.sum()) + title = '.'.join(key[1:]) + date = key[0] + resp['list'][date.strftime('%Y-%m-%d')][title] = {'n': bins_s.to_list(), 'total': total, + 'p': round(bins_s * 100 / total, 2).to_list(), + 'title': title + } + + export_df.loc[(date.strftime('%Y-%m-%d'), title)] = bins_s.to_list() + + df_to_stream = DfToStream((export_df, '分布分析'), (df, '分布分析原始数据'), index=True) + with df_to_stream as d: + export = d.to_stream() + return StreamingResponse(export, media_type=mime, + headers={'Content-Disposition': f'filename="{file_name}"'}) + + + # elif analysis == 'number_of_days': + else: + resp = {'list': {}, 'label': [], + 'start_date': res['start_date'], + 'end_date': res['end_date'], + 'time_particle': res['time_particle'] + } + total_dict = {} + labels = [str(i) for i in sorted(df['values'].unique())] + resp['label'] = labels + for key, tmp_df in df.groupby(['date']): + total = len(tmp_df) + dt = key.strftime('%Y-%m-%d') + labels_dict = {} + for key2, tmp_df2 in tmp_df.groupby('values'): + label = str(key2) + n = len(tmp_df2) + labels_dict[label] = n + + resp['list'][dt] = {'总体': {'n': [labels_dict.get(i, 0) for i in labels], 'total': total, + 'p': [round(labels_dict.get(i, 0) * 100 / total, 2) for i in labels]}} + + export_df = pd.DataFrame(columns=resp['label']) + for d, v in resp['list'].items(): + export_df.loc[d] = v['总体']['n'] + + df_to_stream = DfToStream((export_df, '分布分析'), (df, '分布分析原始数据'), index=True) + with df_to_stream as d: + export = d.to_stream() + return StreamingResponse(export, media_type=mime, headers={'Content-Disposition': f'filename="{file_name}"'}) @router.post("/scatter_model") @@ -694,7 +779,7 @@ async def scatter_model( labels_dict[label] = n resp['list'][dt] = {'总体': {'n': [labels_dict.get(i, 0) for i in labels], 'total': total, - 'p': [round(labels_dict.get(i, 0)*100 / total, 2) for i in labels]}} + 'p': [round(labels_dict.get(i, 0) * 100 / total, 2) for i in labels]}} return schemas.Msg(code=0, msg='ok', data=resp) # bins_s = pd.cut(tmp_df['values'], bins=bins, diff --git a/utils/export.py b/utils/export.py index c2a3938..f41c0c9 100644 --- a/utils/export.py +++ b/utils/export.py @@ -22,5 +22,5 @@ class DfToStream: for item in self.dfs: df = item[0] sheet_name = item[1] - df.to_excel(self.writer, encoding='utf-8', sheet_name=sheet_name, index=False) + df.to_excel(self.writer, encoding='utf-8', sheet_name=sheet_name, index=self.index) return self.output