Compare commits

...

4 Commits

Author SHA1 Message Date
0db503885b 添加首次付费 2020-12-23 13:35:59 +08:00
c0b829dfb4 优化 2020-12-23 13:35:42 +08:00
a6e9845062 更新任务调用 2020-12-23 13:35:20 +08:00
9912567004 更新基础模型 2020-12-23 13:32:56 +08:00
8 changed files with 144 additions and 82 deletions

12
config.json Normal file
View File

@ -0,0 +1,12 @@
{
"summary_func": {
"source_coll": "event",
"dest_coll": "summary_func",
"task_name": "summary_func"
},
"first_recharge": {
"source_coll": "paylist",
"dest_coll": "user",
"task_name": "first_recharge"
}
}

View File

@ -1,5 +1,3 @@
from typing import Union, Any
from pydantic import BaseModel, Field
from bson.objectid import ObjectId
@ -11,79 +9,36 @@ class MdbObjectId(ObjectId):
@classmethod
def validate(cls, v):
if not isinstance(v, ObjectId):
raise TypeError('ObjectId required')
return v
try:
res = ObjectId(v)
except:
raise TypeError('不能装换为 ObjectId')
else:
return res
class GBaseModel(BaseModel):
"""
字段名与保护变量命名冲突 _ 前缀变为后缀,读出再还原
"""
id_: MdbObjectId = Field(..., title="平台")
platform_: str = Field(None, title="平台")
channel_name_: str = Field(None, title="channel")
owner_name_: str = Field(None, title="owner")
channel_uid_: str = Field(None, title="channel_uid")
device_id_: str = Field(None, title='device_id')
district_server_id_: int = Field(None, title="区服id")
game_role_id_: str = Field(None, title="角色id")
event_time_: int = Field(..., title="事件时间")
id: MdbObjectId = Field(..., title="平台", alias='_id')
platform: str = Field(None, title="平台", alias='_platform')
channel_name: str = Field(None, title="channel", alias='_channel_name')
owner_name: str = Field(None, title="owner", alias='_owner_name')
channel_uid: str = Field(None, title="channel_uid", alias='_channel_uid')
device_id: str = Field(None, title='device_id', alias='_device_id')
district_server_id: int = Field(None, title="区服id", alias='_district_server_id')
game_role_id: str = Field(None, title="角色id", alias='_game_role_id')
event_time: int = Field(..., title="事件时间", alias='_event_time')
role_create_time: int = Field(None, title="角色创建时间")
role_level: int = Field(None, title="角色等级")
role_vip: int = Field(None, title="角色vip等级")
def __init__(self, **data: Any):
if isinstance(data.get('_id'), str) and len(data['_id']) == 24:
data['_id'] = ObjectId(data['_id'])
new_data = {}
for k, v in data.items(): # type:str,Any
if k.startswith('_'):
new_k = k[1:] + k[0]
new_data[new_k] = v
else:
new_data[k] = v
super().__init__(**new_data)
def dict(
self,
*,
include: Union['AbstractSetIntStr', 'MappingIntStrAny'] = None,
exclude: Union['AbstractSetIntStr', 'MappingIntStrAny'] = None,
by_alias: bool = False,
skip_defaults: bool = None,
exclude_unset: bool = False,
exclude_defaults: bool = False,
exclude_none: bool = False,
) -> 'DictStrAny':
data = super().dict()
new_data = {}
for k, v in data.items(): # type:str,Any
if k.endswith('_'):
new_k = k[-1] + k[:-1]
new_data[new_k] = v
else:
new_data[k] = v
return new_data
class Config:
arbitrary_types_allowed = True
role_stage: int = Field(None, title="关卡")
@classmethod
def get_fields(cls):
fields = []
for k in cls.__fields__:
if k.endswith('_'):
fields.append(k[-1] + k[:-1])
else:
fields.append(k)
return fields
return [v.alias for v in cls.__fields__.values()]
if __name__ == '__main__':
obj = GBaseModel(_id="5fd0f4812de17aeba6c1a374", role_level='2', aaa=123, _platform=13566)
print(obj.dict())
print(obj.role_level)
obj = GBaseModel(_id="5fd0f4812de17aeba6c1a373", role_level='2', aaa=123, _platform=13566, _event_time=123456789)
print(GBaseModel.get_fields())
print(obj.dict(by_alias=True))

15
main.py
View File

@ -1,3 +1,4 @@
import json
import sys
from multiprocessing import Pool
@ -14,7 +15,7 @@ def get_game() -> list:
def run_task(kwargs):
module_name = kwargs.get('task_name')
class_name = ''.join([s.capitalize() for s in task_name.split('_')])
class_name = ''.join([s.capitalize() for s in module_name.split('_')])
module = import_module(f'.{module_name}', package='task')
c_obj = getattr(module, class_name)
obj = c_obj(**kwargs)
@ -23,15 +24,21 @@ def run_task(kwargs):
if __name__ == '__main__':
# eg: summary_func 0 0
# eg: first_recharge 0 0
task_name, st, et = sys.argv[1:]
st, et = int(st), int(et)
game_list = get_game()
params = [{'game_name': item['id_name'],
'task_name': task_name,
with open('config.json', 'r', encoding='utf8') as f:
task_conf = json.load(f)
params = []
for item in game_list:
p = {'game_name': item['id_name'],
'timezone': item.get('timezone', 'Asia/Shanghai'),
'st': st,
'et': et
}
for item in game_list]
p.update(task_conf[task_name])
params.append(p)
with Pool(len(game_list)) as p:
p.map(run_task, params)

59
task/first_recharge.py Normal file
View File

@ -0,0 +1,59 @@
from pymongo import UpdateOne
from pydantic import BaseModel, Field, validator
import pandas as pd
from .task import Task
from utils import *
class FirstRecharge(Task):
"""
首次充值记录
"""
class Model(BaseModel):
role_level: int = Field(None, title='角色等级')
role_vip: int = Field(None, title='vip等级')
role_stage: IntStr = Field(None, title='关卡')
money: IntFloat = Field(..., title='金额')
game_role_id: str = Field(..., title='角色id', alias='_game_role_id')
orderid: str = Field(..., title='订单号')
proid: str = Field(..., title='计费点')
cdate: int = Field(..., title='当天0点')
@classmethod
def get_fields(cls):
return [v.alias for v in cls.__fields__.values()]
def cleaning(self, cursor_list):
for cursor in cursor_list: # type:dict
for event_coll, ts in cursor.items(): # type:str,dict
if ts['cursor_st'] == ts['cursor_et']:
continue
logger.info(f'开始处理{self.game_name} 处理 {event_coll} 游标 {ts}')
where = {
'_event_time': {
'$gte': ts['cursor_st'],
'$lt': ts['cursor_et'],
}
}
projection = self.Model.get_fields()
bulk_data = []
for item in self.local_db[event_coll].find(where, projection):
try:
item['cdate'] = int(pd.Timestamp(ts['cursor_st'], unit='s', tz=self.timezone) \
.normalize().timestamp())
model = self.Model(**item)
data = model.dict(by_alias=True)
_game_role_id = data.pop('_game_role_id')
bulk_data.append(
UpdateOne({'_game_role_id': _game_role_id, 'is_recharge': {'$exists': False}},
{'$set': {'is_recharge': data}}))
except Exception as e:
logger.error(f'msg:{e}')
# pass
if bulk_data:
self.remote_db[self.dest_coll].bulk_write(bulk_data, ordered=False)
self.local_db[self.dest_coll].bulk_write(bulk_data, ordered=False)
self.set_cursor(cursor_st=ts['cursor_st'], cursor_et=ts['cursor_et'])

View File

@ -40,10 +40,11 @@ class SummaryFunc(Task):
item['cdate'] = int(pd.Timestamp(item['_event_time'], unit='s', tz=self.timezone) \
.normalize().timestamp())
model = self.Model(**item)
data = model.dict()
data = model.dict(by_alias=True)
bulk_data.append(UpdateOne({'_id': data['_id']}, {'$set': data}, upsert=True))
except Exception as e:
logger.error(f'ftype {item["ftype"]} msg:{e}')
# pass
self.remote_db[self.task_name].bulk_write(bulk_data, ordered=False)
if bulk_data:
self.remote_db[self.dest_coll].bulk_write(bulk_data, ordered=False)
self.set_cursor(cursor_st=ts['cursor_st'], cursor_et=ts['cursor_et'])

View File

@ -17,6 +17,8 @@ class Task(metaclass=abc.ABCMeta):
def __init__(self, *args, **kwargs):
self.game_name = kwargs.get('game_name')
self.game_db = f'game_{self.game_name}'
self.source_coll = kwargs.get('source_coll')
self.dest_coll = kwargs.get('dest_coll')
self.cursor_st = kwargs.get('st')
self.cursor_et = kwargs.get('et')
self.timezone = kwargs.get('timezone')
@ -83,16 +85,31 @@ class Task(metaclass=abc.ABCMeta):
self.task_coll.update_one(self.task_where, {
'$set': kwargs}, upsert=True)
def get_event_coll(self) -> list:
"""
根据游标时间戳 返回要处理的集合
:return: [{'event_2020-12-10': {'cursor_st': 1607608848, 'cursor_et': 1607610648}}, {'event_2020-12-10': {'cursor_st': 1607610648, 'cursor_et': 1607610791}}]
"""
def generate_cursor_time(self):
date_index = pd.date_range(pd.Timestamp(self.cursor_st, unit='s', tz=self.timezone),
pd.Timestamp(self.cursor_et, unit='s', tz=self.timezone), freq='30T')
df = pd.DataFrame(index=date_index)
df['st'] = df.index
df['et'] = np.append(df.index[1:], [pd.Timestamp(self.cursor_et, unit='s', tz=self.timezone)])
return df
def get_single_coll(self) -> list:
df = self.generate_cursor_time()
cursor_list = []
for k, item in df.T.items():
cursor_list.append({self.source_coll: {
'cursor_st': int(item['st'].timestamp()),
'cursor_et': int(item['et'].timestamp()),
}}
)
return cursor_list
def get_event_coll(self) -> list:
"""
根据游标时间戳 返回要处理的集合
:return: [{'event_2020-12-10': {'cursor_st': 1607608848, 'cursor_et': 1607610648}}, {'event_2020-12-10': {'cursor_st': 1607610648, 'cursor_et': 1607610791}}]
"""
df = self.generate_cursor_time()
df['event_coll_s'] = df['st'].apply(lambda x: f'event_{x.date().strftime("%Y-%m-%d")}')
df['event_coll_e'] = df['et'].apply(lambda x: f'event_{x.date().strftime("%Y-%m-%d")}')
cursor_list = []
@ -116,6 +133,12 @@ class Task(metaclass=abc.ABCMeta):
cursor_list.append(data)
return cursor_list
def get_source_coll(self) -> list:
if self.source_coll == 'event':
return self.get_event_coll()
else:
return self.get_single_coll()
def set_run_status(self, status: bool):
"""
设置运行状态
@ -134,6 +157,6 @@ class Task(metaclass=abc.ABCMeta):
self.set_run_ts()
self.set_run_status(True)
self.get_cursor()
cursor_list = self.get_event_coll()
cursor_list = self.get_source_coll()
self.cleaning(cursor_list)
self.set_run_status(False)

View File

@ -1,4 +1,5 @@
from loguru import logger
from .field_type import *
logger.add('/data/log/data_cleaning/log.log', format="{time} {level} {name}:{line} {message}", level="INFO",
rotation="100 MB", retention='7 days',

4
utils/field_type.py Normal file
View File

@ -0,0 +1,4 @@
from typing import TypeVar
IntStr = TypeVar('IntStr', int, str)
IntFloat = TypeVar('IntFloat', int, float)