Compare commits
4 Commits
ff0dffb36f
...
0db503885b
Author | SHA1 | Date | |
---|---|---|---|
0db503885b | |||
c0b829dfb4 | |||
a6e9845062 | |||
9912567004 |
12
config.json
Normal file
12
config.json
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
{
|
||||||
|
"summary_func": {
|
||||||
|
"source_coll": "event",
|
||||||
|
"dest_coll": "summary_func",
|
||||||
|
"task_name": "summary_func"
|
||||||
|
},
|
||||||
|
"first_recharge": {
|
||||||
|
"source_coll": "paylist",
|
||||||
|
"dest_coll": "user",
|
||||||
|
"task_name": "first_recharge"
|
||||||
|
}
|
||||||
|
}
|
87
db/model.py
87
db/model.py
@ -1,5 +1,3 @@
|
|||||||
from typing import Union, Any
|
|
||||||
|
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
from bson.objectid import ObjectId
|
from bson.objectid import ObjectId
|
||||||
|
|
||||||
@ -11,79 +9,36 @@ class MdbObjectId(ObjectId):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def validate(cls, v):
|
def validate(cls, v):
|
||||||
if not isinstance(v, ObjectId):
|
try:
|
||||||
raise TypeError('ObjectId required')
|
res = ObjectId(v)
|
||||||
return v
|
except:
|
||||||
|
raise TypeError('不能装换为 ObjectId')
|
||||||
|
else:
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
class GBaseModel(BaseModel):
|
class GBaseModel(BaseModel):
|
||||||
"""
|
id: MdbObjectId = Field(..., title="平台", alias='_id')
|
||||||
字段名与保护变量命名冲突,将 _ 前缀变为后缀,读出再还原
|
platform: str = Field(None, title="平台", alias='_platform')
|
||||||
"""
|
channel_name: str = Field(None, title="channel", alias='_channel_name')
|
||||||
id_: MdbObjectId = Field(..., title="平台")
|
owner_name: str = Field(None, title="owner", alias='_owner_name')
|
||||||
platform_: str = Field(None, title="平台")
|
channel_uid: str = Field(None, title="channel_uid", alias='_channel_uid')
|
||||||
channel_name_: str = Field(None, title="channel")
|
device_id: str = Field(None, title='device_id', alias='_device_id')
|
||||||
owner_name_: str = Field(None, title="owner")
|
district_server_id: int = Field(None, title="区服id", alias='_district_server_id')
|
||||||
channel_uid_: str = Field(None, title="channel_uid")
|
game_role_id: str = Field(None, title="角色id", alias='_game_role_id')
|
||||||
device_id_: str = Field(None, title='device_id')
|
event_time: int = Field(..., title="事件时间", alias='_event_time')
|
||||||
district_server_id_: int = Field(None, title="区服id")
|
|
||||||
game_role_id_: str = Field(None, title="角色id")
|
|
||||||
event_time_: int = Field(..., title="事件时间")
|
|
||||||
role_create_time: int = Field(None, title="角色创建时间")
|
role_create_time: int = Field(None, title="角色创建时间")
|
||||||
role_level: int = Field(None, title="角色等级")
|
role_level: int = Field(None, title="角色等级")
|
||||||
role_vip: int = Field(None, title="角色vip等级")
|
role_vip: int = Field(None, title="角色vip等级")
|
||||||
|
role_stage: int = Field(None, title="关卡")
|
||||||
def __init__(self, **data: Any):
|
|
||||||
if isinstance(data.get('_id'), str) and len(data['_id']) == 24:
|
|
||||||
data['_id'] = ObjectId(data['_id'])
|
|
||||||
|
|
||||||
new_data = {}
|
|
||||||
for k, v in data.items(): # type:str,Any
|
|
||||||
if k.startswith('_'):
|
|
||||||
new_k = k[1:] + k[0]
|
|
||||||
new_data[new_k] = v
|
|
||||||
else:
|
|
||||||
new_data[k] = v
|
|
||||||
|
|
||||||
super().__init__(**new_data)
|
|
||||||
|
|
||||||
def dict(
|
|
||||||
self,
|
|
||||||
*,
|
|
||||||
include: Union['AbstractSetIntStr', 'MappingIntStrAny'] = None,
|
|
||||||
exclude: Union['AbstractSetIntStr', 'MappingIntStrAny'] = None,
|
|
||||||
by_alias: bool = False,
|
|
||||||
skip_defaults: bool = None,
|
|
||||||
exclude_unset: bool = False,
|
|
||||||
exclude_defaults: bool = False,
|
|
||||||
exclude_none: bool = False,
|
|
||||||
) -> 'DictStrAny':
|
|
||||||
data = super().dict()
|
|
||||||
|
|
||||||
new_data = {}
|
|
||||||
for k, v in data.items(): # type:str,Any
|
|
||||||
if k.endswith('_'):
|
|
||||||
new_k = k[-1] + k[:-1]
|
|
||||||
new_data[new_k] = v
|
|
||||||
else:
|
|
||||||
new_data[k] = v
|
|
||||||
return new_data
|
|
||||||
|
|
||||||
class Config:
|
|
||||||
arbitrary_types_allowed = True
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_fields(cls):
|
def get_fields(cls):
|
||||||
fields = []
|
return [v.alias for v in cls.__fields__.values()]
|
||||||
for k in cls.__fields__:
|
|
||||||
if k.endswith('_'):
|
|
||||||
fields.append(k[-1] + k[:-1])
|
|
||||||
else:
|
|
||||||
fields.append(k)
|
|
||||||
return fields
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
obj = GBaseModel(_id="5fd0f4812de17aeba6c1a374", role_level='2', aaa=123, _platform=13566)
|
obj = GBaseModel(_id="5fd0f4812de17aeba6c1a373", role_level='2', aaa=123, _platform=13566, _event_time=123456789)
|
||||||
print(obj.dict())
|
print(GBaseModel.get_fields())
|
||||||
print(obj.role_level)
|
print(obj.dict(by_alias=True))
|
||||||
|
23
main.py
23
main.py
@ -1,3 +1,4 @@
|
|||||||
|
import json
|
||||||
import sys
|
import sys
|
||||||
from multiprocessing import Pool
|
from multiprocessing import Pool
|
||||||
|
|
||||||
@ -14,7 +15,7 @@ def get_game() -> list:
|
|||||||
|
|
||||||
def run_task(kwargs):
|
def run_task(kwargs):
|
||||||
module_name = kwargs.get('task_name')
|
module_name = kwargs.get('task_name')
|
||||||
class_name = ''.join([s.capitalize() for s in task_name.split('_')])
|
class_name = ''.join([s.capitalize() for s in module_name.split('_')])
|
||||||
module = import_module(f'.{module_name}', package='task')
|
module = import_module(f'.{module_name}', package='task')
|
||||||
c_obj = getattr(module, class_name)
|
c_obj = getattr(module, class_name)
|
||||||
obj = c_obj(**kwargs)
|
obj = c_obj(**kwargs)
|
||||||
@ -23,15 +24,21 @@ def run_task(kwargs):
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# eg: summary_func 0 0
|
# eg: summary_func 0 0
|
||||||
|
# eg: first_recharge 0 0
|
||||||
task_name, st, et = sys.argv[1:]
|
task_name, st, et = sys.argv[1:]
|
||||||
st, et = int(st), int(et)
|
st, et = int(st), int(et)
|
||||||
game_list = get_game()
|
game_list = get_game()
|
||||||
params = [{'game_name': item['id_name'],
|
with open('config.json', 'r', encoding='utf8') as f:
|
||||||
'task_name': task_name,
|
task_conf = json.load(f)
|
||||||
'timezone': item.get('timezone', 'Asia/Shanghai'),
|
params = []
|
||||||
'st': st,
|
for item in game_list:
|
||||||
'et': et
|
p = {'game_name': item['id_name'],
|
||||||
}
|
'timezone': item.get('timezone', 'Asia/Shanghai'),
|
||||||
for item in game_list]
|
'st': st,
|
||||||
|
'et': et
|
||||||
|
}
|
||||||
|
p.update(task_conf[task_name])
|
||||||
|
params.append(p)
|
||||||
|
|
||||||
with Pool(len(game_list)) as p:
|
with Pool(len(game_list)) as p:
|
||||||
p.map(run_task, params)
|
p.map(run_task, params)
|
||||||
|
59
task/first_recharge.py
Normal file
59
task/first_recharge.py
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
from pymongo import UpdateOne
|
||||||
|
from pydantic import BaseModel, Field, validator
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from .task import Task
|
||||||
|
from utils import *
|
||||||
|
|
||||||
|
|
||||||
|
class FirstRecharge(Task):
|
||||||
|
"""
|
||||||
|
首次充值记录
|
||||||
|
"""
|
||||||
|
|
||||||
|
class Model(BaseModel):
|
||||||
|
role_level: int = Field(None, title='角色等级')
|
||||||
|
role_vip: int = Field(None, title='vip等级')
|
||||||
|
role_stage: IntStr = Field(None, title='关卡')
|
||||||
|
money: IntFloat = Field(..., title='金额')
|
||||||
|
game_role_id: str = Field(..., title='角色id', alias='_game_role_id')
|
||||||
|
orderid: str = Field(..., title='订单号')
|
||||||
|
proid: str = Field(..., title='计费点')
|
||||||
|
cdate: int = Field(..., title='当天0点')
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_fields(cls):
|
||||||
|
return [v.alias for v in cls.__fields__.values()]
|
||||||
|
|
||||||
|
def cleaning(self, cursor_list):
|
||||||
|
for cursor in cursor_list: # type:dict
|
||||||
|
for event_coll, ts in cursor.items(): # type:str,dict
|
||||||
|
if ts['cursor_st'] == ts['cursor_et']:
|
||||||
|
continue
|
||||||
|
logger.info(f'开始处理{self.game_name} 处理 {event_coll} 游标 {ts}')
|
||||||
|
where = {
|
||||||
|
'_event_time': {
|
||||||
|
'$gte': ts['cursor_st'],
|
||||||
|
'$lt': ts['cursor_et'],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
projection = self.Model.get_fields()
|
||||||
|
bulk_data = []
|
||||||
|
for item in self.local_db[event_coll].find(where, projection):
|
||||||
|
try:
|
||||||
|
item['cdate'] = int(pd.Timestamp(ts['cursor_st'], unit='s', tz=self.timezone) \
|
||||||
|
.normalize().timestamp())
|
||||||
|
model = self.Model(**item)
|
||||||
|
data = model.dict(by_alias=True)
|
||||||
|
_game_role_id = data.pop('_game_role_id')
|
||||||
|
bulk_data.append(
|
||||||
|
UpdateOne({'_game_role_id': _game_role_id, 'is_recharge': {'$exists': False}},
|
||||||
|
{'$set': {'is_recharge': data}}))
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f'msg:{e}')
|
||||||
|
# pass
|
||||||
|
if bulk_data:
|
||||||
|
self.remote_db[self.dest_coll].bulk_write(bulk_data, ordered=False)
|
||||||
|
self.local_db[self.dest_coll].bulk_write(bulk_data, ordered=False)
|
||||||
|
self.set_cursor(cursor_st=ts['cursor_st'], cursor_et=ts['cursor_et'])
|
@ -40,10 +40,11 @@ class SummaryFunc(Task):
|
|||||||
item['cdate'] = int(pd.Timestamp(item['_event_time'], unit='s', tz=self.timezone) \
|
item['cdate'] = int(pd.Timestamp(item['_event_time'], unit='s', tz=self.timezone) \
|
||||||
.normalize().timestamp())
|
.normalize().timestamp())
|
||||||
model = self.Model(**item)
|
model = self.Model(**item)
|
||||||
data = model.dict()
|
data = model.dict(by_alias=True)
|
||||||
bulk_data.append(UpdateOne({'_id': data['_id']}, {'$set': data}, upsert=True))
|
bulk_data.append(UpdateOne({'_id': data['_id']}, {'$set': data}, upsert=True))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f'ftype {item["ftype"]} msg:{e}')
|
logger.error(f'ftype {item["ftype"]} msg:{e}')
|
||||||
# pass
|
# pass
|
||||||
self.remote_db[self.task_name].bulk_write(bulk_data, ordered=False)
|
if bulk_data:
|
||||||
|
self.remote_db[self.dest_coll].bulk_write(bulk_data, ordered=False)
|
||||||
self.set_cursor(cursor_st=ts['cursor_st'], cursor_et=ts['cursor_et'])
|
self.set_cursor(cursor_st=ts['cursor_st'], cursor_et=ts['cursor_et'])
|
||||||
|
35
task/task.py
35
task/task.py
@ -17,6 +17,8 @@ class Task(metaclass=abc.ABCMeta):
|
|||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
self.game_name = kwargs.get('game_name')
|
self.game_name = kwargs.get('game_name')
|
||||||
self.game_db = f'game_{self.game_name}'
|
self.game_db = f'game_{self.game_name}'
|
||||||
|
self.source_coll = kwargs.get('source_coll')
|
||||||
|
self.dest_coll = kwargs.get('dest_coll')
|
||||||
self.cursor_st = kwargs.get('st')
|
self.cursor_st = kwargs.get('st')
|
||||||
self.cursor_et = kwargs.get('et')
|
self.cursor_et = kwargs.get('et')
|
||||||
self.timezone = kwargs.get('timezone')
|
self.timezone = kwargs.get('timezone')
|
||||||
@ -83,16 +85,31 @@ class Task(metaclass=abc.ABCMeta):
|
|||||||
self.task_coll.update_one(self.task_where, {
|
self.task_coll.update_one(self.task_where, {
|
||||||
'$set': kwargs}, upsert=True)
|
'$set': kwargs}, upsert=True)
|
||||||
|
|
||||||
def get_event_coll(self) -> list:
|
def generate_cursor_time(self):
|
||||||
"""
|
|
||||||
根据游标时间戳 返回要处理的集合
|
|
||||||
:return: [{'event_2020-12-10': {'cursor_st': 1607608848, 'cursor_et': 1607610648}}, {'event_2020-12-10': {'cursor_st': 1607610648, 'cursor_et': 1607610791}}]
|
|
||||||
"""
|
|
||||||
date_index = pd.date_range(pd.Timestamp(self.cursor_st, unit='s', tz=self.timezone),
|
date_index = pd.date_range(pd.Timestamp(self.cursor_st, unit='s', tz=self.timezone),
|
||||||
pd.Timestamp(self.cursor_et, unit='s', tz=self.timezone), freq='30T')
|
pd.Timestamp(self.cursor_et, unit='s', tz=self.timezone), freq='30T')
|
||||||
df = pd.DataFrame(index=date_index)
|
df = pd.DataFrame(index=date_index)
|
||||||
df['st'] = df.index
|
df['st'] = df.index
|
||||||
df['et'] = np.append(df.index[1:], [pd.Timestamp(self.cursor_et, unit='s', tz=self.timezone)])
|
df['et'] = np.append(df.index[1:], [pd.Timestamp(self.cursor_et, unit='s', tz=self.timezone)])
|
||||||
|
return df
|
||||||
|
|
||||||
|
def get_single_coll(self) -> list:
|
||||||
|
df = self.generate_cursor_time()
|
||||||
|
cursor_list = []
|
||||||
|
for k, item in df.T.items():
|
||||||
|
cursor_list.append({self.source_coll: {
|
||||||
|
'cursor_st': int(item['st'].timestamp()),
|
||||||
|
'cursor_et': int(item['et'].timestamp()),
|
||||||
|
}}
|
||||||
|
)
|
||||||
|
return cursor_list
|
||||||
|
|
||||||
|
def get_event_coll(self) -> list:
|
||||||
|
"""
|
||||||
|
根据游标时间戳 返回要处理的集合
|
||||||
|
:return: [{'event_2020-12-10': {'cursor_st': 1607608848, 'cursor_et': 1607610648}}, {'event_2020-12-10': {'cursor_st': 1607610648, 'cursor_et': 1607610791}}]
|
||||||
|
"""
|
||||||
|
df = self.generate_cursor_time()
|
||||||
df['event_coll_s'] = df['st'].apply(lambda x: f'event_{x.date().strftime("%Y-%m-%d")}')
|
df['event_coll_s'] = df['st'].apply(lambda x: f'event_{x.date().strftime("%Y-%m-%d")}')
|
||||||
df['event_coll_e'] = df['et'].apply(lambda x: f'event_{x.date().strftime("%Y-%m-%d")}')
|
df['event_coll_e'] = df['et'].apply(lambda x: f'event_{x.date().strftime("%Y-%m-%d")}')
|
||||||
cursor_list = []
|
cursor_list = []
|
||||||
@ -116,6 +133,12 @@ class Task(metaclass=abc.ABCMeta):
|
|||||||
cursor_list.append(data)
|
cursor_list.append(data)
|
||||||
return cursor_list
|
return cursor_list
|
||||||
|
|
||||||
|
def get_source_coll(self) -> list:
|
||||||
|
if self.source_coll == 'event':
|
||||||
|
return self.get_event_coll()
|
||||||
|
else:
|
||||||
|
return self.get_single_coll()
|
||||||
|
|
||||||
def set_run_status(self, status: bool):
|
def set_run_status(self, status: bool):
|
||||||
"""
|
"""
|
||||||
设置运行状态
|
设置运行状态
|
||||||
@ -134,6 +157,6 @@ class Task(metaclass=abc.ABCMeta):
|
|||||||
self.set_run_ts()
|
self.set_run_ts()
|
||||||
self.set_run_status(True)
|
self.set_run_status(True)
|
||||||
self.get_cursor()
|
self.get_cursor()
|
||||||
cursor_list = self.get_event_coll()
|
cursor_list = self.get_source_coll()
|
||||||
self.cleaning(cursor_list)
|
self.cleaning(cursor_list)
|
||||||
self.set_run_status(False)
|
self.set_run_status(False)
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
from .field_type import *
|
||||||
|
|
||||||
logger.add('/data/log/data_cleaning/log.log', format="{time} {level} {name}:{line} {message}", level="INFO",
|
logger.add('/data/log/data_cleaning/log.log', format="{time} {level} {name}:{line} {message}", level="INFO",
|
||||||
rotation="100 MB", retention='7 days',
|
rotation="100 MB", retention='7 days',
|
||||||
|
4
utils/field_type.py
Normal file
4
utils/field_type.py
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
from typing import TypeVar
|
||||||
|
|
||||||
|
IntStr = TypeVar('IntStr', int, str)
|
||||||
|
IntFloat = TypeVar('IntFloat', int, float)
|
Loading…
Reference in New Issue
Block a user