This commit is contained in:
wuhao 2020-12-11 10:51:07 +08:00
parent a37ea1e69b
commit 4934372b42
9 changed files with 342 additions and 1 deletions

2
.gitignore vendored
View File

@ -128,4 +128,4 @@ dmypy.json
# Pyre type checker
.pyre/
.idea/

14
db/__init__.py Normal file
View File

@ -0,0 +1,14 @@
import pymongo
from .model import GBaseModel
from settings import settings
def get_local_db(db_name):
db_client = pymongo.MongoClient(settings.local_mongo_uri)
return db_client[db_name]
def get_remote_db(db_name):
db_client = pymongo.MongoClient(settings.remote_mongo_uri)
return db_client[db_name]

89
db/model.py Normal file
View File

@ -0,0 +1,89 @@
from typing import Union, Any
from pydantic import BaseModel, Field
from bson.objectid import ObjectId
class MdbObjectId(ObjectId):
@classmethod
def __get_validators__(cls):
yield cls.validate
@classmethod
def validate(cls, v):
if not isinstance(v, ObjectId):
raise TypeError('ObjectId required')
return v
class GBaseModel(BaseModel):
"""
字段名与保护变量命名冲突 _ 前缀变为后缀,读出再还原
"""
id_: MdbObjectId = Field(..., title="平台")
platform_: str = Field(None, title="平台")
channel_name_: str = Field(None, title="channel")
owner_name_: str = Field(None, title="owner")
channel_uid_: str = Field(None, title="channel_uid")
device_id_: str = Field(None, title='device_id')
district_server_id_: int = Field(None, title="区服id")
game_role_id_: str = Field(None, title="角色id")
event_time_: int = Field(..., title="事件时间")
role_create_time: int = Field(None, title="角色创建时间")
role_level: int = Field(None, title="角色等级")
role_vip: int = Field(None, title="角色vip等级")
def __init__(self, **data: Any):
if isinstance(data.get('_id'), str) and len(data['_id']) == 24:
data['_id'] = ObjectId(data['_id'])
new_data = {}
for k, v in data.items(): # type:str,Any
if k.startswith('_'):
new_k = k[1:] + k[0]
new_data[new_k] = v
else:
new_data[k] = v
super().__init__(**new_data)
def dict(
self,
*,
include: Union['AbstractSetIntStr', 'MappingIntStrAny'] = None,
exclude: Union['AbstractSetIntStr', 'MappingIntStrAny'] = None,
by_alias: bool = False,
skip_defaults: bool = None,
exclude_unset: bool = False,
exclude_defaults: bool = False,
exclude_none: bool = False,
) -> 'DictStrAny':
data = super().dict()
new_data = {}
for k, v in data.items(): # type:str,Any
if k.endswith('_'):
new_k = k[-1] + k[:-1]
new_data[new_k] = v
else:
new_data[k] = v
return new_data
class Config:
arbitrary_types_allowed = True
@classmethod
def get_fields(cls):
fields = []
for k in cls.__fields__:
if k.endswith('_'):
fields.append(k[-1] + k[:-1])
else:
fields.append(k)
return fields
if __name__ == '__main__':
obj = GBaseModel(_id="5fd0f4812de17aeba6c1a374", role_level='2', aaa=123, _platform=13566)
print(obj.dict())
print(obj.role_level)

37
main.py Normal file
View File

@ -0,0 +1,37 @@
import sys
from multiprocessing import Pool
from importlib import import_module
from db import *
def get_game() -> list:
local_db = get_local_db('admin_game')
games = list(local_db['game'].find())
return games
def run_task(kwargs):
module_name = kwargs.get('task_name')
class_name = ''.join([s.capitalize() for s in task_name.split('_')])
module = import_module(f'.{module_name}', package='task')
c_obj = getattr(module, class_name)
obj = c_obj(**kwargs)
obj.run()
if __name__ == '__main__':
# eg: summary_func 0 0
task_name, st, et = sys.argv[1:]
st, et = int(st), int(et)
game_list = get_game()
params = [{'game_name': item['id_name'],
'task_name': task_name,
'timezone': item.get('timezone', 'Asia/Shanghai'),
'st': st,
'et': et
}
for item in game_list]
with Pool(len(game_list)) as p:
p.map(run_task, params)

15
settings.py Normal file
View File

@ -0,0 +1,15 @@
class Config:
local_mongo_uri = 'mongodb://root:iamciniao@127.0.0.1:27017/?authSource=admin&readPreference=primary&ssl=false'
remote_mongo_uri = 'mongodb://root:Legu2020@dds-hp35c58764c35aa41188-pub.mongodb.huhehaote.rds.aliyuncs.com:3717,dds-hp35c58764c35aa42908-pub.mongodb.huhehaote.rds.aliyuncs.com:3717/admin?replicaSet=mgset-421510732'
class Production(Config):
DB_PREFIX = 'game'
class Debug(Config):
DB_PREFIX = 'debug'
settings = Production

0
task/__init__.py Normal file
View File

42
task/summary_func.py Normal file
View File

@ -0,0 +1,42 @@
from pymongo import UpdateOne
from pydantic import Field
from .task import Task
from utils import *
from db import GBaseModel
class SummaryFunc(Task):
"""
功能分析
"""
class Model(GBaseModel):
prize: list = Field(None, title='奖励')
need: list = Field(None, title='消耗')
ftype: str = Field(..., title='功能')
data: dict = Field(None, title='功能数据')
def cleaning(self, cursor_list):
for cursor in cursor_list: # type:dict
for event_coll, ts in cursor.items(): # type:str,dict
logger.info(f'开始处理{self.game_name} 处理 {event_coll} ...')
where = {
'_event_name': 'Func',
'_event_time': {
'$gte': ts['cursor_st'],
'$lt': ts['cursor_et'],
}
}
projection = self.Model.get_fields()
bulk_data = []
for item in self.local_db[event_coll].find(where, projection):
try:
model = self.Model(**item)
data = model.dict()
bulk_data.append(UpdateOne({'_id': data['_id']}, {'$set': data}, upsert=True))
except Exception as e:
logger.error(e)
self.remote_db[self.task_name].bulk_write(bulk_data, ordered=False)
self.set_cursor(cursor_st=ts['cursor_st'], cursor_et=ts['cursor_et'])

139
task/task.py Normal file
View File

@ -0,0 +1,139 @@
import abc
import time
import pandas as pd
import numpy as np
from db import *
from utils import *
""""
考虑游标过大切分为30分钟块写入
采用批量无序操作提高写性能 bulk_write
"""
class Task(metaclass=abc.ABCMeta):
def __init__(self, *args, **kwargs):
self.game_name = kwargs.get('game_name')
self.game_db = f'game_{self.game_name}'
self.cursor_st = kwargs.get('st')
self.cursor_et = kwargs.get('et')
self.timezone = kwargs.get('timezone')
self.task_name = kwargs.get("task_name")
self.local_db = get_local_db(self.game_db)
self.remote_db = get_remote_db(self.game_db)
self.task_coll = self.local_db['task2']
self.task_where = {
'name': self.task_name
}
self.task_info = self.get_task_info()
def get_task_info(self):
task_info = self.task_coll.find_one(self.task_where) or {}
return task_info
def check_run(self) -> bool:
is_run = self.task_info.get('is_run')
last_ts = self.task_info.get('run_ts', 0)
time_out = self.task_info.get('time_out', 86400)
if not last_ts and not is_run:
# 第一次运行
return True
if not is_run:
# 可以运行
return True
elif int(time.time()) - last_ts > time_out:
# 任务超时
# todo 钉钉通知
logger.info('钉钉通知')
return False
else:
# 正在运行没超时
logger.info('正在运行没超时')
return False
def set_run_ts(self):
self.task_coll.update_one(self.task_where, {
'$set': {'run_ts': int(time.time())}
}, upsert=True)
def get_cursor(self):
"""
没有手动设置游标从taskinfo接着上次执行
任务第一次执行取当天0点
:return:
"""
if not self.cursor_st:
self.cursor_st = self.task_info.get('cursor_et')
if not self.cursor_st:
self.cursor_st = int(pd.Timestamp(time.time(), unit='s', tz=self.timezone).normalize().timestamp())
if not self.cursor_et:
self.cursor_et = int(time.time())
def set_cursor(self, **kwargs):
"""
本次任务完成设置游标
:return: None
"""
if kwargs and set(kwargs) > {'cursor_et', 'cursor_st'}:
raise ValueError('设置游标不合理')
self.task_coll.update_one(self.task_where, {
'$set': kwargs}, upsert=True)
def get_event_coll(self) -> list:
"""
根据游标时间戳 返回要处理的集合
:return: [{'event_2020-12-10': {'cursor_st': 1607608848, 'cursor_et': 1607610648}}, {'event_2020-12-10': {'cursor_st': 1607610648, 'cursor_et': 1607610791}}]
"""
date_index = pd.date_range(pd.Timestamp(self.cursor_st, unit='s', tz=self.timezone),
pd.Timestamp(self.cursor_et, unit='s', tz=self.timezone), freq='30T')
df = pd.DataFrame(index=date_index)
df['st'] = df.index
df['et'] = np.append(df.index[1:], [pd.Timestamp(self.cursor_et, unit='s', tz=self.timezone)])
df['event_coll_s'] = df['st'].apply(lambda x: f'event_{x.date().strftime("%Y-%m-%d")}')
df['event_coll_e'] = df['et'].apply(lambda x: f'event_{x.date().strftime("%Y-%m-%d")}')
cursor_list = []
for k, item in df.T.items():
data = {}
if item['event_coll_s'] != item['event_coll_e']:
data[item['event_coll_s']] = {
'cursor_st': int(item['st'].timestamp()),
'cursor_et': int(item['et'].normalize().timestamp()),
}
data[item['event_coll_e']] = {
'cursor_st': int(item['et'].normalize().timestamp()),
'cursor_et': int(item['et'].timestamp()),
}
else:
data[item['event_coll_s']] = {
'cursor_st': int(item['st'].timestamp()),
'cursor_et': int(item['et'].timestamp()),
}
cursor_list.append(data)
return cursor_list
def set_run_status(self, status: bool):
"""
设置运行状态
:param status:
:return:
"""
self.task_coll.update_one(self.task_where, {'$set': {'is_run': status}}, upsert=True)
@abc.abstractmethod
def cleaning(self, cursor_list):
pass
def run(self):
if not self.check_run():
return '运行中...'
self.set_run_ts()
self.set_run_status(True)
self.get_cursor()
cursor_list = self.get_event_coll()
self.cleaning(cursor_list)
self.set_run_status(False)

5
utils/__init__.py Normal file
View File

@ -0,0 +1,5 @@
from loguru import logger
logger.add('/data/log/data_cleaning/log.log', format="{time} {level} {name}:{line} {message}", level="INFO",
rotation="100 MB", retention='7 days',
enqueue=True)