update
This commit is contained in:
parent
35739a1a92
commit
c05e827970
14
README.md
14
README.md
@ -11,3 +11,17 @@ xdata 数据清洗
|
||||
5. 清洗数据入库。
|
||||
6. 设置本次操作游标。
|
||||
7. 标记运行结束。run=0
|
||||
|
||||
|
||||
|
||||
## 注意事项
|
||||
|
||||
事件时间和入库时间 偏差
|
||||
|
||||
eg:
|
||||
|
||||
事件A时间在 39分57秒发生;入库时间在 40分32秒;
|
||||
|
||||
任务执行时间在 40分0秒,清洗30分0秒~40分0秒的数据,事件A还未入库造成遗漏。
|
||||
|
||||
解决办法已入库时间为游标
|
@ -18,6 +18,7 @@ class FirstRecharge(Task):
|
||||
role_level: int = Field(None, title='角色等级')
|
||||
role_vip: int = Field(None, title='vip等级')
|
||||
role_stage: IntStr = Field(None, title='关卡')
|
||||
event_time: int = Field(..., title="事件时间", alias='_event_time')
|
||||
money: IntFloat = Field(..., title='金额')
|
||||
game_role_id: str = Field(..., min_length=1, title='角色id', alias='_game_role_id')
|
||||
orderid: str = Field(..., min_length=1, title='订单号')
|
||||
@ -35,7 +36,7 @@ class FirstRecharge(Task):
|
||||
continue
|
||||
logger.info(f'开始处理{self.game_name} 处理 {source_coll} 游标 {ts}')
|
||||
where = {
|
||||
'_event_time': {
|
||||
'_ut': {
|
||||
'$gte': ts['cursor_st'],
|
||||
'$lt': ts['cursor_et'],
|
||||
}
|
||||
@ -45,7 +46,7 @@ class FirstRecharge(Task):
|
||||
bulk_data = []
|
||||
for item in self.local_db[source_coll].find(where, projection):
|
||||
try:
|
||||
item['cdate'] = int(pd.Timestamp(ts['cursor_st'], unit='s', tz=self.timezone) \
|
||||
item['cdate'] = int(pd.Timestamp(item['_event_time'], unit='s', tz=self.timezone) \
|
||||
.normalize().timestamp())
|
||||
model = self.Model(**item)
|
||||
data = model.dict(by_alias=True)
|
||||
|
@ -57,7 +57,7 @@ class SummaryAssets(Task):
|
||||
'$or': [{'prize.a': {'$in': list(a)}, 'prize.t': {'$in': list(t)}},
|
||||
{'need.a': {'$in': list(a)}, 'need.t': {'$in': list(t)}}],
|
||||
'_event_name': 'res',
|
||||
'_event_time': {
|
||||
'_ut': {
|
||||
'$gte': ts['cursor_st'],
|
||||
'$lt': ts['cursor_et'],
|
||||
}
|
||||
@ -65,11 +65,11 @@ class SummaryAssets(Task):
|
||||
|
||||
projection = self.Model.get_fields()
|
||||
bulk_data = []
|
||||
cdate = int(pd.Timestamp(ts['cursor_st'], unit='s', tz=self.timezone).normalize().timestamp())
|
||||
|
||||
for item in self.local_db[source_coll].find(where, projection):
|
||||
try:
|
||||
item['cdate'] = cdate
|
||||
item['cdate'] = int(pd.Timestamp(item['_event_time'], unit='s', tz=self.timezone) \
|
||||
.normalize().timestamp())
|
||||
model = self.Model(**item)
|
||||
data = model.dict(by_alias=True)
|
||||
bulk_data.append(UpdateOne({'_id': data['_id']}, {'$set': data}, upsert=True))
|
||||
|
@ -30,7 +30,7 @@ class SummaryFunc(Task):
|
||||
continue
|
||||
logger.info(f'开始处理{self.game_name} 处理 {source_coll} 游标 {ts}')
|
||||
where = {
|
||||
'_event_name': 'Func',
|
||||
'_ut': 'Func',
|
||||
'_event_time': {
|
||||
'$gte': ts['cursor_st'],
|
||||
'$lt': ts['cursor_et'],
|
||||
@ -39,10 +39,10 @@ class SummaryFunc(Task):
|
||||
|
||||
projection = self.Model.get_fields()
|
||||
bulk_data = []
|
||||
cdate = int(pd.Timestamp(ts['cursor_st'], unit='s', tz=self.timezone).normalize().timestamp())
|
||||
for item in self.local_db[source_coll].find(where, projection):
|
||||
try:
|
||||
item['cdate'] = cdate
|
||||
item['cdate'] = int(pd.Timestamp(item['_event_time'], unit='s', tz=self.timezone) \
|
||||
.normalize().timestamp())
|
||||
model = self.Model(**item)
|
||||
data = model.dict(by_alias=True)
|
||||
bulk_data.append(UpdateOne({'_id': data['_id']}, {'$set': data}, upsert=True))
|
||||
|
@ -33,17 +33,17 @@ class SummaryFunnel(Task):
|
||||
where = {
|
||||
'_event_name': 'Guide',
|
||||
'step': {'$in': step_list},
|
||||
'_event_time': {
|
||||
'_ut': {
|
||||
'$gte': ts['cursor_st'],
|
||||
'$lt': ts['cursor_et'],
|
||||
}
|
||||
}
|
||||
|
||||
bulk_data = []
|
||||
cdate = int(pd.Timestamp(ts['cursor_st'], unit='s', tz=self.timezone).normalize().timestamp())
|
||||
for item in self.local_db[source_coll].find(where): # 所有字段
|
||||
try:
|
||||
item['cdate'] = cdate
|
||||
item['cdate'] = int(pd.Timestamp(item['_event_time'], unit='s', tz=self.timezone) \
|
||||
.normalize().timestamp())
|
||||
model = self.Model(**item)
|
||||
data = model.dict(by_alias=True)
|
||||
data.update(item)
|
||||
|
@ -41,20 +41,20 @@ class SummaryJoinHd(Task):
|
||||
continue
|
||||
logger.info(f'开始处理{self.game_name} 处理 {source_coll} 游标 {ts}')
|
||||
where = {
|
||||
'_event_name': 'res',
|
||||
'_ut': 'res',
|
||||
'function': 'hdgetprize',
|
||||
'_event_time': {
|
||||
'$gte': ts['cursor_st'],
|
||||
'$lt': ts['cursor_et'],
|
||||
}
|
||||
}
|
||||
cdate = int(pd.Timestamp(ts['cursor_st'], unit='s', tz=self.timezone).normalize().timestamp())
|
||||
projection = self.Model.get_fields()
|
||||
projection.extend(['function_data', 'function_detail'])
|
||||
bulk_data = []
|
||||
for item in self.local_db[source_coll].find(where, projection):
|
||||
try:
|
||||
item['cdate'] = cdate
|
||||
item['cdate'] = int(pd.Timestamp(item['_event_time'], unit='s', tz=self.timezone) \
|
||||
.normalize().timestamp())
|
||||
item['htype'] = str(item['function_detail'])
|
||||
item['hd_idx'] = 0
|
||||
if isinstance(item['function_data'], dict):
|
||||
|
@ -29,7 +29,7 @@ class SummaryOpenHd(Task):
|
||||
'_event_name': 'Activity',
|
||||
'act': 'click',
|
||||
'htype': {"$exists": 1},
|
||||
'_event_time': {
|
||||
'_ut': {
|
||||
'$gte': ts['cursor_st'],
|
||||
'$lt': ts['cursor_et'],
|
||||
}
|
||||
@ -37,11 +37,11 @@ class SummaryOpenHd(Task):
|
||||
|
||||
projection = self.Model.get_fields()
|
||||
bulk_data = []
|
||||
cdate = int(pd.Timestamp(ts['cursor_st'], unit='s', tz=self.timezone).normalize().timestamp())
|
||||
|
||||
for item in self.local_db[source_coll].find(where, projection):
|
||||
try:
|
||||
item['cdate'] = cdate
|
||||
item['cdate'] = int(pd.Timestamp(item['_event_time'], unit='s', tz=self.timezone) \
|
||||
.normalize().timestamp())
|
||||
item['htype'] = str(item['htype'])
|
||||
model = self.Model(**item)
|
||||
data = model.dict(by_alias=True)
|
||||
|
@ -39,13 +39,11 @@ class SummaryPay(Task):
|
||||
continue
|
||||
logger.info(f'开始处理{self.game_name} 处理 {source_coll} 游标 {ts}')
|
||||
where = {
|
||||
'_event_time': {
|
||||
'_ut': {
|
||||
'$gte': ts['cursor_st'],
|
||||
'$lt': ts['cursor_et'],
|
||||
}
|
||||
}
|
||||
cdate = int(pd.Timestamp(ts['cursor_st'], unit='s', tz=self.timezone) \
|
||||
.normalize().timestamp())
|
||||
projection = self.Model.get_fields()
|
||||
bulk_data = []
|
||||
for item in self.local_db[source_coll].find(where, projection):
|
||||
@ -54,7 +52,8 @@ class SummaryPay(Task):
|
||||
if orderid.startswith('GM_') or \
|
||||
orderid.startswith('debugPay'):
|
||||
continue
|
||||
item['cdate'] = cdate
|
||||
item['cdate'] = int(pd.Timestamp(item['_event_time'], unit='s', tz=self.timezone) \
|
||||
.normalize().timestamp())
|
||||
user_info = self.local_db['user'].find_one({'_game_role_id': item['_game_role_id']}, projection)
|
||||
for k, v in user_info.items():
|
||||
item[k] = item.get(k) or user_info[k]
|
||||
|
@ -36,7 +36,7 @@ class SummaryShopbuy(Task):
|
||||
continue
|
||||
logger.info(f'开始处理{self.game_name} 处理 {source_coll} 游标 {ts}')
|
||||
where = {
|
||||
'_event_name': 'Shop',
|
||||
'_ut': 'Shop',
|
||||
"act": "buy",
|
||||
'_event_time': {
|
||||
'$gte': ts['cursor_st'],
|
||||
@ -46,10 +46,10 @@ class SummaryShopbuy(Task):
|
||||
|
||||
projection = self.Model.get_fields()
|
||||
bulk_data = []
|
||||
cdate = int(pd.Timestamp(ts['cursor_st'], unit='s', tz=self.timezone).normalize().timestamp())
|
||||
for item in self.local_db[source_coll].find(where, projection):
|
||||
try:
|
||||
item['cdate'] = cdate
|
||||
item['cdate'] = int(pd.Timestamp(item['_event_time'], unit='s', tz=self.timezone) \
|
||||
.normalize().timestamp())
|
||||
item['needa'] = item['need'][0]['a']
|
||||
item['needt'] = item['need'][0]['t']
|
||||
item['needn'] = item['need'][0]['n']
|
||||
|
Loading…
Reference in New Issue
Block a user