diff --git a/utils/re_to_jianli.py b/utils/re_to_jianli.py index 5283cc0..ef4a52b 100644 --- a/utils/re_to_jianli.py +++ b/utils/re_to_jianli.py @@ -283,7 +283,7 @@ def fmt_txt(chk_str): if age_str.split(':')[-1].isdigit(): dict_chk['age'] = int(age_str.split(':')[-1]) else: - dict_chk['age'] = int("".join(re.findall("\d+",age_str))) + dict_chk['age'] = int("".join(re.findall("\d+", age_str))) else: age1 = re.findall(r'[0-9]{2}.*?岁', true_chkStr, re.M) if age1: @@ -322,6 +322,14 @@ def fmt_txt(chk_str): # work_str2 = work_str2.split('项目简介')[0] project_undergo2 = project_undergo.strip().strip('\n') str_2 = re.split('([0-9]{4}[/|.][0-9]{1,2})', work_str2) + if len(str_2) <= 1: + work_str2 = work_str2.replace(' 年', '/').replace('年', '/').replace('月', '').replace(' 月', '') + # find_str2_list = re.findall('[0-9]{4}年[0-9]{1,2} 月', work_str2) + # for i in find_str2_list: + # new_i_list = i.split('年') + # new_str = new_i_list[0] + ' 年' + new_i_list[1].replace(' ', '') + # work_str2 = work_str2.replace(i, new_str) + str_2 = re.split('([0-9]{4}[/|.][0-9]{1,2})', work_str2) project_list = re.split('([0-9]{4}[/|.][0-9]{1,2})', project_undergo2) if len(project_list) <= 1: if ':' in project_undergo2: @@ -367,7 +375,8 @@ def fmt_txt(chk_str): if not p_str: continue if p_str[0].isdigit(): - dict_project['time'] = (p_str + project_name_time_str2[index + 1] + project_name_time_str2[ + dict_project['time'] = ( + p_str + project_name_time_str2[index + 1] + project_name_time_str2[ index + 2]).replace('.', '/') name_str = project_name_time_str2[index + 3].strip() if ' ' in name_str: @@ -442,7 +451,8 @@ def fmt_txt(chk_str): if len(project_list) < index2 + 1: break else: - new_str = i + project_list[index + 1] + project_list[index + 2] + project_list[index + 3] + new_str = i + project_list[index + 1] + project_list[index + 2] + project_list[ + index + 3] new_str_list1.append(new_str) if new_str_list1: for project_chk_str2 in new_str_list1: @@ -485,135 +495,185 @@ def fmt_txt(chk_str): project_str_list = project_str_i.split('\n') if project_str_list: dict_project1 = copy.deepcopy(dict_project) - dict_project1['name'] = project_str_list[0].split(':')[-1] + if ':' in project_str_list[0]: + dict_project1['name'] = project_str_list[0].split(':')[-1].strip() + else: + dict_project1['name'] = project_str_list[0].split(':')[-1].strip() chk_key = '' - for index, i in enumerate(project_str_list[1:]): + for i in project_str_list[1:]: if not i or i.isdigit(): continue - if ':' not in i and chk_key: + if ':' not in i and ':' not in i and chk_key: dict_project1[chk_key] += i continue - if i.startswith('开发周期'): - dict_project1['time'] = i.split(':')[-1] + if '开发周期' in i and (':' in i or ':' in i): + if ':' in i: + dict_project1['time'] = i.split(':')[-1] + else: + dict_project1['time'] = i.split(':')[-1] continue - if i.startswith('开发环境'): - dict_project1['comment'] += i.split(':')[-1] + if ('开发环境' in i or '项目描述' in i) and (':' in i or ':' in i): + if ':' in i: + dict_project1['comment'] += i.split(':')[-1] + else: + dict_project1['comment'] += i.split(':')[-1] chk_key = 'comment' continue - if i.startswith('功能模块'): - dict_project1['duty'] = i.split(':')[-1] - chk_key = 'duty' - continue - if i.startswith('项目描述'): - dict_project1['comment'] += i.split(':')[-1] - chk_key = 'comment' - continue - if i.startswith('技术要点'): - dict_project1['duty'] += i.split(':')[-1] + if ('模块' in i or '框架' in i or '技术要点' in i or '职责' in i) and (':' in i or ':' in i): + if ':' in i: + dict_project1['duty'] = i.split(':')[-1] + else: + dict_project1['duty'] = i.split(':')[-1] chk_key = 'duty' continue project_undergo_list.append(dict_project1) else: - # if re.findall('([0-9]{4}[/|.][0-9]{1,2})', project_undergo2): - index2 = -1 - name2 = '' - new_str_list1 = [] - for index, i in enumerate(project_list): - if index <= index2: - continue - if not i: - continue - if i.strip() not in ['-', '–', '―']: - index2 = index + 2 - if not name2: - name3 = project_list[index + 2].split('\n')[-1] - new_str = project_list[index - 2] + project_list[index - 1] + i + project_list[index + 1] + \ - project_list[index + 2].split(name3)[0] - name2 = name3 - else: - name3 = project_list[index + 2].split('\n')[-1] - if name2: - new_str = name2 + project_list[index - 1] + i + project_list[index + 1] + \ + if re.findall('([0-9]{4}[/|.][0-9]{1,2})', project_undergo2): + index2 = -1 + name2 = '' + new_str_list1 = [] + for index, i in enumerate(project_list): + if index <= index2: + continue + if not i: + continue + if i.strip() not in ['-', '–', '―']: + index2 = index + 2 + if not name2: + name3 = project_list[index + 2].split('\n')[-1] + new_str = project_list[index - 2] + project_list[index - 1] + i + project_list[ + index + 1] + \ project_list[index + 2].split(name3)[0] + name2 = name3 else: - new_str = name2 + project_list[index - 1] + i + project_list[index + 1] + \ - project_list[index + 2] - name2 = name3 - new_str_list1.append(new_str) - continue - if i.strip() not in ['-', '–', '―'] and ('-' in i or '–' in i or '―' in i): - index2 = index - if not name2: - name3 = i.split('\n')[-1] - new_str = str_2[index - 2] + str_2[index - 1] + i.split(name3)[0] - name2 = name3 - else: - name3 = i.split('\n')[-1] - new_str = str_2[index - 2] + str_2[index - 1] + i.split(name3)[0] - name2 = name3 - new_str_list1.append(new_str) - continue - if new_str_list1: - for project_str in new_str_list1: - project_name_time_str = project_str.split('\n')[0] - dict_project = { - 'name': '', - 'time': '', - 'comment': '', - 'work': '', - 'duty': '', - } - project_name_time_str2 = re.split('([0-9]{4}[/|.][0-9]{1,2})', project_name_time_str) - if project_name_time_str2: - for index, p_str in enumerate(project_name_time_str2): - if not p_str: - continue - if p_str[0].isdigit(): - if u'\u4e00' <= project_name_time_str2[index + 1].strip()[0] <= u'\u9fff': - dict_project['time'] = p_str.replace('.', '/') - dict_project['name'] = project_name_time_str2[index + 1].split(' ')[-2] - dict_project['work'] = project_name_time_str2[index + 1].split(' ')[-1] - break - else: - dict_project['time'] = ( - p_str + project_name_time_str2[index + 1] + project_name_time_str2[ - index + 2]).replace('.', '/') - dict_project['name'] = project_name_time_str2[index - 1] + name3 = project_list[index + 2].split('\n')[-1] + if name2: + new_str = name2 + project_list[index - 1] + i + project_list[index + 1] + \ + project_list[index + 2].split(name3)[0] + else: + new_str = name2 + project_list[index - 1] + i + project_list[index + 1] + \ + project_list[index + 2] + name2 = name3 + new_str_list1.append(new_str) + continue + if i.strip() not in ['-', '–', '―'] and ('-' in i or '–' in i or '―' in i): + index2 = index + if not name2: + name3 = i.split('\n')[-1] + new_str = str_2[index - 2] + str_2[index - 1] + i.split(name3)[0] + name2 = name3 + else: + name3 = i.split('\n')[-1] + new_str = str_2[index - 2] + str_2[index - 1] + i.split(name3)[0] + name2 = name3 + new_str_list1.append(new_str) + continue + if new_str_list1: + for project_str in new_str_list1: + project_name_time_str = project_str.split('\n')[0] + dict_project = { + 'name': '', + 'time': '', + 'comment': '', + 'work': '', + 'duty': '', + } + project_name_time_str2 = re.split('([0-9]{4}[/|.][0-9]{1,2})', project_name_time_str) + if project_name_time_str2: + for index, p_str in enumerate(project_name_time_str2): + if not p_str: + continue + if p_str[0].isdigit(): + if u'\u4e00' <= project_name_time_str2[index + 1].strip()[0] <= u'\u9fff': + dict_project['time'] = p_str.replace('.', '/') + dict_project['name'] = project_name_time_str2[index + 1].split(' ')[-2] + dict_project['work'] = project_name_time_str2[index + 1].split(' ')[-1] + break + else: + dict_project['time'] = ( + p_str + project_name_time_str2[index + 1] + project_name_time_str2[ + index + 2]).replace('.', '/') + dict_project['name'] = project_name_time_str2[index - 1] - break - project_chk_str2 = project_str.split(project_name_time_str)[-1] - project_chk_str2_list = re.split('(:|:)', project_chk_str2) - if project_chk_str2_list: - index3 = -1 - start_name = '' - new_p_chk_list = [] - for index, p_str3 in enumerate(project_chk_str2_list): - if index <= index3: - continue - if p_str3 in [':', ':']: - if not re.split('[\n|\t]', project_chk_str2_list[index + 1])[0].strip() and len( - project_chk_str2_list[index + 1]) <= 10: + break + project_chk_str2 = project_str.split(project_name_time_str)[-1] + project_chk_str2_list = re.split('(:|:)', project_chk_str2) + if project_chk_str2_list: + index3 = -1 + start_name = '' + new_p_chk_list = [] + for index, p_str3 in enumerate(project_chk_str2_list): + if index <= index3: continue - start_name = project_chk_str2_list[index + 1].split('\n')[-1] - if start_name: - new_p_str = project_chk_str2_list[index - 1].split('\n')[-1] + p_str3 + \ - project_chk_str2_list[index + 1].split(start_name)[0] - else: - new_p_str = project_chk_str2_list[index - 1].split('\n')[-1] + p_str3 + \ - project_chk_str2_list[index + 1] - new_p_chk_list.append(new_p_str) - if new_p_chk_list: - for p_str_true in new_p_chk_list: - if '开发环境' in p_str_true or '开发工具' in p_str_true or '开发技术' in p_str_true or '模块' in p_str_true: - dict_project['work'] += re.split('[:|:]', p_str_true)[-1] + if p_str3 in [':', ':']: + if not re.split('[\n|\t]', project_chk_str2_list[index + 1])[0].strip() and len( + project_chk_str2_list[index + 1]) <= 10: + continue + start_name = project_chk_str2_list[index + 1].split('\n')[-1] + if start_name: + new_p_str = project_chk_str2_list[index - 1].split('\n')[-1] + p_str3 + \ + project_chk_str2_list[index + 1].split(start_name)[0] + else: + new_p_str = project_chk_str2_list[index - 1].split('\n')[-1] + p_str3 + \ + project_chk_str2_list[index + 1] + new_p_chk_list.append(new_p_str) + if new_p_chk_list: + for p_str_true in new_p_chk_list: + if '开发环境' in p_str_true or '开发工具' in p_str_true or '开发技术' in p_str_true or '模块' in p_str_true: + dict_project['work'] += re.split('[:|:]', p_str_true)[-1] + continue + if '项目描述' in p_str_true or '功能介绍' in p_str_true: + dict_project['comment'] += re.split('[:|:]', p_str_true)[-1] + continue + if '职责' in p_str_true: + dict_project['duty'] += re.split('[:|:]', p_str_true)[-1] + continue + project_undergo_list.append(dict_project) + else: + dict_project = { + 'name': '', + 'time': '', + 'comment': '', + 'work': '', + 'duty': '', + } + for project_str_i in project_list[1:]: + if project_str_i: + project_str_list = project_str_i.split('\n') + if project_str_list: + dict_project1 = copy.deepcopy(dict_project) + if ':' in project_str_list[0]: + dict_project1['name'] = project_str_list[0].split(':')[-1].strip() + else: + dict_project1['name'] = project_str_list[0].split(':')[-1].strip() + chk_key = '' + for i in project_str_list[1:]: + if not i or i.isdigit(): continue - if '项目描述' in p_str_true or '功能介绍' in p_str_true: - dict_project['comment'] += re.split('[:|:]', p_str_true)[-1] + if ':' not in i and ':' not in i and chk_key: + dict_project1[chk_key] += i continue - if '职责' in p_str_true: - dict_project['duty'] += re.split('[:|:]', p_str_true)[-1] + if '开发周期' in i and (':' in i or ':' in i): + if ':' in i: + dict_project1['time'] = i.split(':')[-1] + else: + dict_project1['time'] = i.split(':')[-1] continue - project_undergo_list.append(dict_project) + if ('开发环境' in i or '项目描述' in i) and (':' in i or ':' in i): + if ':' in i: + dict_project1['comment'] += i.split(':')[-1] + else: + dict_project1['comment'] += i.split(':')[-1] + chk_key = 'comment' + continue + if ('模块' in i or '框架' in i or '技术要点' in i or '职责' in i) and (':' in i or ':' in i): + if ':' in i: + dict_project1['duty'] = i.split(':')[-1] + else: + dict_project1['duty'] = i.split(':')[-1] + chk_key = 'duty' + continue + project_undergo_list.append(dict_project1) dict_chk['project_undergo'] = project_undergo_list # 数字开头 @@ -720,12 +780,18 @@ def fmt_txt(chk_str): 'position_name': '', 'duty': '', } - new_str_list2 = new_str_list1[0].split(' ', 1) + if '(' in new_str_list1[0]: + new_str_list2 = new_str_list1[0].split('(', 1) + else: + new_str_list2 = new_str_list1[0].split(' ', 1) work_dict['company_name'] = new_str_list2[0] if ':' in new_str_list2[1]: work_dict['time'] = new_str_list2[1].split(':')[-1].replace('.', '/').strip() elif ':' in new_str_list2[1]: work_dict['time'] = new_str_list2[1].split(':')[-1].replace('.', '/').strip() + elif ')' in new_str_list2[1]: + date_list = re.findall('[0-9]{4}[/|.][0-9]{1,2}', new_str_list2[1]) + work_dict['time'] = date_list[0] + '-' + date_list[1] else: work_dict['time'] = new_str_list2[1].replace('.', '/').strip() if len(new_str_list1) > 1: @@ -736,6 +802,10 @@ def fmt_txt(chk_str): duty1 = new_str_list1[2].split('职责')[-1] duty = duty1.join((x for x in work_duty)) work_dict['duty'] = duty + if '负责' in new_str_list1[1]: + duty1 = new_str_list1[1].split('负责')[-1] + duty = duty1.join((x for x in new_str_list1[2:])) + work_dict['duty'] = duty work_list.append(work_dict) dict_chk['work_list'] = work_list review = '' @@ -2309,4 +2379,4 @@ egreat,海尔,MeleA20,MeleA31,LG1154,极米,杰科,亿典等机顶 """ - fmt_txt(chk_str9) + fmt_txt(chk_str10)