自制简历匹配

2022-09-22 18:45:39 +08:00 · 2022-09-22 18:45:39 +08:00 · 4de04a83ab
commit 4de04a83ab
parent 9b943ec14b
1 changed files with 185 additions and 115 deletions
--- a/utils/re_to_jianli.py
+++ b/utils/re_to_jianli.py
@ -283,7 +283,7 @@ def fmt_txt(chk_str):
        if age_str.split('：')[-1].isdigit():
            dict_chk['age'] = int(age_str.split('：')[-1])
        else:
-            dict_chk['age'] = int("".join(re.findall("\d+",age_str)))
+            dict_chk['age'] = int("".join(re.findall("\d+", age_str)))
    else:
        age1 = re.findall(r'[0-9]{2}.*?岁', true_chkStr, re.M)
        if age1:
@ -322,6 +322,14 @@ def fmt_txt(chk_str):
                # work_str2 = work_str2.split('项目简介')[0]
    project_undergo2 = project_undergo.strip().strip('\n')
    str_2 = re.split('([0-9]{4}[/|.][0-9]{1,2})', work_str2)
+    if len(str_2) <= 1:
+        work_str2 = work_str2.replace(' 年', '/').replace('年', '/').replace('月', '').replace(' 月', '')
+        # find_str2_list = re.findall('[0-9]{4}年[0-9]{1,2} 月', work_str2)
+        # for i in find_str2_list:
+        #     new_i_list = i.split('年')
+        #     new_str = new_i_list[0] + ' 年' + new_i_list[1].replace(' ', '')
+        #     work_str2 = work_str2.replace(i, new_str)
+    str_2 = re.split('([0-9]{4}[/|.][0-9]{1,2})', work_str2)
    project_list = re.split('([0-9]{4}[/|.][0-9]{1,2})', project_undergo2)
    if len(project_list) <= 1:
        if '：' in project_undergo2:
@ -367,7 +375,8 @@ def fmt_txt(chk_str):
                            if not p_str:
                                continue
                            if p_str[0].isdigit():
-                                dict_project['time'] = (p_str + project_name_time_str2[index + 1] + project_name_time_str2[
+                                dict_project['time'] = (
+                                        p_str + project_name_time_str2[index + 1] + project_name_time_str2[
                                    index + 2]).replace('.', '/')
                                name_str = project_name_time_str2[index + 3].strip()
                                if '  ' in name_str:
@ -442,7 +451,8 @@ def fmt_txt(chk_str):
                            if len(project_list) < index2 + 1:
                                break
                            else:
-                                new_str = i + project_list[index + 1] + project_list[index + 2] + project_list[index + 3]
+                                new_str = i + project_list[index + 1] + project_list[index + 2] + project_list[
+                                    index + 3]
                                new_str_list1.append(new_str)
                    if new_str_list1:
                        for project_chk_str2 in new_str_list1:
@ -485,135 +495,185 @@ def fmt_txt(chk_str):
                            project_str_list = project_str_i.split('\n')
                            if project_str_list:
                                dict_project1 = copy.deepcopy(dict_project)
-                                dict_project1['name'] = project_str_list[0].split('：')[-1]
+                                if '：' in project_str_list[0]:
+                                    dict_project1['name'] = project_str_list[0].split('：')[-1].strip()
+                                else:
+                                    dict_project1['name'] = project_str_list[0].split(':')[-1].strip()
                                chk_key = ''
-                                for index, i in enumerate(project_str_list[1:]):
+                                for i in project_str_list[1:]:
                                    if not i or i.isdigit():
                                        continue
-                                    if '：' not in i and chk_key:
+                                    if '：' not in i and ':' not in i and chk_key:
                                        dict_project1[chk_key] += i
                                        continue
-                                    if i.startswith('开发周期'):
-                                        dict_project1['time'] = i.split('：')[-1]
+                                    if '开发周期' in i and ('：' in i or ':' in i):
+                                        if '：' in i:
+                                            dict_project1['time'] = i.split('：')[-1]
+                                        else:
+                                            dict_project1['time'] = i.split(':')[-1]
                                        continue
-                                    if i.startswith('开发环境'):
-                                        dict_project1['comment'] += i.split('：')[-1]
+                                    if ('开发环境' in i or '项目描述' in i) and ('：' in i or ':' in i):
+                                        if '：' in i:
+                                            dict_project1['comment'] += i.split('：')[-1]
+                                        else:
+                                            dict_project1['comment'] += i.split(':')[-1]
                                        chk_key = 'comment'
                                        continue
-                                    if i.startswith('功能模块'):
-                                        dict_project1['duty'] = i.split('：')[-1]
-                                        chk_key = 'duty'
-                                        continue
-                                    if i.startswith('项目描述'):
-                                        dict_project1['comment'] += i.split('：')[-1]
-                                        chk_key = 'comment'
-                                        continue
-                                    if i.startswith('技术要点'):
-                                        dict_project1['duty'] += i.split('：')[-1]
+                                    if ('模块' in i or '框架' in i or '技术要点' in i or '职责' in i) and ('：' in i or ':' in i):
+                                        if '：' in i:
+                                            dict_project1['duty'] = i.split('：')[-1]
+                                        else:
+                                            dict_project1['duty'] = i.split(':')[-1]
                                        chk_key = 'duty'
                                        continue
                                project_undergo_list.append(dict_project1)
            else:
-                # if re.findall('([0-9]{4}[/|.][0-9]{1,2})', project_undergo2):
-                index2 = -1
-                name2 = ''
-                new_str_list1 = []
-                for index, i in enumerate(project_list):
-                    if index <= index2:
-                        continue
-                    if not i:
-                        continue
-                    if i.strip() not in ['-', '–', '―']:
-                        index2 = index + 2
-                        if not name2:
-                            name3 = project_list[index + 2].split('\n')[-1]
-                            new_str = project_list[index - 2] + project_list[index - 1] + i + project_list[index + 1] + \
-                                      project_list[index + 2].split(name3)[0]
-                            name2 = name3
-                        else:
-                            name3 = project_list[index + 2].split('\n')[-1]
-                            if name2:
-                                new_str = name2 + project_list[index - 1] + i + project_list[index + 1] + \
+                if re.findall('([0-9]{4}[/|.][0-9]{1,2})', project_undergo2):
+                    index2 = -1
+                    name2 = ''
+                    new_str_list1 = []
+                    for index, i in enumerate(project_list):
+                        if index <= index2:
+                            continue
+                        if not i:
+                            continue
+                        if i.strip() not in ['-', '–', '―']:
+                            index2 = index + 2
+                            if not name2:
+                                name3 = project_list[index + 2].split('\n')[-1]
+                                new_str = project_list[index - 2] + project_list[index - 1] + i + project_list[
+                                    index + 1] + \
                                          project_list[index + 2].split(name3)[0]
+                                name2 = name3
                            else:
-                                new_str = name2 + project_list[index - 1] + i + project_list[index + 1] + \
-                                          project_list[index + 2]
-                            name2 = name3
-                        new_str_list1.append(new_str)
-                        continue
-                    if i.strip() not in ['-', '–', '―'] and ('-' in i or '–' in i or '―' in i):
-                        index2 = index
-                        if not name2:
-                            name3 = i.split('\n')[-1]
-                            new_str = str_2[index - 2] + str_2[index - 1] + i.split(name3)[0]
-                            name2 = name3
-                        else:
-                            name3 = i.split('\n')[-1]
-                            new_str = str_2[index - 2] + str_2[index - 1] + i.split(name3)[0]
-                            name2 = name3
-                        new_str_list1.append(new_str)
-                        continue
-                if new_str_list1:
-                    for project_str in new_str_list1:
-                        project_name_time_str = project_str.split('\n')[0]
-                        dict_project = {
-                            'name': '',
-                            'time': '',
-                            'comment': '',
-                            'work': '',
-                            'duty': '',
-                        }
-                        project_name_time_str2 = re.split('([0-9]{4}[/|.][0-9]{1,2})', project_name_time_str)
-                        if project_name_time_str2:
-                            for index, p_str in enumerate(project_name_time_str2):
-                                if not p_str:
-                                    continue
-                                if p_str[0].isdigit():
-                                    if u'\u4e00' <= project_name_time_str2[index + 1].strip()[0] <= u'\u9fff':
-                                        dict_project['time'] = p_str.replace('.', '/')
-                                        dict_project['name'] = project_name_time_str2[index + 1].split(' ')[-2]
-                                        dict_project['work'] = project_name_time_str2[index + 1].split(' ')[-1]
-                                        break
-                                    else:
-                                        dict_project['time'] = (
-                                                p_str + project_name_time_str2[index + 1] + project_name_time_str2[
-                                            index + 2]).replace('.', '/')
-                                        dict_project['name'] = project_name_time_str2[index - 1]
+                                name3 = project_list[index + 2].split('\n')[-1]
+                                if name2:
+                                    new_str = name2 + project_list[index - 1] + i + project_list[index + 1] + \
+                                              project_list[index + 2].split(name3)[0]
+                                else:
+                                    new_str = name2 + project_list[index - 1] + i + project_list[index + 1] + \
+                                              project_list[index + 2]
+                                name2 = name3
+                            new_str_list1.append(new_str)
+                            continue
+                        if i.strip() not in ['-', '–', '―'] and ('-' in i or '–' in i or '―' in i):
+                            index2 = index
+                            if not name2:
+                                name3 = i.split('\n')[-1]
+                                new_str = str_2[index - 2] + str_2[index - 1] + i.split(name3)[0]
+                                name2 = name3
+                            else:
+                                name3 = i.split('\n')[-1]
+                                new_str = str_2[index - 2] + str_2[index - 1] + i.split(name3)[0]
+                                name2 = name3
+                            new_str_list1.append(new_str)
+                            continue
+                    if new_str_list1:
+                        for project_str in new_str_list1:
+                            project_name_time_str = project_str.split('\n')[0]
+                            dict_project = {
+                                'name': '',
+                                'time': '',
+                                'comment': '',
+                                'work': '',
+                                'duty': '',
+                            }
+                            project_name_time_str2 = re.split('([0-9]{4}[/|.][0-9]{1,2})', project_name_time_str)
+                            if project_name_time_str2:
+                                for index, p_str in enumerate(project_name_time_str2):
+                                    if not p_str:
+                                        continue
+                                    if p_str[0].isdigit():
+                                        if u'\u4e00' <= project_name_time_str2[index + 1].strip()[0] <= u'\u9fff':
+                                            dict_project['time'] = p_str.replace('.', '/')
+                                            dict_project['name'] = project_name_time_str2[index + 1].split(' ')[-2]
+                                            dict_project['work'] = project_name_time_str2[index + 1].split(' ')[-1]
+                                            break
+                                        else:
+                                            dict_project['time'] = (
+                                                    p_str + project_name_time_str2[index + 1] + project_name_time_str2[
+                                                index + 2]).replace('.', '/')
+                                            dict_project['name'] = project_name_time_str2[index - 1]

-                                        break
-                        project_chk_str2 = project_str.split(project_name_time_str)[-1]
-                        project_chk_str2_list = re.split('(:|：)', project_chk_str2)
-                        if project_chk_str2_list:
-                            index3 = -1
-                            start_name = ''
-                            new_p_chk_list = []
-                            for index, p_str3 in enumerate(project_chk_str2_list):
-                                if index <= index3:
-                                    continue
-                                if p_str3 in [':', '：']:
-                                    if not re.split('[\n|\t]', project_chk_str2_list[index + 1])[0].strip() and len(
-                                            project_chk_str2_list[index + 1]) <= 10:
+                                            break
+                            project_chk_str2 = project_str.split(project_name_time_str)[-1]
+                            project_chk_str2_list = re.split('(:|：)', project_chk_str2)
+                            if project_chk_str2_list:
+                                index3 = -1
+                                start_name = ''
+                                new_p_chk_list = []
+                                for index, p_str3 in enumerate(project_chk_str2_list):
+                                    if index <= index3:
                                        continue
-                                    start_name = project_chk_str2_list[index + 1].split('\n')[-1]
-                                    if start_name:
-                                        new_p_str = project_chk_str2_list[index - 1].split('\n')[-1] + p_str3 + \
-                                                    project_chk_str2_list[index + 1].split(start_name)[0]
-                                    else:
-                                        new_p_str = project_chk_str2_list[index - 1].split('\n')[-1] + p_str3 + \
-                                                    project_chk_str2_list[index + 1]
-                                    new_p_chk_list.append(new_p_str)
-                            if new_p_chk_list:
-                                for p_str_true in new_p_chk_list:
-                                    if '开发环境' in p_str_true or '开发工具' in p_str_true or '开发技术' in p_str_true or '模块' in p_str_true:
-                                        dict_project['work'] += re.split('[:|：]', p_str_true)[-1]
+                                    if p_str3 in [':', '：']:
+                                        if not re.split('[\n|\t]', project_chk_str2_list[index + 1])[0].strip() and len(
+                                                project_chk_str2_list[index + 1]) <= 10:
+                                            continue
+                                        start_name = project_chk_str2_list[index + 1].split('\n')[-1]
+                                        if start_name:
+                                            new_p_str = project_chk_str2_list[index - 1].split('\n')[-1] + p_str3 + \
+                                                        project_chk_str2_list[index + 1].split(start_name)[0]
+                                        else:
+                                            new_p_str = project_chk_str2_list[index - 1].split('\n')[-1] + p_str3 + \
+                                                        project_chk_str2_list[index + 1]
+                                        new_p_chk_list.append(new_p_str)
+                                if new_p_chk_list:
+                                    for p_str_true in new_p_chk_list:
+                                        if '开发环境' in p_str_true or '开发工具' in p_str_true or '开发技术' in p_str_true or '模块' in p_str_true:
+                                            dict_project['work'] += re.split('[:|：]', p_str_true)[-1]
+                                            continue
+                                        if '项目描述' in p_str_true or '功能介绍' in p_str_true:
+                                            dict_project['comment'] += re.split('[:|：]', p_str_true)[-1]
+                                            continue
+                                        if '职责' in p_str_true:
+                                            dict_project['duty'] += re.split('[:|：]', p_str_true)[-1]
+                                            continue
+                            project_undergo_list.append(dict_project)
+                else:
+                    dict_project = {
+                        'name': '',
+                        'time': '',
+                        'comment': '',
+                        'work': '',
+                        'duty': '',
+                    }
+                    for project_str_i in project_list[1:]:
+                        if project_str_i:
+                            project_str_list = project_str_i.split('\n')
+                            if project_str_list:
+                                dict_project1 = copy.deepcopy(dict_project)
+                                if '：' in project_str_list[0]:
+                                    dict_project1['name'] = project_str_list[0].split('：')[-1].strip()
+                                else:
+                                    dict_project1['name'] = project_str_list[0].split(':')[-1].strip()
+                                chk_key = ''
+                                for i in project_str_list[1:]:
+                                    if not i or i.isdigit():
                                        continue
-                                    if '项目描述' in p_str_true or '功能介绍' in p_str_true:
-                                        dict_project['comment'] += re.split('[:|：]', p_str_true)[-1]
+                                    if '：' not in i and ':' not in i and chk_key:
+                                        dict_project1[chk_key] += i
                                        continue
-                                    if '职责' in p_str_true:
-                                        dict_project['duty'] += re.split('[:|：]', p_str_true)[-1]
+                                    if '开发周期' in i and ('：' in i or ':' in i):
+                                        if '：' in i:
+                                            dict_project1['time'] = i.split('：')[-1]
+                                        else:
+                                            dict_project1['time'] = i.split(':')[-1]
                                        continue
-                        project_undergo_list.append(dict_project)
+                                    if ('开发环境' in i or '项目描述' in i) and ('：' in i or ':' in i):
+                                        if '：' in i:
+                                            dict_project1['comment'] += i.split('：')[-1]
+                                        else:
+                                            dict_project1['comment'] += i.split(':')[-1]
+                                        chk_key = 'comment'
+                                        continue
+                                    if ('模块' in i or '框架' in i or '技术要点' in i or '职责' in i) and ('：' in i or ':' in i):
+                                        if '：' in i:
+                                            dict_project1['duty'] = i.split('：')[-1]
+                                        else:
+                                            dict_project1['duty'] = i.split(':')[-1]
+                                        chk_key = 'duty'
+                                        continue
+                                project_undergo_list.append(dict_project1)
    dict_chk['project_undergo'] = project_undergo_list

    # 数字开头
@ -720,12 +780,18 @@ def fmt_txt(chk_str):
                    'position_name': '',
                    'duty': '',
                }
-                new_str_list2 = new_str_list1[0].split(' ', 1)
+                if '（' in new_str_list1[0]:
+                    new_str_list2 = new_str_list1[0].split('（', 1)
+                else:
+                    new_str_list2 = new_str_list1[0].split(' ', 1)
                work_dict['company_name'] = new_str_list2[0]
                if ':' in new_str_list2[1]:
                    work_dict['time'] = new_str_list2[1].split(':')[-1].replace('.', '/').strip()
                elif '：' in new_str_list2[1]:
                    work_dict['time'] = new_str_list2[1].split('：')[-1].replace('.', '/').strip()
+                elif '）' in new_str_list2[1]:
+                    date_list = re.findall('[0-9]{4}[/|.][0-9]{1,2}', new_str_list2[1])
+                    work_dict['time'] = date_list[0] + '-' + date_list[1]
                else:
                    work_dict['time'] = new_str_list2[1].replace('.', '/').strip()
                if len(new_str_list1) > 1:
@ -736,6 +802,10 @@ def fmt_txt(chk_str):
                        duty1 = new_str_list1[2].split('职责')[-1]
                        duty = duty1.join((x for x in work_duty))
                        work_dict['duty'] = duty
+                    if '负责' in new_str_list1[1]:
+                        duty1 = new_str_list1[1].split('负责')[-1]
+                        duty = duty1.join((x for x in new_str_list1[2:]))
+                        work_dict['duty'] = duty
                work_list.append(work_dict)
    dict_chk['work_list'] = work_list
    review = ''
@ -2309,4 +2379,4 @@ egreat，海尔，MeleA20，MeleA31，LG1154，极米，杰科，亿典等机顶

    """

-    fmt_txt(chk_str9)
+    fmt_txt(chk_str10)