1.识别简历信息

2022-07-11 17:33:04 +08:00 · 2022-07-11 17:33:04 +08:00 · e934b45e77
commit e934b45e77
parent 1d518a83c5
5 changed files with 475 additions and 1 deletions
--- a/api/api_v1/endpoints/interview.py
+++ b/api/api_v1/endpoints/interview.py
@ -79,3 +79,16 @@ async def interview_insert(
    return schemas.Msg(code=200, msg='ok', data=data)


+@router.post("/interview_insert")
+async def interview_insert(
+        request: Request,
+        data_in: schemas.Interview,
+        ckdb: CKDrive = Depends(get_ck_db),
+) -> schemas.Msg:
+    """ 面试情况 """
+    await interview.init()
+    res = interview.insert_interview_sql()
+    sql = res['sql']
+    insert_data = res['insert_data']
+    data = await db.execute_dict(sql, insert_data)
+    return schemas.Msg(code=200, msg='ok', data=data)
--- a/schemas/init.py
+++ b/schemas/init.py
@ -26,3 +26,4 @@ from .user_url import *
 from .api_module import *
 from .event_list import *
 from .interview import *
+from .interview_plan import *
--- a/schemas/interview_plan.py
+++ b/schemas/interview_plan.py
@ -0,0 +1,16 @@
+from typing import List, Union, Dict
+
+from pydantic import BaseModel
+from typing import Optional
+
+
+class Interview(BaseModel):
+    job_name: str =None  # 应聘职位
+    hr_name: str  # 面试负责人
+    interview_name: str  # 面试官
+    interview_type: str  # 面试类型
+    interview_sign: int  # 面试签到
+    feedback: int  # 面试反馈
+    interview_round: int  # 面试轮次
+    pages: int = 1  # 分页的当前页
+    time_type: str  # 要查询的时间范围类型
--- a/utils/func.py
+++ b/utils/func.py
@ -150,3 +150,13 @@ def create_neidf(resp,columnName):
    columns.insert(0, columnName)
    df = pd.DataFrame(data=date, columns=columns)
    return df
+
+def random_hex():
+    """
+    生成16位随机数
+    :return: 随机数
+    """
+    result = hex(random.randint(0,16**16)).replace('0x','').upper()
+    if(len(result)<16):
+        result = '0'*(16-len(result))+result
+    return result
--- a/utils/jianli.py
+++ b/utils/jianli.py
@ -0,0 +1,434 @@
+import re
+import docx
+import os
+import copy
+from pprint import pprint
+from paddlenlp import Taskflow
+import pdfplumber
+from win32com import client as wc
+from pdf2docx import Converter
+
+# 文件路径
+PATH_DATA = os.path.abspath("C:/Users/Administrator/Desktop/面试简历")
+schema = ['姓名', '所在地', '户口所在地', '籍贯', '婚姻状况', '民族', '身高', '电话', 'tel', '应聘职位', '到岗时间', '学历', '毕业学校', '专业',
+          '期望薪资', '在校时间', '电子邮箱', '工作经验', 'Email', '性别', '年龄'
+          ]
+schema_dict = {'姓名': 'name', '所在地': 'location', '户口所在地': 'account', '婚姻状况': 'gam', '民族': 'nation', '身高': 'height',
+               '电话': 'phone', '应聘职位': 'job', '到岗时间': 'come_time', '学历': 'education', '毕业学校': 'school', '专业': 'career',
+               '期望薪资': 'money', '在校时间': 'at_school', '电子邮箱': 'mail', '工作经验': 'work_exp', 'Email': 'mails',
+               '性别': 'gender', '年龄': 'age', '籍贯': 'accounts', 'tel': 'tels'}
+
+
+def chkworlkandtime(listdata):
+    """
+    获取工作经历中任职的公司名称和对应的在岗时间
+    :param dictdata:
+    :return:返回列表格式
+    """
+    res = {}
+    for i in listdata:
+        for key, datalist in i.items():
+            trueDict = {}
+            for data in datalist:
+                if data['text'] in trueDict:
+                    if data['probability'] <= trueDict[data['text']]['probability']:
+                        continue
+                trueDict.update({
+                    data['text']: {
+                        'end': data['end'],
+                        'probability': data['probability'],
+                        'start': data['start'],
+                    }
+                })
+            trueList = []
+            for key1, value1 in trueDict.items():
+                value1.update({
+                    'text': key1
+                })
+                trueDict1 = copy.deepcopy(value1)
+                trueList.append(trueDict1)
+            trueList.sort(key=lambda item: item['start'])
+            res.update({key: trueList})
+    ress = []
+    if res != {}:
+        for i in range(len(res['公司名'])):
+            date = {
+                'name': res['公司名'][i]['text'],
+                'time': res['时间'][i]['text']
+            }
+            ress.append(date)
+    return ress
+
+
+def getText_docx(filename):  # docx 转text
+    """将docx读成text"""
+    doc = docx.Document(filename)
+    fullText = []
+    for i in doc.paragraphs:  # 迭代docx文档里面的每一个段落
+        fullText.append(i.text)  # 保存每一个段落的文本
+    numTables = doc.tables  # 如果有表格的内容存放在这
+    if len(numTables) > 0:
+        for table in numTables:
+            row_count = len(table.rows)
+            col_count = len(table.columns)
+            for i in range(row_count):
+                for j in range(col_count):
+                    fullText.append(table.cell(i, j).text)
+    return '\n'.join(fullText)
+
+
+def pdf_docx(url, filename):
+    """
+    将pdf文件转为docx文件
+    :param url:
+    :param filename:
+    :return:
+    """
+    # 获取文件名称
+    file_name = filename.split('.')[0]
+    # pdf文件名称
+    pdf_name = url + f"/{filename}"
+    # docx文件名称
+    docx_name = url + f"/{file_name}.docx"
+    # 加载pdf文档
+    cv = Converter(pdf_name)
+    cv.convert(docx_name, start=0, end=12)
+    cv.close()
+
+
+def getText_pdf(filename):
+    """将pdf读成text"""
+    with pdfplumber.open(filename) as pdf_file:
+        content = ''
+        for i in range(len(pdf_file.pages)):
+            page_text = pdf_file.pages[i]
+            page_content = page_text.extract_text()
+            if page_content:
+                content = content + page_content + "\n"
+    return content
+
+
+def doc_docx(url, filename):
+    """
+    将doc文件转为docx文件
+    :param filename:
+    :return:
+    """
+    word = wc.Dispatch("Word.Application")
+    doc = word.Documents.Open(url + f"/{filename}")
+    name = filename.split('.')[0]
+    doc.SaveAs(url + f'/{name}.docx', 12)  # 12为docx
+    doc.Close()
+    word.Quit()
+
+
+def clash(date, retain, pop):
+    """
+    解决词性搜索时，最后的结果只取有值的一个
+    例如'户口所在地','籍贯'只取默认的籍贯返回，（户口所在地有值把值给籍贯）
+    :param date: 原数据
+    :param retain: 要固定返回给前端的数据  户口所在地
+    :param pop: 要删除的那个字段   籍贯
+    :return:
+    """
+    if date[retain] != '':
+        date.pop(pop)
+    else:
+        date[retain] = date[pop]
+        date.pop(pop)
+
+
+def get_date(schema, dates, schema_dict):
+    """
+    把第三方获取的数据筛选出想要的基本信息
+    :param schema:中文的词性标注
+    :param dates:原数据
+    :param schema_dict:对应中文的英文
+    :return: 返回取出概率最大的基本信息数据
+    """
+    date = {}
+    for i in schema:
+        text = dates[0].get(i, '')
+        # 如果数据中没有搜到对应的键，返回空字符串
+        if text == '':
+            date[schema_dict[i]] = text
+        else:
+            if len(text) == 1:
+                date[schema_dict[i]] = text[0]['text']
+            else:
+                aa = {}
+                num = []
+                for dic in text:
+                    aa[dic['probability']] = dic['text']
+                    num.append(dic['probability'])
+                #  取出概率最大的值
+                date[schema_dict[i]] = aa[max(num)]
+    # 解决邮箱冲突的问题
+    clash(date, 'mail', 'mails')
+    # 解决户口所在地冲突的问题
+    clash(date, 'account', 'accounts')
+    # 解决电话冲突的问题
+    clash(date, 'phone', 'tels')
+    return date
+
+
+def fmtTxt(txt, istable=0):
+    # 所有关键字
+    chkStr = ['自我评价', '自我描述', '个人优势', '项目经历', '项目经验', '项目描述', '教育经历', '学习经历', '工作经历', '工作经验', '实习经历',
+              '技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能', '个人评价']
+    # 自我描述
+    chkList1 = ['自我评价', '自我描述', '个人优势', '个人评价']
+    # 项目经验
+    chkList2 = ['项目经历', '项目经验', '项目描述']
+    # 教育背景
+    chkList3 = ['教育经历', '学习经历']
+    # 工作经历
+    chkList4 = ['工作经历', '工作经验', '实习经历']
+    # 个人技能
+    chkList5 = ['技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能']
+    fmtList = []  # 返回拼接好的字符串列表
+    trueIndex = 0
+    fmtStr = ''
+    nowChkList = []
+    # 判断while循环是否需要停止
+    stop_int = 0
+    for index, i in enumerate(txt):
+        if istable:
+            text = i
+        else:
+            text = i.text
+        # text = re.sub('\s+', '', text).lstrip()  # 字符串去除空格和换行符
+
+        # 没有检测出关键字
+        if not fmtStr:
+            # 自我描述
+            for i in chkList1:
+                # 判断是不是以关键字开头
+                if not text.startswith(i, 0):
+                    continue
+                else:
+                    if i in text:
+                        fmtStr = text
+                        nowChkList = [chk for chk in chkStr if chk not in chkList1]
+                        # 检测出关键字证明需要继续循环
+                        stop_int = 1
+                        break
+            if fmtStr:
+                continue
+            # 项目经验
+            for i in chkList2:
+                if i in text:
+                    fmtStr = text
+                    nowChkList = [chk for chk in chkStr if chk not in chkList2]
+                    stop_int = 1
+                    break
+            if fmtStr:
+                continue
+            # 教育背景
+            for i in chkList3:
+                if i in text:
+                    fmtStr = text
+                    nowChkList = [chk for chk in chkStr if chk not in chkList3]
+                    stop_int = 1
+                    break
+            if fmtStr:
+                continue
+            # 工作经历
+            for i in chkList4:
+                # 判断是不是以关键字开头
+                if not text.startswith(i, 0):
+                    continue
+                else:
+                    if i in text:
+                        fmtStr = text
+                        nowChkList = [chk for chk in chkStr if chk not in chkList4]
+                        stop_int = 1
+                        break
+            if fmtStr:
+                continue
+            # 个人技能
+            for i in chkList5:
+                if i in text:
+                    fmtStr = text
+                    nowChkList = [chk for chk in chkStr if chk not in chkList5]
+                    stop_int = 1
+                    break
+            continue
+        else:
+            isTure = 1
+            for i in nowChkList:
+                if i in text:
+                    isTure = 0
+                    break
+            if isTure:
+                fmtStr += text
+                continue
+            else:
+                fmtStrTrue = fmtStr
+                fmtList.append(fmtStrTrue)
+                trueIndex = index
+                # fmtStr = ''
+                # nowChkList = []
+                # 剩余没有检索的部分
+                txt1 = txt[trueIndex:]
+                return fmtList, txt1, stop_int
+
+    # 当列表全部检索完毕需要停止循环
+    if fmtStr:
+        fmtStrTrue = fmtStr
+        fmtList.append(fmtStrTrue)
+        stop_int = 0
+    txt1 = txt[trueIndex:]
+    return fmtList, txt1, stop_int
+
+
+def fmtList(txtlist, dates):
+    chkList1 = ['自我评价', '自我描述', '个人优势']
+    chkList2 = ['项目经历', '项目经验', '项目描述']
+    chkList3 = ['教育经历', '学习经历']
+    chkList4 = ['工作经历', '工作经验', '实习经历']
+    chkList5 = ['技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能']
+    # 自我评价
+    review = []
+    # 项目经验
+    project = []
+    # 工作经验
+    work = []
+    # 教育经验
+    upgrade = []
+    # 技能特长
+    specialty = []
+    for text in txtlist:
+        ischk = 0
+        # 自我评价
+        for i in chkList1:
+            if i in text:
+                review.append(text)
+                ischk = 1
+                break
+        if ischk:
+            continue
+        # 项目经验
+        for i in chkList2:
+            if i in text:
+                project.append(text)
+                ischk = 1
+                break
+        if ischk:
+            continue
+        # 工作经验
+        for i in chkList4:
+            if i in text:
+                work.append(text)
+                ischk = 1
+                break
+        if ischk:
+            continue
+        # 教育经历
+        for i in chkList3:
+            if i in text:
+                upgrade.append(text)
+                ischk = 1
+                break
+        if ischk:
+            continue
+        # 自我评价
+        for i in chkList5:
+            if i in text:
+                specialty.append(text)
+                ischk = 1
+                break
+        if ischk:
+            continue
+    # 取出工作经验里面的公司名和时间
+    work_list = []
+    if len(work) > 0:
+        works = ''
+        for i in work:
+            works += i
+        schema = ['公司名', '时间']
+        ie = Taskflow('information_extraction', schema=schema)
+        text_lists = ie(works)
+        work_list = chkworlkandtime(text_lists)
+    # review自我评价, project项目经验，work工作经验，work具体工作的公司和时间，upgrade教育经历，specialty技能特长
+    dates.update({
+        'review': review,
+        'project': project,
+        'work': work,
+        'work_list': work_list,
+        'upgrade': upgrade,
+        'specialty': specialty,
+    })
+
+    return dates
+
+
+def get_resume():
+    for root, dirs, files in os.walk(PATH_DATA):
+        for file in files:  # 一个file就是一份简历
+            url = PATH_DATA + f"/{file}"
+            if os.path.splitext(file)[1] == '.pdf':
+                pdf_docx(PATH_DATA, file)  # 转为docx
+                name = file.split('.')[0]
+                open_txt = docx.Document(PATH_DATA + f"/{name}.docx")  # 打开docx
+                os.remove(PATH_DATA + f"/{name}.docx")  # 删除生成的文件
+                txt = getText_pdf(url)  # 打开pdf格式文件转txt
+                # txt = getText_docx(PATH_DATA + f"\{name}.docx")
+            elif os.path.splitext(file)[1] == '.docx':
+                open_txt = docx.Document(url)  # 打开docx，将用来读取每一段的内容
+                txt = getText_docx(url)  # 打开docx格式文件转txt
+            elif os.path.splitext(file)[1] == '.doc':
+                doc_docx(PATH_DATA, file)  # 转为docx
+                name = file.split('.')[0]
+                open_txt = docx.Document(PATH_DATA + f"/{name}.docx")  # 打开docx
+                txt = getText_docx(PATH_DATA + f"/{name}.docx")  # 打开docx格式文件转txt
+                os.remove(PATH_DATA + f"/{name}.docx")  # 删除生成的文件
+            ie = Taskflow('information_extraction', schema=schema)  # 花费时间会安装文件
+            # pprint(ie(txt))  # 姓名，电话，电子邮箱，民族，毕业院校，专业，工作经验，婚姻状况
+            # 获取的基础数据
+            text_lists = ie(txt)
+            # 处理后的基本数据
+            dates = get_date(schema, text_lists, schema_dict)
+            # 打开docx获取的每一段数据
+            txt_list = open_txt.paragraphs
+            # 获取的文档内容
+            txt_list1 = []
+            stop_int = 1
+            txt1 = txt_list
+            while stop_int:
+                txt_list2, txt1, stop_int = fmtTxt(txt1)
+                txt_list1 += txt_list2
+            # print(txt_list1)
+            numTables = open_txt.tables  # 获取表格里面的内容
+            table_list = []
+            if len(numTables) > 0:
+                for table in numTables:
+                    row_count = len(table.rows)
+                    col_count = len(table.columns)
+                    for i in range(row_count):
+                        for j in range(col_count):
+                            texts = table.cell(i, j).text
+                            # texts = re.sub('\s+', '', texts).lstrip()  # 字符串去除空格和换行符
+                            if not texts:
+                                continue
+                            if texts in table_list:
+                                continue
+                            table_list.append(texts)
+            if table_list:
+                stop_table = 1
+                table1 = table_list
+                while stop_table:
+                    table_list2, table1, stop_table = fmtTxt(table1, istable=1)
+                    txt_list1 += table_list2
+                # print(txt_list1)
+            # review自我评价,project项目经验，work工作经验，upgrade教育经历，specialty技能特长
+            # 把两部分的数据合起来返回前端,数据都在dates中
+            fmtList(txt_list1, dates)
+            # pprint(dates)
+            a = 1
+
+            return dates
+
+
+if __name__ == '__main__':
+    get_resume()