prs_server/utils/chkmail/chkmail.py

import re
import docx
import os
from pprint import pprint
from paddlenlp import Taskflow
from gongju import getText_docx, getText_pdf, doc_docx, pdf_docx


def get_date(schema, dates, schema_dict):
    """
    把第三方获取的数据筛选出想要的基本信息
    :param schema:中文的词性标注
    :param dates:原数据
    :param schema_dict:对应中文的英文
    :return: 返回取出概率最大的基本信息数据
    """
    date = {}
    for i in schema:
        text = dates[0].get(i, '')
        # 如果数据中没有搜到对应的键，返回空字符串
        if text == '':
            date[schema_dict[i]] = text
        else:
            if len(text) == 1:
                date[schema_dict[i]] = text[0]['text']
            else:
                aa = {}
                num = []
                for dic in text:
                    aa[dic['probability']] = dic['text']
                    num.append(dic['probability'])
                #  取出概率最大的值
                date[schema_dict[i]] = aa[max(num)]
    return date


def fmtTxt(txt, istable=0):
    # 所有关键字
    chkStr = ['自我评价', '自我描述', '个人优势', '项目经历', '项目经验', '项目描述', '教育经历', '学习经历', '工作经历', '工作经验', '实习经历',
              '技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能', '个人评价']
    # 自我描述
    chkList1 = ['自我评价', '自我描述', '个人优势', '个人评价']
    # 项目经验
    chkList2 = ['项目经历', '项目经验', '项目描述']
    # 教育背景
    chkList3 = ['教育经历', '学习经历']
    # 工作经历
    chkList4 = ['工作经历', '工作经验', '实习经历']
    # 个人技能
    chkList5 = ['技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能']
    fmtList = []  # 返回拼接好的字符串列表
    trueIndex = 0
    fmtStr = ''
    nowChkList = []
    # 判断while循环是否需要停止
    stop_int = 0
    for index, i in enumerate(txt):
        if istable:
            text = i
        else:
            text = i.text
        text = re.sub('\s+', '', text).lstrip()  # 字符串去除空格和换行符

        # 没有检测出关键字
        if not fmtStr:

            for i in chkList1:
                # 判断是不是以关键字开头
                if not text.startswith(i, 0):
                    continue
                else:
                    if i in text:
                        fmtStr = text
                        nowChkList = [chk for chk in chkStr if chk not in chkList1]
                        # 检测出关键字证明需要继续循环
                        stop_int = 1
                        break
            if fmtStr:
                continue
            for i in chkList2:
                # 判断是不是以关键字开头
                if not text.startswith(i, 0):
                    continue
                else:
                    if i in text:
                        fmtStr = text
                        nowChkList = [chk for chk in chkStr if chk not in chkList2]
                        stop_int = 1
                        break
            if fmtStr:
                continue
            for i in chkList3:
                if i in text:
                    fmtStr = text
                    nowChkList = [chk for chk in chkStr if chk not in chkList3]
                    stop_int = 1
                    break
            if fmtStr:
                continue
            for i in chkList4:
                # 判断是不是以关键字开头
                if not text.startswith(i, 0):
                    continue
                else:
                    if i in text:
                        fmtStr = text
                        nowChkList = [chk for chk in chkStr if chk not in chkList4]
                        stop_int = 1
                        break
            if fmtStr:
                continue
            for i in chkList5:
                if i in text:
                    fmtStr = text
                    nowChkList = [chk for chk in chkStr if chk not in chkList5]
                    stop_int = 1
                    break
            continue
        else:
            isTure = 1
            for i in nowChkList:
                if i in text:
                    isTure = 0
                    break
            if isTure:
                fmtStr += text
                continue
            else:
                fmtStrTrue = fmtStr
                fmtList.append(fmtStrTrue)
                trueIndex = index
                # fmtStr = ''
                # nowChkList = []
                # 剩余没有检索的部分
                txt1 = txt[trueIndex:]
                return fmtList, txt1, stop_int

    # 当列表全部检索完毕需要停止循环
    if fmtStr:
        fmtStrTrue = fmtStr
        fmtList.append(fmtStrTrue)
        stop_int = 0
    txt1 = txt[trueIndex:]
    return fmtList, txt1, stop_int


def fmtList(txtlist, dates):
    chkList1 = ['自我评价', '自我描述', '个人优势']
    chkList2 = ['项目经历', '项目经验', '项目描述']
    chkList3 = ['教育经历', '学习经历']
    chkList4 = ['工作经历', '工作经验', '实习经历']
    chkList5 = ['技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能']
    # 自我评价
    review = []
    # 项目经验
    project = []
    # 工作经验
    work = []
    # 教育经验
    upgrade = []
    # 技能特长
    specialty = []
    for text in txtlist:
        ischk = 0
        # 自我评价
        for i in chkList1:
            if i in text:
                review.append(text)
                ischk = 1
                break
        if ischk:
            continue
        # 项目经验
        for i in chkList2:
            if i in text:
                project.append(text)
                ischk = 1
                break
        if ischk:
            continue
        # 工作经验
        for i in chkList4:
            if i in text:
                work.append(text)
                ischk = 1
                break
        if ischk:
            continue
        # 教育经历
        for i in chkList3:
            if i in text:
                upgrade.append(text)
                ischk = 1
                break
        if ischk:
            continue
        # 自我评价
        for i in chkList5:
            if i in text:
                specialty.append(text)
                ischk = 1
                break
        if ischk:
            continue
    # review自我评价, project项目经验，work工作经验，upgrade教育经历，specialty技能特长
    dates.update({
        'review': review,
        'project': project,
        'work': work,
        'upgrade': upgrade,
        'specialty': specialty,
    })

    return dates


# 文件路径
PATH_DATA = os.path.abspath("C:/Users/Administrator/Desktop/面试简历")
schema = ['姓名', '所在地', '户口所在地', '婚姻状况', '民族', '身高', '电话', '应聘职位', '到岗时间', '学历', '毕业学校', '专业',
          '期望薪资', '在校时间', '电子邮箱', '工作经验','Email'
          ]
schema_dict = {'姓名': 'name', '所在地': 'location', '户口所在地': 'account', '婚姻状况': 'gam', '民族': 'nation', '身高': 'height',
               '电话': 'phone', '应聘职位': 'job', '到岗时间': 'come_time', '学历': 'education', '毕业学校': 'school', '专业': 'career',
               '期望薪资': 'money', '在校时间': 'at_school', '电子邮箱': 'mail', '工作经验': 'work_exp','Email':'mail'}
for root, dirs, files in os.walk(PATH_DATA):
    for file in files:  # 一个file就是一份简历
        url = PATH_DATA + f"/{file}"
        # # 名字
        # name = ''
        # # 电话
        # phone = ''
        # # 电子邮箱
        # mail = ''
        # # 民族
        # nation = ''
        # # 毕业院校
        # school = ''
        # # 专业
        # major = ''
        # # 工作经验
        # work_exp = ''
        # # 婚姻状况
        # gam = ''
        # # 地址
        # site = []
        # # 婚姻状况
        # marriage = ''
        # # 自我评价
        # review = []
        # # 项目经验
        # project = []
        # # 工作经验
        # work = []
        # # 教育经验
        # upgrade = []
        # # 技能特长
        # specialty = []
        if os.path.splitext(file)[1] == '.pdf':
            pdf_docx(PATH_DATA, file)  # 转为docx
            name = file.split('.')[0]
            open_txt = docx.Document(PATH_DATA + f"/{name}.docx")  # 打开docx
            txt = getText_pdf(url)  # 打开pdf格式文件转txt
            # txt = getText_docx(PATH_DATA + f"\{name}.docx")
        elif os.path.splitext(file)[1] == '.docx':
            open_txt = docx.Document(url)  # 打开docx，将用来读取每一段的内容
            txt = getText_docx(url)  # 打开docx格式文件转txt
        elif os.path.splitext(file)[1] == '.doc':
            doc_docx(PATH_DATA, file)  # 转为docx
            name = file.split('.')[0]
            open_txt = docx.Document(PATH_DATA + f"/{name}.docx")  # 打开docx
            txt = getText_docx(PATH_DATA + f"/{name}.docx")  # 打开docx格式文件转txt
        ie = Taskflow('information_extraction', schema=schema)  # 花费时间会安装文件
        # pprint(ie(txt))  # 姓名，电话，电子邮箱，民族，毕业院校，专业，工作经验，婚姻状况
        # 获取的基础数据
        text_lists = ie(txt)
        # 处理后的基本数据
        dates = get_date(schema, text_lists, schema_dict)
        # 打开docx获取的每一段数据
        txt_list = open_txt.paragraphs
        # 获取的文档内容
        txt_list1 = []
        stop_int = 1
        txt1 = txt_list
        while stop_int:
            txt_list2, txt1, stop_int = fmtTxt(txt1)
            txt_list1 += txt_list2
        print(txt_list1)

        numTables = open_txt.tables  # 获取表格里面的内容
        table_list = []
        if len(numTables) > 0:
            for table in numTables:
                row_count = len(table.rows)
                col_count = len(table.columns)
                for i in range(row_count):
                    for j in range(col_count):
                        texts = table.cell(i, j).text
                        texts = re.sub('\s+', '', texts).lstrip()  # 字符串去除空格和换行符
                        if not texts:
                            continue
                        if texts in table_list:
                            continue
                        table_list.append(texts)
        if table_list:
            stop_table = 1
            table1 = table_list
            while stop_table:
                table_list2, table1, stop_table = fmtTxt(table1, istable=1)
                txt_list1 += table_list2
            # print(txt_list1)
        # review自我评价,project项目经验，work工作经验，upgrade教育经历，specialty技能特长
        # 把两部分的数据合起来返回前端
        datess = fmtList(txt_list1, dates)
        pprint(datess)
        a = 1