qq邮箱

2022-08-08 14:29:03 +08:00 · 2022-08-08 14:29:03 +08:00 · cc927e614b
commit cc927e614b
parent b0e65dc77d
5 changed files with 978 additions and 0 deletions
--- a/utils/chkmail/chkjianli.py
+++ b/utils/chkmail/chkjianli.py
@ -0,0 +1,298 @@
+import re
+
+import docx
+import os
+import copy
+from pprint import pprint
+from paddlenlp import Taskflow
+
+from gongju import get_entity, getText_docx, getText_pdf, doc_docx, pdf_docx
+
+
+def fmtTxt(txt, istable=0):
+    chkStr = ['自我评价', '自我描述', '个人优势', '项目经历', '项目经验', '项目描述', '教育经历', '学习经历', '工作经历', '工作经验', '实习经历',
+              '技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能']
+    chkList1 = ['自我评价', '自我描述', '个人优势']
+    chkList2 = ['项目经历', '项目经验', '项目描述']
+    chkList3 = ['教育经历', '学习经历']
+    chkList4 = ['工作经历', '工作经验', '实习经历']
+    chkList5 = ['技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能']
+    fmtList = []  # 返回拼接好的字符串列表
+    trueIndex = 0
+    fmtStr = ''
+    nowChkList = []
+    # 判断while循环是否需要停止
+    stop_int = 0
+    for index, i in enumerate(txt):
+        if istable:
+            text = i
+        else:
+            text = i.text
+        text = re.sub('\s+', '', text).lstrip()  # 字符串去除空格和换行符
+
+        # 没有检测出关键字
+        if not fmtStr:
+
+            for i in chkList1:
+                # 判断是不是以关键字开头
+                if not text.startswith(i, 0):
+                    break
+                else:
+                    if i in text:
+                        fmtStr = text
+                        nowChkList = [chk for chk in chkStr if chk not in chkList1]
+                        # 检测出关键字证明需要继续循环
+                        stop_int = 1
+                        break
+            if fmtStr:
+                continue
+            for i in chkList2:
+                # 判断是不是以关键字开头
+                if not text.startswith(i, 0):
+                    break
+                else:
+                    if i in text:
+                        fmtStr = text
+                        nowChkList = [chk for chk in chkStr if chk not in chkList2]
+                        stop_int = 1
+                        break
+            if fmtStr:
+                continue
+            for i in chkList3:
+                if i in text:
+                    fmtStr = text
+                    nowChkList = [chk for chk in chkStr if chk not in chkList3]
+                    stop_int = 1
+                    break
+            if fmtStr:
+                continue
+            for i in chkList4:
+                # 判断是不是以关键字开头
+                if not text.startswith(i, 0):
+                    break
+                else:
+                    if i in text:
+                        fmtStr = text
+                        nowChkList = [chk for chk in chkStr if chk not in chkList4]
+                        stop_int = 1
+                        break
+            if fmtStr:
+                continue
+            for i in chkList5:
+                # 判断是不是以关键字开头
+                if not text.startswith(i, 0):
+                    break
+                else:
+                    if i in text:
+                        fmtStr = text
+                        nowChkList = [chk for chk in chkStr if chk not in chkList5]
+                        stop_int = 1
+                        break
+            continue
+        else:
+            isTure = 1
+            for i in nowChkList:
+                if i in text:
+                    isTure = 0
+                    break
+            if isTure:
+                fmtStr += text
+                continue
+            else:
+                fmtStrTrue = fmtStr
+                fmtList.append(fmtStrTrue)
+                trueIndex = index
+                # fmtStr = ''
+                # nowChkList = []
+                # 剩余没有检索的部分
+                txt1 = txt[trueIndex:]
+                return fmtList, txt1, stop_int
+
+    # 当列表全部检索完毕需要停止循环
+    if fmtStr:
+        fmtStrTrue = fmtStr
+        fmtList.append(fmtStrTrue)
+        stop_int = 0
+    txt1 = txt[trueIndex:]
+    return fmtList, txt1, stop_int
+
+
+def fmtList(txtlist):
+    chkList1 = ['自我评价', '自我描述', '个人优势']
+    chkList2 = ['项目经历', '项目经验', '项目描述']
+    chkList3 = ['教育经历', '学习经历']
+    chkList4 = ['工作经历', '工作经验', '实习经历']
+    chkList5 = ['技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能']
+    # 自我评价
+    review = []
+    # 项目经验
+    project = []
+    # 工作经验
+    work = []
+    # 教育经验
+    upgrade = []
+    # 技能特长
+    specialty = []
+    for text in txtlist:
+        ischk = 0
+        # 自我评价
+        for i in chkList1:
+            if i in text:
+                review.append(text)
+                ischk = 1
+                break
+        if ischk:
+            continue
+        # 项目经验
+        for i in chkList2:
+            if i in text:
+                project.append(text)
+                ischk = 1
+                break
+        if ischk:
+            continue
+        # 工作经验
+        for i in chkList4:
+            if i in text:
+                work.append(text)
+                ischk = 1
+                break
+        if ischk:
+            continue
+        # 教育经历
+        for i in chkList3:
+            if i in text:
+                upgrade.append(text)
+                ischk = 1
+                break
+        if ischk:
+            continue
+        # 自我评价
+        for i in chkList5:
+            if i in text:
+                specialty.append(text)
+                ischk = 1
+                break
+        if ischk:
+            continue
+
+    return review, project, work, upgrade, specialty
+
+
+def get_date(schema, dates):
+    """
+    把第三方获取的数据筛选出想要的基本信息
+    :param schema:
+    :param dates:
+    :return:
+    """
+    date = {}
+    for i in schema:
+        text = dates[0].get(i, '')
+        # 如果数据中没有搜到对应的键，返回空字符串
+        if text == '':
+            date[i] = text
+        else:
+            if len(text) == 1:
+                date[i] = text[0]['text']
+            else:
+                aa = {}
+                num = []
+                for dic in text:
+                    aa[dic['probability']] = dic['text']
+                    num.append(dic['probability'])
+                date[i] = aa[max(num)]
+    return date
+
+
+# 文件路径
+PATH_DATA = os.path.abspath("D:/wokerplay/面试简历1")
+schema = ['姓名', '所在地', '户口所在地', '婚姻状况', '民族', '身高', '电话', '应聘职位', '到岗时间', '学历', '毕业学校', '专业',
+          '期望薪资', '在校时间', '到岗时间', '工作经历', '自我评价', '电子邮箱', '技能', '特长', '工作经验', '项目经验'
+          ]
+for root, dirs, files in os.walk(PATH_DATA):
+    for file in files:  # 一个file就是一份简历
+        url = PATH_DATA + f"/{file}"
+        # 名字
+        name = ''
+        # 电话
+        phone = ''
+        # 电子邮箱
+        mail = ''
+        # 民族
+        nation = ''
+        # 毕业院校
+        school = ''
+        # 专业
+        major = ''
+        # 工作经验
+        work_exp = ''
+        # 婚姻状况
+        gam = ''
+        # 地址
+        site = []
+        # 婚姻状况
+        marriage = ''
+        # 自我评价
+        review = []
+        # 项目经验
+        project = []
+        # 工作经验
+        work = []
+        # 教育经验
+        upgrade = []
+        # 技能特长
+        specialty = []
+        if os.path.splitext(file)[1] == '.pdf':
+            pdf_docx(PATH_DATA, file)  # 转为docx
+            name = file.split('.')[0]
+            open_txt = docx.Document(PATH_DATA + f"/{name}.docx")  # 打开docx
+            txt = getText_pdf(url)  # 打开pdf格式文件转txt
+            # txt = getText_docx(PATH_DATA + f"\{name}.docx")
+        elif os.path.splitext(file)[1] == '.docx':
+            open_txt = docx.Document(url)  # 打开docx，将用来读取每一段的内容
+            txt = getText_docx(url)  # 打开docx格式文件转txt
+        elif os.path.splitext(file)[1] == '.doc':
+            doc_docx(PATH_DATA, file)  # 转为docx
+            name = file.split('.')[0]
+            open_txt = docx.Document(PATH_DATA + f"/{name}.docx")  # 打开docx
+            txt = getText_docx(PATH_DATA + f"/{name}.docx")  # 打开docx格式文件转txt
+        # ie = Taskflow('information_extraction', schema=schema)  # 花费时间会安装文件
+        # pprint(ie(txt))  # 姓名，电话，电子邮箱，民族，毕业院校，专业，工作经验，婚姻状况
+        # #获取的基础数据
+        # text_lists=ie(txt)
+        # 打开docx获取的每一段数据
+        txt_list = open_txt.paragraphs
+        # 获取的文档内容
+        txt_list1 = []
+        stop_int = 1
+        txt1 = txt_list
+        while stop_int:
+            txt_list2, txt1, stop_int = fmtTxt(txt1)
+            txt_list1 += txt_list2
+        print(txt_list1)
+
+        numTables = open_txt.tables  # 获取表格里面的内容
+        table_list = []
+        if len(numTables) > 0:
+            for table in numTables:
+                row_count = len(table.rows)
+                col_count = len(table.columns)
+                for i in range(row_count):
+                    for j in range(col_count):
+                        texts = table.cell(i, j).text
+                        texts = re.sub('\s+', '', texts).lstrip()  # 字符串去除空格和换行符
+                        if not texts:
+                            continue
+                        if texts in table_list:
+                            continue
+                        table_list.append(texts)
+        if table_list:
+            stop_table = 1
+            table1 = table_list
+            while stop_table:
+                table_list2, table1, stop_table = fmtTxt(table1, istable=1)
+                txt_list1 += table_list2
+            print(txt_list1)
+
+        review, project, work, upgrade, specialty = fmtList(txt_list1)
--- a/utils/chkmail/chkmail.py
+++ b/utils/chkmail/chkmail.py
@ -0,0 +1,315 @@
+import re
+import docx
+import os
+from pprint import pprint
+from paddlenlp import Taskflow
+from gongju import getText_docx, getText_pdf, doc_docx, pdf_docx
+
+
+def get_date(schema, dates, schema_dict):
+    """
+    把第三方获取的数据筛选出想要的基本信息
+    :param schema:中文的词性标注
+    :param dates:原数据
+    :param schema_dict:对应中文的英文
+    :return: 返回取出概率最大的基本信息数据
+    """
+    date = {}
+    for i in schema:
+        text = dates[0].get(i, '')
+        # 如果数据中没有搜到对应的键，返回空字符串
+        if text == '':
+            date[schema_dict[i]] = text
+        else:
+            if len(text) == 1:
+                date[schema_dict[i]] = text[0]['text']
+            else:
+                aa = {}
+                num = []
+                for dic in text:
+                    aa[dic['probability']] = dic['text']
+                    num.append(dic['probability'])
+                #  取出概率最大的值
+                date[schema_dict[i]] = aa[max(num)]
+    return date
+
+
+def fmtTxt(txt, istable=0):
+    # 所有关键字
+    chkStr = ['自我评价', '自我描述', '个人优势', '项目经历', '项目经验', '项目描述', '教育经历', '学习经历', '工作经历', '工作经验', '实习经历',
+              '技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能', '个人评价']
+    # 自我描述
+    chkList1 = ['自我评价', '自我描述', '个人优势', '个人评价']
+    # 项目经验
+    chkList2 = ['项目经历', '项目经验', '项目描述']
+    # 教育背景
+    chkList3 = ['教育经历', '学习经历']
+    # 工作经历
+    chkList4 = ['工作经历', '工作经验', '实习经历']
+    # 个人技能
+    chkList5 = ['技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能']
+    fmtList = []  # 返回拼接好的字符串列表
+    trueIndex = 0
+    fmtStr = ''
+    nowChkList = []
+    # 判断while循环是否需要停止
+    stop_int = 0
+    for index, i in enumerate(txt):
+        if istable:
+            text = i
+        else:
+            text = i.text
+        text = re.sub('\s+', '', text).lstrip()  # 字符串去除空格和换行符
+
+        # 没有检测出关键字
+        if not fmtStr:
+
+            for i in chkList1:
+                # 判断是不是以关键字开头
+                if not text.startswith(i, 0):
+                    continue
+                else:
+                    if i in text:
+                        fmtStr = text
+                        nowChkList = [chk for chk in chkStr if chk not in chkList1]
+                        # 检测出关键字证明需要继续循环
+                        stop_int = 1
+                        break
+            if fmtStr:
+                continue
+            for i in chkList2:
+                # 判断是不是以关键字开头
+                if not text.startswith(i, 0):
+                    continue
+                else:
+                    if i in text:
+                        fmtStr = text
+                        nowChkList = [chk for chk in chkStr if chk not in chkList2]
+                        stop_int = 1
+                        break
+            if fmtStr:
+                continue
+            for i in chkList3:
+                if i in text:
+                    fmtStr = text
+                    nowChkList = [chk for chk in chkStr if chk not in chkList3]
+                    stop_int = 1
+                    break
+            if fmtStr:
+                continue
+            for i in chkList4:
+                # 判断是不是以关键字开头
+                if not text.startswith(i, 0):
+                    continue
+                else:
+                    if i in text:
+                        fmtStr = text
+                        nowChkList = [chk for chk in chkStr if chk not in chkList4]
+                        stop_int = 1
+                        break
+            if fmtStr:
+                continue
+            for i in chkList5:
+                if i in text:
+                    fmtStr = text
+                    nowChkList = [chk for chk in chkStr if chk not in chkList5]
+                    stop_int = 1
+                    break
+            continue
+        else:
+            isTure = 1
+            for i in nowChkList:
+                if i in text:
+                    isTure = 0
+                    break
+            if isTure:
+                fmtStr += text
+                continue
+            else:
+                fmtStrTrue = fmtStr
+                fmtList.append(fmtStrTrue)
+                trueIndex = index
+                # fmtStr = ''
+                # nowChkList = []
+                # 剩余没有检索的部分
+                txt1 = txt[trueIndex:]
+                return fmtList, txt1, stop_int
+
+    # 当列表全部检索完毕需要停止循环
+    if fmtStr:
+        fmtStrTrue = fmtStr
+        fmtList.append(fmtStrTrue)
+        stop_int = 0
+    txt1 = txt[trueIndex:]
+    return fmtList, txt1, stop_int
+
+
+def fmtList(txtlist, dates):
+    chkList1 = ['自我评价', '自我描述', '个人优势']
+    chkList2 = ['项目经历', '项目经验', '项目描述']
+    chkList3 = ['教育经历', '学习经历']
+    chkList4 = ['工作经历', '工作经验', '实习经历']
+    chkList5 = ['技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能']
+    # 自我评价
+    review = []
+    # 项目经验
+    project = []
+    # 工作经验
+    work = []
+    # 教育经验
+    upgrade = []
+    # 技能特长
+    specialty = []
+    for text in txtlist:
+        ischk = 0
+        # 自我评价
+        for i in chkList1:
+            if i in text:
+                review.append(text)
+                ischk = 1
+                break
+        if ischk:
+            continue
+        # 项目经验
+        for i in chkList2:
+            if i in text:
+                project.append(text)
+                ischk = 1
+                break
+        if ischk:
+            continue
+        # 工作经验
+        for i in chkList4:
+            if i in text:
+                work.append(text)
+                ischk = 1
+                break
+        if ischk:
+            continue
+        # 教育经历
+        for i in chkList3:
+            if i in text:
+                upgrade.append(text)
+                ischk = 1
+                break
+        if ischk:
+            continue
+        # 自我评价
+        for i in chkList5:
+            if i in text:
+                specialty.append(text)
+                ischk = 1
+                break
+        if ischk:
+            continue
+    # review自我评价, project项目经验，work工作经验，upgrade教育经历，specialty技能特长
+    dates.update({
+        'review': review,
+        'project': project,
+        'work': work,
+        'upgrade': upgrade,
+        'specialty': specialty,
+    })
+
+    return dates
+
+
+# 文件路径
+PATH_DATA = os.path.abspath("C:/Users/Administrator/Desktop/面试简历")
+schema = ['姓名', '所在地', '户口所在地', '婚姻状况', '民族', '身高', '电话', '应聘职位', '到岗时间', '学历', '毕业学校', '专业',
+          '期望薪资', '在校时间', '电子邮箱', '工作经验','Email'
+          ]
+schema_dict = {'姓名': 'name', '所在地': 'location', '户口所在地': 'account', '婚姻状况': 'gam', '民族': 'nation', '身高': 'height',
+               '电话': 'phone', '应聘职位': 'job', '到岗时间': 'come_time', '学历': 'education', '毕业学校': 'school', '专业': 'career',
+               '期望薪资': 'money', '在校时间': 'at_school', '电子邮箱': 'mail', '工作经验': 'work_exp','Email':'mail'}
+for root, dirs, files in os.walk(PATH_DATA):
+    for file in files:  # 一个file就是一份简历
+        url = PATH_DATA + f"/{file}"
+        # # 名字
+        # name = ''
+        # # 电话
+        # phone = ''
+        # # 电子邮箱
+        # mail = ''
+        # # 民族
+        # nation = ''
+        # # 毕业院校
+        # school = ''
+        # # 专业
+        # major = ''
+        # # 工作经验
+        # work_exp = ''
+        # # 婚姻状况
+        # gam = ''
+        # # 地址
+        # site = []
+        # # 婚姻状况
+        # marriage = ''
+        # # 自我评价
+        # review = []
+        # # 项目经验
+        # project = []
+        # # 工作经验
+        # work = []
+        # # 教育经验
+        # upgrade = []
+        # # 技能特长
+        # specialty = []
+        if os.path.splitext(file)[1] == '.pdf':
+            pdf_docx(PATH_DATA, file)  # 转为docx
+            name = file.split('.')[0]
+            open_txt = docx.Document(PATH_DATA + f"/{name}.docx")  # 打开docx
+            txt = getText_pdf(url)  # 打开pdf格式文件转txt
+            # txt = getText_docx(PATH_DATA + f"\{name}.docx")
+        elif os.path.splitext(file)[1] == '.docx':
+            open_txt = docx.Document(url)  # 打开docx，将用来读取每一段的内容
+            txt = getText_docx(url)  # 打开docx格式文件转txt
+        elif os.path.splitext(file)[1] == '.doc':
+            doc_docx(PATH_DATA, file)  # 转为docx
+            name = file.split('.')[0]
+            open_txt = docx.Document(PATH_DATA + f"/{name}.docx")  # 打开docx
+            txt = getText_docx(PATH_DATA + f"/{name}.docx")  # 打开docx格式文件转txt
+        ie = Taskflow('information_extraction', schema=schema)  # 花费时间会安装文件
+        # pprint(ie(txt))  # 姓名，电话，电子邮箱，民族，毕业院校，专业，工作经验，婚姻状况
+        # 获取的基础数据
+        text_lists = ie(txt)
+        # 处理后的基本数据
+        dates = get_date(schema, text_lists, schema_dict)
+        # 打开docx获取的每一段数据
+        txt_list = open_txt.paragraphs
+        # 获取的文档内容
+        txt_list1 = []
+        stop_int = 1
+        txt1 = txt_list
+        while stop_int:
+            txt_list2, txt1, stop_int = fmtTxt(txt1)
+            txt_list1 += txt_list2
+        print(txt_list1)
+
+        numTables = open_txt.tables  # 获取表格里面的内容
+        table_list = []
+        if len(numTables) > 0:
+            for table in numTables:
+                row_count = len(table.rows)
+                col_count = len(table.columns)
+                for i in range(row_count):
+                    for j in range(col_count):
+                        texts = table.cell(i, j).text
+                        texts = re.sub('\s+', '', texts).lstrip()  # 字符串去除空格和换行符
+                        if not texts:
+                            continue
+                        if texts in table_list:
+                            continue
+                        table_list.append(texts)
+        if table_list:
+            stop_table = 1
+            table1 = table_list
+            while stop_table:
+                table_list2, table1, stop_table = fmtTxt(table1, istable=1)
+                txt_list1 += table_list2
+            # print(txt_list1)
+        # review自我评价,project项目经验，work工作经验，upgrade教育经历，specialty技能特长
+        # 把两部分的数据合起来返回前端
+        datess = fmtList(txt_list1, dates)
+        pprint(datess)
+        a = 1
--- a/utils/chkmail/gongju.py
+++ b/utils/chkmail/gongju.py
@ -0,0 +1,164 @@
+import fnmatch
+import docx
+# import win32com.client
+import os
+from pdf2docx import Converter
+
+PATH_DATA = os.path.abspath(r"C:\Users\Administrator\Desktop\新建文件夹")  # word简历存放路径
+
+# 将docx，doc文件转换成txt文件
+def docx_to_txt():
+    wordapp = win32com.client.gencache.EnsureDispatch("Word.Application")
+    try:
+        for root, dirs, files in os.walk(PATH_DATA):
+            for _dir in dirs:
+                pass
+            for _file in files:
+                if not (fnmatch.fnmatch(_file, '*.doc') or fnmatch.fnmatch(_file, '*.docx')) or _file.startswith("~"):
+                    continue
+                print('_file:', _file)
+                file = os.path.join(root, _file)
+                wordapp.Documents.Open(file)
+                if fnmatch.fnmatch(_file, '*.docx'):  # 匹配doc文档
+                    file = file[:-3] + 'txt'
+                else:  # 匹配docx文档
+                    file = file[:-4] + 'txt'
+                wordapp.ActiveDocument.SaveAs(file, FileFormat=win32com.client.constants.wdFormatText,
+                                              Encoding=65001)  # 这里直接转换为 utf-8 格式的txt
+                # https://docs.microsoft.com/zh-cn/office/vba/api/Office.MsoEncoding   各种格式代码在这里查
+                wordapp.ActiveDocument.Close()
+
+    finally:
+        wordapp.Quit()
+
+
+# 将pdf文件转换成txt文件
+def pdf_to_txt(pdf_path):
+    with pdfplumber.open(pdf_path) as pdf_file:
+        content = ''
+        for i in range(len(pdf_file.pages)):
+            page_text = pdf_file.pages[i]
+            page_content = page_text.extract_text()
+            if page_content:
+                content = content + page_content + "\n"
+                with open(f"{pdf_path.split('.')[0]}.txt", "w", encoding="utf-8") as file:
+                    file.write(content)
+                    file.close()
+
+
+
+import paddlehub as hub
+import numpy as np
+
+
+def get_model():
+    lac = hub.Module(name='lac')
+    return lac
+
+
+def get_lac(text):
+    inputs = {"text": [text]}
+    lac = get_model()
+    res = lac.lexical_analysis(data=inputs)
+    tag = res[0]['tag']
+    word = res[0]['word']
+    return tag, word
+
+
+def get_entity(text, label):
+    '''
+    label参数可以为
+    'PER' : 人名
+    'LOC' : 地名
+    'ORG' : 机构名
+    'TIME' : 时间
+    '''
+    res = []
+    tag, word = get_lac(text)
+    tag = np.array(tag)
+    indexs = np.where(tag == label)[0]
+    for index in indexs:
+        res.append(word[index])
+    return res
+
+
+def getText_docx(filename):  # docx 转text
+    """将docx读成text"""
+    doc = docx.Document(filename)
+    fullText = []
+    for i in doc.paragraphs:  # 迭代docx文档里面的每一个段落
+        fullText.append(i.text)  # 保存每一个段落的文本
+    numTables = doc.tables  #如果有表格的内容存放在这
+    if len(numTables) > 0:
+        for table in numTables:
+            row_count = len(table.rows)
+            col_count = len(table.columns)
+            for i in range(row_count):
+                for j in range(col_count):
+                    fullText.append(table.cell(i, j).text)
+    return '\n'.join(fullText)
+
+
+import pdfplumber
+
+
+def getText_pdf(filename):
+    """将pdf读成text"""
+    with pdfplumber.open(filename) as pdf_file:
+        content = ''
+        for i in range(len(pdf_file.pages)):
+            page_text = pdf_file.pages[i]
+            page_content = page_text.extract_text()
+            if page_content:
+                content = content + page_content + "\n"
+    return content
+
+
+from win32com import client as wc
+
+
+def doc_docx(url, filename):
+    """
+    将doc文件转为docx文件
+    :param filename:
+    :return:
+    """
+    word = wc.Dispatch("Word.Application")
+    doc = word.Documents.Open(url + f"/{filename}")
+    name = filename.split('.')[0]
+    doc.SaveAs(url + f'/{name}.docx', 12)  # 12为docx
+    doc.Close()
+    word.Quit()
+
+
+def pdf_docx(url,filename):
+    """
+    将pdf文件转为docx文件
+    :param url:
+    :param filename:
+    :return:
+    """
+    # 获取文件名称
+    file_name = filename.split('.')[0]
+    # pdf文件名称
+    pdf_name = url + f"/{filename}"
+    # docx文件名称
+    docx_name = url +f"/{file_name}.docx"
+    # 加载pdf文档
+    cv = Converter(pdf_name)
+    cv.convert(docx_name,start=0,end=12)
+    cv.close()
+
+def read_tables(open_txt):
+    """
+    读取docx的表格内容
+    :param open_txt: 打开docx后的对象
+    :return:
+    """
+    numTables = open_txt.tables
+    for table in numTables:
+        row_count = len(table.rows)
+        col_count = len(table.columns)
+        for i in range(row_count):
+            for j in range(col_count):
+                print(table.cell(i, j).text)
--- a/utils/chkmail/qqemail.py
+++ b/utils/chkmail/qqemail.py
@ -0,0 +1,138 @@
+# -*- coding: utf-8 -*-
+import poplib
+import base64
+import os
+from email.parser import Parser
+from email.header import decode_header
+from email.utils import parseaddr
+
+email_user = {
+    '李宗振': {
+        'email': '1986461823@qq.com',
+        'pwd': 'hoosihokeaqkifdf'
+    },
+    '吴操': {
+        'email': '2787668634@qq.com',
+        'pwd': 'jendjvizztqsdebb'
+    }
+}
+
+
+def email_users(dirname, emaildict):
+    # 判断文件夹是否存在不存在创建文件夹
+    dirpath = './{0}'.format(dirname)
+    if not os.path.exists(dirpath):
+        os.makedirs(dirpath)
+
+    # 连接到POP3服务器:
+    server = poplib.POP3("pop.qq.com")
+    # 可以打开或关闭调试信息:
+    server.set_debuglevel(1)
+    # 可选:打印POP3服务器的欢迎文字:
+    print(server.getwelcome().decode('utf-8'))
+
+    # 身份认证:
+    server.user(emaildict['email'])
+    # server.user("1986461823@qq.com")
+    # 非密码，qq邮箱登录第三方需要的授权码，可在qq邮箱设置里获得
+    server.pass_(emaildict['pwd'])
+    # server.pass_("hoosihokeaqkifdf")
+
+    # stat()返回邮件数量和占用空间:
+    print('Messages: %s. Size: %s' % server.stat())
+    # list()返回所有邮件的编号:
+    resp, mails, octets = server.list()
+    # 可以查看返回的列表类似[b'1 82923', b'2 2184', ...]
+    print(mails)
+
+    # 获取最新一封邮件, 注意索引号从1开始:
+    index = len(mails)
+    for i in range(1, index + 1):
+        resp, lines, octets = server.retr(i)
+
+        # lines存储了邮件的原始文本的每一行,
+        # 可以获得整个邮件的原始文本:
+        try:
+            msg_content = b'\r\n'.join(lines).decode('utf-8')
+        except:
+            continue
+        # 稍后解析出邮件:
+        msg = Parser().parsestr(msg_content)
+
+        print_info(msg, dirpath)
+    # 可以根据邮件索引号直接从服务器删除邮件:
+    # server.dele(index)
+    # 关闭连接:
+    server.quit()
+
+
+def print_info(msg, dirpath, indent=0):
+    for part in msg.walk():
+        if part.get_content_maintype() == 'multipart' or part.get('Content-Disposition') is None:
+            continue
+        fileName = part.get_filename()
+        # 保存附件
+        if fileName:
+            filename = ''
+            transfer_encoding = part.get_all('Content-Transfer-Encoding')
+            if transfer_encoding and transfer_encoding[0] == 'base64':
+                filename_parts = fileName.split('?')
+                filename = base64.b64decode(filename_parts[3]).decode(filename_parts[1])
+
+            data = part.get_payload(decode=True)
+            if filename:
+                filename_path = dirpath + "/{0}".format(filename)
+                # 文件存在则直接跳过
+                if os.path.exists(filename_path):
+                    continue
+                fEx = open(filename_path, 'wb')
+                fEx.write(data)
+                fEx.close()
+    if indent == 0:
+        for header in ['From', 'To', 'Subject']:
+            value = msg.get(header, '')
+            if value:
+                if header == 'Subject':
+                    value = decode_str(value)
+                else:
+                    hdr, addr = parseaddr(value)
+                    name = decode_str(hdr)
+                    value = u'%s <%s>' % (name, addr)
+            print('%s%s: %s' % ('  ' * indent, header, value))
+    if (msg.is_multipart()):
+        parts = msg.get_payload()
+        for n, part in enumerate(parts):
+            print('%spart %s' % ('  ' * indent, n))
+            print('%s--------------------' % ('  ' * indent))
+            print_info(part, dirpath, indent + 1)
+    else:
+        content_type = msg.get_content_type()
+        if content_type == 'text/plain' or content_type == 'text/html':
+            content = msg.get_payload(decode=True)
+            charset = guess_charset(msg)
+            if charset:
+                pass
+        else:
+            print('%sAttachment: %s' % ('  ' * indent, content_type))
+
+
+def decode_str(s):
+    value, charset = decode_header(s)[0]
+    if charset:
+        value = value.decode(charset)
+    return value
+
+
+def guess_charset(msg):
+    charset = msg.get_charset()
+    if charset is None:
+        content_type = msg.get('Content-Type', '').lower()
+        pos = content_type.find('charset=')
+        if pos >= 0:
+            charset = content_type[pos + 8:].strip()
+    return charset
+
+
+if __name__ == '__main__':
+    for dirname, email_dict in email_user.items():
+        email_users(dirname, email_dict)
--- a/utils/chkmail/teststr.py
+++ b/utils/chkmail/teststr.py
@ -0,0 +1,63 @@
+# -*- coding: utf-8 -*-
+import copy
+
+txtlist = [{'公司名': [{'end': 353,
+                     'probability': 0.8196403474903491,
+                     'start': 341,
+                     'text': '武汉漫维智能科技有限公司'},
+                    {'end': 20,
+                     'probability': 0.8494340282651791,
+                     'start': 6,
+                     'text': '武汉中软国际科技服务有限公司'},
+                    {'end': 400,
+                     'probability': 0.5690599404322967,
+                     'start': 388,
+                     'text': '武汉漫维智能科技有限公司'},
+                    {'end': 733,
+                     'probability': 0.9766299737741235,
+                     'start': 721,
+                     'text': '广州中道电子科技有限公司'}],
+            '时间': [{'end': 34,
+                    'probability': 0.6200274175388927,
+                    'start': 22,
+                    'text': '2018.03 - 至今'},
+                   {'end': 383,
+                    'probability': 0.4970208179496325,
+                    'start': 366,
+                    'text': '2017.09 - 2018.04'},
+                   {'end': 752,
+                    'probability': 0.5228238735354154,
+                    'start': 735,
+                    'text': '2015.11 - 2017.09'}]}]
+
+
+def chkworlkandtime(dictdata):
+    res = {}
+    for i in dictdata:
+        for key, datalist in i.items():
+            trueDict = {}
+            for data in datalist:
+                if data['text'] in trueDict:
+                    if data['probability'] <= trueDict[data['text']]['probability']:
+                        continue
+                trueDict.update({
+                    data['text']: {
+                        'end': data['end'],
+                        'probability': data['probability'],
+                        'start': data['start'],
+                    }
+                })
+            trueList = []
+            for key1, value1 in trueDict.items():
+                value1.update({
+                    'text': key1
+                })
+                trueDict1 = copy.deepcopy(value1)
+                trueList.append(trueDict1)
+            trueList.sort(key=lambda item: item['start'])
+            res.update({key: trueList})
+
+    return res
+
+
+chkworlkandtime(txtlist)