diff --git a/utils/chkmail/chkjianli.py b/utils/chkmail/chkjianli.py new file mode 100644 index 0000000..8c5f088 --- /dev/null +++ b/utils/chkmail/chkjianli.py @@ -0,0 +1,298 @@ +import re + +import docx +import os +import copy +from pprint import pprint +from paddlenlp import Taskflow + +from gongju import get_entity, getText_docx, getText_pdf, doc_docx, pdf_docx + + +def fmtTxt(txt, istable=0): + chkStr = ['自我评价', '自我描述', '个人优势', '项目经历', '项目经验', '项目描述', '教育经历', '学习经历', '工作经历', '工作经验', '实习经历', + '技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能'] + chkList1 = ['自我评价', '自我描述', '个人优势'] + chkList2 = ['项目经历', '项目经验', '项目描述'] + chkList3 = ['教育经历', '学习经历'] + chkList4 = ['工作经历', '工作经验', '实习经历'] + chkList5 = ['技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能'] + fmtList = [] # 返回拼接好的字符串列表 + trueIndex = 0 + fmtStr = '' + nowChkList = [] + # 判断while循环是否需要停止 + stop_int = 0 + for index, i in enumerate(txt): + if istable: + text = i + else: + text = i.text + text = re.sub('\s+', '', text).lstrip() # 字符串去除空格和换行符 + + # 没有检测出关键字 + if not fmtStr: + + for i in chkList1: + # 判断是不是以关键字开头 + if not text.startswith(i, 0): + break + else: + if i in text: + fmtStr = text + nowChkList = [chk for chk in chkStr if chk not in chkList1] + # 检测出关键字证明需要继续循环 + stop_int = 1 + break + if fmtStr: + continue + for i in chkList2: + # 判断是不是以关键字开头 + if not text.startswith(i, 0): + break + else: + if i in text: + fmtStr = text + nowChkList = [chk for chk in chkStr if chk not in chkList2] + stop_int = 1 + break + if fmtStr: + continue + for i in chkList3: + if i in text: + fmtStr = text + nowChkList = [chk for chk in chkStr if chk not in chkList3] + stop_int = 1 + break + if fmtStr: + continue + for i in chkList4: + # 判断是不是以关键字开头 + if not text.startswith(i, 0): + break + else: + if i in text: + fmtStr = text + nowChkList = [chk for chk in chkStr if chk not in chkList4] + stop_int = 1 + break + if fmtStr: + continue + for i in chkList5: + # 判断是不是以关键字开头 + if not text.startswith(i, 0): + break + else: + if i in text: + fmtStr = text + nowChkList = [chk for chk in chkStr if chk not in chkList5] + stop_int = 1 + break + continue + else: + isTure = 1 + for i in nowChkList: + if i in text: + isTure = 0 + break + if isTure: + fmtStr += text + continue + else: + fmtStrTrue = fmtStr + fmtList.append(fmtStrTrue) + trueIndex = index + # fmtStr = '' + # nowChkList = [] + # 剩余没有检索的部分 + txt1 = txt[trueIndex:] + return fmtList, txt1, stop_int + + # 当列表全部检索完毕需要停止循环 + if fmtStr: + fmtStrTrue = fmtStr + fmtList.append(fmtStrTrue) + stop_int = 0 + txt1 = txt[trueIndex:] + return fmtList, txt1, stop_int + + +def fmtList(txtlist): + chkList1 = ['自我评价', '自我描述', '个人优势'] + chkList2 = ['项目经历', '项目经验', '项目描述'] + chkList3 = ['教育经历', '学习经历'] + chkList4 = ['工作经历', '工作经验', '实习经历'] + chkList5 = ['技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能'] + # 自我评价 + review = [] + # 项目经验 + project = [] + # 工作经验 + work = [] + # 教育经验 + upgrade = [] + # 技能特长 + specialty = [] + for text in txtlist: + ischk = 0 + # 自我评价 + for i in chkList1: + if i in text: + review.append(text) + ischk = 1 + break + if ischk: + continue + # 项目经验 + for i in chkList2: + if i in text: + project.append(text) + ischk = 1 + break + if ischk: + continue + # 工作经验 + for i in chkList4: + if i in text: + work.append(text) + ischk = 1 + break + if ischk: + continue + # 教育经历 + for i in chkList3: + if i in text: + upgrade.append(text) + ischk = 1 + break + if ischk: + continue + # 自我评价 + for i in chkList5: + if i in text: + specialty.append(text) + ischk = 1 + break + if ischk: + continue + + return review, project, work, upgrade, specialty + + +def get_date(schema, dates): + """ + 把第三方获取的数据筛选出想要的基本信息 + :param schema: + :param dates: + :return: + """ + date = {} + for i in schema: + text = dates[0].get(i, '') + # 如果数据中没有搜到对应的键,返回空字符串 + if text == '': + date[i] = text + else: + if len(text) == 1: + date[i] = text[0]['text'] + else: + aa = {} + num = [] + for dic in text: + aa[dic['probability']] = dic['text'] + num.append(dic['probability']) + date[i] = aa[max(num)] + return date + + +# 文件路径 +PATH_DATA = os.path.abspath("D:/wokerplay/面试简历1") +schema = ['姓名', '所在地', '户口所在地', '婚姻状况', '民族', '身高', '电话', '应聘职位', '到岗时间', '学历', '毕业学校', '专业', + '期望薪资', '在校时间', '到岗时间', '工作经历', '自我评价', '电子邮箱', '技能', '特长', '工作经验', '项目经验' + ] +for root, dirs, files in os.walk(PATH_DATA): + for file in files: # 一个file就是一份简历 + url = PATH_DATA + f"/{file}" + # 名字 + name = '' + # 电话 + phone = '' + # 电子邮箱 + mail = '' + # 民族 + nation = '' + # 毕业院校 + school = '' + # 专业 + major = '' + # 工作经验 + work_exp = '' + # 婚姻状况 + gam = '' + # 地址 + site = [] + # 婚姻状况 + marriage = '' + # 自我评价 + review = [] + # 项目经验 + project = [] + # 工作经验 + work = [] + # 教育经验 + upgrade = [] + # 技能特长 + specialty = [] + if os.path.splitext(file)[1] == '.pdf': + pdf_docx(PATH_DATA, file) # 转为docx + name = file.split('.')[0] + open_txt = docx.Document(PATH_DATA + f"/{name}.docx") # 打开docx + txt = getText_pdf(url) # 打开pdf格式文件转txt + # txt = getText_docx(PATH_DATA + f"\{name}.docx") + elif os.path.splitext(file)[1] == '.docx': + open_txt = docx.Document(url) # 打开docx,将用来读取每一段的内容 + txt = getText_docx(url) # 打开docx格式文件转txt + elif os.path.splitext(file)[1] == '.doc': + doc_docx(PATH_DATA, file) # 转为docx + name = file.split('.')[0] + open_txt = docx.Document(PATH_DATA + f"/{name}.docx") # 打开docx + txt = getText_docx(PATH_DATA + f"/{name}.docx") # 打开docx格式文件转txt + # ie = Taskflow('information_extraction', schema=schema) # 花费时间会安装文件 + # pprint(ie(txt)) # 姓名,电话,电子邮箱,民族,毕业院校,专业,工作经验,婚姻状况 + # #获取的基础数据 + # text_lists=ie(txt) + # 打开docx获取的每一段数据 + txt_list = open_txt.paragraphs + # 获取的文档内容 + txt_list1 = [] + stop_int = 1 + txt1 = txt_list + while stop_int: + txt_list2, txt1, stop_int = fmtTxt(txt1) + txt_list1 += txt_list2 + print(txt_list1) + + numTables = open_txt.tables # 获取表格里面的内容 + table_list = [] + if len(numTables) > 0: + for table in numTables: + row_count = len(table.rows) + col_count = len(table.columns) + for i in range(row_count): + for j in range(col_count): + texts = table.cell(i, j).text + texts = re.sub('\s+', '', texts).lstrip() # 字符串去除空格和换行符 + if not texts: + continue + if texts in table_list: + continue + table_list.append(texts) + if table_list: + stop_table = 1 + table1 = table_list + while stop_table: + table_list2, table1, stop_table = fmtTxt(table1, istable=1) + txt_list1 += table_list2 + print(txt_list1) + + review, project, work, upgrade, specialty = fmtList(txt_list1) diff --git a/utils/chkmail/chkmail.py b/utils/chkmail/chkmail.py new file mode 100644 index 0000000..dd0aeee --- /dev/null +++ b/utils/chkmail/chkmail.py @@ -0,0 +1,315 @@ +import re +import docx +import os +from pprint import pprint +from paddlenlp import Taskflow +from gongju import getText_docx, getText_pdf, doc_docx, pdf_docx + + +def get_date(schema, dates, schema_dict): + """ + 把第三方获取的数据筛选出想要的基本信息 + :param schema:中文的词性标注 + :param dates:原数据 + :param schema_dict:对应中文的英文 + :return: 返回取出概率最大的基本信息数据 + """ + date = {} + for i in schema: + text = dates[0].get(i, '') + # 如果数据中没有搜到对应的键,返回空字符串 + if text == '': + date[schema_dict[i]] = text + else: + if len(text) == 1: + date[schema_dict[i]] = text[0]['text'] + else: + aa = {} + num = [] + for dic in text: + aa[dic['probability']] = dic['text'] + num.append(dic['probability']) + # 取出概率最大的值 + date[schema_dict[i]] = aa[max(num)] + return date + + +def fmtTxt(txt, istable=0): + # 所有关键字 + chkStr = ['自我评价', '自我描述', '个人优势', '项目经历', '项目经验', '项目描述', '教育经历', '学习经历', '工作经历', '工作经验', '实习经历', + '技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能', '个人评价'] + # 自我描述 + chkList1 = ['自我评价', '自我描述', '个人优势', '个人评价'] + # 项目经验 + chkList2 = ['项目经历', '项目经验', '项目描述'] + # 教育背景 + chkList3 = ['教育经历', '学习经历'] + # 工作经历 + chkList4 = ['工作经历', '工作经验', '实习经历'] + # 个人技能 + chkList5 = ['技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能'] + fmtList = [] # 返回拼接好的字符串列表 + trueIndex = 0 + fmtStr = '' + nowChkList = [] + # 判断while循环是否需要停止 + stop_int = 0 + for index, i in enumerate(txt): + if istable: + text = i + else: + text = i.text + text = re.sub('\s+', '', text).lstrip() # 字符串去除空格和换行符 + + # 没有检测出关键字 + if not fmtStr: + + for i in chkList1: + # 判断是不是以关键字开头 + if not text.startswith(i, 0): + continue + else: + if i in text: + fmtStr = text + nowChkList = [chk for chk in chkStr if chk not in chkList1] + # 检测出关键字证明需要继续循环 + stop_int = 1 + break + if fmtStr: + continue + for i in chkList2: + # 判断是不是以关键字开头 + if not text.startswith(i, 0): + continue + else: + if i in text: + fmtStr = text + nowChkList = [chk for chk in chkStr if chk not in chkList2] + stop_int = 1 + break + if fmtStr: + continue + for i in chkList3: + if i in text: + fmtStr = text + nowChkList = [chk for chk in chkStr if chk not in chkList3] + stop_int = 1 + break + if fmtStr: + continue + for i in chkList4: + # 判断是不是以关键字开头 + if not text.startswith(i, 0): + continue + else: + if i in text: + fmtStr = text + nowChkList = [chk for chk in chkStr if chk not in chkList4] + stop_int = 1 + break + if fmtStr: + continue + for i in chkList5: + if i in text: + fmtStr = text + nowChkList = [chk for chk in chkStr if chk not in chkList5] + stop_int = 1 + break + continue + else: + isTure = 1 + for i in nowChkList: + if i in text: + isTure = 0 + break + if isTure: + fmtStr += text + continue + else: + fmtStrTrue = fmtStr + fmtList.append(fmtStrTrue) + trueIndex = index + # fmtStr = '' + # nowChkList = [] + # 剩余没有检索的部分 + txt1 = txt[trueIndex:] + return fmtList, txt1, stop_int + + # 当列表全部检索完毕需要停止循环 + if fmtStr: + fmtStrTrue = fmtStr + fmtList.append(fmtStrTrue) + stop_int = 0 + txt1 = txt[trueIndex:] + return fmtList, txt1, stop_int + + +def fmtList(txtlist, dates): + chkList1 = ['自我评价', '自我描述', '个人优势'] + chkList2 = ['项目经历', '项目经验', '项目描述'] + chkList3 = ['教育经历', '学习经历'] + chkList4 = ['工作经历', '工作经验', '实习经历'] + chkList5 = ['技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能'] + # 自我评价 + review = [] + # 项目经验 + project = [] + # 工作经验 + work = [] + # 教育经验 + upgrade = [] + # 技能特长 + specialty = [] + for text in txtlist: + ischk = 0 + # 自我评价 + for i in chkList1: + if i in text: + review.append(text) + ischk = 1 + break + if ischk: + continue + # 项目经验 + for i in chkList2: + if i in text: + project.append(text) + ischk = 1 + break + if ischk: + continue + # 工作经验 + for i in chkList4: + if i in text: + work.append(text) + ischk = 1 + break + if ischk: + continue + # 教育经历 + for i in chkList3: + if i in text: + upgrade.append(text) + ischk = 1 + break + if ischk: + continue + # 自我评价 + for i in chkList5: + if i in text: + specialty.append(text) + ischk = 1 + break + if ischk: + continue + # review自我评价, project项目经验,work工作经验,upgrade教育经历,specialty技能特长 + dates.update({ + 'review': review, + 'project': project, + 'work': work, + 'upgrade': upgrade, + 'specialty': specialty, + }) + + return dates + + +# 文件路径 +PATH_DATA = os.path.abspath("C:/Users/Administrator/Desktop/面试简历") +schema = ['姓名', '所在地', '户口所在地', '婚姻状况', '民族', '身高', '电话', '应聘职位', '到岗时间', '学历', '毕业学校', '专业', + '期望薪资', '在校时间', '电子邮箱', '工作经验','Email' + ] +schema_dict = {'姓名': 'name', '所在地': 'location', '户口所在地': 'account', '婚姻状况': 'gam', '民族': 'nation', '身高': 'height', + '电话': 'phone', '应聘职位': 'job', '到岗时间': 'come_time', '学历': 'education', '毕业学校': 'school', '专业': 'career', + '期望薪资': 'money', '在校时间': 'at_school', '电子邮箱': 'mail', '工作经验': 'work_exp','Email':'mail'} +for root, dirs, files in os.walk(PATH_DATA): + for file in files: # 一个file就是一份简历 + url = PATH_DATA + f"/{file}" + # # 名字 + # name = '' + # # 电话 + # phone = '' + # # 电子邮箱 + # mail = '' + # # 民族 + # nation = '' + # # 毕业院校 + # school = '' + # # 专业 + # major = '' + # # 工作经验 + # work_exp = '' + # # 婚姻状况 + # gam = '' + # # 地址 + # site = [] + # # 婚姻状况 + # marriage = '' + # # 自我评价 + # review = [] + # # 项目经验 + # project = [] + # # 工作经验 + # work = [] + # # 教育经验 + # upgrade = [] + # # 技能特长 + # specialty = [] + if os.path.splitext(file)[1] == '.pdf': + pdf_docx(PATH_DATA, file) # 转为docx + name = file.split('.')[0] + open_txt = docx.Document(PATH_DATA + f"/{name}.docx") # 打开docx + txt = getText_pdf(url) # 打开pdf格式文件转txt + # txt = getText_docx(PATH_DATA + f"\{name}.docx") + elif os.path.splitext(file)[1] == '.docx': + open_txt = docx.Document(url) # 打开docx,将用来读取每一段的内容 + txt = getText_docx(url) # 打开docx格式文件转txt + elif os.path.splitext(file)[1] == '.doc': + doc_docx(PATH_DATA, file) # 转为docx + name = file.split('.')[0] + open_txt = docx.Document(PATH_DATA + f"/{name}.docx") # 打开docx + txt = getText_docx(PATH_DATA + f"/{name}.docx") # 打开docx格式文件转txt + ie = Taskflow('information_extraction', schema=schema) # 花费时间会安装文件 + # pprint(ie(txt)) # 姓名,电话,电子邮箱,民族,毕业院校,专业,工作经验,婚姻状况 + # 获取的基础数据 + text_lists = ie(txt) + # 处理后的基本数据 + dates = get_date(schema, text_lists, schema_dict) + # 打开docx获取的每一段数据 + txt_list = open_txt.paragraphs + # 获取的文档内容 + txt_list1 = [] + stop_int = 1 + txt1 = txt_list + while stop_int: + txt_list2, txt1, stop_int = fmtTxt(txt1) + txt_list1 += txt_list2 + print(txt_list1) + + numTables = open_txt.tables # 获取表格里面的内容 + table_list = [] + if len(numTables) > 0: + for table in numTables: + row_count = len(table.rows) + col_count = len(table.columns) + for i in range(row_count): + for j in range(col_count): + texts = table.cell(i, j).text + texts = re.sub('\s+', '', texts).lstrip() # 字符串去除空格和换行符 + if not texts: + continue + if texts in table_list: + continue + table_list.append(texts) + if table_list: + stop_table = 1 + table1 = table_list + while stop_table: + table_list2, table1, stop_table = fmtTxt(table1, istable=1) + txt_list1 += table_list2 + # print(txt_list1) + # review自我评价,project项目经验,work工作经验,upgrade教育经历,specialty技能特长 + # 把两部分的数据合起来返回前端 + datess = fmtList(txt_list1, dates) + pprint(datess) + a = 1 diff --git a/utils/chkmail/gongju.py b/utils/chkmail/gongju.py new file mode 100644 index 0000000..3d338f6 --- /dev/null +++ b/utils/chkmail/gongju.py @@ -0,0 +1,164 @@ +import fnmatch +import docx +# import win32com.client +import os +from pdf2docx import Converter + +PATH_DATA = os.path.abspath(r"C:\Users\Administrator\Desktop\新建文件夹") # word简历存放路径 + +# 将docx,doc文件转换成txt文件 +def docx_to_txt(): + wordapp = win32com.client.gencache.EnsureDispatch("Word.Application") + try: + for root, dirs, files in os.walk(PATH_DATA): + for _dir in dirs: + pass + for _file in files: + if not (fnmatch.fnmatch(_file, '*.doc') or fnmatch.fnmatch(_file, '*.docx')) or _file.startswith("~"): + continue + print('_file:', _file) + file = os.path.join(root, _file) + wordapp.Documents.Open(file) + if fnmatch.fnmatch(_file, '*.docx'): # 匹配doc文档 + file = file[:-3] + 'txt' + else: # 匹配docx文档 + file = file[:-4] + 'txt' + wordapp.ActiveDocument.SaveAs(file, FileFormat=win32com.client.constants.wdFormatText, + Encoding=65001) # 这里直接转换为 utf-8 格式的txt + # https://docs.microsoft.com/zh-cn/office/vba/api/Office.MsoEncoding 各种格式代码在这里查 + wordapp.ActiveDocument.Close() + + finally: + wordapp.Quit() + + +# 将pdf文件转换成txt文件 +def pdf_to_txt(pdf_path): + with pdfplumber.open(pdf_path) as pdf_file: + content = '' + for i in range(len(pdf_file.pages)): + page_text = pdf_file.pages[i] + page_content = page_text.extract_text() + if page_content: + content = content + page_content + "\n" + with open(f"{pdf_path.split('.')[0]}.txt", "w", encoding="utf-8") as file: + file.write(content) + file.close() + + + +import paddlehub as hub +import numpy as np + + +def get_model(): + lac = hub.Module(name='lac') + return lac + + +def get_lac(text): + inputs = {"text": [text]} + lac = get_model() + res = lac.lexical_analysis(data=inputs) + tag = res[0]['tag'] + word = res[0]['word'] + return tag, word + + +def get_entity(text, label): + ''' + label参数可以为 + 'PER' : 人名 + 'LOC' : 地名 + 'ORG' : 机构名 + 'TIME' : 时间 + ''' + res = [] + tag, word = get_lac(text) + tag = np.array(tag) + indexs = np.where(tag == label)[0] + for index in indexs: + res.append(word[index]) + return res + + +def getText_docx(filename): # docx 转text + """将docx读成text""" + doc = docx.Document(filename) + fullText = [] + for i in doc.paragraphs: # 迭代docx文档里面的每一个段落 + fullText.append(i.text) # 保存每一个段落的文本 + numTables = doc.tables #如果有表格的内容存放在这 + if len(numTables) > 0: + for table in numTables: + row_count = len(table.rows) + col_count = len(table.columns) + for i in range(row_count): + for j in range(col_count): + fullText.append(table.cell(i, j).text) + return '\n'.join(fullText) + + +import pdfplumber + + +def getText_pdf(filename): + """将pdf读成text""" + with pdfplumber.open(filename) as pdf_file: + content = '' + for i in range(len(pdf_file.pages)): + page_text = pdf_file.pages[i] + page_content = page_text.extract_text() + if page_content: + content = content + page_content + "\n" + return content + + +from win32com import client as wc + + +def doc_docx(url, filename): + """ + 将doc文件转为docx文件 + :param filename: + :return: + """ + word = wc.Dispatch("Word.Application") + doc = word.Documents.Open(url + f"/{filename}") + name = filename.split('.')[0] + doc.SaveAs(url + f'/{name}.docx', 12) # 12为docx + doc.Close() + word.Quit() + + +def pdf_docx(url,filename): + """ + 将pdf文件转为docx文件 + :param url: + :param filename: + :return: + """ + # 获取文件名称 + file_name = filename.split('.')[0] + # pdf文件名称 + pdf_name = url + f"/{filename}" + # docx文件名称 + docx_name = url +f"/{file_name}.docx" + # 加载pdf文档 + cv = Converter(pdf_name) + cv.convert(docx_name,start=0,end=12) + cv.close() + +def read_tables(open_txt): + """ + 读取docx的表格内容 + :param open_txt: 打开docx后的对象 + :return: + """ + numTables = open_txt.tables + for table in numTables: + row_count = len(table.rows) + col_count = len(table.columns) + for i in range(row_count): + for j in range(col_count): + print(table.cell(i, j).text) diff --git a/utils/chkmail/qqemail.py b/utils/chkmail/qqemail.py new file mode 100644 index 0000000..ac65dff --- /dev/null +++ b/utils/chkmail/qqemail.py @@ -0,0 +1,138 @@ +# -*- coding: utf-8 -*- +import poplib +import base64 +import os +from email.parser import Parser +from email.header import decode_header +from email.utils import parseaddr + +email_user = { + '李宗振': { + 'email': '1986461823@qq.com', + 'pwd': 'hoosihokeaqkifdf' + }, + '吴操': { + 'email': '2787668634@qq.com', + 'pwd': 'jendjvizztqsdebb' + } +} + + +def email_users(dirname, emaildict): + # 判断文件夹是否存在不存在创建文件夹 + dirpath = './{0}'.format(dirname) + if not os.path.exists(dirpath): + os.makedirs(dirpath) + + # 连接到POP3服务器: + server = poplib.POP3("pop.qq.com") + # 可以打开或关闭调试信息: + server.set_debuglevel(1) + # 可选:打印POP3服务器的欢迎文字: + print(server.getwelcome().decode('utf-8')) + + # 身份认证: + server.user(emaildict['email']) + # server.user("1986461823@qq.com") + # 非密码,qq邮箱登录第三方需要的授权码,可在qq邮箱设置里获得 + server.pass_(emaildict['pwd']) + # server.pass_("hoosihokeaqkifdf") + + # stat()返回邮件数量和占用空间: + print('Messages: %s. Size: %s' % server.stat()) + # list()返回所有邮件的编号: + resp, mails, octets = server.list() + # 可以查看返回的列表类似[b'1 82923', b'2 2184', ...] + print(mails) + + # 获取最新一封邮件, 注意索引号从1开始: + index = len(mails) + for i in range(1, index + 1): + resp, lines, octets = server.retr(i) + + # lines存储了邮件的原始文本的每一行, + # 可以获得整个邮件的原始文本: + try: + msg_content = b'\r\n'.join(lines).decode('utf-8') + except: + continue + # 稍后解析出邮件: + msg = Parser().parsestr(msg_content) + + print_info(msg, dirpath) + # 可以根据邮件索引号直接从服务器删除邮件: + # server.dele(index) + # 关闭连接: + server.quit() + + +def print_info(msg, dirpath, indent=0): + for part in msg.walk(): + if part.get_content_maintype() == 'multipart' or part.get('Content-Disposition') is None: + continue + fileName = part.get_filename() + # 保存附件 + if fileName: + filename = '' + transfer_encoding = part.get_all('Content-Transfer-Encoding') + if transfer_encoding and transfer_encoding[0] == 'base64': + filename_parts = fileName.split('?') + filename = base64.b64decode(filename_parts[3]).decode(filename_parts[1]) + + data = part.get_payload(decode=True) + if filename: + filename_path = dirpath + "/{0}".format(filename) + # 文件存在则直接跳过 + if os.path.exists(filename_path): + continue + fEx = open(filename_path, 'wb') + fEx.write(data) + fEx.close() + if indent == 0: + for header in ['From', 'To', 'Subject']: + value = msg.get(header, '') + if value: + if header == 'Subject': + value = decode_str(value) + else: + hdr, addr = parseaddr(value) + name = decode_str(hdr) + value = u'%s <%s>' % (name, addr) + print('%s%s: %s' % (' ' * indent, header, value)) + if (msg.is_multipart()): + parts = msg.get_payload() + for n, part in enumerate(parts): + print('%spart %s' % (' ' * indent, n)) + print('%s--------------------' % (' ' * indent)) + print_info(part, dirpath, indent + 1) + else: + content_type = msg.get_content_type() + if content_type == 'text/plain' or content_type == 'text/html': + content = msg.get_payload(decode=True) + charset = guess_charset(msg) + if charset: + pass + else: + print('%sAttachment: %s' % (' ' * indent, content_type)) + + +def decode_str(s): + value, charset = decode_header(s)[0] + if charset: + value = value.decode(charset) + return value + + +def guess_charset(msg): + charset = msg.get_charset() + if charset is None: + content_type = msg.get('Content-Type', '').lower() + pos = content_type.find('charset=') + if pos >= 0: + charset = content_type[pos + 8:].strip() + return charset + + +if __name__ == '__main__': + for dirname, email_dict in email_user.items(): + email_users(dirname, email_dict) diff --git a/utils/chkmail/teststr.py b/utils/chkmail/teststr.py new file mode 100644 index 0000000..368c968 --- /dev/null +++ b/utils/chkmail/teststr.py @@ -0,0 +1,63 @@ +# -*- coding: utf-8 -*- +import copy + +txtlist = [{'公司名': [{'end': 353, + 'probability': 0.8196403474903491, + 'start': 341, + 'text': '武汉漫维智能科技有限公司'}, + {'end': 20, + 'probability': 0.8494340282651791, + 'start': 6, + 'text': '武汉中软国际科技服务有限公司'}, + {'end': 400, + 'probability': 0.5690599404322967, + 'start': 388, + 'text': '武汉漫维智能科技有限公司'}, + {'end': 733, + 'probability': 0.9766299737741235, + 'start': 721, + 'text': '广州中道电子科技有限公司'}], + '时间': [{'end': 34, + 'probability': 0.6200274175388927, + 'start': 22, + 'text': '2018.03 - 至今'}, + {'end': 383, + 'probability': 0.4970208179496325, + 'start': 366, + 'text': '2017.09 - 2018.04'}, + {'end': 752, + 'probability': 0.5228238735354154, + 'start': 735, + 'text': '2015.11 - 2017.09'}]}] + + +def chkworlkandtime(dictdata): + res = {} + for i in dictdata: + for key, datalist in i.items(): + trueDict = {} + for data in datalist: + if data['text'] in trueDict: + if data['probability'] <= trueDict[data['text']]['probability']: + continue + trueDict.update({ + data['text']: { + 'end': data['end'], + 'probability': data['probability'], + 'start': data['start'], + } + }) + trueList = [] + for key1, value1 in trueDict.items(): + value1.update({ + 'text': key1 + }) + trueDict1 = copy.deepcopy(value1) + trueList.append(trueDict1) + trueList.sort(key=lambda item: item['start']) + res.update({key: trueList}) + + return res + + +chkworlkandtime(txtlist)