This commit is contained in:
Àî×ÚÕñ 2022-08-08 14:29:03 +08:00
parent b0e65dc77d
commit cc927e614b
5 changed files with 978 additions and 0 deletions

298
utils/chkmail/chkjianli.py Normal file
View File

@ -0,0 +1,298 @@
import re
import docx
import os
import copy
from pprint import pprint
from paddlenlp import Taskflow
from gongju import get_entity, getText_docx, getText_pdf, doc_docx, pdf_docx
def fmtTxt(txt, istable=0):
chkStr = ['自我评价', '自我描述', '个人优势', '项目经历', '项目经验', '项目描述', '教育经历', '学习经历', '工作经历', '工作经验', '实习经历',
'技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能']
chkList1 = ['自我评价', '自我描述', '个人优势']
chkList2 = ['项目经历', '项目经验', '项目描述']
chkList3 = ['教育经历', '学习经历']
chkList4 = ['工作经历', '工作经验', '实习经历']
chkList5 = ['技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能']
fmtList = [] # 返回拼接好的字符串列表
trueIndex = 0
fmtStr = ''
nowChkList = []
# 判断while循环是否需要停止
stop_int = 0
for index, i in enumerate(txt):
if istable:
text = i
else:
text = i.text
text = re.sub('\s+', '', text).lstrip() # 字符串去除空格和换行符
# 没有检测出关键字
if not fmtStr:
for i in chkList1:
# 判断是不是以关键字开头
if not text.startswith(i, 0):
break
else:
if i in text:
fmtStr = text
nowChkList = [chk for chk in chkStr if chk not in chkList1]
# 检测出关键字证明需要继续循环
stop_int = 1
break
if fmtStr:
continue
for i in chkList2:
# 判断是不是以关键字开头
if not text.startswith(i, 0):
break
else:
if i in text:
fmtStr = text
nowChkList = [chk for chk in chkStr if chk not in chkList2]
stop_int = 1
break
if fmtStr:
continue
for i in chkList3:
if i in text:
fmtStr = text
nowChkList = [chk for chk in chkStr if chk not in chkList3]
stop_int = 1
break
if fmtStr:
continue
for i in chkList4:
# 判断是不是以关键字开头
if not text.startswith(i, 0):
break
else:
if i in text:
fmtStr = text
nowChkList = [chk for chk in chkStr if chk not in chkList4]
stop_int = 1
break
if fmtStr:
continue
for i in chkList5:
# 判断是不是以关键字开头
if not text.startswith(i, 0):
break
else:
if i in text:
fmtStr = text
nowChkList = [chk for chk in chkStr if chk not in chkList5]
stop_int = 1
break
continue
else:
isTure = 1
for i in nowChkList:
if i in text:
isTure = 0
break
if isTure:
fmtStr += text
continue
else:
fmtStrTrue = fmtStr
fmtList.append(fmtStrTrue)
trueIndex = index
# fmtStr = ''
# nowChkList = []
# 剩余没有检索的部分
txt1 = txt[trueIndex:]
return fmtList, txt1, stop_int
# 当列表全部检索完毕需要停止循环
if fmtStr:
fmtStrTrue = fmtStr
fmtList.append(fmtStrTrue)
stop_int = 0
txt1 = txt[trueIndex:]
return fmtList, txt1, stop_int
def fmtList(txtlist):
chkList1 = ['自我评价', '自我描述', '个人优势']
chkList2 = ['项目经历', '项目经验', '项目描述']
chkList3 = ['教育经历', '学习经历']
chkList4 = ['工作经历', '工作经验', '实习经历']
chkList5 = ['技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能']
# 自我评价
review = []
# 项目经验
project = []
# 工作经验
work = []
# 教育经验
upgrade = []
# 技能特长
specialty = []
for text in txtlist:
ischk = 0
# 自我评价
for i in chkList1:
if i in text:
review.append(text)
ischk = 1
break
if ischk:
continue
# 项目经验
for i in chkList2:
if i in text:
project.append(text)
ischk = 1
break
if ischk:
continue
# 工作经验
for i in chkList4:
if i in text:
work.append(text)
ischk = 1
break
if ischk:
continue
# 教育经历
for i in chkList3:
if i in text:
upgrade.append(text)
ischk = 1
break
if ischk:
continue
# 自我评价
for i in chkList5:
if i in text:
specialty.append(text)
ischk = 1
break
if ischk:
continue
return review, project, work, upgrade, specialty
def get_date(schema, dates):
"""
把第三方获取的数据筛选出想要的基本信息
:param schema:
:param dates:
:return:
"""
date = {}
for i in schema:
text = dates[0].get(i, '')
# 如果数据中没有搜到对应的键,返回空字符串
if text == '':
date[i] = text
else:
if len(text) == 1:
date[i] = text[0]['text']
else:
aa = {}
num = []
for dic in text:
aa[dic['probability']] = dic['text']
num.append(dic['probability'])
date[i] = aa[max(num)]
return date
# 文件路径
PATH_DATA = os.path.abspath("D:/wokerplay/面试简历1")
schema = ['姓名', '所在地', '户口所在地', '婚姻状况', '民族', '身高', '电话', '应聘职位', '到岗时间', '学历', '毕业学校', '专业',
'期望薪资', '在校时间', '到岗时间', '工作经历', '自我评价', '电子邮箱', '技能', '特长', '工作经验', '项目经验'
]
for root, dirs, files in os.walk(PATH_DATA):
for file in files: # 一个file就是一份简历
url = PATH_DATA + f"/{file}"
# 名字
name = ''
# 电话
phone = ''
# 电子邮箱
mail = ''
# 民族
nation = ''
# 毕业院校
school = ''
# 专业
major = ''
# 工作经验
work_exp = ''
# 婚姻状况
gam = ''
# 地址
site = []
# 婚姻状况
marriage = ''
# 自我评价
review = []
# 项目经验
project = []
# 工作经验
work = []
# 教育经验
upgrade = []
# 技能特长
specialty = []
if os.path.splitext(file)[1] == '.pdf':
pdf_docx(PATH_DATA, file) # 转为docx
name = file.split('.')[0]
open_txt = docx.Document(PATH_DATA + f"/{name}.docx") # 打开docx
txt = getText_pdf(url) # 打开pdf格式文件转txt
# txt = getText_docx(PATH_DATA + f"\{name}.docx")
elif os.path.splitext(file)[1] == '.docx':
open_txt = docx.Document(url) # 打开docx将用来读取每一段的内容
txt = getText_docx(url) # 打开docx格式文件转txt
elif os.path.splitext(file)[1] == '.doc':
doc_docx(PATH_DATA, file) # 转为docx
name = file.split('.')[0]
open_txt = docx.Document(PATH_DATA + f"/{name}.docx") # 打开docx
txt = getText_docx(PATH_DATA + f"/{name}.docx") # 打开docx格式文件转txt
# ie = Taskflow('information_extraction', schema=schema) # 花费时间会安装文件
# pprint(ie(txt)) # 姓名,电话,电子邮箱,民族,毕业院校,专业,工作经验,婚姻状况
# #获取的基础数据
# text_lists=ie(txt)
# 打开docx获取的每一段数据
txt_list = open_txt.paragraphs
# 获取的文档内容
txt_list1 = []
stop_int = 1
txt1 = txt_list
while stop_int:
txt_list2, txt1, stop_int = fmtTxt(txt1)
txt_list1 += txt_list2
print(txt_list1)
numTables = open_txt.tables # 获取表格里面的内容
table_list = []
if len(numTables) > 0:
for table in numTables:
row_count = len(table.rows)
col_count = len(table.columns)
for i in range(row_count):
for j in range(col_count):
texts = table.cell(i, j).text
texts = re.sub('\s+', '', texts).lstrip() # 字符串去除空格和换行符
if not texts:
continue
if texts in table_list:
continue
table_list.append(texts)
if table_list:
stop_table = 1
table1 = table_list
while stop_table:
table_list2, table1, stop_table = fmtTxt(table1, istable=1)
txt_list1 += table_list2
print(txt_list1)
review, project, work, upgrade, specialty = fmtList(txt_list1)

315
utils/chkmail/chkmail.py Normal file
View File

@ -0,0 +1,315 @@
import re
import docx
import os
from pprint import pprint
from paddlenlp import Taskflow
from gongju import getText_docx, getText_pdf, doc_docx, pdf_docx
def get_date(schema, dates, schema_dict):
"""
把第三方获取的数据筛选出想要的基本信息
:param schema:中文的词性标注
:param dates:原数据
:param schema_dict:对应中文的英文
:return: 返回取出概率最大的基本信息数据
"""
date = {}
for i in schema:
text = dates[0].get(i, '')
# 如果数据中没有搜到对应的键,返回空字符串
if text == '':
date[schema_dict[i]] = text
else:
if len(text) == 1:
date[schema_dict[i]] = text[0]['text']
else:
aa = {}
num = []
for dic in text:
aa[dic['probability']] = dic['text']
num.append(dic['probability'])
# 取出概率最大的值
date[schema_dict[i]] = aa[max(num)]
return date
def fmtTxt(txt, istable=0):
# 所有关键字
chkStr = ['自我评价', '自我描述', '个人优势', '项目经历', '项目经验', '项目描述', '教育经历', '学习经历', '工作经历', '工作经验', '实习经历',
'技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能', '个人评价']
# 自我描述
chkList1 = ['自我评价', '自我描述', '个人优势', '个人评价']
# 项目经验
chkList2 = ['项目经历', '项目经验', '项目描述']
# 教育背景
chkList3 = ['教育经历', '学习经历']
# 工作经历
chkList4 = ['工作经历', '工作经验', '实习经历']
# 个人技能
chkList5 = ['技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能']
fmtList = [] # 返回拼接好的字符串列表
trueIndex = 0
fmtStr = ''
nowChkList = []
# 判断while循环是否需要停止
stop_int = 0
for index, i in enumerate(txt):
if istable:
text = i
else:
text = i.text
text = re.sub('\s+', '', text).lstrip() # 字符串去除空格和换行符
# 没有检测出关键字
if not fmtStr:
for i in chkList1:
# 判断是不是以关键字开头
if not text.startswith(i, 0):
continue
else:
if i in text:
fmtStr = text
nowChkList = [chk for chk in chkStr if chk not in chkList1]
# 检测出关键字证明需要继续循环
stop_int = 1
break
if fmtStr:
continue
for i in chkList2:
# 判断是不是以关键字开头
if not text.startswith(i, 0):
continue
else:
if i in text:
fmtStr = text
nowChkList = [chk for chk in chkStr if chk not in chkList2]
stop_int = 1
break
if fmtStr:
continue
for i in chkList3:
if i in text:
fmtStr = text
nowChkList = [chk for chk in chkStr if chk not in chkList3]
stop_int = 1
break
if fmtStr:
continue
for i in chkList4:
# 判断是不是以关键字开头
if not text.startswith(i, 0):
continue
else:
if i in text:
fmtStr = text
nowChkList = [chk for chk in chkStr if chk not in chkList4]
stop_int = 1
break
if fmtStr:
continue
for i in chkList5:
if i in text:
fmtStr = text
nowChkList = [chk for chk in chkStr if chk not in chkList5]
stop_int = 1
break
continue
else:
isTure = 1
for i in nowChkList:
if i in text:
isTure = 0
break
if isTure:
fmtStr += text
continue
else:
fmtStrTrue = fmtStr
fmtList.append(fmtStrTrue)
trueIndex = index
# fmtStr = ''
# nowChkList = []
# 剩余没有检索的部分
txt1 = txt[trueIndex:]
return fmtList, txt1, stop_int
# 当列表全部检索完毕需要停止循环
if fmtStr:
fmtStrTrue = fmtStr
fmtList.append(fmtStrTrue)
stop_int = 0
txt1 = txt[trueIndex:]
return fmtList, txt1, stop_int
def fmtList(txtlist, dates):
chkList1 = ['自我评价', '自我描述', '个人优势']
chkList2 = ['项目经历', '项目经验', '项目描述']
chkList3 = ['教育经历', '学习经历']
chkList4 = ['工作经历', '工作经验', '实习经历']
chkList5 = ['技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能']
# 自我评价
review = []
# 项目经验
project = []
# 工作经验
work = []
# 教育经验
upgrade = []
# 技能特长
specialty = []
for text in txtlist:
ischk = 0
# 自我评价
for i in chkList1:
if i in text:
review.append(text)
ischk = 1
break
if ischk:
continue
# 项目经验
for i in chkList2:
if i in text:
project.append(text)
ischk = 1
break
if ischk:
continue
# 工作经验
for i in chkList4:
if i in text:
work.append(text)
ischk = 1
break
if ischk:
continue
# 教育经历
for i in chkList3:
if i in text:
upgrade.append(text)
ischk = 1
break
if ischk:
continue
# 自我评价
for i in chkList5:
if i in text:
specialty.append(text)
ischk = 1
break
if ischk:
continue
# review自我评价, project项目经验work工作经验upgrade教育经历specialty技能特长
dates.update({
'review': review,
'project': project,
'work': work,
'upgrade': upgrade,
'specialty': specialty,
})
return dates
# 文件路径
PATH_DATA = os.path.abspath("C:/Users/Administrator/Desktop/面试简历")
schema = ['姓名', '所在地', '户口所在地', '婚姻状况', '民族', '身高', '电话', '应聘职位', '到岗时间', '学历', '毕业学校', '专业',
'期望薪资', '在校时间', '电子邮箱', '工作经验','Email'
]
schema_dict = {'姓名': 'name', '所在地': 'location', '户口所在地': 'account', '婚姻状况': 'gam', '民族': 'nation', '身高': 'height',
'电话': 'phone', '应聘职位': 'job', '到岗时间': 'come_time', '学历': 'education', '毕业学校': 'school', '专业': 'career',
'期望薪资': 'money', '在校时间': 'at_school', '电子邮箱': 'mail', '工作经验': 'work_exp','Email':'mail'}
for root, dirs, files in os.walk(PATH_DATA):
for file in files: # 一个file就是一份简历
url = PATH_DATA + f"/{file}"
# # 名字
# name = ''
# # 电话
# phone = ''
# # 电子邮箱
# mail = ''
# # 民族
# nation = ''
# # 毕业院校
# school = ''
# # 专业
# major = ''
# # 工作经验
# work_exp = ''
# # 婚姻状况
# gam = ''
# # 地址
# site = []
# # 婚姻状况
# marriage = ''
# # 自我评价
# review = []
# # 项目经验
# project = []
# # 工作经验
# work = []
# # 教育经验
# upgrade = []
# # 技能特长
# specialty = []
if os.path.splitext(file)[1] == '.pdf':
pdf_docx(PATH_DATA, file) # 转为docx
name = file.split('.')[0]
open_txt = docx.Document(PATH_DATA + f"/{name}.docx") # 打开docx
txt = getText_pdf(url) # 打开pdf格式文件转txt
# txt = getText_docx(PATH_DATA + f"\{name}.docx")
elif os.path.splitext(file)[1] == '.docx':
open_txt = docx.Document(url) # 打开docx将用来读取每一段的内容
txt = getText_docx(url) # 打开docx格式文件转txt
elif os.path.splitext(file)[1] == '.doc':
doc_docx(PATH_DATA, file) # 转为docx
name = file.split('.')[0]
open_txt = docx.Document(PATH_DATA + f"/{name}.docx") # 打开docx
txt = getText_docx(PATH_DATA + f"/{name}.docx") # 打开docx格式文件转txt
ie = Taskflow('information_extraction', schema=schema) # 花费时间会安装文件
# pprint(ie(txt)) # 姓名,电话,电子邮箱,民族,毕业院校,专业,工作经验,婚姻状况
# 获取的基础数据
text_lists = ie(txt)
# 处理后的基本数据
dates = get_date(schema, text_lists, schema_dict)
# 打开docx获取的每一段数据
txt_list = open_txt.paragraphs
# 获取的文档内容
txt_list1 = []
stop_int = 1
txt1 = txt_list
while stop_int:
txt_list2, txt1, stop_int = fmtTxt(txt1)
txt_list1 += txt_list2
print(txt_list1)
numTables = open_txt.tables # 获取表格里面的内容
table_list = []
if len(numTables) > 0:
for table in numTables:
row_count = len(table.rows)
col_count = len(table.columns)
for i in range(row_count):
for j in range(col_count):
texts = table.cell(i, j).text
texts = re.sub('\s+', '', texts).lstrip() # 字符串去除空格和换行符
if not texts:
continue
if texts in table_list:
continue
table_list.append(texts)
if table_list:
stop_table = 1
table1 = table_list
while stop_table:
table_list2, table1, stop_table = fmtTxt(table1, istable=1)
txt_list1 += table_list2
# print(txt_list1)
# review自我评价,project项目经验work工作经验upgrade教育经历specialty技能特长
# 把两部分的数据合起来返回前端
datess = fmtList(txt_list1, dates)
pprint(datess)
a = 1

164
utils/chkmail/gongju.py Normal file
View File

@ -0,0 +1,164 @@
import fnmatch
import docx
# import win32com.client
import os
from pdf2docx import Converter
PATH_DATA = os.path.abspath(r"C:\Users\Administrator\Desktop\新建文件夹") # word简历存放路径
# 将docxdoc文件转换成txt文件
def docx_to_txt():
wordapp = win32com.client.gencache.EnsureDispatch("Word.Application")
try:
for root, dirs, files in os.walk(PATH_DATA):
for _dir in dirs:
pass
for _file in files:
if not (fnmatch.fnmatch(_file, '*.doc') or fnmatch.fnmatch(_file, '*.docx')) or _file.startswith("~"):
continue
print('_file:', _file)
file = os.path.join(root, _file)
wordapp.Documents.Open(file)
if fnmatch.fnmatch(_file, '*.docx'): # 匹配doc文档
file = file[:-3] + 'txt'
else: # 匹配docx文档
file = file[:-4] + 'txt'
wordapp.ActiveDocument.SaveAs(file, FileFormat=win32com.client.constants.wdFormatText,
Encoding=65001) # 这里直接转换为 utf-8 格式的txt
# https://docs.microsoft.com/zh-cn/office/vba/api/Office.MsoEncoding 各种格式代码在这里查
wordapp.ActiveDocument.Close()
finally:
wordapp.Quit()
# 将pdf文件转换成txt文件
def pdf_to_txt(pdf_path):
with pdfplumber.open(pdf_path) as pdf_file:
content = ''
for i in range(len(pdf_file.pages)):
page_text = pdf_file.pages[i]
page_content = page_text.extract_text()
if page_content:
content = content + page_content + "\n"
with open(f"{pdf_path.split('.')[0]}.txt", "w", encoding="utf-8") as file:
file.write(content)
file.close()
import paddlehub as hub
import numpy as np
def get_model():
lac = hub.Module(name='lac')
return lac
def get_lac(text):
inputs = {"text": [text]}
lac = get_model()
res = lac.lexical_analysis(data=inputs)
tag = res[0]['tag']
word = res[0]['word']
return tag, word
def get_entity(text, label):
'''
label参数可以为
'PER' : 人名
'LOC' : 地名
'ORG' : 机构名
'TIME' : 时间
'''
res = []
tag, word = get_lac(text)
tag = np.array(tag)
indexs = np.where(tag == label)[0]
for index in indexs:
res.append(word[index])
return res
def getText_docx(filename): # docx 转text
"""将docx读成text"""
doc = docx.Document(filename)
fullText = []
for i in doc.paragraphs: # 迭代docx文档里面的每一个段落
fullText.append(i.text) # 保存每一个段落的文本
numTables = doc.tables #如果有表格的内容存放在这
if len(numTables) > 0:
for table in numTables:
row_count = len(table.rows)
col_count = len(table.columns)
for i in range(row_count):
for j in range(col_count):
fullText.append(table.cell(i, j).text)
return '\n'.join(fullText)
import pdfplumber
def getText_pdf(filename):
"""将pdf读成text"""
with pdfplumber.open(filename) as pdf_file:
content = ''
for i in range(len(pdf_file.pages)):
page_text = pdf_file.pages[i]
page_content = page_text.extract_text()
if page_content:
content = content + page_content + "\n"
return content
from win32com import client as wc
def doc_docx(url, filename):
"""
将doc文件转为docx文件
:param filename:
:return:
"""
word = wc.Dispatch("Word.Application")
doc = word.Documents.Open(url + f"/{filename}")
name = filename.split('.')[0]
doc.SaveAs(url + f'/{name}.docx', 12) # 12为docx
doc.Close()
word.Quit()
def pdf_docx(url,filename):
"""
将pdf文件转为docx文件
:param url:
:param filename:
:return:
"""
# 获取文件名称
file_name = filename.split('.')[0]
# pdf文件名称
pdf_name = url + f"/{filename}"
# docx文件名称
docx_name = url +f"/{file_name}.docx"
# 加载pdf文档
cv = Converter(pdf_name)
cv.convert(docx_name,start=0,end=12)
cv.close()
def read_tables(open_txt):
"""
读取docx的表格内容
:param open_txt: 打开docx后的对象
:return:
"""
numTables = open_txt.tables
for table in numTables:
row_count = len(table.rows)
col_count = len(table.columns)
for i in range(row_count):
for j in range(col_count):
print(table.cell(i, j).text)

138
utils/chkmail/qqemail.py Normal file
View File

@ -0,0 +1,138 @@
# -*- coding: utf-8 -*-
import poplib
import base64
import os
from email.parser import Parser
from email.header import decode_header
from email.utils import parseaddr
email_user = {
'李宗振': {
'email': '1986461823@qq.com',
'pwd': 'hoosihokeaqkifdf'
},
'吴操': {
'email': '2787668634@qq.com',
'pwd': 'jendjvizztqsdebb'
}
}
def email_users(dirname, emaildict):
# 判断文件夹是否存在不存在创建文件夹
dirpath = './{0}'.format(dirname)
if not os.path.exists(dirpath):
os.makedirs(dirpath)
# 连接到POP3服务器:
server = poplib.POP3("pop.qq.com")
# 可以打开或关闭调试信息:
server.set_debuglevel(1)
# 可选:打印POP3服务器的欢迎文字:
print(server.getwelcome().decode('utf-8'))
# 身份认证:
server.user(emaildict['email'])
# server.user("1986461823@qq.com")
# 非密码qq邮箱登录第三方需要的授权码可在qq邮箱设置里获得
server.pass_(emaildict['pwd'])
# server.pass_("hoosihokeaqkifdf")
# stat()返回邮件数量和占用空间:
print('Messages: %s. Size: %s' % server.stat())
# list()返回所有邮件的编号:
resp, mails, octets = server.list()
# 可以查看返回的列表类似[b'1 82923', b'2 2184', ...]
print(mails)
# 获取最新一封邮件, 注意索引号从1开始:
index = len(mails)
for i in range(1, index + 1):
resp, lines, octets = server.retr(i)
# lines存储了邮件的原始文本的每一行,
# 可以获得整个邮件的原始文本:
try:
msg_content = b'\r\n'.join(lines).decode('utf-8')
except:
continue
# 稍后解析出邮件:
msg = Parser().parsestr(msg_content)
print_info(msg, dirpath)
# 可以根据邮件索引号直接从服务器删除邮件:
# server.dele(index)
# 关闭连接:
server.quit()
def print_info(msg, dirpath, indent=0):
for part in msg.walk():
if part.get_content_maintype() == 'multipart' or part.get('Content-Disposition') is None:
continue
fileName = part.get_filename()
# 保存附件
if fileName:
filename = ''
transfer_encoding = part.get_all('Content-Transfer-Encoding')
if transfer_encoding and transfer_encoding[0] == 'base64':
filename_parts = fileName.split('?')
filename = base64.b64decode(filename_parts[3]).decode(filename_parts[1])
data = part.get_payload(decode=True)
if filename:
filename_path = dirpath + "/{0}".format(filename)
# 文件存在则直接跳过
if os.path.exists(filename_path):
continue
fEx = open(filename_path, 'wb')
fEx.write(data)
fEx.close()
if indent == 0:
for header in ['From', 'To', 'Subject']:
value = msg.get(header, '')
if value:
if header == 'Subject':
value = decode_str(value)
else:
hdr, addr = parseaddr(value)
name = decode_str(hdr)
value = u'%s <%s>' % (name, addr)
print('%s%s: %s' % (' ' * indent, header, value))
if (msg.is_multipart()):
parts = msg.get_payload()
for n, part in enumerate(parts):
print('%spart %s' % (' ' * indent, n))
print('%s--------------------' % (' ' * indent))
print_info(part, dirpath, indent + 1)
else:
content_type = msg.get_content_type()
if content_type == 'text/plain' or content_type == 'text/html':
content = msg.get_payload(decode=True)
charset = guess_charset(msg)
if charset:
pass
else:
print('%sAttachment: %s' % (' ' * indent, content_type))
def decode_str(s):
value, charset = decode_header(s)[0]
if charset:
value = value.decode(charset)
return value
def guess_charset(msg):
charset = msg.get_charset()
if charset is None:
content_type = msg.get('Content-Type', '').lower()
pos = content_type.find('charset=')
if pos >= 0:
charset = content_type[pos + 8:].strip()
return charset
if __name__ == '__main__':
for dirname, email_dict in email_user.items():
email_users(dirname, email_dict)

63
utils/chkmail/teststr.py Normal file
View File

@ -0,0 +1,63 @@
# -*- coding: utf-8 -*-
import copy
txtlist = [{'公司名': [{'end': 353,
'probability': 0.8196403474903491,
'start': 341,
'text': '武汉漫维智能科技有限公司'},
{'end': 20,
'probability': 0.8494340282651791,
'start': 6,
'text': '武汉中软国际科技服务有限公司'},
{'end': 400,
'probability': 0.5690599404322967,
'start': 388,
'text': '武汉漫维智能科技有限公司'},
{'end': 733,
'probability': 0.9766299737741235,
'start': 721,
'text': '广州中道电子科技有限公司'}],
'时间': [{'end': 34,
'probability': 0.6200274175388927,
'start': 22,
'text': '2018.03 - 至今'},
{'end': 383,
'probability': 0.4970208179496325,
'start': 366,
'text': '2017.09 - 2018.04'},
{'end': 752,
'probability': 0.5228238735354154,
'start': 735,
'text': '2015.11 - 2017.09'}]}]
def chkworlkandtime(dictdata):
res = {}
for i in dictdata:
for key, datalist in i.items():
trueDict = {}
for data in datalist:
if data['text'] in trueDict:
if data['probability'] <= trueDict[data['text']]['probability']:
continue
trueDict.update({
data['text']: {
'end': data['end'],
'probability': data['probability'],
'start': data['start'],
}
})
trueList = []
for key1, value1 in trueDict.items():
value1.update({
'text': key1
})
trueDict1 = copy.deepcopy(value1)
trueList.append(trueDict1)
trueList.sort(key=lambda item: item['start'])
res.update({key: trueList})
return res
chkworlkandtime(txtlist)