prs_server/utils/chkmail/chkmail.py
2022-08-08 14:29:03 +08:00

316 lines
11 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
import docx
import os
from pprint import pprint
from paddlenlp import Taskflow
from gongju import getText_docx, getText_pdf, doc_docx, pdf_docx
def get_date(schema, dates, schema_dict):
"""
把第三方获取的数据筛选出想要的基本信息
:param schema:中文的词性标注
:param dates:原数据
:param schema_dict:对应中文的英文
:return: 返回取出概率最大的基本信息数据
"""
date = {}
for i in schema:
text = dates[0].get(i, '')
# 如果数据中没有搜到对应的键,返回空字符串
if text == '':
date[schema_dict[i]] = text
else:
if len(text) == 1:
date[schema_dict[i]] = text[0]['text']
else:
aa = {}
num = []
for dic in text:
aa[dic['probability']] = dic['text']
num.append(dic['probability'])
# 取出概率最大的值
date[schema_dict[i]] = aa[max(num)]
return date
def fmtTxt(txt, istable=0):
# 所有关键字
chkStr = ['自我评价', '自我描述', '个人优势', '项目经历', '项目经验', '项目描述', '教育经历', '学习经历', '工作经历', '工作经验', '实习经历',
'技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能', '个人评价']
# 自我描述
chkList1 = ['自我评价', '自我描述', '个人优势', '个人评价']
# 项目经验
chkList2 = ['项目经历', '项目经验', '项目描述']
# 教育背景
chkList3 = ['教育经历', '学习经历']
# 工作经历
chkList4 = ['工作经历', '工作经验', '实习经历']
# 个人技能
chkList5 = ['技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能']
fmtList = [] # 返回拼接好的字符串列表
trueIndex = 0
fmtStr = ''
nowChkList = []
# 判断while循环是否需要停止
stop_int = 0
for index, i in enumerate(txt):
if istable:
text = i
else:
text = i.text
text = re.sub('\s+', '', text).lstrip() # 字符串去除空格和换行符
# 没有检测出关键字
if not fmtStr:
for i in chkList1:
# 判断是不是以关键字开头
if not text.startswith(i, 0):
continue
else:
if i in text:
fmtStr = text
nowChkList = [chk for chk in chkStr if chk not in chkList1]
# 检测出关键字证明需要继续循环
stop_int = 1
break
if fmtStr:
continue
for i in chkList2:
# 判断是不是以关键字开头
if not text.startswith(i, 0):
continue
else:
if i in text:
fmtStr = text
nowChkList = [chk for chk in chkStr if chk not in chkList2]
stop_int = 1
break
if fmtStr:
continue
for i in chkList3:
if i in text:
fmtStr = text
nowChkList = [chk for chk in chkStr if chk not in chkList3]
stop_int = 1
break
if fmtStr:
continue
for i in chkList4:
# 判断是不是以关键字开头
if not text.startswith(i, 0):
continue
else:
if i in text:
fmtStr = text
nowChkList = [chk for chk in chkStr if chk not in chkList4]
stop_int = 1
break
if fmtStr:
continue
for i in chkList5:
if i in text:
fmtStr = text
nowChkList = [chk for chk in chkStr if chk not in chkList5]
stop_int = 1
break
continue
else:
isTure = 1
for i in nowChkList:
if i in text:
isTure = 0
break
if isTure:
fmtStr += text
continue
else:
fmtStrTrue = fmtStr
fmtList.append(fmtStrTrue)
trueIndex = index
# fmtStr = ''
# nowChkList = []
# 剩余没有检索的部分
txt1 = txt[trueIndex:]
return fmtList, txt1, stop_int
# 当列表全部检索完毕需要停止循环
if fmtStr:
fmtStrTrue = fmtStr
fmtList.append(fmtStrTrue)
stop_int = 0
txt1 = txt[trueIndex:]
return fmtList, txt1, stop_int
def fmtList(txtlist, dates):
chkList1 = ['自我评价', '自我描述', '个人优势']
chkList2 = ['项目经历', '项目经验', '项目描述']
chkList3 = ['教育经历', '学习经历']
chkList4 = ['工作经历', '工作经验', '实习经历']
chkList5 = ['技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能']
# 自我评价
review = []
# 项目经验
project = []
# 工作经验
work = []
# 教育经验
upgrade = []
# 技能特长
specialty = []
for text in txtlist:
ischk = 0
# 自我评价
for i in chkList1:
if i in text:
review.append(text)
ischk = 1
break
if ischk:
continue
# 项目经验
for i in chkList2:
if i in text:
project.append(text)
ischk = 1
break
if ischk:
continue
# 工作经验
for i in chkList4:
if i in text:
work.append(text)
ischk = 1
break
if ischk:
continue
# 教育经历
for i in chkList3:
if i in text:
upgrade.append(text)
ischk = 1
break
if ischk:
continue
# 自我评价
for i in chkList5:
if i in text:
specialty.append(text)
ischk = 1
break
if ischk:
continue
# review自我评价, project项目经验work工作经验upgrade教育经历specialty技能特长
dates.update({
'review': review,
'project': project,
'work': work,
'upgrade': upgrade,
'specialty': specialty,
})
return dates
# 文件路径
PATH_DATA = os.path.abspath("C:/Users/Administrator/Desktop/面试简历")
schema = ['姓名', '所在地', '户口所在地', '婚姻状况', '民族', '身高', '电话', '应聘职位', '到岗时间', '学历', '毕业学校', '专业',
'期望薪资', '在校时间', '电子邮箱', '工作经验','Email'
]
schema_dict = {'姓名': 'name', '所在地': 'location', '户口所在地': 'account', '婚姻状况': 'gam', '民族': 'nation', '身高': 'height',
'电话': 'phone', '应聘职位': 'job', '到岗时间': 'come_time', '学历': 'education', '毕业学校': 'school', '专业': 'career',
'期望薪资': 'money', '在校时间': 'at_school', '电子邮箱': 'mail', '工作经验': 'work_exp','Email':'mail'}
for root, dirs, files in os.walk(PATH_DATA):
for file in files: # 一个file就是一份简历
url = PATH_DATA + f"/{file}"
# # 名字
# name = ''
# # 电话
# phone = ''
# # 电子邮箱
# mail = ''
# # 民族
# nation = ''
# # 毕业院校
# school = ''
# # 专业
# major = ''
# # 工作经验
# work_exp = ''
# # 婚姻状况
# gam = ''
# # 地址
# site = []
# # 婚姻状况
# marriage = ''
# # 自我评价
# review = []
# # 项目经验
# project = []
# # 工作经验
# work = []
# # 教育经验
# upgrade = []
# # 技能特长
# specialty = []
if os.path.splitext(file)[1] == '.pdf':
pdf_docx(PATH_DATA, file) # 转为docx
name = file.split('.')[0]
open_txt = docx.Document(PATH_DATA + f"/{name}.docx") # 打开docx
txt = getText_pdf(url) # 打开pdf格式文件转txt
# txt = getText_docx(PATH_DATA + f"\{name}.docx")
elif os.path.splitext(file)[1] == '.docx':
open_txt = docx.Document(url) # 打开docx将用来读取每一段的内容
txt = getText_docx(url) # 打开docx格式文件转txt
elif os.path.splitext(file)[1] == '.doc':
doc_docx(PATH_DATA, file) # 转为docx
name = file.split('.')[0]
open_txt = docx.Document(PATH_DATA + f"/{name}.docx") # 打开docx
txt = getText_docx(PATH_DATA + f"/{name}.docx") # 打开docx格式文件转txt
ie = Taskflow('information_extraction', schema=schema) # 花费时间会安装文件
# pprint(ie(txt)) # 姓名,电话,电子邮箱,民族,毕业院校,专业,工作经验,婚姻状况
# 获取的基础数据
text_lists = ie(txt)
# 处理后的基本数据
dates = get_date(schema, text_lists, schema_dict)
# 打开docx获取的每一段数据
txt_list = open_txt.paragraphs
# 获取的文档内容
txt_list1 = []
stop_int = 1
txt1 = txt_list
while stop_int:
txt_list2, txt1, stop_int = fmtTxt(txt1)
txt_list1 += txt_list2
print(txt_list1)
numTables = open_txt.tables # 获取表格里面的内容
table_list = []
if len(numTables) > 0:
for table in numTables:
row_count = len(table.rows)
col_count = len(table.columns)
for i in range(row_count):
for j in range(col_count):
texts = table.cell(i, j).text
texts = re.sub('\s+', '', texts).lstrip() # 字符串去除空格和换行符
if not texts:
continue
if texts in table_list:
continue
table_list.append(texts)
if table_list:
stop_table = 1
table1 = table_list
while stop_table:
table_list2, table1, stop_table = fmtTxt(table1, istable=1)
txt_list1 += table_list2
# print(txt_list1)
# review自我评价,project项目经验work工作经验upgrade教育经历specialty技能特长
# 把两部分的数据合起来返回前端
datess = fmtList(txt_list1, dates)
pprint(datess)
a = 1