316 lines
11 KiB
Python
316 lines
11 KiB
Python
import re
|
||
import docx
|
||
import os
|
||
from pprint import pprint
|
||
from paddlenlp import Taskflow
|
||
from gongju import getText_docx, getText_pdf, doc_docx, pdf_docx
|
||
|
||
|
||
def get_date(schema, dates, schema_dict):
|
||
"""
|
||
把第三方获取的数据筛选出想要的基本信息
|
||
:param schema:中文的词性标注
|
||
:param dates:原数据
|
||
:param schema_dict:对应中文的英文
|
||
:return: 返回取出概率最大的基本信息数据
|
||
"""
|
||
date = {}
|
||
for i in schema:
|
||
text = dates[0].get(i, '')
|
||
# 如果数据中没有搜到对应的键,返回空字符串
|
||
if text == '':
|
||
date[schema_dict[i]] = text
|
||
else:
|
||
if len(text) == 1:
|
||
date[schema_dict[i]] = text[0]['text']
|
||
else:
|
||
aa = {}
|
||
num = []
|
||
for dic in text:
|
||
aa[dic['probability']] = dic['text']
|
||
num.append(dic['probability'])
|
||
# 取出概率最大的值
|
||
date[schema_dict[i]] = aa[max(num)]
|
||
return date
|
||
|
||
|
||
def fmtTxt(txt, istable=0):
|
||
# 所有关键字
|
||
chkStr = ['自我评价', '自我描述', '个人优势', '项目经历', '项目经验', '项目描述', '教育经历', '学习经历', '工作经历', '工作经验', '实习经历',
|
||
'技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能', '个人评价']
|
||
# 自我描述
|
||
chkList1 = ['自我评价', '自我描述', '个人优势', '个人评价']
|
||
# 项目经验
|
||
chkList2 = ['项目经历', '项目经验', '项目描述']
|
||
# 教育背景
|
||
chkList3 = ['教育经历', '学习经历']
|
||
# 工作经历
|
||
chkList4 = ['工作经历', '工作经验', '实习经历']
|
||
# 个人技能
|
||
chkList5 = ['技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能']
|
||
fmtList = [] # 返回拼接好的字符串列表
|
||
trueIndex = 0
|
||
fmtStr = ''
|
||
nowChkList = []
|
||
# 判断while循环是否需要停止
|
||
stop_int = 0
|
||
for index, i in enumerate(txt):
|
||
if istable:
|
||
text = i
|
||
else:
|
||
text = i.text
|
||
text = re.sub('\s+', '', text).lstrip() # 字符串去除空格和换行符
|
||
|
||
# 没有检测出关键字
|
||
if not fmtStr:
|
||
|
||
for i in chkList1:
|
||
# 判断是不是以关键字开头
|
||
if not text.startswith(i, 0):
|
||
continue
|
||
else:
|
||
if i in text:
|
||
fmtStr = text
|
||
nowChkList = [chk for chk in chkStr if chk not in chkList1]
|
||
# 检测出关键字证明需要继续循环
|
||
stop_int = 1
|
||
break
|
||
if fmtStr:
|
||
continue
|
||
for i in chkList2:
|
||
# 判断是不是以关键字开头
|
||
if not text.startswith(i, 0):
|
||
continue
|
||
else:
|
||
if i in text:
|
||
fmtStr = text
|
||
nowChkList = [chk for chk in chkStr if chk not in chkList2]
|
||
stop_int = 1
|
||
break
|
||
if fmtStr:
|
||
continue
|
||
for i in chkList3:
|
||
if i in text:
|
||
fmtStr = text
|
||
nowChkList = [chk for chk in chkStr if chk not in chkList3]
|
||
stop_int = 1
|
||
break
|
||
if fmtStr:
|
||
continue
|
||
for i in chkList4:
|
||
# 判断是不是以关键字开头
|
||
if not text.startswith(i, 0):
|
||
continue
|
||
else:
|
||
if i in text:
|
||
fmtStr = text
|
||
nowChkList = [chk for chk in chkStr if chk not in chkList4]
|
||
stop_int = 1
|
||
break
|
||
if fmtStr:
|
||
continue
|
||
for i in chkList5:
|
||
if i in text:
|
||
fmtStr = text
|
||
nowChkList = [chk for chk in chkStr if chk not in chkList5]
|
||
stop_int = 1
|
||
break
|
||
continue
|
||
else:
|
||
isTure = 1
|
||
for i in nowChkList:
|
||
if i in text:
|
||
isTure = 0
|
||
break
|
||
if isTure:
|
||
fmtStr += text
|
||
continue
|
||
else:
|
||
fmtStrTrue = fmtStr
|
||
fmtList.append(fmtStrTrue)
|
||
trueIndex = index
|
||
# fmtStr = ''
|
||
# nowChkList = []
|
||
# 剩余没有检索的部分
|
||
txt1 = txt[trueIndex:]
|
||
return fmtList, txt1, stop_int
|
||
|
||
# 当列表全部检索完毕需要停止循环
|
||
if fmtStr:
|
||
fmtStrTrue = fmtStr
|
||
fmtList.append(fmtStrTrue)
|
||
stop_int = 0
|
||
txt1 = txt[trueIndex:]
|
||
return fmtList, txt1, stop_int
|
||
|
||
|
||
def fmtList(txtlist, dates):
|
||
chkList1 = ['自我评价', '自我描述', '个人优势']
|
||
chkList2 = ['项目经历', '项目经验', '项目描述']
|
||
chkList3 = ['教育经历', '学习经历']
|
||
chkList4 = ['工作经历', '工作经验', '实习经历']
|
||
chkList5 = ['技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能']
|
||
# 自我评价
|
||
review = []
|
||
# 项目经验
|
||
project = []
|
||
# 工作经验
|
||
work = []
|
||
# 教育经验
|
||
upgrade = []
|
||
# 技能特长
|
||
specialty = []
|
||
for text in txtlist:
|
||
ischk = 0
|
||
# 自我评价
|
||
for i in chkList1:
|
||
if i in text:
|
||
review.append(text)
|
||
ischk = 1
|
||
break
|
||
if ischk:
|
||
continue
|
||
# 项目经验
|
||
for i in chkList2:
|
||
if i in text:
|
||
project.append(text)
|
||
ischk = 1
|
||
break
|
||
if ischk:
|
||
continue
|
||
# 工作经验
|
||
for i in chkList4:
|
||
if i in text:
|
||
work.append(text)
|
||
ischk = 1
|
||
break
|
||
if ischk:
|
||
continue
|
||
# 教育经历
|
||
for i in chkList3:
|
||
if i in text:
|
||
upgrade.append(text)
|
||
ischk = 1
|
||
break
|
||
if ischk:
|
||
continue
|
||
# 自我评价
|
||
for i in chkList5:
|
||
if i in text:
|
||
specialty.append(text)
|
||
ischk = 1
|
||
break
|
||
if ischk:
|
||
continue
|
||
# review自我评价, project项目经验,work工作经验,upgrade教育经历,specialty技能特长
|
||
dates.update({
|
||
'review': review,
|
||
'project': project,
|
||
'work': work,
|
||
'upgrade': upgrade,
|
||
'specialty': specialty,
|
||
})
|
||
|
||
return dates
|
||
|
||
|
||
# 文件路径
|
||
PATH_DATA = os.path.abspath("C:/Users/Administrator/Desktop/面试简历")
|
||
schema = ['姓名', '所在地', '户口所在地', '婚姻状况', '民族', '身高', '电话', '应聘职位', '到岗时间', '学历', '毕业学校', '专业',
|
||
'期望薪资', '在校时间', '电子邮箱', '工作经验','Email'
|
||
]
|
||
schema_dict = {'姓名': 'name', '所在地': 'location', '户口所在地': 'account', '婚姻状况': 'gam', '民族': 'nation', '身高': 'height',
|
||
'电话': 'phone', '应聘职位': 'job', '到岗时间': 'come_time', '学历': 'education', '毕业学校': 'school', '专业': 'career',
|
||
'期望薪资': 'money', '在校时间': 'at_school', '电子邮箱': 'mail', '工作经验': 'work_exp','Email':'mail'}
|
||
for root, dirs, files in os.walk(PATH_DATA):
|
||
for file in files: # 一个file就是一份简历
|
||
url = PATH_DATA + f"/{file}"
|
||
# # 名字
|
||
# name = ''
|
||
# # 电话
|
||
# phone = ''
|
||
# # 电子邮箱
|
||
# mail = ''
|
||
# # 民族
|
||
# nation = ''
|
||
# # 毕业院校
|
||
# school = ''
|
||
# # 专业
|
||
# major = ''
|
||
# # 工作经验
|
||
# work_exp = ''
|
||
# # 婚姻状况
|
||
# gam = ''
|
||
# # 地址
|
||
# site = []
|
||
# # 婚姻状况
|
||
# marriage = ''
|
||
# # 自我评价
|
||
# review = []
|
||
# # 项目经验
|
||
# project = []
|
||
# # 工作经验
|
||
# work = []
|
||
# # 教育经验
|
||
# upgrade = []
|
||
# # 技能特长
|
||
# specialty = []
|
||
if os.path.splitext(file)[1] == '.pdf':
|
||
pdf_docx(PATH_DATA, file) # 转为docx
|
||
name = file.split('.')[0]
|
||
open_txt = docx.Document(PATH_DATA + f"/{name}.docx") # 打开docx
|
||
txt = getText_pdf(url) # 打开pdf格式文件转txt
|
||
# txt = getText_docx(PATH_DATA + f"\{name}.docx")
|
||
elif os.path.splitext(file)[1] == '.docx':
|
||
open_txt = docx.Document(url) # 打开docx,将用来读取每一段的内容
|
||
txt = getText_docx(url) # 打开docx格式文件转txt
|
||
elif os.path.splitext(file)[1] == '.doc':
|
||
doc_docx(PATH_DATA, file) # 转为docx
|
||
name = file.split('.')[0]
|
||
open_txt = docx.Document(PATH_DATA + f"/{name}.docx") # 打开docx
|
||
txt = getText_docx(PATH_DATA + f"/{name}.docx") # 打开docx格式文件转txt
|
||
ie = Taskflow('information_extraction', schema=schema) # 花费时间会安装文件
|
||
# pprint(ie(txt)) # 姓名,电话,电子邮箱,民族,毕业院校,专业,工作经验,婚姻状况
|
||
# 获取的基础数据
|
||
text_lists = ie(txt)
|
||
# 处理后的基本数据
|
||
dates = get_date(schema, text_lists, schema_dict)
|
||
# 打开docx获取的每一段数据
|
||
txt_list = open_txt.paragraphs
|
||
# 获取的文档内容
|
||
txt_list1 = []
|
||
stop_int = 1
|
||
txt1 = txt_list
|
||
while stop_int:
|
||
txt_list2, txt1, stop_int = fmtTxt(txt1)
|
||
txt_list1 += txt_list2
|
||
print(txt_list1)
|
||
|
||
numTables = open_txt.tables # 获取表格里面的内容
|
||
table_list = []
|
||
if len(numTables) > 0:
|
||
for table in numTables:
|
||
row_count = len(table.rows)
|
||
col_count = len(table.columns)
|
||
for i in range(row_count):
|
||
for j in range(col_count):
|
||
texts = table.cell(i, j).text
|
||
texts = re.sub('\s+', '', texts).lstrip() # 字符串去除空格和换行符
|
||
if not texts:
|
||
continue
|
||
if texts in table_list:
|
||
continue
|
||
table_list.append(texts)
|
||
if table_list:
|
||
stop_table = 1
|
||
table1 = table_list
|
||
while stop_table:
|
||
table_list2, table1, stop_table = fmtTxt(table1, istable=1)
|
||
txt_list1 += table_list2
|
||
# print(txt_list1)
|
||
# review自我评价,project项目经验,work工作经验,upgrade教育经历,specialty技能特长
|
||
# 把两部分的数据合起来返回前端
|
||
datess = fmtList(txt_list1, dates)
|
||
pprint(datess)
|
||
a = 1
|