import re import docx import os import copy from pprint import pprint from paddlenlp import Taskflow from gongju import get_entity, getText_docx, getText_pdf, doc_docx, pdf_docx def fmtTxt(txt, istable=0): chkStr = ['自我评价', '自我描述', '个人优势', '项目经历', '项目经验', '项目描述', '教育经历', '学习经历', '工作经历', '工作经验', '实习经历', '技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能'] chkList1 = ['自我评价', '自我描述', '个人优势'] chkList2 = ['项目经历', '项目经验', '项目描述'] chkList3 = ['教育经历', '学习经历'] chkList4 = ['工作经历', '工作经验', '实习经历'] chkList5 = ['技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能'] fmtList = [] # 返回拼接好的字符串列表 trueIndex = 0 fmtStr = '' nowChkList = [] # 判断while循环是否需要停止 stop_int = 0 for index, i in enumerate(txt): if istable: text = i else: text = i.text text = re.sub('\s+', '', text).lstrip() # 字符串去除空格和换行符 # 没有检测出关键字 if not fmtStr: for i in chkList1: # 判断是不是以关键字开头 if not text.startswith(i, 0): break else: if i in text: fmtStr = text nowChkList = [chk for chk in chkStr if chk not in chkList1] # 检测出关键字证明需要继续循环 stop_int = 1 break if fmtStr: continue for i in chkList2: # 判断是不是以关键字开头 if not text.startswith(i, 0): break else: if i in text: fmtStr = text nowChkList = [chk for chk in chkStr if chk not in chkList2] stop_int = 1 break if fmtStr: continue for i in chkList3: if i in text: fmtStr = text nowChkList = [chk for chk in chkStr if chk not in chkList3] stop_int = 1 break if fmtStr: continue for i in chkList4: # 判断是不是以关键字开头 if not text.startswith(i, 0): break else: if i in text: fmtStr = text nowChkList = [chk for chk in chkStr if chk not in chkList4] stop_int = 1 break if fmtStr: continue for i in chkList5: # 判断是不是以关键字开头 if not text.startswith(i, 0): break else: if i in text: fmtStr = text nowChkList = [chk for chk in chkStr if chk not in chkList5] stop_int = 1 break continue else: isTure = 1 for i in nowChkList: if i in text: isTure = 0 break if isTure: fmtStr += text continue else: fmtStrTrue = fmtStr fmtList.append(fmtStrTrue) trueIndex = index # fmtStr = '' # nowChkList = [] # 剩余没有检索的部分 txt1 = txt[trueIndex:] return fmtList, txt1, stop_int # 当列表全部检索完毕需要停止循环 if fmtStr: fmtStrTrue = fmtStr fmtList.append(fmtStrTrue) stop_int = 0 txt1 = txt[trueIndex:] return fmtList, txt1, stop_int def fmtList(txtlist): chkList1 = ['自我评价', '自我描述', '个人优势'] chkList2 = ['项目经历', '项目经验', '项目描述'] chkList3 = ['教育经历', '学习经历'] chkList4 = ['工作经历', '工作经验', '实习经历'] chkList5 = ['技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能'] # 自我评价 review = [] # 项目经验 project = [] # 工作经验 work = [] # 教育经验 upgrade = [] # 技能特长 specialty = [] for text in txtlist: ischk = 0 # 自我评价 for i in chkList1: if i in text: review.append(text) ischk = 1 break if ischk: continue # 项目经验 for i in chkList2: if i in text: project.append(text) ischk = 1 break if ischk: continue # 工作经验 for i in chkList4: if i in text: work.append(text) ischk = 1 break if ischk: continue # 教育经历 for i in chkList3: if i in text: upgrade.append(text) ischk = 1 break if ischk: continue # 自我评价 for i in chkList5: if i in text: specialty.append(text) ischk = 1 break if ischk: continue return review, project, work, upgrade, specialty def get_date(schema, dates): """ 把第三方获取的数据筛选出想要的基本信息 :param schema: :param dates: :return: """ date = {} for i in schema: text = dates[0].get(i, '') # 如果数据中没有搜到对应的键,返回空字符串 if text == '': date[i] = text else: if len(text) == 1: date[i] = text[0]['text'] else: aa = {} num = [] for dic in text: aa[dic['probability']] = dic['text'] num.append(dic['probability']) date[i] = aa[max(num)] return date # 文件路径 PATH_DATA = os.path.abspath("D:/wokerplay/面试简历1") schema = ['姓名', '所在地', '户口所在地', '婚姻状况', '民族', '身高', '电话', '应聘职位', '到岗时间', '学历', '毕业学校', '专业', '期望薪资', '在校时间', '到岗时间', '工作经历', '自我评价', '电子邮箱', '技能', '特长', '工作经验', '项目经验' ] for root, dirs, files in os.walk(PATH_DATA): for file in files: # 一个file就是一份简历 url = PATH_DATA + f"/{file}" # 名字 name = '' # 电话 phone = '' # 电子邮箱 mail = '' # 民族 nation = '' # 毕业院校 school = '' # 专业 major = '' # 工作经验 work_exp = '' # 婚姻状况 gam = '' # 地址 site = [] # 婚姻状况 marriage = '' # 自我评价 review = [] # 项目经验 project = [] # 工作经验 work = [] # 教育经验 upgrade = [] # 技能特长 specialty = [] if os.path.splitext(file)[1] == '.pdf': pdf_docx(PATH_DATA, file) # 转为docx name = file.split('.')[0] open_txt = docx.Document(PATH_DATA + f"/{name}.docx") # 打开docx txt = getText_pdf(url) # 打开pdf格式文件转txt # txt = getText_docx(PATH_DATA + f"\{name}.docx") elif os.path.splitext(file)[1] == '.docx': open_txt = docx.Document(url) # 打开docx,将用来读取每一段的内容 txt = getText_docx(url) # 打开docx格式文件转txt elif os.path.splitext(file)[1] == '.doc': doc_docx(PATH_DATA, file) # 转为docx name = file.split('.')[0] open_txt = docx.Document(PATH_DATA + f"/{name}.docx") # 打开docx txt = getText_docx(PATH_DATA + f"/{name}.docx") # 打开docx格式文件转txt # ie = Taskflow('information_extraction', schema=schema) # 花费时间会安装文件 # pprint(ie(txt)) # 姓名,电话,电子邮箱,民族,毕业院校,专业,工作经验,婚姻状况 # #获取的基础数据 # text_lists=ie(txt) # 打开docx获取的每一段数据 txt_list = open_txt.paragraphs # 获取的文档内容 txt_list1 = [] stop_int = 1 txt1 = txt_list while stop_int: txt_list2, txt1, stop_int = fmtTxt(txt1) txt_list1 += txt_list2 print(txt_list1) numTables = open_txt.tables # 获取表格里面的内容 table_list = [] if len(numTables) > 0: for table in numTables: row_count = len(table.rows) col_count = len(table.columns) for i in range(row_count): for j in range(col_count): texts = table.cell(i, j).text texts = re.sub('\s+', '', texts).lstrip() # 字符串去除空格和换行符 if not texts: continue if texts in table_list: continue table_list.append(texts) if table_list: stop_table = 1 table1 = table_list while stop_table: table_list2, table1, stop_table = fmtTxt(table1, istable=1) txt_list1 += table_list2 print(txt_list1) review, project, work, upgrade, specialty = fmtList(txt_list1)