import re import docx import os from pprint import pprint from paddlenlp import Taskflow from gongju import getText_docx, getText_pdf, doc_docx, pdf_docx def get_date(schema, dates, schema_dict): """ 把第三方获取的数据筛选出想要的基本信息 :param schema:中文的词性标注 :param dates:原数据 :param schema_dict:对应中文的英文 :return: 返回取出概率最大的基本信息数据 """ date = {} for i in schema: text = dates[0].get(i, '') # 如果数据中没有搜到对应的键,返回空字符串 if text == '': date[schema_dict[i]] = text else: if len(text) == 1: date[schema_dict[i]] = text[0]['text'] else: aa = {} num = [] for dic in text: aa[dic['probability']] = dic['text'] num.append(dic['probability']) # 取出概率最大的值 date[schema_dict[i]] = aa[max(num)] return date def fmtTxt(txt, istable=0): # 所有关键字 chkStr = ['自我评价', '自我描述', '个人优势', '项目经历', '项目经验', '项目描述', '教育经历', '学习经历', '工作经历', '工作经验', '实习经历', '技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能', '个人评价'] # 自我描述 chkList1 = ['自我评价', '自我描述', '个人优势', '个人评价'] # 项目经验 chkList2 = ['项目经历', '项目经验', '项目描述'] # 教育背景 chkList3 = ['教育经历', '学习经历'] # 工作经历 chkList4 = ['工作经历', '工作经验', '实习经历'] # 个人技能 chkList5 = ['技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能'] fmtList = [] # 返回拼接好的字符串列表 trueIndex = 0 fmtStr = '' nowChkList = [] # 判断while循环是否需要停止 stop_int = 0 for index, i in enumerate(txt): if istable: text = i else: text = i.text text = re.sub('\s+', '', text).lstrip() # 字符串去除空格和换行符 # 没有检测出关键字 if not fmtStr: for i in chkList1: # 判断是不是以关键字开头 if not text.startswith(i, 0): continue else: if i in text: fmtStr = text nowChkList = [chk for chk in chkStr if chk not in chkList1] # 检测出关键字证明需要继续循环 stop_int = 1 break if fmtStr: continue for i in chkList2: # 判断是不是以关键字开头 if not text.startswith(i, 0): continue else: if i in text: fmtStr = text nowChkList = [chk for chk in chkStr if chk not in chkList2] stop_int = 1 break if fmtStr: continue for i in chkList3: if i in text: fmtStr = text nowChkList = [chk for chk in chkStr if chk not in chkList3] stop_int = 1 break if fmtStr: continue for i in chkList4: # 判断是不是以关键字开头 if not text.startswith(i, 0): continue else: if i in text: fmtStr = text nowChkList = [chk for chk in chkStr if chk not in chkList4] stop_int = 1 break if fmtStr: continue for i in chkList5: if i in text: fmtStr = text nowChkList = [chk for chk in chkStr if chk not in chkList5] stop_int = 1 break continue else: isTure = 1 for i in nowChkList: if i in text: isTure = 0 break if isTure: fmtStr += text continue else: fmtStrTrue = fmtStr fmtList.append(fmtStrTrue) trueIndex = index # fmtStr = '' # nowChkList = [] # 剩余没有检索的部分 txt1 = txt[trueIndex:] return fmtList, txt1, stop_int # 当列表全部检索完毕需要停止循环 if fmtStr: fmtStrTrue = fmtStr fmtList.append(fmtStrTrue) stop_int = 0 txt1 = txt[trueIndex:] return fmtList, txt1, stop_int def fmtList(txtlist, dates): chkList1 = ['自我评价', '自我描述', '个人优势'] chkList2 = ['项目经历', '项目经验', '项目描述'] chkList3 = ['教育经历', '学习经历'] chkList4 = ['工作经历', '工作经验', '实习经历'] chkList5 = ['技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能'] # 自我评价 review = [] # 项目经验 project = [] # 工作经验 work = [] # 教育经验 upgrade = [] # 技能特长 specialty = [] for text in txtlist: ischk = 0 # 自我评价 for i in chkList1: if i in text: review.append(text) ischk = 1 break if ischk: continue # 项目经验 for i in chkList2: if i in text: project.append(text) ischk = 1 break if ischk: continue # 工作经验 for i in chkList4: if i in text: work.append(text) ischk = 1 break if ischk: continue # 教育经历 for i in chkList3: if i in text: upgrade.append(text) ischk = 1 break if ischk: continue # 自我评价 for i in chkList5: if i in text: specialty.append(text) ischk = 1 break if ischk: continue # review自我评价, project项目经验,work工作经验,upgrade教育经历,specialty技能特长 dates.update({ 'review': review, 'project': project, 'work': work, 'upgrade': upgrade, 'specialty': specialty, }) return dates # 文件路径 PATH_DATA = os.path.abspath("C:/Users/Administrator/Desktop/面试简历") schema = ['姓名', '所在地', '户口所在地', '婚姻状况', '民族', '身高', '电话', '应聘职位', '到岗时间', '学历', '毕业学校', '专业', '期望薪资', '在校时间', '电子邮箱', '工作经验','Email' ] schema_dict = {'姓名': 'name', '所在地': 'location', '户口所在地': 'account', '婚姻状况': 'gam', '民族': 'nation', '身高': 'height', '电话': 'phone', '应聘职位': 'job', '到岗时间': 'come_time', '学历': 'education', '毕业学校': 'school', '专业': 'career', '期望薪资': 'money', '在校时间': 'at_school', '电子邮箱': 'mail', '工作经验': 'work_exp','Email':'mail'} for root, dirs, files in os.walk(PATH_DATA): for file in files: # 一个file就是一份简历 url = PATH_DATA + f"/{file}" # # 名字 # name = '' # # 电话 # phone = '' # # 电子邮箱 # mail = '' # # 民族 # nation = '' # # 毕业院校 # school = '' # # 专业 # major = '' # # 工作经验 # work_exp = '' # # 婚姻状况 # gam = '' # # 地址 # site = [] # # 婚姻状况 # marriage = '' # # 自我评价 # review = [] # # 项目经验 # project = [] # # 工作经验 # work = [] # # 教育经验 # upgrade = [] # # 技能特长 # specialty = [] if os.path.splitext(file)[1] == '.pdf': pdf_docx(PATH_DATA, file) # 转为docx name = file.split('.')[0] open_txt = docx.Document(PATH_DATA + f"/{name}.docx") # 打开docx txt = getText_pdf(url) # 打开pdf格式文件转txt # txt = getText_docx(PATH_DATA + f"\{name}.docx") elif os.path.splitext(file)[1] == '.docx': open_txt = docx.Document(url) # 打开docx,将用来读取每一段的内容 txt = getText_docx(url) # 打开docx格式文件转txt elif os.path.splitext(file)[1] == '.doc': doc_docx(PATH_DATA, file) # 转为docx name = file.split('.')[0] open_txt = docx.Document(PATH_DATA + f"/{name}.docx") # 打开docx txt = getText_docx(PATH_DATA + f"/{name}.docx") # 打开docx格式文件转txt ie = Taskflow('information_extraction', schema=schema) # 花费时间会安装文件 # pprint(ie(txt)) # 姓名,电话,电子邮箱,民族,毕业院校,专业,工作经验,婚姻状况 # 获取的基础数据 text_lists = ie(txt) # 处理后的基本数据 dates = get_date(schema, text_lists, schema_dict) # 打开docx获取的每一段数据 txt_list = open_txt.paragraphs # 获取的文档内容 txt_list1 = [] stop_int = 1 txt1 = txt_list while stop_int: txt_list2, txt1, stop_int = fmtTxt(txt1) txt_list1 += txt_list2 print(txt_list1) numTables = open_txt.tables # 获取表格里面的内容 table_list = [] if len(numTables) > 0: for table in numTables: row_count = len(table.rows) col_count = len(table.columns) for i in range(row_count): for j in range(col_count): texts = table.cell(i, j).text texts = re.sub('\s+', '', texts).lstrip() # 字符串去除空格和换行符 if not texts: continue if texts in table_list: continue table_list.append(texts) if table_list: stop_table = 1 table1 = table_list while stop_table: table_list2, table1, stop_table = fmtTxt(table1, istable=1) txt_list1 += table_list2 # print(txt_list1) # review自我评价,project项目经验,work工作经验,upgrade教育经历,specialty技能特长 # 把两部分的数据合起来返回前端 datess = fmtList(txt_list1, dates) pprint(datess) a = 1