From e934b45e7773d144078b0a335b64963c7fa07d8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E4=BC=9F?= <250213850@qq.com> Date: Mon, 11 Jul 2022 17:33:04 +0800 Subject: [PATCH] =?UTF-8?q?1.=E8=AF=86=E5=88=AB=E7=AE=80=E5=8E=86=E4=BF=A1?= =?UTF-8?q?=E6=81=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- api/api_v1/endpoints/interview.py | 13 + schemas/__init__.py | 1 + schemas/interview_plan.py | 16 ++ utils/func.py | 12 +- utils/jianli.py | 434 ++++++++++++++++++++++++++++++ 5 files changed, 475 insertions(+), 1 deletion(-) create mode 100644 schemas/interview_plan.py create mode 100644 utils/jianli.py diff --git a/api/api_v1/endpoints/interview.py b/api/api_v1/endpoints/interview.py index cc51772..7a7b9ec 100644 --- a/api/api_v1/endpoints/interview.py +++ b/api/api_v1/endpoints/interview.py @@ -79,3 +79,16 @@ async def interview_insert( return schemas.Msg(code=200, msg='ok', data=data) +@router.post("/interview_insert") +async def interview_insert( + request: Request, + data_in: schemas.Interview, + ckdb: CKDrive = Depends(get_ck_db), +) -> schemas.Msg: + """ 面试情况 """ + await interview.init() + res = interview.insert_interview_sql() + sql = res['sql'] + insert_data = res['insert_data'] + data = await db.execute_dict(sql, insert_data) + return schemas.Msg(code=200, msg='ok', data=data) \ No newline at end of file diff --git a/schemas/__init__.py b/schemas/__init__.py index 9c9e463..3987dc0 100644 --- a/schemas/__init__.py +++ b/schemas/__init__.py @@ -26,3 +26,4 @@ from .user_url import * from .api_module import * from .event_list import * from .interview import * +from .interview_plan import * \ No newline at end of file diff --git a/schemas/interview_plan.py b/schemas/interview_plan.py new file mode 100644 index 0000000..ebca0d3 --- /dev/null +++ b/schemas/interview_plan.py @@ -0,0 +1,16 @@ +from typing import List, Union, Dict + +from pydantic import BaseModel +from typing import Optional + + +class Interview(BaseModel): + job_name: str =None # 应聘职位 + hr_name: str # 面试负责人 + interview_name: str # 面试官 + interview_type: str # 面试类型 + interview_sign: int # 面试签到 + feedback: int # 面试反馈 + interview_round: int # 面试轮次 + pages: int = 1 # 分页的当前页 + time_type: str # 要查询的时间范围类型 diff --git a/utils/func.py b/utils/func.py index 8dee789..d164861 100644 --- a/utils/func.py +++ b/utils/func.py @@ -149,4 +149,14 @@ def create_neidf(resp,columnName): columns.insert(0, '全部用户数') columns.insert(0, columnName) df = pd.DataFrame(data=date, columns=columns) - return df \ No newline at end of file + return df + +def random_hex(): + """ + 生成16位随机数 + :return: 随机数 + """ + result = hex(random.randint(0,16**16)).replace('0x','').upper() + if(len(result)<16): + result = '0'*(16-len(result))+result + return result \ No newline at end of file diff --git a/utils/jianli.py b/utils/jianli.py new file mode 100644 index 0000000..23a25e7 --- /dev/null +++ b/utils/jianli.py @@ -0,0 +1,434 @@ +import re +import docx +import os +import copy +from pprint import pprint +from paddlenlp import Taskflow +import pdfplumber +from win32com import client as wc +from pdf2docx import Converter + +# 文件路径 +PATH_DATA = os.path.abspath("C:/Users/Administrator/Desktop/面试简历") +schema = ['姓名', '所在地', '户口所在地', '籍贯', '婚姻状况', '民族', '身高', '电话', 'tel', '应聘职位', '到岗时间', '学历', '毕业学校', '专业', + '期望薪资', '在校时间', '电子邮箱', '工作经验', 'Email', '性别', '年龄' + ] +schema_dict = {'姓名': 'name', '所在地': 'location', '户口所在地': 'account', '婚姻状况': 'gam', '民族': 'nation', '身高': 'height', + '电话': 'phone', '应聘职位': 'job', '到岗时间': 'come_time', '学历': 'education', '毕业学校': 'school', '专业': 'career', + '期望薪资': 'money', '在校时间': 'at_school', '电子邮箱': 'mail', '工作经验': 'work_exp', 'Email': 'mails', + '性别': 'gender', '年龄': 'age', '籍贯': 'accounts', 'tel': 'tels'} + + +def chkworlkandtime(listdata): + """ + 获取工作经历中任职的公司名称和对应的在岗时间 + :param dictdata: + :return:返回列表格式 + """ + res = {} + for i in listdata: + for key, datalist in i.items(): + trueDict = {} + for data in datalist: + if data['text'] in trueDict: + if data['probability'] <= trueDict[data['text']]['probability']: + continue + trueDict.update({ + data['text']: { + 'end': data['end'], + 'probability': data['probability'], + 'start': data['start'], + } + }) + trueList = [] + for key1, value1 in trueDict.items(): + value1.update({ + 'text': key1 + }) + trueDict1 = copy.deepcopy(value1) + trueList.append(trueDict1) + trueList.sort(key=lambda item: item['start']) + res.update({key: trueList}) + ress = [] + if res != {}: + for i in range(len(res['公司名'])): + date = { + 'name': res['公司名'][i]['text'], + 'time': res['时间'][i]['text'] + } + ress.append(date) + return ress + + +def getText_docx(filename): # docx 转text + """将docx读成text""" + doc = docx.Document(filename) + fullText = [] + for i in doc.paragraphs: # 迭代docx文档里面的每一个段落 + fullText.append(i.text) # 保存每一个段落的文本 + numTables = doc.tables # 如果有表格的内容存放在这 + if len(numTables) > 0: + for table in numTables: + row_count = len(table.rows) + col_count = len(table.columns) + for i in range(row_count): + for j in range(col_count): + fullText.append(table.cell(i, j).text) + return '\n'.join(fullText) + + +def pdf_docx(url, filename): + """ + 将pdf文件转为docx文件 + :param url: + :param filename: + :return: + """ + # 获取文件名称 + file_name = filename.split('.')[0] + # pdf文件名称 + pdf_name = url + f"/{filename}" + # docx文件名称 + docx_name = url + f"/{file_name}.docx" + # 加载pdf文档 + cv = Converter(pdf_name) + cv.convert(docx_name, start=0, end=12) + cv.close() + + +def getText_pdf(filename): + """将pdf读成text""" + with pdfplumber.open(filename) as pdf_file: + content = '' + for i in range(len(pdf_file.pages)): + page_text = pdf_file.pages[i] + page_content = page_text.extract_text() + if page_content: + content = content + page_content + "\n" + return content + + +def doc_docx(url, filename): + """ + 将doc文件转为docx文件 + :param filename: + :return: + """ + word = wc.Dispatch("Word.Application") + doc = word.Documents.Open(url + f"/{filename}") + name = filename.split('.')[0] + doc.SaveAs(url + f'/{name}.docx', 12) # 12为docx + doc.Close() + word.Quit() + + +def clash(date, retain, pop): + """ + 解决词性搜索时,最后的结果只取有值的一个 + 例如'户口所在地','籍贯'只取默认的籍贯返回,(户口所在地有值把值给籍贯) + :param date: 原数据 + :param retain: 要固定返回给前端的数据 户口所在地 + :param pop: 要删除的那个字段 籍贯 + :return: + """ + if date[retain] != '': + date.pop(pop) + else: + date[retain] = date[pop] + date.pop(pop) + + +def get_date(schema, dates, schema_dict): + """ + 把第三方获取的数据筛选出想要的基本信息 + :param schema:中文的词性标注 + :param dates:原数据 + :param schema_dict:对应中文的英文 + :return: 返回取出概率最大的基本信息数据 + """ + date = {} + for i in schema: + text = dates[0].get(i, '') + # 如果数据中没有搜到对应的键,返回空字符串 + if text == '': + date[schema_dict[i]] = text + else: + if len(text) == 1: + date[schema_dict[i]] = text[0]['text'] + else: + aa = {} + num = [] + for dic in text: + aa[dic['probability']] = dic['text'] + num.append(dic['probability']) + # 取出概率最大的值 + date[schema_dict[i]] = aa[max(num)] + # 解决邮箱冲突的问题 + clash(date, 'mail', 'mails') + # 解决户口所在地冲突的问题 + clash(date, 'account', 'accounts') + # 解决电话冲突的问题 + clash(date, 'phone', 'tels') + return date + + +def fmtTxt(txt, istable=0): + # 所有关键字 + chkStr = ['自我评价', '自我描述', '个人优势', '项目经历', '项目经验', '项目描述', '教育经历', '学习经历', '工作经历', '工作经验', '实习经历', + '技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能', '个人评价'] + # 自我描述 + chkList1 = ['自我评价', '自我描述', '个人优势', '个人评价'] + # 项目经验 + chkList2 = ['项目经历', '项目经验', '项目描述'] + # 教育背景 + chkList3 = ['教育经历', '学习经历'] + # 工作经历 + chkList4 = ['工作经历', '工作经验', '实习经历'] + # 个人技能 + chkList5 = ['技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能'] + fmtList = [] # 返回拼接好的字符串列表 + trueIndex = 0 + fmtStr = '' + nowChkList = [] + # 判断while循环是否需要停止 + stop_int = 0 + for index, i in enumerate(txt): + if istable: + text = i + else: + text = i.text + # text = re.sub('\s+', '', text).lstrip() # 字符串去除空格和换行符 + + # 没有检测出关键字 + if not fmtStr: + # 自我描述 + for i in chkList1: + # 判断是不是以关键字开头 + if not text.startswith(i, 0): + continue + else: + if i in text: + fmtStr = text + nowChkList = [chk for chk in chkStr if chk not in chkList1] + # 检测出关键字证明需要继续循环 + stop_int = 1 + break + if fmtStr: + continue + # 项目经验 + for i in chkList2: + if i in text: + fmtStr = text + nowChkList = [chk for chk in chkStr if chk not in chkList2] + stop_int = 1 + break + if fmtStr: + continue + # 教育背景 + for i in chkList3: + if i in text: + fmtStr = text + nowChkList = [chk for chk in chkStr if chk not in chkList3] + stop_int = 1 + break + if fmtStr: + continue + # 工作经历 + for i in chkList4: + # 判断是不是以关键字开头 + if not text.startswith(i, 0): + continue + else: + if i in text: + fmtStr = text + nowChkList = [chk for chk in chkStr if chk not in chkList4] + stop_int = 1 + break + if fmtStr: + continue + # 个人技能 + for i in chkList5: + if i in text: + fmtStr = text + nowChkList = [chk for chk in chkStr if chk not in chkList5] + stop_int = 1 + break + continue + else: + isTure = 1 + for i in nowChkList: + if i in text: + isTure = 0 + break + if isTure: + fmtStr += text + continue + else: + fmtStrTrue = fmtStr + fmtList.append(fmtStrTrue) + trueIndex = index + # fmtStr = '' + # nowChkList = [] + # 剩余没有检索的部分 + txt1 = txt[trueIndex:] + return fmtList, txt1, stop_int + + # 当列表全部检索完毕需要停止循环 + if fmtStr: + fmtStrTrue = fmtStr + fmtList.append(fmtStrTrue) + stop_int = 0 + txt1 = txt[trueIndex:] + return fmtList, txt1, stop_int + + +def fmtList(txtlist, dates): + chkList1 = ['自我评价', '自我描述', '个人优势'] + chkList2 = ['项目经历', '项目经验', '项目描述'] + chkList3 = ['教育经历', '学习经历'] + chkList4 = ['工作经历', '工作经验', '实习经历'] + chkList5 = ['技能特长', '技能', '特长', '专长', '技能专长', '专业技能', '职业技能'] + # 自我评价 + review = [] + # 项目经验 + project = [] + # 工作经验 + work = [] + # 教育经验 + upgrade = [] + # 技能特长 + specialty = [] + for text in txtlist: + ischk = 0 + # 自我评价 + for i in chkList1: + if i in text: + review.append(text) + ischk = 1 + break + if ischk: + continue + # 项目经验 + for i in chkList2: + if i in text: + project.append(text) + ischk = 1 + break + if ischk: + continue + # 工作经验 + for i in chkList4: + if i in text: + work.append(text) + ischk = 1 + break + if ischk: + continue + # 教育经历 + for i in chkList3: + if i in text: + upgrade.append(text) + ischk = 1 + break + if ischk: + continue + # 自我评价 + for i in chkList5: + if i in text: + specialty.append(text) + ischk = 1 + break + if ischk: + continue + # 取出工作经验里面的公司名和时间 + work_list = [] + if len(work) > 0: + works = '' + for i in work: + works += i + schema = ['公司名', '时间'] + ie = Taskflow('information_extraction', schema=schema) + text_lists = ie(works) + work_list = chkworlkandtime(text_lists) + # review自我评价, project项目经验,work工作经验,work具体工作的公司和时间,upgrade教育经历,specialty技能特长 + dates.update({ + 'review': review, + 'project': project, + 'work': work, + 'work_list': work_list, + 'upgrade': upgrade, + 'specialty': specialty, + }) + + return dates + + +def get_resume(): + for root, dirs, files in os.walk(PATH_DATA): + for file in files: # 一个file就是一份简历 + url = PATH_DATA + f"/{file}" + if os.path.splitext(file)[1] == '.pdf': + pdf_docx(PATH_DATA, file) # 转为docx + name = file.split('.')[0] + open_txt = docx.Document(PATH_DATA + f"/{name}.docx") # 打开docx + os.remove(PATH_DATA + f"/{name}.docx") # 删除生成的文件 + txt = getText_pdf(url) # 打开pdf格式文件转txt + # txt = getText_docx(PATH_DATA + f"\{name}.docx") + elif os.path.splitext(file)[1] == '.docx': + open_txt = docx.Document(url) # 打开docx,将用来读取每一段的内容 + txt = getText_docx(url) # 打开docx格式文件转txt + elif os.path.splitext(file)[1] == '.doc': + doc_docx(PATH_DATA, file) # 转为docx + name = file.split('.')[0] + open_txt = docx.Document(PATH_DATA + f"/{name}.docx") # 打开docx + txt = getText_docx(PATH_DATA + f"/{name}.docx") # 打开docx格式文件转txt + os.remove(PATH_DATA + f"/{name}.docx") # 删除生成的文件 + ie = Taskflow('information_extraction', schema=schema) # 花费时间会安装文件 + # pprint(ie(txt)) # 姓名,电话,电子邮箱,民族,毕业院校,专业,工作经验,婚姻状况 + # 获取的基础数据 + text_lists = ie(txt) + # 处理后的基本数据 + dates = get_date(schema, text_lists, schema_dict) + # 打开docx获取的每一段数据 + txt_list = open_txt.paragraphs + # 获取的文档内容 + txt_list1 = [] + stop_int = 1 + txt1 = txt_list + while stop_int: + txt_list2, txt1, stop_int = fmtTxt(txt1) + txt_list1 += txt_list2 + # print(txt_list1) + numTables = open_txt.tables # 获取表格里面的内容 + table_list = [] + if len(numTables) > 0: + for table in numTables: + row_count = len(table.rows) + col_count = len(table.columns) + for i in range(row_count): + for j in range(col_count): + texts = table.cell(i, j).text + # texts = re.sub('\s+', '', texts).lstrip() # 字符串去除空格和换行符 + if not texts: + continue + if texts in table_list: + continue + table_list.append(texts) + if table_list: + stop_table = 1 + table1 = table_list + while stop_table: + table_list2, table1, stop_table = fmtTxt(table1, istable=1) + txt_list1 += table_list2 + # print(txt_list1) + # review自我评价,project项目经验,work工作经验,upgrade教育经历,specialty技能特长 + # 把两部分的数据合起来返回前端,数据都在dates中 + fmtList(txt_list1, dates) + # pprint(dates) + a = 1 + + return dates + + +if __name__ == '__main__': + get_resume()