diff --git a/api/api_v1/endpoints/interview.py b/api/api_v1/endpoints/interview.py index 945fdf0..6c4c531 100644 --- a/api/api_v1/endpoints/interview.py +++ b/api/api_v1/endpoints/interview.py @@ -2,16 +2,17 @@ import datetime import mimetypes from collections import defaultdict import time +import os from urllib.parse import quote import re from clickhouse_driver import Client import pandas as pd import numpy as np -from fastapi import APIRouter, Depends, Request +from fastapi import APIRouter, Depends, Request, File, UploadFile from motor.motor_asyncio import AsyncIOMotorDatabase from pandas import DataFrame from starlette.responses import StreamingResponse - +from utils.jianli import get_resume import crud, schemas from common import * @@ -79,6 +80,33 @@ async def interview_insert( return schemas.Msg(code=200, msg='ok', data=data) +# 写入面试数据 +@router.post("/interview_file_insert") +async def interview_insert( + request: Request, + file: UploadFile = File(...), + db: CKDrive = Depends(get_ck_db), +) -> schemas.Msg: + """ interview面试数据写入 """ + path_data = os.getcwd() + '/jianli' # 当前文件所在的目录 + contents = await file.read() + filename = file.filename + try: + with open(path_data + filename, "wb") as f: + # 将获取的file文件内容,写入到新文件中 + f.write(contents) + f.close() + except: + return schemas.Msg(code=400, msg='上传文件有误', data=None) + insert_data = get_resume(filename, path_data) + sql = f"insert into HR.resumes(interview_name, interview_type, interview_sign, feedback, interview_round, star_time, end_time, event_time, uid, name, phone, job_name, hr_name, work_exp, interview_stage, owner_name, education, work_undergo, school, specialty, mmended_state, mail, account, id_card, gender, interview_state, graduate_time, counts) values" + data = await db.execute_dict(sql, insert_data) + return schemas.Msg(code=200, msg='ok', data=data) + + +@app.post("/file_upload") +async def file_upload(file: UploadFile = File(...)): + # @router.post("/interview_insert") # async def interview_insert( # request: Request, diff --git a/utils/jianli.py b/utils/jianli.py index 23a25e7..f0d38f5 100644 --- a/utils/jianli.py +++ b/utils/jianli.py @@ -9,7 +9,6 @@ from win32com import client as wc from pdf2docx import Converter # 文件路径 -PATH_DATA = os.path.abspath("C:/Users/Administrator/Desktop/面试简历") schema = ['姓名', '所在地', '户口所在地', '籍贯', '婚姻状况', '民族', '身高', '电话', 'tel', '应聘职位', '到岗时间', '学历', '毕业学校', '专业', '期望薪资', '在校时间', '电子邮箱', '工作经验', 'Email', '性别', '年龄' ] @@ -363,72 +362,73 @@ def fmtList(txtlist, dates): return dates -def get_resume(): - for root, dirs, files in os.walk(PATH_DATA): - for file in files: # 一个file就是一份简历 - url = PATH_DATA + f"/{file}" - if os.path.splitext(file)[1] == '.pdf': - pdf_docx(PATH_DATA, file) # 转为docx - name = file.split('.')[0] - open_txt = docx.Document(PATH_DATA + f"/{name}.docx") # 打开docx - os.remove(PATH_DATA + f"/{name}.docx") # 删除生成的文件 - txt = getText_pdf(url) # 打开pdf格式文件转txt - # txt = getText_docx(PATH_DATA + f"\{name}.docx") - elif os.path.splitext(file)[1] == '.docx': - open_txt = docx.Document(url) # 打开docx,将用来读取每一段的内容 - txt = getText_docx(url) # 打开docx格式文件转txt - elif os.path.splitext(file)[1] == '.doc': - doc_docx(PATH_DATA, file) # 转为docx - name = file.split('.')[0] - open_txt = docx.Document(PATH_DATA + f"/{name}.docx") # 打开docx - txt = getText_docx(PATH_DATA + f"/{name}.docx") # 打开docx格式文件转txt - os.remove(PATH_DATA + f"/{name}.docx") # 删除生成的文件 - ie = Taskflow('information_extraction', schema=schema) # 花费时间会安装文件 - # pprint(ie(txt)) # 姓名,电话,电子邮箱,民族,毕业院校,专业,工作经验,婚姻状况 - # 获取的基础数据 - text_lists = ie(txt) - # 处理后的基本数据 - dates = get_date(schema, text_lists, schema_dict) - # 打开docx获取的每一段数据 - txt_list = open_txt.paragraphs - # 获取的文档内容 - txt_list1 = [] - stop_int = 1 - txt1 = txt_list - while stop_int: - txt_list2, txt1, stop_int = fmtTxt(txt1) - txt_list1 += txt_list2 - # print(txt_list1) - numTables = open_txt.tables # 获取表格里面的内容 - table_list = [] - if len(numTables) > 0: - for table in numTables: - row_count = len(table.rows) - col_count = len(table.columns) - for i in range(row_count): - for j in range(col_count): - texts = table.cell(i, j).text - # texts = re.sub('\s+', '', texts).lstrip() # 字符串去除空格和换行符 - if not texts: - continue - if texts in table_list: - continue - table_list.append(texts) - if table_list: - stop_table = 1 - table1 = table_list - while stop_table: - table_list2, table1, stop_table = fmtTxt(table1, istable=1) - txt_list1 += table_list2 - # print(txt_list1) - # review自我评价,project项目经验,work工作经验,upgrade教育经历,specialty技能特长 - # 把两部分的数据合起来返回前端,数据都在dates中 - fmtList(txt_list1, dates) - # pprint(dates) - a = 1 +def get_resume(file, path_data): + # for root, dirs, files in os.walk(PATH_DATA): + # for file in files: # 一个file就是一份简历 - return dates + url = path_data + f"/{file}" + if os.path.splitext(file)[1] == '.pdf': + pdf_docx(path_data, file) # 转为docx + name = file.split('.')[0] + open_txt = docx.Document(path_data + f"/{name}.docx") # 打开docx + os.remove(path_data + f"/{name}.docx") # 删除生成的文件 + txt = getText_pdf(url) # 打开pdf格式文件转txt + # txt = getText_docx(PATH_DATA + f"\{name}.docx") + elif os.path.splitext(file)[1] == '.docx': + open_txt = docx.Document(url) # 打开docx,将用来读取每一段的内容 + txt = getText_docx(url) # 打开docx格式文件转txt + elif os.path.splitext(file)[1] == '.doc': + doc_docx(path_data, file) # 转为docx + name = file.split('.')[0] + open_txt = docx.Document(path_data + f"/{name}.docx") # 打开docx + txt = getText_docx(path_data + f"/{name}.docx") # 打开docx格式文件转txt + os.remove(path_data + f"/{name}.docx") # 删除生成的文件 + ie = Taskflow('information_extraction', schema=schema) # 花费时间会安装文件 + # pprint(ie(txt)) # 姓名,电话,电子邮箱,民族,毕业院校,专业,工作经验,婚姻状况 + # 获取的基础数据 + text_lists = ie(txt) + # 处理后的基本数据 + dates = get_date(schema, text_lists, schema_dict) + # 打开docx获取的每一段数据 + txt_list = open_txt.paragraphs + # 获取的文档内容 + txt_list1 = [] + stop_int = 1 + txt1 = txt_list + while stop_int: + txt_list2, txt1, stop_int = fmtTxt(txt1) + txt_list1 += txt_list2 + # print(txt_list1) + numTables = open_txt.tables # 获取表格里面的内容 + table_list = [] + if len(numTables) > 0: + for table in numTables: + row_count = len(table.rows) + col_count = len(table.columns) + for i in range(row_count): + for j in range(col_count): + texts = table.cell(i, j).text + # texts = re.sub('\s+', '', texts).lstrip() # 字符串去除空格和换行符 + if not texts: + continue + if texts in table_list: + continue + table_list.append(texts) + if table_list: + stop_table = 1 + table1 = table_list + while stop_table: + table_list2, table1, stop_table = fmtTxt(table1, istable=1) + txt_list1 += table_list2 + # print(txt_list1) + # review自我评价,project项目经验,work工作经验,upgrade教育经历,specialty技能特长 + # 把两部分的数据合起来返回前端,数据都在dates中 + fmtList(txt_list1, dates) + # pprint(dates) + a = 1 + + return dates if __name__ == '__main__': - get_resume() + get_resume(file, path_data) diff --git a/接口文档/主页查询接口.txt b/接口文档/主页查询接口文档.txt similarity index 100% rename from 接口文档/主页查询接口.txt rename to 接口文档/主页查询接口文档.txt diff --git a/接口文档/导入候选人接口文档.txt b/接口文档/导入候选人接口文档.txt new file mode 100644 index 0000000..b38050a --- /dev/null +++ b/接口文档/导入候选人接口文档.txt @@ -0,0 +1,12 @@ +·: /api/v1/itr/interview_file_insert + +: + ļ + + +ֵ: +{ + "code": 200, + "msg": "ok", + "data": 1 # ɹӵ +} \ No newline at end of file