From aa3212676fab36309a16b0981889a04b8036bfdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=80=C3=AE=C3=97=C3=9A=C3=95=C3=B1?= Date: Mon, 24 Oct 2022 15:43:09 +0800 Subject: [PATCH] =?UTF-8?q?=E6=89=B9=E9=87=8F=E5=AF=BC=E5=85=A5=E7=AE=80?= =?UTF-8?q?=E5=8E=86=E4=BC=98=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- api/api_v1/endpoints/interview.py | 8 ++++---- utils/func.py | 13 ++++++++++++- utils/re_to_jianli.py | 21 ++++++++++++++++++--- 3 files changed, 34 insertions(+), 8 deletions(-) diff --git a/api/api_v1/endpoints/interview.py b/api/api_v1/endpoints/interview.py index 7a79b21..8193d14 100644 --- a/api/api_v1/endpoints/interview.py +++ b/api/api_v1/endpoints/interview.py @@ -799,14 +799,14 @@ async def files_to_hw( fn = file_path + '/' + file end_str = file.split('.')[-1].lower() # 文件类型 # 文件类型支持 - if end_str not in ['pdf', 'doc', 'docx', 'png', 'jpg', 'jpeg', 'word']: + if end_str not in ['pdf', 'doc', 'docx', 'word']: continue if end_str in ['doc', 'docx', 'word']: # doc,docx, word转pdf fn, fil = doc2pdf(fn, file_path, file) file = fil - if end_str in ['png', 'jpg', 'jpeg']: # 图片转pdf - fn, fil = png2pdf(file_path, file) - file = fil + # if end_str in ['png', 'jpg', 'jpeg']: # 图片转pdf + # fn, fil = png2pdf(file_path, file) + # file = fil data_mode1 = deepcopy(data_mode) uid = get_uid() data_mode1['uid'] = uid diff --git a/utils/func.py b/utils/func.py index f5a03a9..ad77f61 100644 --- a/utils/func.py +++ b/utils/func.py @@ -29,6 +29,7 @@ my_pass = 'whrsugtgkstibjdj' # 发件人邮箱密码 subject = '入职通知' # 邮件的主题,也可以说是标题 mail_host = 'smtp.qq.com' + def get_uid(): return hex(int(time.time() * 10 ** 7) + random.randint(0, 10000))[2:] @@ -269,7 +270,7 @@ def send_affix_mail(str_msg: str, send_to: list, file_name: str = None): # ---附件部分--- if file_name != None: part = MIMEApplication(open(file_name, 'rb').read()) - filename=file_name.split('/')[-1] + filename = file_name.split('/')[-1] part.add_header('Content-Disposition', 'attachment', filename=filename) msg.attach(part) smpt = smtplib.SMTP_SSL(mail_host, 465, 'utf-8') @@ -416,6 +417,16 @@ def png2pdf(dir_path, filename): return res_path, new_filename +# pdf转换为txt文字 +def pdf_to_text(path): + res = '' + doc = fitz.open(path) + for page in doc: + text = page.getText() + res += text + return res + + def write_task(jsontext): with open('task.json', 'w', encoding='utf-8') as f: f.write(jsontext) diff --git a/utils/re_to_jianli.py b/utils/re_to_jianli.py index c6034d3..e9f035d 100644 --- a/utils/re_to_jianli.py +++ b/utils/re_to_jianli.py @@ -3,6 +3,9 @@ from LAC import LAC import re import copy +import pytesseract +from PIL import Image +from utils import png2pdf, pdf_to_text import pdfplumber lac = LAC(mode='lac') @@ -20,6 +23,15 @@ def getText_pdf(filename): return content +# 识别图片文字内容 +def get_png_txt(filename): + # 读取图片 + im = Image.open(filename) + # 识别文字,并指定语言 + string = pytesseract.image_to_string(im, lang='chi_sim') + return string + + def fmt_txt(chk_str): true_chkStr = chk_str regex = re.compile(r":\n*") @@ -324,7 +336,7 @@ def fmt_txt(chk_str): # if '实习经历' in true_chkStr: # work_undergo_str = true_chkStr.split('实习经历')[-1] # else: - work_undergo_str = true_chkStr.split('工作经验')[-1] + work_undergo_str = true_chkStr.split('工作经验')[-1] else: if '⼯作经历' in true_chkStr: work_undergo_str = true_chkStr.split('⼯作经历')[-1] @@ -1143,8 +1155,11 @@ def fmt_txt(chk_str): if __name__ == '__main__': # txt = getText_pdf('D:\wokerplay\面试简历\【Android开发工程师_武汉】陈超峰 10年.pdf') - # if txt: - # pass + txt = get_png_txt(r'D:\wokerplay\面试简历1\陈雨蝶.jpg') + # pdf_path, file_name = png2pdf('D:\wokerplay\面试简历1', '程敏谦-文案策划简历.JPG') + # txt = pdf_to_text(pdf_path) + if txt: + pass # 拉勾 chk_str1 = """