批量导入简历优化

This commit is contained in:
Àî×ÚÕñ 2022-10-24 15:43:09 +08:00
parent 41219df9e6
commit aa3212676f
3 changed files with 34 additions and 8 deletions

View File

@ -799,14 +799,14 @@ async def files_to_hw(
fn = file_path + '/' + file
end_str = file.split('.')[-1].lower() # 文件类型
# 文件类型支持
if end_str not in ['pdf', 'doc', 'docx', 'png', 'jpg', 'jpeg', 'word']:
if end_str not in ['pdf', 'doc', 'docx', 'word']:
continue
if end_str in ['doc', 'docx', 'word']: # doc,docx, word转pdf
fn, fil = doc2pdf(fn, file_path, file)
file = fil
if end_str in ['png', 'jpg', 'jpeg']: # 图片转pdf
fn, fil = png2pdf(file_path, file)
file = fil
# if end_str in ['png', 'jpg', 'jpeg']: # 图片转pdf
# fn, fil = png2pdf(file_path, file)
# file = fil
data_mode1 = deepcopy(data_mode)
uid = get_uid()
data_mode1['uid'] = uid

View File

@ -29,6 +29,7 @@ my_pass = 'whrsugtgkstibjdj' # 发件人邮箱密码
subject = '入职通知' # 邮件的主题,也可以说是标题
mail_host = 'smtp.qq.com'
def get_uid():
return hex(int(time.time() * 10 ** 7) + random.randint(0, 10000))[2:]
@ -269,7 +270,7 @@ def send_affix_mail(str_msg: str, send_to: list, file_name: str = None):
# ---附件部分---
if file_name != None:
part = MIMEApplication(open(file_name, 'rb').read())
filename=file_name.split('/')[-1]
filename = file_name.split('/')[-1]
part.add_header('Content-Disposition', 'attachment', filename=filename)
msg.attach(part)
smpt = smtplib.SMTP_SSL(mail_host, 465, 'utf-8')
@ -416,6 +417,16 @@ def png2pdf(dir_path, filename):
return res_path, new_filename
# pdf转换为txt文字
def pdf_to_text(path):
res = ''
doc = fitz.open(path)
for page in doc:
text = page.getText()
res += text
return res
def write_task(jsontext):
with open('task.json', 'w', encoding='utf-8') as f:
f.write(jsontext)

View File

@ -3,6 +3,9 @@
from LAC import LAC
import re
import copy
import pytesseract
from PIL import Image
from utils import png2pdf, pdf_to_text
import pdfplumber
lac = LAC(mode='lac')
@ -20,6 +23,15 @@ def getText_pdf(filename):
return content
# 识别图片文字内容
def get_png_txt(filename):
# 读取图片
im = Image.open(filename)
# 识别文字,并指定语言
string = pytesseract.image_to_string(im, lang='chi_sim')
return string
def fmt_txt(chk_str):
true_chkStr = chk_str
regex = re.compile(r"\n*")
@ -324,7 +336,7 @@ def fmt_txt(chk_str):
# if '实习经历' in true_chkStr:
# work_undergo_str = true_chkStr.split('实习经历')[-1]
# else:
work_undergo_str = true_chkStr.split('工作经验')[-1]
work_undergo_str = true_chkStr.split('工作经验')[-1]
else:
if '⼯作经历' in true_chkStr:
work_undergo_str = true_chkStr.split('⼯作经历')[-1]
@ -1143,8 +1155,11 @@ def fmt_txt(chk_str):
if __name__ == '__main__':
# txt = getText_pdf('D:\wokerplay\面试简历\【Android开发工程师_武汉】陈超峰 10年.pdf')
# if txt:
# pass
txt = get_png_txt(r'D:\wokerplay\面试简历1\陈雨蝶.jpg')
# pdf_path, file_name = png2pdf('D:\wokerplay\面试简历1', '程敏谦-文案策划简历.JPG')
# txt = pdf_to_text(pdf_path)
if txt:
pass
# 拉勾
chk_str1 = """