批量导入简历优化
This commit is contained in:
parent
41219df9e6
commit
aa3212676f
@ -799,14 +799,14 @@ async def files_to_hw(
|
|||||||
fn = file_path + '/' + file
|
fn = file_path + '/' + file
|
||||||
end_str = file.split('.')[-1].lower() # 文件类型
|
end_str = file.split('.')[-1].lower() # 文件类型
|
||||||
# 文件类型支持
|
# 文件类型支持
|
||||||
if end_str not in ['pdf', 'doc', 'docx', 'png', 'jpg', 'jpeg', 'word']:
|
if end_str not in ['pdf', 'doc', 'docx', 'word']:
|
||||||
continue
|
continue
|
||||||
if end_str in ['doc', 'docx', 'word']: # doc,docx, word转pdf
|
if end_str in ['doc', 'docx', 'word']: # doc,docx, word转pdf
|
||||||
fn, fil = doc2pdf(fn, file_path, file)
|
fn, fil = doc2pdf(fn, file_path, file)
|
||||||
file = fil
|
file = fil
|
||||||
if end_str in ['png', 'jpg', 'jpeg']: # 图片转pdf
|
# if end_str in ['png', 'jpg', 'jpeg']: # 图片转pdf
|
||||||
fn, fil = png2pdf(file_path, file)
|
# fn, fil = png2pdf(file_path, file)
|
||||||
file = fil
|
# file = fil
|
||||||
data_mode1 = deepcopy(data_mode)
|
data_mode1 = deepcopy(data_mode)
|
||||||
uid = get_uid()
|
uid = get_uid()
|
||||||
data_mode1['uid'] = uid
|
data_mode1['uid'] = uid
|
||||||
|
@ -29,6 +29,7 @@ my_pass = 'whrsugtgkstibjdj' # 发件人邮箱密码
|
|||||||
subject = '入职通知' # 邮件的主题,也可以说是标题
|
subject = '入职通知' # 邮件的主题,也可以说是标题
|
||||||
mail_host = 'smtp.qq.com'
|
mail_host = 'smtp.qq.com'
|
||||||
|
|
||||||
|
|
||||||
def get_uid():
|
def get_uid():
|
||||||
return hex(int(time.time() * 10 ** 7) + random.randint(0, 10000))[2:]
|
return hex(int(time.time() * 10 ** 7) + random.randint(0, 10000))[2:]
|
||||||
|
|
||||||
@ -416,6 +417,16 @@ def png2pdf(dir_path, filename):
|
|||||||
return res_path, new_filename
|
return res_path, new_filename
|
||||||
|
|
||||||
|
|
||||||
|
# pdf转换为txt文字
|
||||||
|
def pdf_to_text(path):
|
||||||
|
res = ''
|
||||||
|
doc = fitz.open(path)
|
||||||
|
for page in doc:
|
||||||
|
text = page.getText()
|
||||||
|
res += text
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
def write_task(jsontext):
|
def write_task(jsontext):
|
||||||
with open('task.json', 'w', encoding='utf-8') as f:
|
with open('task.json', 'w', encoding='utf-8') as f:
|
||||||
f.write(jsontext)
|
f.write(jsontext)
|
||||||
|
@ -3,6 +3,9 @@
|
|||||||
from LAC import LAC
|
from LAC import LAC
|
||||||
import re
|
import re
|
||||||
import copy
|
import copy
|
||||||
|
import pytesseract
|
||||||
|
from PIL import Image
|
||||||
|
from utils import png2pdf, pdf_to_text
|
||||||
import pdfplumber
|
import pdfplumber
|
||||||
|
|
||||||
lac = LAC(mode='lac')
|
lac = LAC(mode='lac')
|
||||||
@ -20,6 +23,15 @@ def getText_pdf(filename):
|
|||||||
return content
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
# 识别图片文字内容
|
||||||
|
def get_png_txt(filename):
|
||||||
|
# 读取图片
|
||||||
|
im = Image.open(filename)
|
||||||
|
# 识别文字,并指定语言
|
||||||
|
string = pytesseract.image_to_string(im, lang='chi_sim')
|
||||||
|
return string
|
||||||
|
|
||||||
|
|
||||||
def fmt_txt(chk_str):
|
def fmt_txt(chk_str):
|
||||||
true_chkStr = chk_str
|
true_chkStr = chk_str
|
||||||
regex = re.compile(r":\n*")
|
regex = re.compile(r":\n*")
|
||||||
@ -1143,8 +1155,11 @@ def fmt_txt(chk_str):
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# txt = getText_pdf('D:\wokerplay\面试简历\【Android开发工程师_武汉】陈超峰 10年.pdf')
|
# txt = getText_pdf('D:\wokerplay\面试简历\【Android开发工程师_武汉】陈超峰 10年.pdf')
|
||||||
# if txt:
|
txt = get_png_txt(r'D:\wokerplay\面试简历1\陈雨蝶.jpg')
|
||||||
# pass
|
# pdf_path, file_name = png2pdf('D:\wokerplay\面试简历1', '程敏谦-文案策划简历.JPG')
|
||||||
|
# txt = pdf_to_text(pdf_path)
|
||||||
|
if txt:
|
||||||
|
pass
|
||||||
# 拉勾
|
# 拉勾
|
||||||
chk_str1 = """
|
chk_str1 = """
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user