批量导入简历优化

2022-10-24 15:43:09 +08:00 · 2022-10-24 15:43:09 +08:00 · aa3212676f
commit aa3212676f
parent 41219df9e6
3 changed files with 34 additions and 8 deletions
--- a/api/api_v1/endpoints/interview.py
+++ b/api/api_v1/endpoints/interview.py
@ -799,14 +799,14 @@ async def files_to_hw(
            fn = file_path + '/' + file
            end_str = file.split('.')[-1].lower()  # 文件类型
            # 文件类型支持
-            if end_str not in ['pdf', 'doc', 'docx', 'png', 'jpg', 'jpeg', 'word']:
+            if end_str not in ['pdf', 'doc', 'docx', 'word']:
                continue
            if end_str in ['doc', 'docx', 'word']:  # doc,docx, word转pdf
                fn, fil = doc2pdf(fn, file_path, file)
                file = fil
-            if end_str in ['png', 'jpg', 'jpeg']:  # 图片转pdf
+            # if end_str in ['png', 'jpg', 'jpeg']:  # 图片转pdf
-                fn, fil = png2pdf(file_path, file)
+            #     fn, fil = png2pdf(file_path, file)
-                file = fil
+            #     file = fil
            data_mode1 = deepcopy(data_mode)
            uid = get_uid()
            data_mode1['uid'] = uid
--- a/utils/func.py
+++ b/utils/func.py
@ -29,6 +29,7 @@ my_pass = 'whrsugtgkstibjdj'  # 发件人邮箱密码
 subject = '入职通知'  # 邮件的主题，也可以说是标题
 mail_host = 'smtp.qq.com'
 def get_uid():
    return hex(int(time.time() * 10 ** 7) + random.randint(0, 10000))[2:]
@ -416,6 +417,16 @@ def png2pdf(dir_path, filename):
    return res_path, new_filename
 # pdf转换为txt文字
 def pdf_to_text(path):
    res = ''
    doc = fitz.open(path)
    for page in doc:
        text = page.getText()
        res += text
    return res
 def write_task(jsontext):
    with open('task.json', 'w', encoding='utf-8') as f:
        f.write(jsontext)
--- a/utils/re_to_jianli.py
+++ b/utils/re_to_jianli.py
@ -3,6 +3,9 @@
 from LAC import LAC
 import re
 import copy
 import pytesseract
 from PIL import Image
 from utils import png2pdf, pdf_to_text
 import pdfplumber
 lac = LAC(mode='lac')
@ -20,6 +23,15 @@ def getText_pdf(filename):
    return content
 # 识别图片文字内容
 def get_png_txt(filename):
    # 读取图片
    im = Image.open(filename)
    # 识别文字，并指定语言
    string = pytesseract.image_to_string(im, lang='chi_sim')
    return string
 def fmt_txt(chk_str):
    true_chkStr = chk_str
    regex = re.compile(r"：\n*")
@ -1143,8 +1155,11 @@ def fmt_txt(chk_str):
 if __name__ == '__main__':
    # txt = getText_pdf('D:\wokerplay\面试简历\【Android开发工程师_武汉】陈超峰 10年.pdf')
-    # if txt:
+    txt = get_png_txt(r'D:\wokerplay\面试简历1\陈雨蝶.jpg')
-    #     pass
+    # pdf_path, file_name = png2pdf('D:\wokerplay\面试简历1', '程敏谦-文案策划简历.JPG')
    # txt = pdf_to_text(pdf_path)
    if txt:
        pass
    # 拉勾
    chk_str1 = """