From aa3212676fab36309a16b0981889a04b8036bfdd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=80=C3=AE=C3=97=C3=9A=C3=95=C3=B1?= <lizz556@163.com>
Date: Mon, 24 Oct 2022 15:43:09 +0800
Subject: [PATCH] =?UTF-8?q?=E6=89=B9=E9=87=8F=E5=AF=BC=E5=85=A5=E7=AE=80?=
 =?UTF-8?q?=E5=8E=86=E4=BC=98=E5=8C=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 api/api_v1/endpoints/interview.py |  8 ++++----
 utils/func.py                     | 13 ++++++++++++-
 utils/re_to_jianli.py             | 21 ++++++++++++++++++---
 3 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/api/api_v1/endpoints/interview.py b/api/api_v1/endpoints/interview.py
index 7a79b21..8193d14 100644
--- a/api/api_v1/endpoints/interview.py
+++ b/api/api_v1/endpoints/interview.py
@@ -799,14 +799,14 @@ async def files_to_hw(
             fn = file_path + '/' + file
             end_str = file.split('.')[-1].lower()  # 文件类型
             # 文件类型支持
-            if end_str not in ['pdf', 'doc', 'docx', 'png', 'jpg', 'jpeg', 'word']:
+            if end_str not in ['pdf', 'doc', 'docx', 'word']:
                 continue
             if end_str in ['doc', 'docx', 'word']:  # doc,docx, word转pdf
                 fn, fil = doc2pdf(fn, file_path, file)
                 file = fil
-            if end_str in ['png', 'jpg', 'jpeg']:  # 图片转pdf
-                fn, fil = png2pdf(file_path, file)
-                file = fil
+            # if end_str in ['png', 'jpg', 'jpeg']:  # 图片转pdf
+            #     fn, fil = png2pdf(file_path, file)
+            #     file = fil
             data_mode1 = deepcopy(data_mode)
             uid = get_uid()
             data_mode1['uid'] = uid
diff --git a/utils/func.py b/utils/func.py
index f5a03a9..ad77f61 100644
--- a/utils/func.py
+++ b/utils/func.py
@@ -29,6 +29,7 @@ my_pass = 'whrsugtgkstibjdj'  # 发件人邮箱密码
 subject = '入职通知'  # 邮件的主题，也可以说是标题
 mail_host = 'smtp.qq.com'
 
+
 def get_uid():
     return hex(int(time.time() * 10 ** 7) + random.randint(0, 10000))[2:]
 
@@ -269,7 +270,7 @@ def send_affix_mail(str_msg: str, send_to: list, file_name: str = None):
     # ---附件部分---
     if file_name != None:
         part = MIMEApplication(open(file_name, 'rb').read())
-        filename=file_name.split('/')[-1]
+        filename = file_name.split('/')[-1]
         part.add_header('Content-Disposition', 'attachment', filename=filename)
         msg.attach(part)
     smpt = smtplib.SMTP_SSL(mail_host, 465, 'utf-8')
@@ -416,6 +417,16 @@ def png2pdf(dir_path, filename):
     return res_path, new_filename
 
 
+# pdf转换为txt文字
+def pdf_to_text(path):
+    res = ''
+    doc = fitz.open(path)
+    for page in doc:
+        text = page.getText()
+        res += text
+    return res
+
+
 def write_task(jsontext):
     with open('task.json', 'w', encoding='utf-8') as f:
         f.write(jsontext)
diff --git a/utils/re_to_jianli.py b/utils/re_to_jianli.py
index c6034d3..e9f035d 100644
--- a/utils/re_to_jianli.py
+++ b/utils/re_to_jianli.py
@@ -3,6 +3,9 @@
 from LAC import LAC
 import re
 import copy
+import pytesseract
+from PIL import Image
+from utils import png2pdf, pdf_to_text
 import pdfplumber
 
 lac = LAC(mode='lac')
@@ -20,6 +23,15 @@ def getText_pdf(filename):
     return content
 
 
+# 识别图片文字内容
+def get_png_txt(filename):
+    # 读取图片
+    im = Image.open(filename)
+    # 识别文字，并指定语言
+    string = pytesseract.image_to_string(im, lang='chi_sim')
+    return string
+
+
 def fmt_txt(chk_str):
     true_chkStr = chk_str
     regex = re.compile(r"：\n*")
@@ -324,7 +336,7 @@ def fmt_txt(chk_str):
         # if '实习经历' in true_chkStr:
         #     work_undergo_str = true_chkStr.split('实习经历')[-1]
         # else:
-            work_undergo_str = true_chkStr.split('工作经验')[-1]
+        work_undergo_str = true_chkStr.split('工作经验')[-1]
     else:
         if '⼯作经历' in true_chkStr:
             work_undergo_str = true_chkStr.split('⼯作经历')[-1]
@@ -1143,8 +1155,11 @@ def fmt_txt(chk_str):
 
 if __name__ == '__main__':
     # txt = getText_pdf('D:\wokerplay\面试简历\【Android开发工程师_武汉】陈超峰 10年.pdf')
-    # if txt:
-    #     pass
+    txt = get_png_txt(r'D:\wokerplay\面试简历1\陈雨蝶.jpg')
+    # pdf_path, file_name = png2pdf('D:\wokerplay\面试简历1', '程敏谦-文案策划简历.JPG')
+    # txt = pdf_to_text(pdf_path)
+    if txt:
+        pass
     # 拉勾
     chk_str1 = """