prs_server/utils/chkmail/gongju.py
2022-08-08 14:29:03 +08:00

165 lines
4.6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import fnmatch
import docx
# import win32com.client
import os
from pdf2docx import Converter
PATH_DATA = os.path.abspath(r"C:\Users\Administrator\Desktop\新建文件夹") # word简历存放路径
# 将docxdoc文件转换成txt文件
def docx_to_txt():
wordapp = win32com.client.gencache.EnsureDispatch("Word.Application")
try:
for root, dirs, files in os.walk(PATH_DATA):
for _dir in dirs:
pass
for _file in files:
if not (fnmatch.fnmatch(_file, '*.doc') or fnmatch.fnmatch(_file, '*.docx')) or _file.startswith("~"):
continue
print('_file:', _file)
file = os.path.join(root, _file)
wordapp.Documents.Open(file)
if fnmatch.fnmatch(_file, '*.docx'): # 匹配doc文档
file = file[:-3] + 'txt'
else: # 匹配docx文档
file = file[:-4] + 'txt'
wordapp.ActiveDocument.SaveAs(file, FileFormat=win32com.client.constants.wdFormatText,
Encoding=65001) # 这里直接转换为 utf-8 格式的txt
# https://docs.microsoft.com/zh-cn/office/vba/api/Office.MsoEncoding 各种格式代码在这里查
wordapp.ActiveDocument.Close()
finally:
wordapp.Quit()
# 将pdf文件转换成txt文件
def pdf_to_txt(pdf_path):
with pdfplumber.open(pdf_path) as pdf_file:
content = ''
for i in range(len(pdf_file.pages)):
page_text = pdf_file.pages[i]
page_content = page_text.extract_text()
if page_content:
content = content + page_content + "\n"
with open(f"{pdf_path.split('.')[0]}.txt", "w", encoding="utf-8") as file:
file.write(content)
file.close()
import paddlehub as hub
import numpy as np
def get_model():
lac = hub.Module(name='lac')
return lac
def get_lac(text):
inputs = {"text": [text]}
lac = get_model()
res = lac.lexical_analysis(data=inputs)
tag = res[0]['tag']
word = res[0]['word']
return tag, word
def get_entity(text, label):
'''
label参数可以为
'PER' : 人名
'LOC' : 地名
'ORG' : 机构名
'TIME' : 时间
'''
res = []
tag, word = get_lac(text)
tag = np.array(tag)
indexs = np.where(tag == label)[0]
for index in indexs:
res.append(word[index])
return res
def getText_docx(filename): # docx 转text
"""将docx读成text"""
doc = docx.Document(filename)
fullText = []
for i in doc.paragraphs: # 迭代docx文档里面的每一个段落
fullText.append(i.text) # 保存每一个段落的文本
numTables = doc.tables #如果有表格的内容存放在这
if len(numTables) > 0:
for table in numTables:
row_count = len(table.rows)
col_count = len(table.columns)
for i in range(row_count):
for j in range(col_count):
fullText.append(table.cell(i, j).text)
return '\n'.join(fullText)
import pdfplumber
def getText_pdf(filename):
"""将pdf读成text"""
with pdfplumber.open(filename) as pdf_file:
content = ''
for i in range(len(pdf_file.pages)):
page_text = pdf_file.pages[i]
page_content = page_text.extract_text()
if page_content:
content = content + page_content + "\n"
return content
from win32com import client as wc
def doc_docx(url, filename):
"""
将doc文件转为docx文件
:param filename:
:return:
"""
word = wc.Dispatch("Word.Application")
doc = word.Documents.Open(url + f"/{filename}")
name = filename.split('.')[0]
doc.SaveAs(url + f'/{name}.docx', 12) # 12为docx
doc.Close()
word.Quit()
def pdf_docx(url,filename):
"""
将pdf文件转为docx文件
:param url:
:param filename:
:return:
"""
# 获取文件名称
file_name = filename.split('.')[0]
# pdf文件名称
pdf_name = url + f"/{filename}"
# docx文件名称
docx_name = url +f"/{file_name}.docx"
# 加载pdf文档
cv = Converter(pdf_name)
cv.convert(docx_name,start=0,end=12)
cv.close()
def read_tables(open_txt):
"""
读取docx的表格内容
:param open_txt: 打开docx后的对象
:return:
"""
numTables = open_txt.tables
for table in numTables:
row_count = len(table.rows)
col_count = len(table.columns)
for i in range(row_count):
for j in range(col_count):
print(table.cell(i, j).text)