165 lines
4.6 KiB
Python
165 lines
4.6 KiB
Python
import fnmatch
|
||
import docx
|
||
# import win32com.client
|
||
import os
|
||
from pdf2docx import Converter
|
||
|
||
PATH_DATA = os.path.abspath(r"C:\Users\Administrator\Desktop\新建文件夹") # word简历存放路径
|
||
|
||
# 将docx,doc文件转换成txt文件
|
||
def docx_to_txt():
|
||
wordapp = win32com.client.gencache.EnsureDispatch("Word.Application")
|
||
try:
|
||
for root, dirs, files in os.walk(PATH_DATA):
|
||
for _dir in dirs:
|
||
pass
|
||
for _file in files:
|
||
if not (fnmatch.fnmatch(_file, '*.doc') or fnmatch.fnmatch(_file, '*.docx')) or _file.startswith("~"):
|
||
continue
|
||
print('_file:', _file)
|
||
file = os.path.join(root, _file)
|
||
wordapp.Documents.Open(file)
|
||
if fnmatch.fnmatch(_file, '*.docx'): # 匹配doc文档
|
||
file = file[:-3] + 'txt'
|
||
else: # 匹配docx文档
|
||
file = file[:-4] + 'txt'
|
||
wordapp.ActiveDocument.SaveAs(file, FileFormat=win32com.client.constants.wdFormatText,
|
||
Encoding=65001) # 这里直接转换为 utf-8 格式的txt
|
||
# https://docs.microsoft.com/zh-cn/office/vba/api/Office.MsoEncoding 各种格式代码在这里查
|
||
wordapp.ActiveDocument.Close()
|
||
|
||
finally:
|
||
wordapp.Quit()
|
||
|
||
|
||
# 将pdf文件转换成txt文件
|
||
def pdf_to_txt(pdf_path):
|
||
with pdfplumber.open(pdf_path) as pdf_file:
|
||
content = ''
|
||
for i in range(len(pdf_file.pages)):
|
||
page_text = pdf_file.pages[i]
|
||
page_content = page_text.extract_text()
|
||
if page_content:
|
||
content = content + page_content + "\n"
|
||
with open(f"{pdf_path.split('.')[0]}.txt", "w", encoding="utf-8") as file:
|
||
file.write(content)
|
||
file.close()
|
||
|
||
|
||
|
||
import paddlehub as hub
|
||
import numpy as np
|
||
|
||
|
||
def get_model():
|
||
lac = hub.Module(name='lac')
|
||
return lac
|
||
|
||
|
||
def get_lac(text):
|
||
inputs = {"text": [text]}
|
||
lac = get_model()
|
||
res = lac.lexical_analysis(data=inputs)
|
||
tag = res[0]['tag']
|
||
word = res[0]['word']
|
||
return tag, word
|
||
|
||
|
||
def get_entity(text, label):
|
||
'''
|
||
label参数可以为
|
||
'PER' : 人名
|
||
'LOC' : 地名
|
||
'ORG' : 机构名
|
||
'TIME' : 时间
|
||
'''
|
||
res = []
|
||
tag, word = get_lac(text)
|
||
tag = np.array(tag)
|
||
indexs = np.where(tag == label)[0]
|
||
for index in indexs:
|
||
res.append(word[index])
|
||
return res
|
||
|
||
|
||
def getText_docx(filename): # docx 转text
|
||
"""将docx读成text"""
|
||
doc = docx.Document(filename)
|
||
fullText = []
|
||
for i in doc.paragraphs: # 迭代docx文档里面的每一个段落
|
||
fullText.append(i.text) # 保存每一个段落的文本
|
||
numTables = doc.tables #如果有表格的内容存放在这
|
||
if len(numTables) > 0:
|
||
for table in numTables:
|
||
row_count = len(table.rows)
|
||
col_count = len(table.columns)
|
||
for i in range(row_count):
|
||
for j in range(col_count):
|
||
fullText.append(table.cell(i, j).text)
|
||
return '\n'.join(fullText)
|
||
|
||
|
||
import pdfplumber
|
||
|
||
|
||
def getText_pdf(filename):
|
||
"""将pdf读成text"""
|
||
with pdfplumber.open(filename) as pdf_file:
|
||
content = ''
|
||
for i in range(len(pdf_file.pages)):
|
||
page_text = pdf_file.pages[i]
|
||
page_content = page_text.extract_text()
|
||
if page_content:
|
||
content = content + page_content + "\n"
|
||
return content
|
||
|
||
|
||
from win32com import client as wc
|
||
|
||
|
||
def doc_docx(url, filename):
|
||
"""
|
||
将doc文件转为docx文件
|
||
:param filename:
|
||
:return:
|
||
"""
|
||
word = wc.Dispatch("Word.Application")
|
||
doc = word.Documents.Open(url + f"/{filename}")
|
||
name = filename.split('.')[0]
|
||
doc.SaveAs(url + f'/{name}.docx', 12) # 12为docx
|
||
doc.Close()
|
||
word.Quit()
|
||
|
||
|
||
def pdf_docx(url,filename):
|
||
"""
|
||
将pdf文件转为docx文件
|
||
:param url:
|
||
:param filename:
|
||
:return:
|
||
"""
|
||
# 获取文件名称
|
||
file_name = filename.split('.')[0]
|
||
# pdf文件名称
|
||
pdf_name = url + f"/{filename}"
|
||
# docx文件名称
|
||
docx_name = url +f"/{file_name}.docx"
|
||
# 加载pdf文档
|
||
cv = Converter(pdf_name)
|
||
cv.convert(docx_name,start=0,end=12)
|
||
cv.close()
|
||
|
||
def read_tables(open_txt):
|
||
"""
|
||
读取docx的表格内容
|
||
:param open_txt: 打开docx后的对象
|
||
:return:
|
||
"""
|
||
numTables = open_txt.tables
|
||
for table in numTables:
|
||
row_count = len(table.rows)
|
||
col_count = len(table.columns)
|
||
for i in range(row_count):
|
||
for j in range(col_count):
|
||
print(table.cell(i, j).text)
|