import fnmatch import docx # import win32com.client import os from pdf2docx import Converter PATH_DATA = os.path.abspath(r"C:\Users\Administrator\Desktop\新建文件夹") # word简历存放路径 # 将docx,doc文件转换成txt文件 def docx_to_txt(): wordapp = win32com.client.gencache.EnsureDispatch("Word.Application") try: for root, dirs, files in os.walk(PATH_DATA): for _dir in dirs: pass for _file in files: if not (fnmatch.fnmatch(_file, '*.doc') or fnmatch.fnmatch(_file, '*.docx')) or _file.startswith("~"): continue print('_file:', _file) file = os.path.join(root, _file) wordapp.Documents.Open(file) if fnmatch.fnmatch(_file, '*.docx'): # 匹配doc文档 file = file[:-3] + 'txt' else: # 匹配docx文档 file = file[:-4] + 'txt' wordapp.ActiveDocument.SaveAs(file, FileFormat=win32com.client.constants.wdFormatText, Encoding=65001) # 这里直接转换为 utf-8 格式的txt # https://docs.microsoft.com/zh-cn/office/vba/api/Office.MsoEncoding 各种格式代码在这里查 wordapp.ActiveDocument.Close() finally: wordapp.Quit() # 将pdf文件转换成txt文件 def pdf_to_txt(pdf_path): with pdfplumber.open(pdf_path) as pdf_file: content = '' for i in range(len(pdf_file.pages)): page_text = pdf_file.pages[i] page_content = page_text.extract_text() if page_content: content = content + page_content + "\n" with open(f"{pdf_path.split('.')[0]}.txt", "w", encoding="utf-8") as file: file.write(content) file.close() import paddlehub as hub import numpy as np def get_model(): lac = hub.Module(name='lac') return lac def get_lac(text): inputs = {"text": [text]} lac = get_model() res = lac.lexical_analysis(data=inputs) tag = res[0]['tag'] word = res[0]['word'] return tag, word def get_entity(text, label): ''' label参数可以为 'PER' : 人名 'LOC' : 地名 'ORG' : 机构名 'TIME' : 时间 ''' res = [] tag, word = get_lac(text) tag = np.array(tag) indexs = np.where(tag == label)[0] for index in indexs: res.append(word[index]) return res def getText_docx(filename): # docx 转text """将docx读成text""" doc = docx.Document(filename) fullText = [] for i in doc.paragraphs: # 迭代docx文档里面的每一个段落 fullText.append(i.text) # 保存每一个段落的文本 numTables = doc.tables #如果有表格的内容存放在这 if len(numTables) > 0: for table in numTables: row_count = len(table.rows) col_count = len(table.columns) for i in range(row_count): for j in range(col_count): fullText.append(table.cell(i, j).text) return '\n'.join(fullText) import pdfplumber def getText_pdf(filename): """将pdf读成text""" with pdfplumber.open(filename) as pdf_file: content = '' for i in range(len(pdf_file.pages)): page_text = pdf_file.pages[i] page_content = page_text.extract_text() if page_content: content = content + page_content + "\n" return content from win32com import client as wc def doc_docx(url, filename): """ 将doc文件转为docx文件 :param filename: :return: """ word = wc.Dispatch("Word.Application") doc = word.Documents.Open(url + f"/{filename}") name = filename.split('.')[0] doc.SaveAs(url + f'/{name}.docx', 12) # 12为docx doc.Close() word.Quit() def pdf_docx(url,filename): """ 将pdf文件转为docx文件 :param url: :param filename: :return: """ # 获取文件名称 file_name = filename.split('.')[0] # pdf文件名称 pdf_name = url + f"/{filename}" # docx文件名称 docx_name = url +f"/{file_name}.docx" # 加载pdf文档 cv = Converter(pdf_name) cv.convert(docx_name,start=0,end=12) cv.close() def read_tables(open_txt): """ 读取docx的表格内容 :param open_txt: 打开docx后的对象 :return: """ numTables = open_txt.tables for table in numTables: row_count = len(table.rows) col_count = len(table.columns) for i in range(row_count): for j in range(col_count): print(table.cell(i, j).text)