import os import docx # pip install python-docx # Does not support .pdf and .doc import PyPDF2 from docx import Document SUPPORTED_FORMATS = ['pdf', 'doc', 'docx'] WORD_FORMATS = ['doc', 'docx']For DOCX
f_list = [] for dirpath, subdirs, files in os.walk("."): for f in files: if f.split(".")[1] == "docx" and f[0] != "~": f_list.append(os.path.join(dirpath, f)) d_list = [] for f in f_list: d_list.append(Document(f)) t_list = [] for d in d_list: para_text = "" for para in d.paragraphs: para_text = para_text + " " + para.text t_list.append(para_text)FOR PDF
f_list = [] for dirpath, subdirs, files in os.walk("."): for f in files: if f.split(".")[1] == "pdf" and f[0] != "~": #print(f, dirpath) f_list.append(os.path.join(dirpath, f)) d_list = [] t_list = [] for f in f_list: pdfFileObj = open(f, 'rb') pdfReader = PyPDF2.PdfFileReader(pdfFileObj) d_list.append(pdfReader) text = "" for pageObj in pdfReader.pages: text = text + " " + pageObj.extractText() t_list.append(text) pdfFileObj.close()For DOC
import win32com.client word = win32com.client.Dispatch("Word.Application") word.visible = False wb = word.Documents.Open(r'D:\xyz.doc') doc = word.ActiveDocument print(doc.Range().Text)
Wednesday, September 1, 2021
Extracting Text from Docx, Doc and Pdf files Using Python
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment