Wednesday, September 1, 2021

Extracting Text from Docx, Doc and Pdf files Using Python




import os
import docx # pip install python-docx # Does not support .pdf and .doc
import PyPDF2

from docx import Document

SUPPORTED_FORMATS = ['pdf', 'doc', 'docx']
WORD_FORMATS = ['doc', 'docx']

For DOCX

f_list = [] for dirpath, subdirs, files in os.walk("."): for f in files: if f.split(".")[1] == "docx" and f[0] != "~": f_list.append(os.path.join(dirpath, f)) d_list = [] for f in f_list: d_list.append(Document(f)) t_list = [] for d in d_list: para_text = "" for para in d.paragraphs: para_text = para_text + " " + para.text t_list.append(para_text)

FOR PDF

f_list = [] for dirpath, subdirs, files in os.walk("."): for f in files: if f.split(".")[1] == "pdf" and f[0] != "~": #print(f, dirpath) f_list.append(os.path.join(dirpath, f)) d_list = [] t_list = [] for f in f_list: pdfFileObj = open(f, 'rb') pdfReader = PyPDF2.PdfFileReader(pdfFileObj) d_list.append(pdfReader) text = "" for pageObj in pdfReader.pages: text = text + " " + pageObj.extractText() t_list.append(text) pdfFileObj.close()

For DOC

import win32com.client word = win32com.client.Dispatch("Word.Application") word.visible = False wb = word.Documents.Open(r'D:\xyz.doc') doc = word.ActiveDocument print(doc.Range().Text) Tags: Technology,Python,Natural Language Processing,

No comments:

Post a Comment