-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdfReader.py
128 lines (94 loc) · 3.55 KB
/
pdfReader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
from PyPDF2 import PdfFileWriter, PdfFileReader
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
import pdfminer
import pandas as pd
import os,sys
import xlrd
import math
import xlwt
import re
import numpy as np
from dateutil.parser import parse
import operator
from PyPDF2 import PdfFileWriter, PdfFileReader
from PyPDF2.generic import (
DictionaryObject,
NumberObject,
FloatObject,
NameObject,
TextStringObject,
ArrayObject
)
class Document:
def __init__(self,fileName,fPath):
"""
Documents class constructer deceleration
"""
self.fileName = fileName
self.fPath = fPath
self.fType = 0
self.content = []
self.coordcont = []
def readFile(self):
"""
This Function check the file type and call the
called the function according to matched file type
"""
if self.fType ==1:
self.readPdf()
def readPdf(self):
file1 = os.path.join(self.fPath,self.fileName)
fp = open(file1,'rb')
parser = PDFParser(fp)
document = PDFDocument(parser)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
rsrcmgr = PDFResourceManager()
device = PDFDevice(rsrcmgr)
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
lt = []
lt1 = []
def parse_obj(lt_objs,pageNo):
for obj in lt_objs:
try:
if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
lt.append(obj.get_text().replace('\n', ''))
lt1.append([obj.get_text().replace('\n', '').strip(),int(obj.bbox[0]),int(obj.bbox[1]),int(obj.bbox[3]),pageNo+1])
#print(pageNo + 1,int(obj.bbox[2]),int(obj.bbox[3]),obj.get_text().replace('\n', '').strip())
except:
pass
for pageNumber,page in enumerate(PDFPage.get_pages(fp)):
try:
interpreter.process_page(page)
layout = device.get_result()
parse_obj(layout._objs,pageNumber)
except:
pass
self.coordcont = lt1
self.content = lt
return self.content
def findFileType(self):
ff = self.fileName.split('.')
ff[1] = ff[1].lower()
if ff[1] == 'pdf':
self.fType = 1
if __name__ == '__main__':
inpPdfDir = r'path to dir with pdf files'
for folder, subfolder, filenames in os.walk(inpPdfDir):
for f in filenames :
print(f,"---------------")
oDoc = Document(f,folder)
oDoc.findFileType()
oDoc.readFile()
if oDoc.fType == 1:
print("yahooo")