-
Notifications
You must be signed in to change notification settings - Fork 0
/
pd.py
87 lines (73 loc) · 3.39 KB
/
pd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import pandas as pd
import csv
import xml.etree.ElementTree as ET
tree = ET.parse('PDFtestOUTPUT.xml')
root = tree.getroot()
exmfile = open("extracted text.txt","w",encoding="utf-8")
newlineexmfile = open("newlin text.txt","w",encoding="utf-8")
outcsv = open("outCsv.csv","w",encoding="utf-8")
fieldnames = ['S.No','Text', 'x0','y0','x1','y1','Page_No','height','width']
csvwriter = csv.DictWriter(outcsv, fieldnames=fieldnames)
csvwriter.writeheader()
page_number =None
row_number =1
for item in root:
#print(item.attrib)
page_number = item.attrib['pageid']
temp_x = 0
temp_y = 0
temp_text = ''
print("Page Number",page_number)
# if str(page_number) == '3':
# break
childitem = list(item)
for subElement in item:
subelementIter = subElement.iter()
for child in subelementIter:
elementText = child.text
if elementText is not None :
#if elementText is not None :
attributDic = child.attrib
'''
code to make rows
'''
if int(float(attributDic['y0'])) == temp_y:
if temp_x <int(float(attributDic['x0'])):
#temp_text += elementText.strip()
temp_text += elementText
print('nested if:',temp_text)
newlineexmfile.write("\n"+temp_text)
else:
temp_text = elementText.rstrip()+temp_text
newlineexmfile.write("\n"+temp_text)
print('nest else:',temp_text)
else:
# temp_text = ''
temp_text2 = elementText.lstrip()
print('else:',temp_text2)
newlineexmfile.write("\n"+temp_text2)
x0 = int(float(attributDic['x0']))
y0 = int(float(attributDic['y0']))
x1 = int(float(attributDic['x1']))
y1 = int(float(attributDic['y1']))
height = int(float(attributDic['height']))
width = int(float(attributDic['width']))
temp_text =elementText.strip()
temp_y= y0
temp_x =x0
# print("Data from Tag",elementText.strip(),'x0',attributDic['x0'],'y0',attributDic['y0'])
csvwriter.writerow({'S.No':row_number,'Text':elementText.strip(), 'x0':x0,'y0':y0,'x1':x1,'y1':y1,'Page_No':page_number,
'height':height,'width':width})
exmfile.write("\n"+elementText)
row_number +=1
exmfile.close()
outcsv.close()
##---taking a out a dataframe from csv file to perform actions
extractedTextCsvDF = pd.DataFrame.from_csv("outCsv.csv")
print(extractedTextCsvDF)
print("first extracgted------------------------------------------------------")
print(extractedTextCsvDF.loc[extractedTextCsvDF['Page_No'] == 1])
a =extractedTextCsvDF.sort_values(['x0','y0','Page_No','x1'], ascending=[True, False, True,False])
#a.to_csv("sortedCsv.csv",sep=',',encoding = "utf-8")
print('sorted',a)
print("Done")