-
Notifications
You must be signed in to change notification settings - Fork 1
/
ocr.py
122 lines (114 loc) · 3.3 KB
/
ocr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import pytesseract
import argparse
import cv2
import csv
from get_stations import get_station_names
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--image", required=True,
help="path to input image to be OCR")
ap.add_argument("-f", "--filename", required=False,
help="name of the csv file")
args = vars(ap.parse_args())
image = cv2.imread(args["image"])
config = "-c tessedit_char_whitelist=0123456789. -c tosp_min_sane_kn_sp=5 --oem 0 --psm 6"
data_tesseract = pytesseract.image_to_string(image, config=config)
data = []
pg_no = -1
content = data_tesseract.split('\n')
counter = 1
ind = 0
i = 0
try:
pg_no = content[i]
except:
pg_no = -1
flag = 0
while i < len(content):
if len(content[i]) == 0:
i += 1
continue
temp = content[i].split()
if temp[0] == "1":
flag = 1
break
elif temp[0] == "2":
i -= 1
break
elif temp[0] == "3":
i -= 2
break
i += 1
if i == len(content):
i = 7
if i != -1:
data_temp = []
while i < len(content):
if len(content[i]) == 0:
i += 1
continue
temp = content[i].split()
string = ""
cnt = 0
for num in temp:
string_ind = ""
if len(num) == 1 and (num[0] == "."):
continue;
for char in num:
if char != "-":
string_ind += char
if counter == 32 and cnt == 0:
string_ind = "Sum"
elif counter == 33 and cnt == 0:
string_ind = "Mean"
string = string + string_ind + ","
cnt += 1
data_temp.append(string[:-1])
counter += 1
i += 1
if counter == 34:
data.append(data_temp)
data_temp = []
counter = 1
ind += 1
flag = 0
cnt = 0
while i < len(content):
if len(content[i]) == 0:
i += 1
continue
temp = content[i].split()
try:
if temp[0] == "1":
flag = 1
break
elif temp[0] == "2":
i -= 1
break
elif temp[0] == "3":
i -= 2
break
elif temp[0] == "4":
i -= 3
break
i += 1
cnt += 1
except:
break
if i == len(content):
i = i - cnt + 7
stations_list = get_station_names(args["image"], args["output"], data)
with open("results/" + args["filename"] + '.csv', mode='a') as ocr_out:
ocr_writer = csv.writer(ocr_out, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
input_name = args["image"]
try:
file_name = input_name.split('.')[0].split("/")[-1]
pg_no = file_name
except:
pg_no = "Page N/A"
ocr_writer.writerow([pg_no])
ocr_writer.writerow(["\n"])
for dt, station in zip(data, stations_list):
ocr_writer.writerow(station.split(','))
for line in dt:
ocr_writer.writerow(line.split(','))
ocr_writer.writerow(["\n"])