forked from openpaperwork/pyocr
-
Notifications
You must be signed in to change notification settings - Fork 0
/
update_test_data.sh
executable file
·190 lines (157 loc) · 3.78 KB
/
update_test_data.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
#!/bin/sh
run_tess()
{
img="$1"
shift
out="$1"
shift
lang="$1"
shift
lang_arg=""
if [ -n "${lang}" ]; then
lang_arg=-l
fi
echo tesseract ${img} ${out} ${lang_arg} ${lang} $@
if ! tesseract ${img} ${out} ${lang_arg} ${lang} $@ > /dev/null 2>&1
then
echo "FAILED !"
fi
}
run_tess_all()
{
type=${1}
mkdir -p output/${type}/tesseract
for input in input/${type}/*;
do
output=$(basename ${input})
output=$(echo ${output} | sed s/.jpg//g)
output=$(echo ${output} | sed s/.png//g)
lang=eng
extra_config=""
if echo ${output} | grep digit > /dev/null ;
then
lang=eng # don't touch
extra_config="digits"
elif echo ${output} | grep french > /dev/null ;
then lang=fra
elif echo ${output} | grep japanese > /dev/null ;
then lang=jpn
fi
run_tess ${input} output/${type}/tesseract/${output} ${lang} \
${extra_config}
run_tess ${input} output/${type}/tesseract/${output} ${lang} \
batch.nochop makebox ${extra_config}
run_tess ${input} output/${type}/tesseract/${output} ${lang} \
hocr ${extra_config}
mv output/${type}/tesseract/${output}.hocr \
output/${type}/tesseract/${output}.words
cp output/${type}/tesseract/${output}.words \
output/${type}/tesseract/${output}.lines
done
}
run_tess_api()
{
img="$1"
shift
out="$1"
shift
lang="$1"
shift
builder="$1"
shift
echo "${img} --> ${out} (${lang} / ${builder})"
lang_arg=""
if [ -n "${lang}" ]; then
lang_arg=-l
fi
cat << EOF | python3
from PIL import Image
from pyocr import libtesseract
from pyocr import builders
img = Image.open("${img}")
builder = builders.${builder}()
out = libtesseract.image_to_string(img, lang="${lang}", builder=builder)
with open("${out}", "w") as fd:
builder.write_file(fd, out)
EOF
}
run_tess_api_all()
{
type="$1"
mkdir -p output/${type}/libtesseract
for input in input/${type}/*;
do
output=$(basename ${input})
output=$(echo ${output} | sed s/.jpg//g)
output=$(echo ${output} | sed s/.png//g)
lang=eng
if echo ${output} | grep digit > /dev/null ;
then
run_tess_api ${input} output/${type}/libtesseract/${output}.txt \
${lang} \
DigitBuilder
run_tess_api ${input} output/${type}/libtesseract/${output}.lines \
${lang} \
DigitLineBoxBuilder
elif echo ${output} | grep french > /dev/null ;
then lang=fra
elif echo ${output} | grep japanese > /dev/null ;
then lang=jpn
fi
run_tess_api ${input} output/${type}/libtesseract/${output}.txt ${lang} \
TextBuilder
run_tess_api ${input} output/${type}/libtesseract/${output}.words ${lang} \
WordBoxBuilder
run_tess_api ${input} output/${type}/libtesseract/${output}.lines ${lang} \
LineBoxBuilder
done
}
run_cuneiform()
{
img="$1"
shift
out="$1"
shift
lang="$1"
shift
lang_arg=""
if [ -n "${lang}" ]; then
lang_arg=-l
fi
echo cuneiform ${lang_arg} ${lang} "$@" -o ${out} ${img}
if ! cuneiform ${lang_arg} ${lang} "$@" -o ${out} ${img} > /dev/null; then
echo "FAILED !"
fi
}
run_cuneiform_all()
{
type="$1"
mkdir -p output/${type}/cuneiform
for input in input/${type}/*;
do
output=$(basename ${input})
output=$(echo ${output} | sed s/.jpg//g)
output=$(echo ${output} | sed s/.png//g)
lang=eng
if echo ${output} | grep french > /dev/null ;
then lang=fra
elif echo ${output} | grep japanese > /dev/null ;
then
# skip japanese for now
continue
fi
run_cuneiform ${input} output/${type}/cuneiform/${output}.txt ${lang} -f text
run_cuneiform ${input} output/${type}/cuneiform/${output}.words ${lang} -f hocr
run_cuneiform ${input} output/${type}/cuneiform/${output}.lines ${lang} -f hocr
done
}
cd tests
echo "=== Tesseract sh ==="
run_tess_all real
run_tess_all specific
echo "=== Tesseract C-api ==="
run_tess_api_all real
run_tess_api_all specific
echo "=== Cuneiform ==="
run_cuneiform_all real
run_cuneiform_all specific