-
Notifications
You must be signed in to change notification settings - Fork 0
/
do-convert-single-file
executable file
·75 lines (68 loc) · 2.29 KB
/
do-convert-single-file
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/bin/bash
set -e
#set -x
MIME_TYPE=$(file -b input.blob --mime-type)
THUMBNAIL_FILENAME=0-thumbnail.png
case "$MIME_TYPE" in
image/png)
;;
image/jpeg)
THUMBNAIL_FILENAME=0-thumbnail.jpg
;;
**)
echo "Unhandled image type: $MIME_TYPE"
exit 0 # We finished successfully, outputting a great error message
;;
esac
generate_thumbnail() {
cmd="convert input.blob"
if convert input.blob profile.icc 2>/dev/null; then
# Convert to sRGB
cmd="$cmd -profile profile.icc -profile /usr/share/color/icc/colord/sRGB.icc"
fi
# Remove transparency
cmd="$cmd -background white -alpha remove -alpha off"
# Strip other PNG chunks
cmd="$cmd -define png:include-chunk=none"
# Resize and strip color profile
cmd="$cmd -thumbnail 700x700>"
# Output
cmd="$cmd $THUMBNAIL_FILENAME"
$cmd
}
generate_thumbnail
LANG_2LETTER_JSON=$(echo "$1" | jq '.languageCode')
case "$LANG_2LETTER_JSON" in
'"ar"') LANG_3LETTER="ara" ;;
'"ca"') LANG_3LETTER="cat" ;;
'"cs"') LANG_3LETTER="ces" ;;
'"de"') LANG_3LETTER="deu" ;;
'"en"') LANG_3LETTER="eng" ;;
'"fr"') LANG_3LETTER="fra" ;;
'"it"') LANG_3LETTER="ita" ;;
'"nl"') LANG_3LETTER="nld" ;;
'"no"') LANG_3LETTER="nor" ;;
'"pt"') LANG_3LETTER="por" ;;
'"ru"') LANG_3LETTER="rus" ;;
'"es"') LANG_3LETTER="spa" ;;
'"sv"') LANG_3LETTER="swe" ;;
**)
echo "Unhandled languageCode: $LANG_2LETTER_JSON"
exit 0 # We finished successfully, outputting a great error message
;;
esac
# Remove alpha channel, if it exists.
if [ "$(file input.blob --mime-type --brief)" = "image/png" ]; then
mogrify input.blob -background white -alpha remove -alpha off
fi
if [ "$(echo "$1" | jq '.wantOcr')" = 'true' ]; then
echo "$1" | jq '{ filename: .filename, languageCode: .languageCode, wantOcr: false, wantSplitByPage: false, contentType: "application/pdf", metadata: (.metadata + { "isFromOcr": true }) }' > 0.json
# OMP_THREAD_LIMIT: disable multithreading. Kubernetes is expecting us to
# stay on one CPU.
OMP_THREAD_LIMIT=1 tesseract input.blob 0 -l "$LANG_3LETTER" --psm 1 quiet pdf txt
mv 0.pdf 0.blob
else
touch 0.txt
echo "$1" | jq '{ filename: .filename, languageCode: .languageCode, wantOcr: false, wantSplitByPage: false, contentType: "application/pdf", metadata: .metadata }' > 0.json
img2pdf input.blob > 0.blob
fi