-
Notifications
You must be signed in to change notification settings - Fork 0
/
xhtml_from_hocr.py
executable file
·112 lines (93 loc) · 3.31 KB
/
xhtml_from_hocr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env python3
import os as OS
from html.parser import HTMLParser
from roman_numeral.roman_numeral import roman_from_int
import html
class HtmlFromHocr(HTMLParser):
skipped_tags = ["span", "div", "html", "head", "title", "meta", "body"]
def handle_starttag(self, tag, attrs):
global is_new_page
global page_num
if tag in self.skipped_tags:
return
print(f"<{tag}", end = '')
# for i in attrs:
# print(' ' + i[0] + '="' + i[1] + '"', end = '')
print(">", end = '')
if is_new_page:
print(f'<a title="{page_num}" id="p{page_num}" epub:type="pagebreak"></a>', end="")
is_new_page = False
def handle_endtag(self, tag):
if tag in self.skipped_tags:
return
print(f"</{tag}>")
if tag == "p":
print("\n")
def handle_data(self, data):
if "\n" in data or "\r" in data:
return
if data[-1] == "-":
data = data[0:-1]
print(html.escape(data), end = "")
else:
print(f'{html.escape(data)} ', end = "")
def split_prefix_num(name: str):
num = ""
for idx, c in enumerate(reversed(name)):
if not c.isdigit():
break
num = c + num
if num:
file_prefix = name[:-idx]
num = int(num)
else:
file_prefix = name
num = 0
return file_prefix, num
def parse_file(file_path, args, parser):
global is_new_page
global page_num
is_new_page = True
name: str = OS.path.splitext(OS.path.basename(file_path))[0]
file_prefix, page_num = split_prefix_num(name)
if file_prefix != args.prefix:
page_num += args.foffset
page_num = roman_from_int(page_num).lower()
else:
page_num += args.offset
with open(file_path, 'r') as file:
contents = file.read()
parser.feed(contents)
def print_out(args):
parser = HtmlFromHocr()
print(
"<?xml version='1.0' encoding='utf-8'?>\n" +
f'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" lang="{args.language}" xml:lang="{args.language}">\n' +
"<head>\n" +
f" <title>{args.title}</title>\n" +
"</head>\n\n" +
"<body>\n"
)
for j in args.file_path:
parse_file(j, args, parser)
print("</body>\n</html>\n")
parser.close()
def main(args):
if args.output:
import sys
with args.output as sys.stdout:
print_out(args)
else:
print_out(args)
if __name__ == "__main__":
import argparse
argparser = argparse.ArgumentParser()
argparser.add_argument("file_path", nargs="*", type=str, help="File(s) to convert.")
argparser.add_argument("-F", "--foffset", type=int, default=0, help="Same as -O, but for front matter.")
argparser.add_argument("-l", "--language", type=str, default="en", help="Set the LCID for the document. Defaults to 'en'.")
argparser.add_argument("-O", "--offset", type=int, default=0, help="Offset the page number of main body matter by this amount. Useful if the filename does not match the page number. For example, if page001.jpg contains page 3, set this to 2. Can be nagative.")
argparser.add_argument("-o", "--output", type=argparse.FileType('w'), help="Output file. Defaults to stdout.")
argparser.add_argument("-p", "--prefix", type=str, default="page", help="Specify the filename prefix. Defaults to 'page'. Filenames with this prefix are considered main body matter. All other files are considered front matter and will be numbered using roman numerals.")
argparser.add_argument("-t", "--title", type=str, help="Set the title of the ebook.")
args = argparser.parse_args()
main(args)