-
Notifications
You must be signed in to change notification settings - Fork 0
/
transformations.py
75 lines (60 loc) · 1.97 KB
/
transformations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# This file contains the various transformations you can use
from markdownify import markdownify as md
import markdown
from bs4 import BeautifulSoup
def transformation_token_split(b, N=2000):
res = []
tokens = b.split()
for i in range(0, len(tokens), N):
res.append(" ".join(tokens[i : i + N]))
return res
def transformation_clean_epub(b):
# Convert the raw block of epub html to markdown
out = md(b, heading_style="ATX")
out = out.replace("xml version='1.0' encoding='utf-8'?", "")
out = out.replace("\n\n", "\n")
# Use markdown to conver the markdown to back html
out = markdown.markdown(out)
# Read into beautiful soup
soup = BeautifulSoup(out, "html.parser")
# Prettiy print the html
out = soup.prettify(formatter="html")
return out
#
# Split an HTML into blocks based on the h1 and h2 tags
#
def transformation_html_heading_split(b, splits):
# Construct a BeautifulSoup out of the raw HTML
soup = BeautifulSoup(b, "html.parser")
blocks = []
s = ""
tag = soup.find("h1") # Find the first tag
while tag:
if tag.name is not None:
s += repr(tag).replace("\n", " ")
else:
s += "\n\n"
tag = tag.nextSibling
if tag is not None and tag.name in splits:
blocks.append(s.replace("'\n'", "\n"))
s = ""
return blocks
def transformation_html2md(b):
out = md(b, heading_style="ATX")
out = out.replace("xml version='1.0' encoding='utf-8'?", "")
out = out.replace("\n\n", "\n")
return out
def transformation_html2txt(b):
soup = BeautifulSoup(b, "html.parser")
return soup.prettify()
def transformation_newline_split(b):
return b.split("\n")
def transformation_sentence_split(b):
out = b.split(".")
# Remove empty strings
out = list(filter(None, out))
# Trim the strings
out = [x.strip() for x in out]
# Add period to the end of each sentence
out = [x + "." for x in out]
return out