forked from allanlepp/te_rss
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parsers_html.py
302 lines (218 loc) · 10.8 KB
/
parsers_html.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
"""
Erinevate parserid ja funktsioonid.
"""
import re
from html import unescape
from lxml import html # sudo apt install python3-lxml
from lxml.html.clean import Cleaner
import rss_config
import rss_disk
import rss_print
def html_clean(htmlString):
# Try to parse the provided HTML string using lxml
# strip all unnecessary information to save space
cleaner = Cleaner()
cleaner.comments = True
cleaner.javascript = True
cleaner.scripts = True
cleaner.style = True
htmlString = cleaner.clean_html(htmlString)
return htmlString
def html_change_short_urls(htmlPageString, curDomainShort):
"""
Fix short urls.
"""
htmlPageString = htmlPageString.replace('src="//', 'src="http://')
htmlPageString = htmlPageString.replace('src="./', 'src="' + curDomainShort + '/')
htmlPageString = htmlPageString.replace('src="/', 'src="' + curDomainShort + '/')
htmlPageString = htmlPageString.replace('href="//', 'href="http://')
htmlPageString = htmlPageString.replace('href="./', 'href="' + curDomainShort + '/')
htmlPageString = htmlPageString.replace('href="/', 'href="' + curDomainShort + '/')
return htmlPageString
def html_first_node(htmlString):
htmlStringStartTag = htmlString.split(" ")[0]
htmlStringStartTag = htmlStringStartTag.split(">")[0]
htmlStringEndTag = htmlStringStartTag.replace("<", "</") + ">"
htmlStringList = htmlString.split(htmlStringEndTag)
countStartTags = htmlStringList[0].count(htmlStringStartTag)
if countStartTags == 1:
rss_print.print_debug(__file__, "esimesest splitist leiti " + str(countStartTags) + " esimest tagi '" + htmlStringStartTag + "'", 2)
htmlString = htmlStringList[0]
htmlString = htmlString + htmlStringEndTag
else:
rss_print.print_debug(__file__, "esimesest splitist leiti " + str(countStartTags) + " esimest tagi '" + htmlStringStartTag + "': " + str(htmlStringList[0]), 1)
htmlString = htmlStringEndTag.join(htmlStringList[0:countStartTags])
htmlString = htmlString + htmlStringEndTag
return htmlString
def html_page_cleanup(htmlString):
if not htmlString:
rss_print.print_debug(__file__, "tühi sisend html string: '" + htmlString + "'", 1)
return htmlString
rss_print.print_debug(__file__, "puhastame html stringi üleliigsest jamast", 3)
# remove styles
htmlString = re.sub(r"<style[\s\S]*?<\/style>", "", htmlString)
# remove comments
htmlString = re.sub(r"<!--[\s\S]*?-->", "", htmlString)
# remove scripts from links
htmlString = re.sub(r' onclick=(\")[\s\S]*?(\")', "", htmlString)
htmlString = re.sub(r" onclick=(')[\s\S]*?(')", "", htmlString)
# remove scripts
htmlString = re.sub(r"<script[\s\S]*?<\/script>", "", htmlString)
# remove trackers from links
htmlString = htmlString.replace("&", "&")
htmlString = re.sub(r'(&|\?)_[0-9A-Za-z_-]*', "", htmlString) # delfi
htmlString = re.sub(r'_ga=[0-9.-]*', "", htmlString) # _ga=2.22935807.513285745.1595741966-250801514.1594127878
htmlString = re.sub(r'fbclid=[0-9A-Za-z-_]*', "", htmlString)
htmlString = re.sub(r'gclid=[0-9A-Za-z-_]*', "", htmlString)
htmlString = re.sub(r'refid=[0-9A-Za-z=.%_-]*', "", htmlString)
htmlString = re.sub(r'utm_source=[0-9A-Za-z-_&=.]*', "", htmlString)
# fix link without trackers
htmlString = htmlString.replace("?&", "?")
# fix site links
htmlString = htmlString.replace('href="http://', 'href="https://')
htmlString = htmlString.replace("https://twitter.com", "https://mobile.twitter.com")
htmlString = htmlString.replace("https://facebook.com", "https://m.facebook.com")
# eemaldame html-i vahelise whitespace-i
htmlString = re.sub(r"\s\s+(?=<)", "", htmlString)
# eemaldame allesjäänud tühikud
htmlString = htmlString.replace('\\n', " ")
htmlString = htmlString.replace('\\r', " ")
htmlString = htmlString.replace('\\t', " ")
# br - peab tegema, kuna muidu ei saa xpath oma teekondasid kätte
htmlString = htmlString.replace("<br/>", "<br>")
htmlString = htmlString.replace(" <br>", "<br>")
htmlString = htmlString.replace("<br> ", "<br>")
htmlString = htmlString.replace("<br><br>", "<br>")
htmlString = " ".join(htmlString.split())
return htmlString
def html_remove_single_parents(htmlString):
i = 0
while True:
htmlString = htmlString.strip()
if not htmlString:
rss_print.print_debug(__file__, "katkestame, tühi sisend: '" + htmlString + "'", 0)
return htmlString
if not htmlString.startswith("<"):
rss_print.print_debug(__file__, "katkestame, algus pole tag: '" + htmlString + "'", 4)
return htmlString
if not htmlString.endswith(">"):
rss_print.print_debug(__file__, "katkestame, lõpp pole tag: '" + htmlString + "'", 4)
return htmlString
if "</" not in htmlString:
rss_print.print_debug(__file__, "katkestame, puudub lõpptag: '" + htmlString + "'", 4)
return htmlString
if htmlString.startswith("<a "):
rss_print.print_debug(__file__, "katkestame, algus tag on a: '" + htmlString + "'", 4)
return htmlString
if htmlString.startswith("<i>"):
rss_print.print_debug(__file__, "katkestame, algus tag on i: '" + htmlString + "'", 4)
return htmlString
if htmlString.startswith("<b>"):
rss_print.print_debug(__file__, "katkestame, algus tag on b: '" + htmlString + "'", 4)
return htmlString
if len(htmlString) <= 7: # <p></p>
rss_print.print_debug(__file__, "katkestame, liiga lühike: '" + htmlString + "'", 4)
return htmlString
if html_string_count_parent_nodes(htmlString, "html_remove_single_parents") != 1:
rss_print.print_debug(__file__, "katkestame, mitu parent node-i: '" + htmlString + "'", 4)
return htmlString
# so far so good
i += 1
# küsime child kandidaadi
htmlString = html_string_children(htmlString)
# kui see on tühi, siis teavitame ja lõpetame
if not htmlString:
rss_print.print_debug(__file__, "child[" + str(i) + "] hankimise lõpptulemus on tühjus: '" + htmlString + "'", 3)
return htmlString
rss_print.print_debug(__file__, "child[" + str(i) + "] hankimise vahetulemus: '" + htmlString + "'", 4)
def html_string_children(htmlString):
if not isinstance(htmlString, str):
rss_print.print_debug(__file__, "sisend pole string, tagastame tühjuse", 0)
return ""
if htmlString[0] != "<":
rss_print.print_debug(__file__, "katkestame, algus pole tag: '" + htmlString + "'", 4)
return htmlString
if htmlString[-1] != ">":
rss_print.print_debug(__file__, "katkestame, lõpp pole tag: '" + htmlString + "'", 4)
return htmlString
if "</" not in htmlString:
rss_print.print_debug(__file__, "sisendis pole child elementi, tagastame sisendi", 0)
return htmlString
if len(htmlString) <= 7: # <b></b>
rss_print.print_debug(__file__, "liiga lühike, tagastame sisendi: '" + htmlString + "'", 0)
return htmlString
tagOpening = htmlString.find(">") + 1
tagClosing = htmlString.rfind("</")
# lõikame stringist vajaliku osa
htmlString = htmlString[tagOpening:tagClosing]
htmlString = htmlString.strip()
return htmlString
def html_object_count_parent_nodes(htmlTree):
# loeme ülemobjektid
parentCount = int(htmlTree.xpath('count(/html/body/*)'))
return parentCount
def html_string_count_parent_nodes(htmlString, caller):
htmlString = htmlString.strip()
if not htmlString:
return 0
# loome objektipuu
htmlTree = html_tree_from_document_string(htmlString, caller)
# asume lugema ülemobjekte
parentCount = html_object_count_parent_nodes(htmlTree)
return parentCount
def html_to_string(htmlNode, prettyPrint=False):
if isinstance(htmlNode, str):
rss_print.print_debug(__file__, "sisend on juba string, tagastame sisendi", 0)
rss_print.print_debug(__file__, "sisend on juba string: htmlNode = " + htmlNode, 3)
return htmlNode
htmlStringAsBytes = html.tostring(htmlNode, encoding="unicode", pretty_print=prettyPrint)
htmlString = str(htmlStringAsBytes)
htmlString = unescape(htmlString)
rss_print.print_debug(__file__, "htmlString = " + htmlString, 4)
return htmlString
def html_tree_debug(htmlPageName, pageTree):
"""
Paigutab koodi body elementi ja salvestab selle kettale.
"""
htmlPageString = html_to_string(pageTree, prettyPrint=True)
htmlPageString = htmlPageString.split("<body ")[1]
htmlPageString = htmlPageString.split("</body>")[0]
htmlPageString = "<body " + htmlPageString + "</body>"
rss_disk.write_file(rss_config.PATH_FILENAME_DEBUG, htmlPageName, htmlPageString)
def html_tree_from_document_string(htmlString, caller):
"""
See funktsioon teeb root html treed.
"""
if caller:
rss_print.print_debug(__file__, "asume looma html objekti kutsujale: " + caller, 4)
htmlString = htmlString.strip()
if not htmlString:
rss_print.print_debug(__file__, "puudub html stringi sisu kutsujal: '" + caller + "'", 1)
htmlString = "<html><head></head></html>"
if htmlString.startswith('<?xml version="1.0" encoding="utf-8"?>'):
# kui unicode ei käi, proovime utf-8 "Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration."
htmlStringUtf = htmlString.encode('utf-8')
return html.document_fromstring(htmlStringUtf)
try:
htmlTree = html.document_fromstring(htmlString)
except Exception as e:
rss_print.print_debug(__file__, "ei õnnestunud luua mitteutf-8 html objekti kutsujal: '" + caller + "'", 0)
rss_print.print_debug(__file__, "exception = '" + str(e) + "'", 1)
rss_print.print_debug(__file__, "ei õnnestunud luua mitteutf-8 html objekti stringist: '" + htmlString + "'", 3)
return htmlTree
def html_tree_from_string(htmlString, caller):
"""
See funktsioon ei tee root html treed.
"""
if caller:
rss_print.print_debug(__file__, "asume looma html objekti kutsujale: " + caller, 4)
htmlString = htmlString.strip()
if not htmlString:
rss_print.print_debug(__file__, "puudub html stringi sisu kutsujal: '" + caller + "'", 0)
try:
htmlTree = html.fromstring(htmlString)
except Exception as e:
rss_print.print_debug(__file__, "ei õnnestunud luua mitteutf-8 html objekti kutsujal: '" + caller + "'", 0)
rss_print.print_debug(__file__, "exception = '" + str(e) + "'", 1)
rss_print.print_debug(__file__, "ei õnnestunud luua mitteutf-8 html objekti stringist: '" + htmlString + "'", 3)
return htmlTree