-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathhreflang_classes.py
460 lines (421 loc) · 19 KB
/
hreflang_classes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
import requests
from bs4 import BeautifulSoup
import urllib.robotparser as robotparser
import urllib.parse
import time
import re
import validators
import gzip
class page_check:
# get all the page data, only want to do this once so stuck it in the init to have the data throughout
def __init__(self, page):
self.page = page
self.parsed_uri = urllib.parse.urlparse(page)
self.home_page = '{uri.scheme}://{uri.netloc}/'.format(uri=self.parsed_uri)
self.headers = {'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'}
self.r = requests.get(self.page,allow_redirects=False, headers=self.headers)
self.soup = BeautifulSoup(self.r.content, "lxml")
# returns true and 200 if 200, otherwise returns false and the status code
def check_status(self):
if self.r.status_code == 200:
return True, self.r.status_code
else:
return False, self.r.status_code
# returns true if none or the same as the page, returns false and the canonical otherwise
def check_canonical(self):
canonical_element = self.soup.find("link", {"rel": "canonical"}) if self.soup.find("link", {"rel": "canonical"}) is not None else False
if canonical_element is not False:
canonical_link = canonical_element["href"]
if canonical_link == self.page:
return True, canonical_link
else:
return False, canonical_link
else:
return True, "No Canonical"
# ensures the page is not blocked by robots returns Bool and directive
def check_robots(self):
robots_element = self.soup.find("meta", {"name": "robots"}) if self.soup.find("meta", {"name": "robots"}) is not None else False
if robots_element is not False:
robot_directive = robots_element["content"]
blocks = ["no index", "noindex"]
if any(x in robot_directive.lower() for x in blocks):
return False, robot_directive
else:
return True, robot_directive
else:
return True, "No Robots"
# ensures the page is not blocked by robots txt, is a little awkward cause the library is initialized globally to
# avoid having to hit again for each page, hopefully will get a better solution later
def check_txt(self):
if rp.can_fetch("*", self.page):
return True, "Not txt Blocked"
else:
return False, "txt Blocked"
# calls the previous few functions to return a simple is it indexable or not
def indexable(self):
status = self.check_status()
canonical = self.check_canonical()
robots = self.check_robots()
txt = self.check_txt()
checks = [status, canonical, robots, txt]
over = list(map(lambda x: x[0], checks))
try:
index = over.index(False)
return checks[index]
except ValueError:
return True
# takes a few links formats and returns an absolute, may have missed some formats, it should notify if so
def validate_link(self, link):
link_parts = urllib.parse.urlparse(link)
if "#" in link:
return False, "nope"
elif self.parsed_uri.scheme == link_parts.scheme and self.parsed_uri.netloc == link_parts.netloc:
return True, link
elif link_parts.scheme == "" and link_parts.netloc == "" and link[0] == "/":
return True, '{}://{}{}'.format(self.parsed_uri.scheme, self.parsed_uri.netloc, link)
elif link_parts.scheme == "" and link_parts.netloc == "":
return True, '{}://{}/{}'.format(self.parsed_uri.scheme, self.parsed_uri.netloc, link)
elif link_parts.scheme == "" and link_parts.netloc == self.parsed_uri.netloc:
return True, '{}:{}'.format(self.parsed_uri.scheme, link)
elif link_parts.netloc is not None and link_parts.netloc != self.parsed_uri.netloc:
return False, "nope"
else:
print("did not account for this one:" + link)
return False, "nope"
# grabs the links on the page, calls the validation on them so should only produce absolute links
def get_links(self):
links = []
for link in self.soup.find_all("a"):
try:
checked_link = self.validate_link(link["href"].strip())
if checked_link[0] and checked_link[1] not in links:
links.append(checked_link[1])
else:
pass
except:
pass
return list(set(links))
# gets the hreflang data from a page
def get_hreflang(self):
hreflang = []
for tag in self.soup.find_all("link", {"hreflang": re.compile(r".*")}):
try:
hreflang.append((tag["hreflang"], tag["href"]))
except:
pass
return list(map(lambda x: x[1], hreflang)), list(map(lambda x: x[0], hreflang))
# checks the hreflang on a page includes a self referring tag
def check_self(self):
links, targets = self.get_hreflang()
if self.page in links:
return True
else:
return False
# create a new instance for each of the alts (yea recursion biatch)
def create_alt_instances(self):
links, targets = self.get_hreflang()
alt_instances = {"tar":[], "instance":[]}
if len(targets) == 0:
print("No Tags on: " + self.page)
else:
for iter, target in enumerate(targets):
link = links[iter]
if validators.url(link):
current = page_check(link)
alt_instances["tar"].append(target)
alt_instances["instance"].append(current)
else:
print("badly formed link " + link)
return alt_instances
# cheks that alternate pages being pointed to also point back
def check_return(self, alt_instances):
wrong = []
for iter, tar in enumerate(alt_instances["tar"]):
current = alt_instances["instance"][iter]
links, targets = current.get_hreflang()
if self.page in links:
pass
else:
wrong.append(current.page)
if len(wrong) == 0:
print(self.page + " has all its return links")
else:
print(self.page + " missing return link from " + str(wrong))
# ensure alts have self referring tags
def check_alts_self(self, alt_instances):
wrong = []
for iter, tar in enumerate(alt_instances["tar"]):
current = alt_instances["instance"][iter]
if current.check_self():
pass
else:
wrong.append(current.page)
if len(wrong) == 0:
print(self.page + " points to pages that all have their self references")
else:
print(self.page + " points to pages which are missing their self references" + str(wrong))
# ensure pages being pointed to are indexable
def check_alts_indexable(self, alt_instances):
wrong = []
for iter, tar in enumerate(alt_instances["tar"]):
current = alt_instances["instance"][iter]
holder = current.indexable()
if holder == True:
pass
else:
wrong.append((holder, current.page))
if len(wrong) == 0:
print(self.page + " points to only indexable pages")
else:
print(self.page + " points to pages which are not indexable " + str(wrong))
# checks if alternate pages all use the same code as the page we are checking when pointing to it
def check_targeting(self, alt_instances):
wrong = []
links, targets = self.get_hreflang()
try:
target = targets[links.index(self.page)]
for iter, tar in enumerate(alt_instances["tar"]):
current = alt_instances["instance"][iter]
cur_links, cur_targets = current.get_hreflang()
try:
cur_tar = cur_targets[cur_links.index(self.page)]
if cur_tar == target:
pass
else:
wrong.append((current.page, cur_tar))
except ValueError:
pass
if len(wrong) == 0:
print(self.page + " has consistent targeting across its alts")
else:
print(self.page + " has pages pointing to it with the wrong targeting code " +str(wrong))
except ValueError:
pass
# calls a few hreflang checking functions for the page we are on
def validate_alts(self):
alt_instances = self.create_alt_instances()
if alt_instances is not None:
self.check_return(alt_instances)
self.check_alts_self(alt_instances)
self.check_alts_indexable(alt_instances)
self.check_targeting(alt_instances)
# the crawler class, initiating just sets the start page, and sorts the starter things out
class crawler:
def __init__(self, page):
self.current_page = page
self.parsed_uri = urllib.parse.urlparse(page)
self.home_page = '{uri.scheme}://{uri.netloc}/'.format(uri=self.parsed_uri)
self.to_crawl = [page]
self.crawled = []
# the recursive crawler, it calls the hreflang check module, so if you want a free crawl validation, this is what you need
def rec_crawl(self):
print(str(len(self.to_crawl)) + " pages in que and " + str(len(self.crawled)) + " done, now on: " + self.current_page)
self.crawled.append(self.current_page)
if validators.url(self.current_page):
class_instance = page_check(self.current_page)
indexer = class_instance.indexable()
if indexer == True:
class_instance.validate_alts()
self.to_crawl = list(set(self.to_crawl + class_instance.get_links()))
self.to_crawl = list(set(self.to_crawl) - set(self.crawled))
time.sleep(1)
if len(self.to_crawl) == 0:
print("All Done")
else:
self.current_page = self.to_crawl[0]
self.rec_crawl()
else:
time.sleep(1)
print(self.current_page + " is not indexable " + str(indexer))
self.to_crawl = list(set(self.to_crawl) - set(self.crawled))
if len(self.to_crawl) == 0:
print("All Done")
else:
self.current_page = self.to_crawl[0]
self.rec_crawl()
else:
print("badly formed link " + self.current_page)
if len(self.to_crawl) == 0:
print("All Done")
else:
self.crawled.append(self.current_page)
self.to_crawl = list(set(self.to_crawl) - set(self.crawled))
self.current_page = self.to_crawl[0]
self.rec_crawl()
# my attempt at hreflang is sitemap validation, this got much more complicated than anticipated
# init just takes a url, gets the robots and from there the sitemap location
class sitemap:
def __init__(self, page):
self.current_page = page
self.parsed_uri = urllib.parse.urlparse(page)
self.home_page = '{uri.scheme}://{uri.netloc}/'.format(uri=self.parsed_uri)
self.roboter = self.home_page + "robots.txt"
# get the sitemaps, this will only return what is pointed to in robots, if they are sitemap indexes then that is what is returned
def check_robots_for_sitemap(self):
sitemaps = []
r = requests.get(self.roboter)
lines = r.content.split(b"\n")
for line in lines:
if "Sitemap:" in line.decode("utf-8"):
sitemaps.append(line.decode("utf-8").split(" ")[1].replace("\r", ""))
if len(sitemaps) == 0:
print("no sitemaps in the robots file")
return False
else:
return sitemaps
# this checks if the robots sitemaps are sitemap indexes and if so, parses them to get the actual sitemaps
# it only checks on one level, so if you a sitemap index of sitemap indexes this gets wonky
def get_sitemaps(self):
sitemaps_now = self.check_robots_for_sitemap()
sitemaps = []
if sitemaps is not False:
for sitemap in sitemaps_now:
r = requests.get(sitemap)
soup = BeautifulSoup(r.content, "lxml")
if soup.find("sitemapindex"):
for loc in soup.find_all("loc"):
sitemaps.append(loc.text)
else:
sitemaps.append(sitemap)
return sitemaps
else:
print("no sitemaps in the robots file")
return False
# does the heavy lifting, grabs all sitemaps, parses them into a workable dictionary
def get_data(self):
sitemaps = self.get_sitemaps()
if sitemaps is not False:
data = {"urls":[]}
for sitemap in sitemaps:
print("downloading sitemap :" + sitemap)
r = requests.get(sitemap)
if "Content-Type" in r.headers and "gz" in r.headers["Content-Type"]:
content = gzip.decompress(r.content)
soup = BeautifulSoup(content, "lxml")
for url in soup.find_all("url"):
deet = {"url": url.find("loc").text, "alts": []}
for alt in url.find_all(attrs={"rel": "alternate"}):
alter = {"target": alt["hreflang"], "link": alt["href"]}
deet["alts"].append(alter)
data["urls"].append(deet)
else:
soup = BeautifulSoup(r.content, "lxml")
for url in soup.find_all("url"):
deet = {"url": url.find("loc").text, "alts": []}
for alt in url.find_all(attrs={"rel": "alternate"}):
alter = {"target": alt["hreflang"], "link": alt["href"]}
deet["alts"].append(alter)
data["urls"].append(deet)
return data
else:
print("no sitemaps in the robots file")
return False
# checks all sitemap urls with hreflang for a self reference
def check_self_ref(self, url_element):
url = url_element["url"]
alts = url_element["alts"]
links = list(map(lambda x: x["link"], alts))
if url in links:
print(url + " has self reference")
else:
print(url + " is missing self reference")
return False
# checks that links pointed to in hreflang are also in the sitemap
def check_link_in_map(self, url_element, data):
alts = url_element["alts"]
links = list(map(lambda x: x["link"], alts))
check = []
urls = list(map(lambda x: x["url"], data["urls"]))
for link in links:
if link in urls:
print(link + " is in and has a corresponding url element")
else:
print(link + " is is pointed to but has to corresponding url element")
check.append(link)
if len(check) == 0:
return True
else:
return check
# checks if alternates exist and have alternates pointing back to the origin
def check_return(self, url_element, data):
url = url_element["url"]
links = list(map(lambda x: x["link"], url_element["alts"]))
urls = list(map(lambda x: x["url"], data["urls"]))
check = []
for link in links:
if link in urls:
index = urls.index(link)
other_element = data["urls"][index]
other_links = list(map(lambda x: x["link"], other_element["alts"]))
if url in other_links:
print(url + " points to " + link + " and has link pointing back to it")
else:
print(url + " points to " + link + " but not return")
check.append(link)
else:
pass
if len(check) == 0:
return True
else:
return check
# checks if alternates exist, then if they do ensures that the origins targeting is
# the same as the targeting applied by the alternate
def check_target(self, url_element, data):
url = url_element["url"]
links = list(map(lambda x: x["link"], url_element["alts"]))
targets = list(map(lambda x: x["target"], url_element["alts"]))
check = []
for iter, link in enumerate(links):
target = targets[iter]
urls = list(map(lambda x: x["url"], data["urls"]))
if link in urls:
index = urls.index(link)
other_element = data["urls"][index]
other_links = list(map(lambda x: x["link"], other_element["alts"]))
if url in other_links:
other_index = other_links.index(link)
other_targets = list(map(lambda x: x["target"], other_element["alts"]))
other_target = other_targets[other_index]
if target == other_target:
print(url + "url with target " + target + " is pointed to with target " + other_target)
else:
print(url + "url with target " + target + " is pointed to with target " + other_target)
check.append(target)
else:
pass
else:
pass
if len(check) == 0:
return True
else:
return check
# loops through each url in sitemap applying checks to each, updates dict with "checks" which have data on errors
def check_data(self):
data = self.get_data()
data["checks"] = []
for url_element in data["urls"]:
checks = []
checks.append({"self ref": self.check_self_ref(url_element)})
checks.append({"points to link in map": self.check_link_in_map(url_element, data)})
checks.append({"has return": self.check_return(url_element, data)})
checks.append({"same target": self.check_target(url_element, data)})
data["checks"].append(checks)
return data
# just some example urls i was using to test as i coded, some of the bigger ones cause memory error for me
# when checking the sitemap which is fairly understandable, dealing with pretty huge processing requirements
next = "http://www.next.de/de"
canon = "https://www.canon.co.uk/"
balsam = "https://www.balsamhill.co.uk"
google = "https://www.google.com"
macidys = "https://www.mcdonalds.com/"
asics = "https://www.asics.com/"
nuanc = "https://www.nuance.com"
# Just some example calls
# x = crawler(balsam)
# roboter = x.home_page + "robots.txt"
# rp = robotparser.RobotFileParser()
# rp.set_url(roboter)
# rp.read()
a = sitemap(asics)
data = a.check_data()
# x.rec_crawl()