-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
105 lines (96 loc) · 3.11 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# -*- coding: utf-8 -*-
import cookielib
import copy
import csv
import StringIO
import urllib2
from flask import make_response
from pylinkvalidator import api
def generate_csv(items):
header = ['status', 'source', 'url']
strIO = StringIO.StringIO()
writer = csv.DictWriter(
strIO,
delimiter=',',
lineterminator='\n',
fieldnames=header,
)
writer.writeheader()
for item in items:
for s_item in item['sources_data']:
data = {
'status': item['item_status'],
'source': s_item['source_url'],
'url': item['item_url'],
}
writer.writerow(data)
return strIO.getvalue()
def csv_response(data):
output = make_response(data)
output.headers['Content-Disposition'] = 'attachment; filename=export.csv'
output.headers["Content-type"] = "text/csv"
return output
def return_error_pages(site_links=[], config={}):
error_items = []
crawled_site = api.crawl_with_options(
site_links,
config,
)
error_pages = crawled_site.error_pages
for item in error_pages:
raw = error_pages[item]
sources = raw.sources
sources_data = []
for source in sources:
source_url = source.origin.geturl()
source_html = source.origin_str
source_data = {
'source_url': source_url,
'source_html': source_html,
}
sources_data.append(source_data)
item_url = raw.url_split.geturl()
item_status = raw.status
item_status_message = raw.get_status_message()
data = {
'sources': len(sources),
'sources_data': sources_data,
'item_url': item_url,
'item_status': item_status,
'item_status_message': item_status_message,
}
error_items.append(data)
return error_items
def check_redirects(results):
broken_links = copy.deepcopy(results)
# CookieJar
cj = cookielib.CookieJar()
# Opener
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
# Iterate
for item in results:
# Defaults
item_url = item['item_url']
# Request
req = urllib2.Request(
item_url,
None,
{'User-Agent': 'Mozilla/5.0 (X11; Linux i686; '
'G518Rco3Yp0uLV40Lcc9hAzC1BOROTJADjicLjOmlr4=) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/44.0.2403.157 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/'
'xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'},
)
try:
resp = opener.open(req)
resp_code = resp.getcode()
if resp_code == 200:
broken_links.remove(item)
except Exception:
pass
return broken_links