-
Notifications
You must be signed in to change notification settings - Fork 5
/
fetch_page.py
83 lines (77 loc) · 3.05 KB
/
fetch_page.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import gzip
import json
import requests
import sys
try:
from cStringIO import StringIO
except:
from StringIO import StringIO
class GetSource():
def __init__(self):
#self.dataset = 'CC-MAIN-2018-13'
self.dataset = 'CC-MAIN-2018-17'
def get_html(self, url, dataset=None, filename=None, exit=False):
if dataset is None:
dataset = self.dataset
# Let's fetch the Common Crawl FAQ using the CC index
resp = requests.get('http://index.commoncrawl.org/{0}-index?url={1}&output=json'.format(dataset, url))
try:
pages = [json.loads(x) for x in resp.content.strip().split('\n')]
except:
print resp.content.strip().split('\n')
raw_input('')
# Multiple pages may have been found - we're only interested in one
try:
page = [pg for pg in pages if pg['status'] == '200']
except KeyError:
print 'Failed (No capture found)',
return
if len(page) > 1:
page = page[0]
else:
page = pages[0]
# We need to calculate the start and the end of the relevant byte range
# (each WARC file is composed of many small GZIP files stuck together)
offset, length = int(page['offset']), int(page['length'])
offset_end = offset + length - 1
# We'll get the file via HTTPS so we don't need to worry about S3 credentials
# Getting the file on S3 is equivalent however - you can request a Range
prefix = 'https://commoncrawl.s3.amazonaws.com/'
# We can then use the Range header to ask for just this set of bytes
resp = requests.get(prefix + page['filename'], headers={'Range': 'bytes={}-{}'.format(offset, offset_end)})
# The page is stored compressed (gzip) to save space
# We can extract it using the GZIP library
raw_data = StringIO(resp.content)
f = gzip.GzipFile(fileobj=raw_data)
# What we have now is just the WARC response, formatted:
data = f.read()
try:
warc, header, response = data.strip().split('\r\n\r\n', 2)
except:
if exit:
print 'Failed',
return
file_name = url
try:
url = [line.split(' ')[1] for line in data.split('\r\n') if 'Location:' in line][0]
self.get_html(url, filename=file_name, exit=True)
except:
print 'Failed',
return
#
if filename is None:
filename = url
with open('pagesource_10k_march18/' + filename, 'w') as f:
f.write(response)
if __name__ == '__main__':
getSource = GetSource()
urls_file = sys.argv[1]
with open(urls_file, 'r') as f:
for line in f:
url = line.split(',')[1].strip()
print '[{0}] Fetching {1}'.format(line.split(',')[0], line.split(',')[1].strip()),
#try:
getSource.get_html(url)
print ' Done'
#except Exception as e:
#print str(e) + ' Failed'