-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetpypidata.py
83 lines (67 loc) · 2.16 KB
/
getpypidata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
"""
Build an offline archive of the pypi index.
Note that we're intentionally skipping error handling.
$ nix-shell -p pythonPackages.requests pythonPackages.nose \
pythonPackages.protobuf pythonPackages.eventlet \
pythonPackages.setuptools
"""
import os
import eventlet
import re
import requests
import raw_data_pb2
import time
import hashlib
import sys
CONCURRENCY = 20
INDEX_URL = 'https://pypi.python.org/simple'
PACKAGE_URL = 'https://pypi.python.org/pypi/{}/json'
# pypi package names are simple enough to be parsed with a regex.
PACKAGE_RE = re.compile(r"href='([^']+)'", re.M)
def fetch_and_store(package_name):
url = PACKAGE_URL.format(package_name)
try:
content = requests.get(url).content
except requests.RequestException:
# Reschedule scraping. We don't keep count on how often this
# fails and stop scraping because pypi's super solid and seems
# to always work.
return [package_name]
pb = raw_data_pb2.RawData(
retrieved_timestamp=int(time.time()),
url_used=url,
package_name=package_name,
package_json=content,
)
raw_data = pb.SerializeToString()
filename = hashlib.sha1(raw_data).hexdigest()
with open(os.path.join('./raw', filename), 'w') as f:
f.write(raw_data)
return []
def main():
index = requests.get(INDEX_URL).content
package_names = set(PACKAGE_RE.findall(index))
try:
os.makedirs('./raw')
except os.error:
pass
# fetch all the json
pool = eventlet.GreenPool(CONCURRENCY)
while package_names:
errors = []
counter = 0
for error in pool.imap(fetch_and_store, package_names):
errors.extend(error)
counter += 1
sys.stdout.write("Fetched: {}, Total: {}, Percent: {:.1f}, Errors: {}\r".format(
counter, len(package_names),
counter * 100.0 / len(package_names), len(errors)))
sys.stdout.flush()
package_names = errors
def test_re():
TESTDATA = """
<a href='115wangpan'>115wangpan</a><br/>
"""
assert PACKAGE_RE.findall(TESTDATA) == ['115wangpan']
if __name__ == '__main__':
main()