-
Notifications
You must be signed in to change notification settings - Fork 6
/
genproc-plan-grabber.py
114 lines (101 loc) · 4.06 KB
/
genproc-plan-grabber.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/env python -u
# -*- coding: utf-8 -*-
# vim:et
# ---------------------------------------------------------------------------
# genproc-plan-grabber.py
# Author: Maxim Dubinin (sim@gis-lab.info)
# About: Grab http://plan.genproc.gov.ru/plan2014 data on organizations checks. Simply iterate through id and get contents.
# Created: 22:05 28.12.2013
# Usage example: python genproc-plan-grabber.py 1 100001
# ---------------------------------------------------------------------------
#import urllib
import urllib2
from httplib import BadStatusLine,IncompleteRead
import socket
import sys
import os
import datetime
import time
def console_out(text):
global time_prev
time_current = datetime.datetime.now()
timestamp = time_current.strftime('%Y-%m-%d %H:%M:%S')
if time_prev == 0:
timedif = ""
else:
timedif = str((time_current - time_prev).seconds)
spaces = " "*(4 - len(timedif))
print(timestamp + " " + timedif + spaces + text)
time_prev = time_current
def download_org(link,id):
global time_prev
numtries = 5
timeoutvalue = 40
timeinterval = 6
for i in range(1,numtries+1):
i = str(i)
try:
#user_agent = "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)"
#headers = { 'User-Agent' : user_agent }
#values = {'name' : 'John Doe','language' : 'Python' }
#data = urllib.urlencode(values)
#req = urllib2.Request(link, data, headers)
#u = urllib2.urlopen(req)
u = urllib2.urlopen(link, timeout = timeoutvalue)
except BadStatusLine:
console_out('BadStatusLine for ID:' + id + '.' + ' Attempt: ' + i)
success = False
time.sleep(3)
except urllib2.URLError, e:
if hasattr(e, 'reason'):
console_out('We failed to reach a server for ID:' + id + ' Reason: ' + str(e.reason) + '.' + ' Attempt: ' + i)
elif hasattr(e, 'code'):
console_out('The server couldn\'t fulfill the request for ID: ' + id + ' Error code: ' + str(e.code) + '.' + ' Attempt: ' + i)
success = False
time.sleep(3)
except socket.timeout, e:
console_out('Connection timed out on urlopen() for ID: ' + id + '.' + ' Attempt: ' + i)
success = False
time.sleep(3)
else:
f = open("data/" + id + ".html","wb")
try:
r = u.read()
except socket.timeout, e:
console_out('Connection timed out on socket.read() for ID: ' + id + '.' + ' Attempt: ' + i)
success = False
u.close()
time.sleep(3)
except IncompleteRead:
console_out('Incomplete read on socket.read() for ID: ' + id + '.' + ' Attempt: ' + i)
success = False
u.close()
time.sleep(3)
else:
f.write(r)
f.close()
console_out("Listing " + id + " downloaded")
success = True
break
time_current = datetime.datetime.now()
time_diff = (time_current - time_prev).seconds
if time_diff < timeinterval:
time.sleep(timeinterval - time_diff)
return success
if __name__ == '__main__':
args = sys.argv[1:]
start_id = int(args[0])
end_id = int(args[1]) + 1
time_prev = 0
f_errors = open("errors" + "_" + str(start_id) + "_" + str(end_id) + ".csv","a")
if not os.path.exists("data"): os.makedirs("data")
for id in range(start_id,end_id):
id = str(id)
link = "http://plan.genproc.gov.ru/plan2014/detail.php?ID=" + id
if not os.path.exists("data/" + id + ".html") or os.stat("data/" + id + ".html")[6]==0:
success = download_org(link,id)
if success == False:
f_errors.write(id + "," + link + ", unavailable" + "\n")
else:
console_out("Listing for id " + id + " already exists")
f_errors.close()