forked from statfi/opendata
-
Notifications
You must be signed in to change notification settings - Fork 4
/
statfi_px_api.py
143 lines (122 loc) · 4.7 KB
/
statfi_px_api.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# -*- coding: UTF-8 -*-
"""
This module contains tools to load information about PC Axis files in
Statistics Finland's databases and download the files from the databases
using the open data API:
http://www.stat.fi/org/lainsaadanto/avoin_data_en.html
For license see LICENSE document
"""
import os, csv, datetime, urllib, urllib2, httplib, StringIO, gzip, os, time
import csv_tools
class PxInfo(object):
"""
A simple object representation of PX information in
Statistics Finland's open data API:
"""
_timeformat = '%Y-%m-%d %H:%M' #Just a cache place for dateformat
def __init__(self, path, size, created, updated, variables,
tablesize, type, language, title, *args):
self.path = path.strip()
self.size = size.strip()
self.created = created.strip()
self.updated = updated.strip()
self.variables = variables.strip()
self.tablesize = tablesize.strip()
self.type = type.strip()
self.language = language.strip()
self.title = title.strip()
def __unicode__(self):
return u'PX file %s: %s' % (self.path, self.title)
def __repr__(self):
return unicode(self).encode('utf-8')
@property
def created_dt(self):
return datetime.datetime.strptime(self.created, self._timeformat)
@property
def updated_dt(self):
return datetime.datetime.strptime(self.updated, self._timeformat)
def create_px(url="http://pxweb2.stat.fi/database/StatFin/StatFin_rap.csv"):
"""
Creates a list of Px-objects from a given url. Url should point to a CSV file.
Url's default value points to Statfin databases contents CSV.
"""
iterable = urllib.urlopen(url)
iterable.next() #First line is for metadata
return [PxInfo(*i) for i in csv_tools.UnicodeReader(iterable, delimiter=";", encoding="iso-8859-1")]
def fetch_px_zipped(px_objs, target_dir="."):
"""
Fetch PC Axis files for given list of Px objects with gzip compression on the transfer
Modified from http://www.diveintopython.net/http_web_services/gzip_compression.html
Save the files to target directory
WARNING: Statfin database contains over 2500 PX files with many gigabytes of data.
"""
import urllib2, httplib, StringIO, gzip, os.path, time
opener = urllib2.build_opener()
for px_obj in px_objs:
base, pxfile_path = os.path.split(px_obj.path)
pxfile = open(os.path.join(target_dir, pxfile_path), 'w+')
request = urllib2.Request(px_obj.path)
request.add_header('Accept-encoding', 'gzip')
f = opener.open(request)
compressedstream = StringIO.StringIO(f.read())
try:
for data in gzip.GzipFile(fileobj=compressedstream):
pxfile.write(data)
print f.headers
except IOError, e:
print e, px_obj, f.headers
break
pxfile.close()
time.sleep(1)
def fetch_px(px_objs, target_dir="."):
"""
Fetch PC Axis files for given list of Px objects
Save the files to target directory
WARNING: Statfin database contains over 2500 PX files with many gigabytes of data.
"""
import urllib2, httplib, StringIO, gzip, os
opener = urllib2.build_opener()
for n, px_obj in enumerate(px_objs):
protocol, url = urllib.splittype(px_obj.path)
base, pxfile_part = urllib.splithost(url)
if pxfile_part.startswith(os.sep):
pxfile_part = pxfile_part.replace(os.sep, '', 1)
pxfile_path = os.path.join(target_dir, pxfile_part)
if os.path.exists(pxfile_path):
updated = check_update(px_obj.path, pxfile_path, opener)
if not updated:
continue
request = urllib2.Request(px_obj.path)
try:
f = opener.open(request)
except urllib2.HTTPError, e:
print e, px_obj
continue
makedirs(pxfile_path)
pxfile = open(pxfile_path, 'w+')
try:
for data in f.read():
pxfile.write(data)
except IOError, e:
print e, px_obj, f.headers
break
pxfile.close()
def check_update(url, file_path, opener):
"""
Check that network resource is newer than file resource
"""
req = urllib2.Request(url)
try:
url_req = opener.open(req)
except urllib2.HTTPError, e:
print e, url
return False
file_mtime_dt = datetime.datetime.fromtimestamp(os.path.getmtime(file_path))
url_modified_dt = datetime.datetime.strptime(
url_req.headers.dict.get('last-modified'), '%a, %d %b %Y %H:%M:%S GMT')
return url_modified_dt > file_mtime_dt
def makedirs(px_path):
try:
os.makedirs(os.path.split(px_path)[0])
except OSError, e:
pass