-
Notifications
You must be signed in to change notification settings - Fork 3
/
web.py
242 lines (198 loc) · 7.89 KB
/
web.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
import requests
import logging
import contextlib
from io import BytesIO
from .storage import StorageObject
from lxml import etree
from urllib import request
logger = logging.getLogger(__name__)
class WebAPI:
"""Provides method to access web API.
This class uses python requests package.
See https://2.python-requests.org/en/master/user/advanced/#request-and-response-objects
Attributes:
base_url: The base URL for all API endpoint.
If base_url is specified, relative URL can be used to make requests.
Relative URL will be appended to base URL when making the requests.
"""
def __init__(self, base_url="", **kwargs):
"""Initializes API.
Args:
base_url: Base URL, the common URL prefix for all the API endpoints.
**kwargs: keyword arguments to be encoded as GET parameters in the URL in all future requests.
"""
self.kwargs = kwargs
self.headers = {}
base_url = base_url
if base_url.startswith("http://") or base_url.startswith("https://"):
self.base_url = base_url
else:
raise ValueError("Base URL should start with http:// or https://")
def add_header(self, **kwargs):
"""Adds a header to be used in all future HTTP requests
Args:
**kwargs: The key-value pairs to be added as the headers.
"""
self.headers.update(kwargs)
def request(self, method, url, **kwargs):
"""Sends a request to a URL endpoint.
data in self.headers will be added to the request header.
This method uses the same arguments as the python requests package
https://github.com/psf/requests/blob/master/requests/api.py
Args:
method: Request method, e.g. GET, OPTIONS, HEAD, POST, PUT, PATCH, or DELETE
url: URL endpoint for the request, which can be relative URL.
**kwargs: See https://github.com/psf/requests/blob/master/requests/api.py
Returns: Request response
"""
url = self.build_url(url)
method = str(method).lower()
if not hasattr(requests, method):
raise ValueError("Invalid method: %s" % method)
request_func = getattr(requests, method)
headers = kwargs.get("headers", {})
headers.update(self.headers)
kwargs["headers"] = headers
response = request_func(url, **kwargs)
return response
def get(self, url, **kwargs):
"""Makes a get request.
Use keyword arguments to specify the query strings in the request.
Args:
url (str): The URL/Endpoint of the API.
This can be a relative URL if base_url is specified in initialization.
**kwargs: keyword arguments to be encoded as GET parameters in the URL.
Returns: A Response Object
"""
url = self.build_url(url, **kwargs)
logger.debug("Requesting data from %s" % url)
response = requests.get(url, headers=self.headers)
logger.debug("Response code: %s" % response.status_code)
if response.status_code != 200:
logger.debug(response.content)
return response
def get_json(self, url, **kwargs):
return self.get(url, **kwargs).json()
def post(self, url, data, **kwargs):
url = self.build_url(url, **kwargs)
logger.debug("Posting data to %s" % url)
response = requests.post(url, json=data, headers=self.headers)
logger.debug("Response code: %s" % response.status_code)
if response.status_code != 200:
logger.debug(response.content)
return response
def post_json(self, url, data, **kwargs):
return self.post(url, data, **kwargs).json()
def delete(self, url, **kwargs):
url = self.build_url(url, **kwargs)
logger.debug("Deleting data from %s" % url)
response = requests.delete(url, headers=self.headers)
return response
def build_url(self, url, **kwargs):
"""Builds the URL/Endpoint for a request.
Keyword arguments are converted to query string in the URL.
Args:
url (str): The URL/Endpoint of the API.
If url is relative url, it will be appended to the base_url.
If url is absolute URL (starts with https:// or http://), the base_url will be ignored.
Returns:
str: The absolute URL/Endpoint of the API with query string.
"""
url = url
if not (url.startswith("http://") or url.startswith("https://")):
url = "%s%s" % (self.base_url, url)
query_dict = self.kwargs.copy()
query_dict.update(kwargs)
return self.append_query_string(url, **query_dict)
@staticmethod
def append_query_string(url, **kwargs):
"""Appends query string to a URL
Query string is specified as keyword arguments.
Args:
url (str): URL
Returns:
str: URL with query string.
"""
for key, val in kwargs.items():
if "?" not in url:
url += "?"
if isinstance(val, list):
url += "".join(["&%s=%s" % (key, v) for v in val])
else:
url += "&%s=%s" % (key, val)
return url
class HTML:
def __init__(self, uri):
self.uri = uri
self.__etree = None
self.__content = None
def read(self):
obj = StorageObject(self.uri)
if obj.scheme in ["http", "https"]:
r = requests.get(self.uri)
return r.content
with open(self.uri, 'r') as f:
return f.read()
@property
def content(self):
if not self.__content:
self.__content = self.read()
return self.__content
@property
def etree(self):
if not self.__etree:
self.__etree = etree.parse(BytesIO(self.content), etree.HTMLParser())
return self.__etree
@staticmethod
def __tags_to_list(parent, tag):
elements = parent.findall(".//%s" % tag)
if not elements:
return None
results = []
for element in elements:
text = element.text if element.text else ""
results.append(text + ''.join(etree.tostring(e).decode() for e in element))
return results
@staticmethod
def __append_data(to_list, parent, tag):
data = HTML.__tags_to_list(parent, tag)
if data:
to_list.append(data)
def get_tables(self):
"""Gets the data of of HTML tables in the web page as a list of dictionary.
Returns:
list: A list of dictionary, each contain data from a table in the web page.
Each dictionary has two keys: "headers" and "data".
Both "headers" and "data" are 2D lists.
"""
html = self.etree
html_tables = html.findall('.//table')
data_tables = []
for html_table in html_tables:
table = {
"headers": [],
"data": []
}
rows = html_table.findall(".//tr")
for row in rows:
self.__append_data(table["headers"], row, "th")
self.__append_data(table["data"], row, "td")
data_tables.append(table)
return data_tables
def download(url, file_path):
"""Downloads a file from a URL response.
Args:
url (str): The URL of the file to be downloaded.
file_path (str): The path to store the file.
Returns: None
"""
url_response = request.urlopen(url)
with open(file_path, 'wb') as out_file:
with contextlib.closing(url_response) as fp:
logger.debug("Downloading data from %s" % url)
block_size = 1 << 16
while True:
block = fp.read(block_size)
if not block:
break
out_file.write(block)