-
Notifications
You must be signed in to change notification settings - Fork 0
/
ytm_pageparser.py
210 lines (173 loc) · 7.58 KB
/
ytm_pageparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
#!/usr/bin/env python3
"""
Yandex Transport Monitor page parser
"""
# String Quotation Policy
# The string quotation policy is as follows:
# - Strings which are visible to end user should be double-quoted (print, log).
# - Strings which are "internal", such as values, dictionary keys etc. are single-quoted.
# - Do not mix single-quoted and double-quoted strings in one statement.
# - Since SQL queries usually contain single-quotes, it's better to put the whole query
# in double quotes.
import datetime
import re
import psycopg2
from bs4 import BeautifulSoup
class YTMPageParser:
"""
Yandex Transport Monitor page parser class.
"""
DB_RESULT_OK = 0
DB_RESULT_NODATA = 1
def __init__(self, filename):
self.filename = ''
self.data = ()
self.db_host = 'localhost'
self.db_port = '5432'
self.db_name = 'ytmonitor'
self.db_username = 'ytmonitor'
self.db_password = 'password'
self.filename = filename
def set_database(self,
db_host='localhost',
db_port='5432',
db_name='ytmonitor',
db_username='ytmonitor',
db_password='password'):
"""
Set PostgreSQL database settings.
:param db_host: PostgreSQL database host
:param db_port: PostgreSQL database port
:param db_name: PostgreSQL database name
:param db_username: PostgreSQL database username
:param db_password: PostgreSQL database password
:return:
"""
self.db_host = db_host
self.db_port = db_port
self.db_name = db_name
self.db_username = db_username
self.db_password = db_password
def parse(self):
"""Parse the file, will return tuple of tuples containing the result, and will
also save the result to data variable (as tuple of tuples)
Result format:
(sequence_number,transit_route,transit_type,transit_frequency,prognosis,prognosis_more)
Currently only up to two prognosis values are available.
If prognosis data is available, usually no "transit_frequency" data is present.
"""
file = open(self.filename, 'r', encoding='utf-8')
soup = BeautifulSoup(file, 'lxml', from_encoding='utf-8')
cnt = 1
rows = soup.find_all('div', {'class': 'masstransit-stop-panel-view__vehicle-row'})
# NOTE: strip() is used to remove trailing \n, this was an issue for "Троллейбус" transit type.
for row in rows:
# Getting transit route number
query = row.find('div', {'class': 'masstransit-stop-panel-view__vehicle-name'})
if query is not None:
transit_number = query.string.replace(u'\xa0', u' ').strip()
else:
transit_number = ''
# Getting transit type
query = row.find('div', {'class': 'masstransit-stop-panel-view__vehicle-type'})
if query is not None:
transit_type = query.string.replace(u'\xa0', u' ').strip()
else:
transit_type = ''
# Bus frequency
query = row.find('span', {'class': 'masstransit-prognoses-view__frequency-time-value'})
if query is not None:
transit_frequency = query.string.replace(u'\xa0', u' ').strip()
else:
transit_frequency = ''
# Recalculate to minutes
query = transit_frequency.split(u' ')
if len(query) >= 2:
value = int(query[0])
units = query[1]
if units=='ч':
transit_frequency = str(value*60)
else:
transit_frequency = str(value)
# Transit prognosis
query = row.find('span', {'class': 'masstransit-prognoses-view__less-hour'})
transit_prognosis = ''
transit_prognosis_more = ''
if query is not None:
data = query.string.replace(u'\xa0', u' ').strip()
# Things were so much easier before...
# Splitting prognosis and prognosis_more
data = data.replace(u',', u'').split(u' ')
units = data[-1]
prognosis = data[0]
prognosis_more = data[1:-1]
# Converting to output format
transit_prognosis = str(prognosis)
for i in range(0, len(prognosis_more)-1):
transit_prognosis_more += str(prognosis_more[i])+' '
# Adding the last element
if len(prognosis_more) >= 1:
transit_prognosis_more += str(prognosis_more[-1])
# Amazing new things!
# Expected exact times of next departures!
query = row.find('span', {'class': 'masstransit-prognoses-view__more-hour'})
transit_departures = ''
if query is not None:
transit_departures = query.string.replace(u'\xa0', u' ').strip()
# Saving the result
data_tuple = (str(cnt), transit_number, transit_type, transit_frequency,
transit_prognosis, transit_prognosis_more, transit_departures)
self.data = self.data + (data_tuple,)
cnt = cnt + 1
file.close()
return self.data
def write_to_database(self, station_id, timestamp, data):
"""Write data to PostgreSQL database."""
#0. Check if data is not empty
if len(data) == 0:
return self.DB_RESULT_NODATA
# 1. Connect to database
try:
conn = psycopg2.connect(host=self.db_host,
port=self.db_port,
database=self.db_name,
user=self.db_username,
password=self.db_password,
connect_timeout=10)
# pylint: disable=C0103
except psycopg2.OperationalError as e:
print("ERROR: " + str(datetime.datetime.now()) +
" Unable to connect to database (method: write_to_database)")
print("ERROR: " + str(datetime.datetime.now()) + " " + str(e))
return 1
# pylint: enable=C0103
else:
if conn is not None:
cur = conn.cursor()
# 2. Write data
query = "INSERT INTO " + \
"transit(stop_id, stamp, route, type, " + \
"frequency, prognosis, prognosis_more," \
"departures) " + \
"VALUES "
for line in data:
subquery = "(" + \
"'" + station_id + "'" + ", " + \
"TIMESTAMP " + "'" + str(timestamp) + "'" + ", " + \
"'" + str(line[1]) + "'" + ", " + \
"'" + str(line[2]) + "'" + ", " + \
"'" + str(line[3]) + "'" + ", " + \
"'" + str(line[4]) + "'" + ", " + \
"'" + str(line[5]) + "'" + ", " + \
"'" + str(line[6]) + "'" + \
")"
query = query + subquery + ", "
query = query[:-2]+";"
cur.execute(query)
conn.commit()
# 3. Disconnect from database
cur.close()
conn.close()
return self.DB_RESULT_OK
if __name__ == '__main__':
print("Do not run this on its own!")