-
Notifications
You must be signed in to change notification settings - Fork 0
/
ScrapeMe.py
412 lines (323 loc) · 13.1 KB
/
ScrapeMe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 19 11:10:49 2021
@author: ClaudiaLorusso
"""
import requests
from bs4 import BeautifulSoup
import re
import json
import unicodedata
import os.path
from datetime import datetime
from time import sleep
import numpy as np
import winsound
import random
from urllib import request
class ScrapeMe:
"""
Simple script in python that allows you to scrape some public information
(laws and regulaments) from "Consiglio Regionale della Puglia"s web page,
using BeautifulSoup.
If the software suddenly crushes, you can simply restart it:
it wont reload pages that had been already visited.
DISCLAIMER: This is for didactit purpose only!
"""
def __load_page__(self, soup, page_num):
"""
Loads the page with the specified number.
Parameters
----------
soup : BeautifulSoup object
Data structure representing the parsed HTML page.
page_num : int
Number of the page you want to load.
Returns
-------
soup : BeautifulSoup object
Data structure representing the page_num parsed HTML page.
"""
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0",
}
payload = {
"__EVENTTARGET": "gwLeggi",
"__EVENTARGUMENT": "Page${}".format(page_num),
"__LASTFOCUS": "",
"__ASYNCPOST": "true",
}
for inp in soup.select("input"):
payload[inp["name"]] = inp.get("value")
api_url = "http://portale2015.consiglio.puglia.it/documentazione/leges/risultati.aspx"
soup = BeautifulSoup(
requests.post(api_url, data=payload, headers=headers).content,
"html.parser",
)
return soup
def __get_links__(self):
"""
Scrapes the list of the links pointing to the every law and regulament
contained into the "Consiglio della Regione Puglia"s database.
Every link will be saved into a .txt file named laws_link.txt.
You can find the file into the Utils folder.
Returns
-------
link_list : list
contains evrey link that points to the DB documents
"""
if os.path.isfile("Utils\laws_link.txt")==False or os.stat("Utils\laws_link.txt").st_size == 0:
url = "http://portale2015.consiglio.puglia.it/documentazione/leges/risultati.aspx"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
page=1
link_list = list()
while(page<251):
soup_page = self.__load_page__(soup, page)
table = soup_page.find_all("a")
pattern_url = "http://portale2015.consiglio.puglia.it/documentazione/leges/"
for row in table:
pattern = r"modulo.aspx.id=\d+"
match = re.findall(pattern, str(row.get("href")))
for i in match:
url = pattern_url + i
link_list.append(url)
page+=1
#stampo su file i link delle leggi
with open('Utils\laws_link.txt', 'w+') as f:
#with open('Utils\laws_dif.txt', 'w+') as f:
for item in link_list:
f.write("%s\n" % item)
else:
#link_file = open('Utils\laws_link.txt', 'r')
link_file = open('Utils\laws_dif.txt', 'r')
link_list = link_file.read().splitlines()
link_file.close()
return link_list
def __get_content__(self, url):
"""
Scrapes the desired content from the specified url.
Every info is then saved into a .json file thanks to the __save_info__
method.
Parameters
----------
url : string
the url page from which scrape the desired info.
Returns
-------
string
.json file path (testing pourpose)
"""
with open('Utils\laws_processed.txt', "a+") as link_file:
link_list = link_file.read().splitlines()
link_file.close()
if (url in set(link_list)) == False:
with open('Utils\laws_processed.txt', "a") as myfile:
myfile.write(url+'\n')
myfile.close()
user_agent = self.get_random_ua()
referer = 'https://google.it'
headers = {
'user-agent': user_agent,
'referer': referer,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Pragma': 'no-cache'
}
req = requests.get(url, headers)
soup = BeautifulSoup(req.content, 'html.parser')
#get year
pattern = r'(?:(?:18|19|20|21)[0-9]{2})'
reg = re.findall(pattern, str(soup.find(id = "lblAnno")))
year = reg[0]
#get number
# initializing tag
tag1 = "span id=\"lblNumero\""
tag2 = "span"
# regex to extract required strings
pattern = "<" + tag1 + ">(.*?)</" + tag2 + ">"
reg = re.findall(pattern, str(soup.find(id = "lblNumero")))
try:
number = reg[0]
except IndexError:
number = str()
print(url)
#get date
pattern = r'(((0[1-9]|[12][0-9]|3[01])([-./])(0[13578]|10|12)([-./])(\d{4}))|(([0][1-9]|[12][0-9]|30)([-./])(0[469]|11)([-./])(\d{4}))|((0[1-9]|1[0-9]|2[0-8])([-./])(02)([-./])(\d{4}))|((29)(\.|-|\/)(02)([-./])([02468][048]00))|((29)([-./])(02)([-./])([13579][26]00))|((29)([-./])(02)([-./])([0-9][0-9][0][48]))|((29)([-./])(02)([-./])([0-9][0-9][2468][048]))|((29)([-./])(02)([-./])([0-9][0-9][13579][26])))'
reg = re.findall(pattern, str(soup.find(id = "lblData")))
date_r = reg[0]
date = date_r[0]
#get title
# initializing tag
tag1 = "span id=\"lblTitolo\""
tag2 = "span"
# regex to extract required strings
pattern = "<" + tag1 + ">(.*?)</" + tag2 + ">"
reg = re.findall(pattern, str(soup.find(id = "lblTitolo")), re.DOTALL)
try:
title = reg[0]
except IndexError:
title = ""
print("Error in getting title:\t",url)
#get avviso
# initializing tag
tag1 = "span id=\"lblStorVig\""
tag2 = "span"
# regex to extract required strings
pattern = "<" + tag1 + ">(.*?)</" + tag2 + ">"
reg = re.findall(pattern, str(soup.find(id = "lblStorVig")), re.DOTALL)
try:
avviso = reg[0]
if (avviso == "Regolamento Storico"):
avviso = "REG"
elif (avviso == "Regolamento Vigente"):
avviso = "REG"
else:
avviso = "LR"
except IndexError:
avviso = "LR"
print("LR vs REG:\t", url)
#get body
# Initializing variable
gfg = BeautifulSoup(request.urlopen(url).read().decode('utf-8', 'ignore'), features='lxml')
# Extracting data for article section
bodyHtml = gfg.find('body')
# Calculating result
body = bodyHtml.get_text()#unicodedata.normalize("NFKD",bodyHtml.get_text())
body = re.sub("’", "'", body)
body = re.sub("“", '"', body)
body = re.sub("”", '"', body)
body = re.sub(r"(?<!\\)\\n|\n", " ", body)
body = " ".join(body.split())
#saving everything into a json file
return self.__save_info__(year, number, date, title, body, avviso)
def get_random_ua(self):
"""
Gives you the chance to get a random User-agent from a .txt named
ua_file.txt. You can find the specified file into the Utils folder.
Returns
-------
random_ua : string
random User-Agent
"""
random_ua = ''
ua_file = 'Utils/ua_file.txt'
try:
with open(ua_file) as f:
lines = f.readlines()
if len(lines) > 0:
prng = np.random.RandomState()
index = prng.permutation(len(lines) - 1)
idx = np.asarray(index, dtype=np.int64)[0]
random_ua = unicodedata.normalize("NFKD",lines[int(idx)]).replace("\n", "")
except Exception as ex:
print('Exception in random_ua:')
print(str(ex))
finally:
return random_ua
def __save_info__(self, year, number, date, title, body, avviso):
"""
Saves every law (or regulament) info into a .json file
Parameters
----------
year : string
law's emanation year
number : int
law's number
date : string
law's emanation date
title : string
law's description
body : string
law's content
avviso : string
= LR, if it is a law
= REG, if it is a regulament
Returns
-------
path_file : string
.json path file (test pourposes)
"""
try:
d = datetime.strptime(date, "%d/%m/%Y").strftime('%Y_%m_%d')
file_name = avviso+"-"+ d + "-" +number+".json"
path_file = "DB/"+file_name
file_details = {
'adv': avviso,
'year': year,
'date': date,
'number': number,
'title': title,
'body': body
}
with open(path_file, 'x+', encoding='utf-8-sig') as json_file:
json.dump(file_details, json_file, ensure_ascii=False)
json_file.close()
except FileExistsError:
pass
return path_file
def create_DB(self):
"""
Creates the desired DB with every law and regulament.
NO duplicates will be added in the folder.
Returns
-------
None.
"""
try:
link_list = self.__get_links__()
link_list.reverse()
random.shuffle(link_list)
for link in link_list:
self.__get_content__(link)
sleep(.5)
except ConnectionError as ce:
print(ce.message)
print("\nSlowdown kid! You overwhelmed the server!!")
def load_content(self, path):
"""
You can use this method to correcly print a law's body from
the json file in which it stands.
Parameters
----------
path : string
path of the law
Returns
-------
None.
"""
#f = open(path)
with open(path, encoding="utf-8-sig", errors="ignore") as f:
data = json.load(f)
f.close()
print(data)
print(data.get('body'))
"""
sc = ScrapeMe()
sc.create_DB()
#sc.load_content("Leggi_prova_DB\LR-1972_01_13-1.json")
winsound.Beep(440, 1000)
"""
if os.path.isfile("Utils\laws_link.txt") and os.stat("Utils\laws_link.txt").st_size != 0:
link_file = open('Utils\laws_link.txt', 'r')
link_list = link_file.read().splitlines()
link_file.close()
link_file = open('Utils\laws_processed.txt', 'r')
link_processed = link_file.read().splitlines()
link_file.close()
diff = list(set(link_list)-set(link_processed))
with open('Utils\laws_dif.txt', 'w+') as f:
for item in diff:
f.write("%s\n" % item)
f.close()
print(diff)
if(set(link_list)==set(link_processed)):
print("DONE :)")
else:
sc = ScrapeMe()
sc.create_DB()
winsound.Beep(440, 1000)
else:
sc = ScrapeMe()
sc.create_DB()
winsound.Beep(440, 1000)