-
Notifications
You must be signed in to change notification settings - Fork 0
/
tgcom24_scraper.py
122 lines (99 loc) · 5.65 KB
/
tgcom24_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
#Scraper for TGCOM24
Created on Sat Apr 2 10:42:10 2022
Last edit on Thurs Apr 7 12:30:00 2022
@author: Giulio Gabrieli
"""
#import required libraries
import os
import requests
from bs4 import BeautifulSoup
import datetime
import pandas as pd
import numpy as np
from tqdm import tqdm
from bs4 import Comment
#dictionary to convert months from italian to english. Required to make
#standardized timestamps
months_dict = {'Gennaio':'January','Febbraio':'February','Marzo':'March',
'Aprile' : 'April','Maggio':'May','Giugno':'June','Luglio':'July',
'Agosto':'August','Settembre':'September','Ottobre':'October',
'Novembre':'November','Dicembre':'December'}
#PARAMETERS
FIRSTPAGE = 1 #unless the developer was drunk, this shouldbe 1
BASEPATH = '/home/giulio/Repositories/ukrainenewsscraper'
RAWPATH = BASEPATH + os.sep + 'Raw' + os.sep #Path to raw data folder
BASEURL = 'https://www.tgcom24.mediaset.it/d/crisi-ucraina' #URL to scrape
DB = RAWPATH+'TGCOM24/tgcom24.csv' #name of the database
if(__name__ == '__main__'):
print('Initializing scraper for TGCOM24')
#check if the database is already present, otherwise initialize an empty dataframe
if not os.path.exists(DB):
print('Database not present in ',DB)
print('Initializing empty dataframe')
df = pd.DataFrame([], columns=['Type','Title','url','Date','Subtitle','Article'])
else:
print('Database present in',DB)
df = pd.read_csv(DB)
print('Database contains',len(df), 'articles')
print('Starting Download of news\' details (ID, url, Title, Date)')
#initialize data array and set the page counter
data = []
url = BASEURL# generates the URL
r = requests.get(url) #request the URL
if(r.status_code == 200): #Verify if the page exists
html = r.text #get the HTML of the page
soup = BeautifulSoup(html, 'html.parser') #initialize the parser
headers = soup.find_all(class_='container-5-cose')
#get the list of indexed, one per day
urlstocheck = [header.find('a').get('href') for header in headers]
for dailyurl in tqdm(urlstocheck):
r = requests.get(dailyurl) #request the URL
if(r.status_code == 200):
html = r.text #get the HTML of the page
soup = BeautifulSoup(html, 'html.parser') #initialize the parser
#get the daily summary
title = soup.find('h1').text.replace('\n','').replace('\t','')
subtitle = soup.find('h2').text.replace('\n','').replace('\t','')
paragraph = soup.find(class_='txb_first_paragraph').text.replace('\n','').replace('\t','')
publicationDate = soup.find(class_='rt__ribbon').find('time').get('datetime')
publicationDate = datetime.datetime.strptime(publicationDate, '%Y-%m-%dT%H:%M:%S')
data.append(['summary', title, dailyurl, publicationDate.strftime('%Y-%m-%d %H:%M:%S'), subtitle,paragraph])
#find the temporeale ID and number of pages
temporeale = soup.find(class_='more').get('data-href').split('?')[0]
comments = soup.find_all(string=lambda text: isinstance(text, Comment))
lastpage = int([x.replace(' ','') for x in comments if x.replace(' ','').startswith('totpage')][0].replace('totpage',''))
for page in range(FIRSTPAGE, lastpage+1):
print(page)
r = requests.get('https://www.tgcom24.mediaset.it/'+temporeale+'?'+str(page))
if(r.status_code == 200):
html = r.text #get the HTML of the page
soup = BeautifulSoup(html, 'html.parser') #initialize the parser
for article in soup.find('ul').find_all('li'):
try:
date = article.find('date').text.replace('\r','').replace('\t','').replace('\n','')
time = article.find('time').text.replace('\r','').replace('\t','').replace('\n','')
title = article.find('h3').text.replace('\r','').replace('\t','').replace('\n','')
text = article.find('p').text
date = date.replace('feb','02').replace('mar','03').replace('apr','04')
date = date + ' 2022'
publicationDate = date+' ' +time
publicationDate = datetime.datetime.strptime(publicationDate, '%d %m %Y %H:%M')
data.append(['entry', title,
'https://www.tgcom24.mediaset.it/'+temporeale+'?'+str(page),
publicationDate.strftime('%Y-%m-%d %H:%M:%S'),'',text])
except:
pass
else:
print(dailyurl, 'skipped. page returned ',r.status_code)
#create or update the database
print(len(data), 'new articles added to the database')
if(len(data) > 0):
df = df.append(pd.DataFrame(data, columns=['Type','Title','url','Date','Subtitle','Article']))
if not os.path.exists(RAWPATH + 'TGCOM24'):
os.mkdir(RAWPATH + 'TGCOM24')
df = df.sort_values(by='Date', ascending=False)
df.to_csv(DB, index = False)
print('Database saved tgcom24.csv in',RAWPATH+'TGCOM24')