-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraperCNA.py
143 lines (115 loc) · 5.82 KB
/
scraperCNA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 2 10:42:10 2022
Last edit on Sat Apr 2 13:26:00 2022
@author: Giulio Gabrieli
"""
#import required libraries
import os
import requests
from bs4 import BeautifulSoup
import datetime
import pandas as pd
import numpy as np
from tqdm import tqdm
#dictionary to convert months from italian to english. Required to make
#standardized timestamps
months_dict = {'Gennaio':'January','Febbraio':'February','Marzo':'March',
'Aprile' : 'April','Maggio':'May','Giugno':'June','Luglio':'July',
'Agosto':'August','Settembre':'September','Ottobre':'October',
'Novembre':'November','Dicembre':'December'}
#PARAMETERS
FIRSTPAGE = 0 #unless the developer was drunk, this shouldbe 1. For CNA this is 0. I am definitely not surprised.
BASEPATH = '/home/giulio/Repositories/ukrainenewsscraper'
RAWPATH = BASEPATH + os.sep + 'Raw' + os.sep #Path to raw data folder
BASEURL = 'https://www.channelnewsasia.com/topic/ukraine-invasion?sort_by=field_release_date_value&sort_order=DESC&type%5Barticle%5D=article&page=' #URL to scrape
DB = RAWPATH+'cna/cna.csv' #name of the database
if(__name__ == '__main__'):
print('Initializing scraper for Channel News Asia')
#check if the database is already present, otherwise initialize an empty dataframe
if not os.path.exists(DB):
print('Database not present in ',DB)
print('Initializing empty dataframe')
df = pd.DataFrame([], columns=['ID','Title','url','Date'])
else:
print('Database present in',DB)
df = pd.read_csv(DB)
print('Database contains',len(df), 'articles')
print('Starting Download of news\' details (ID, url, Title, Date)')
#initialize data array and set the page counter
page = FIRSTPAGE
data = []
#initialize the download control
download = True
#while donwload is true, loop the pages and get article details
while page <=63:
print('Scraping page:', page)
url = BASEURL + str(page) # generates the URL
headers = requests.utils.default_headers()
headers.update(
{
'User-Agent': 'Mozilla/5.0 (platform; rv:geckoversion) Gecko/geckotrail Firefox/firefoxversion',
}
)
r = requests.get(url, headers=headers) #request the URL
if(r.status_code == 200): #Verify if the page exists
html = r.text #get the HTML of the page
soup = BeautifulSoup(html, 'html.parser') #initialize the parser
articles = soup.findAll('h6')
toupdate = []
for article in articles:
newsID = article.a.get('href').split('-')[-1] #unique identifier of the news
newsTitle = article.text
newsUrl = 'https://www.channelnewsasia.com' + article.a.get('href')
data.append([newsID, newsTitle, newsUrl]) #temporarily save the date in data
page +=1 #update the page number
else:
#if the status code of the page is not 200 stop the scraper, we have reached the last page contaning news
download = False
#create or update the database
print(len(data), 'new articles added to the database')
if(len(data) > 0):
df = df.append(pd.DataFrame(data, columns=['ID','Title','url']))
if not os.path.exists(RAWPATH + 'cna'):
os.mkdir(RAWPATH + 'cna')
df = df.sort_values(by='Date', ascending=False)
df.to_csv(DB, index = False)
print('Database saved cna.csv in',RAWPATH+'ilpost')
#create the article folder if it is not present
if not os.path.exists(RAWPATH + 'cna/Articles'):
os.mkdir(RAWPATH + 'cna/Articles')
#limit to the articles that are not present in the article folder
todowload = df[~df.ID.isin([x.replace('.txt', '') for x in os.listdir(RAWPATH + 'cna/Articles')])]
print('There are',len(todowload),'new articles to download')
#if there are new articles to download
publishedtime = []
if(len(todowload) > 0):
#get article ID and url
for ID, url in tqdm(np.transpose([todowload.ID.to_list(), todowload.url.to_list()])):
r = requests.get(url, headers=headers) #get the page
if(r.status_code == 200): #check the status code
try:
html = r.text #get the html
soup = BeautifulSoup(html, 'html.parser') #initialize the parser
title = soup.article.h1.text #get the title
elements = [element.text for element in soup.findAll(['p','h2'])] #get all the elements of the article
text = title + '\n' + '\n' + '\n'.join(elements[:-2]) #merge title and other elements
publishedon = soup.findAll('meta', attrs={'name':'cXenseParse:recs:mdc-changedtime'})[0].get('content')
newsDate = datetime.datetime.strptime(publishedon, '%Y-%m-%dT%H:%M:%S+08:00')
newsDate = newsDate.strftime('%Y-%m-%d %H:%M:%S')
publishedtime.append(newsDate)
except:
print('Article Skipped:', ID, url)
publishedtime.append(np.nan)
pass
#Write the news to a text file. name of the file is the ID of te news
myText = open(RAWPATH + 'cna/Articles/' + ID + '.txt','w')
myText.write(text)
myText.close()
else:
#if the status code of the page is not 200 skip this news. URL may be wrong
print(ID,'Skipped. Status Code:',r.status_code)
df.Date = publishedtime
df.to_csv(DB, index = False)
print('Database saved cna.csv in',RAWPATH+'cna')