-
Notifications
You must be signed in to change notification settings - Fork 0
/
sample_script.py
122 lines (80 loc) · 3.59 KB
/
sample_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python3
#SAMPLE_SCRIPT [assuming ENGLISH Website]
#due to privacy reasons, website url & user agent not shared.
#Importing Important Libraries
import urllib.request,urllib.parse,urllib.error
from bs4 import BeautifulSoup
import ssl
from datetime import datetime
import re
import MySQLdb
#for calculating the time taken in scraping the data.
starttime=datetime.now()
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
#Connecting to a Database
conn=MySQLdb.connect(host='localhost',user='your_username',password='your_password', db='Scraped_Data')
cur=conn.cursor()
#To make MySQL store data in UTF-8
conn.set_character_set('utf8')
cur.execute('SET NAMES utf8;')
cur.execute('SET CHARACTER SET utf8;')
cur.execute('SET character_set_connection=utf8;')
cur.execute('CREATE DATABASE IF NOT EXISTS Scraped_Data')
cur.execute('USE Scraped_Data')
cur.execute('CREATE TABLE IF NOT EXISTS Cryptocurrency_Data(Published_Date TEXT, News_URL VARCHAR(255) UNIQUE , Title TEXT, Content TEXT, Images_URL TEXT, News_Source TEXT, Language TEXT)')
#First URL enter
p_url='enter your url here'
#due to privacy reasons, website url & etc not shared.
try:
#Opening the URL, by changing the User Agent
p_req= urllib.request.Request(p_url, headers={'User-Agent': 'enter your user agent id here'})
#due to privacy reasons user agent not shared...
# parsing it using Beautiful Soup
p_html= urllib.request.urlopen(p_req).read()
p_soup=BeautifulSoup(p_html, 'html.parser')
#Extracting Total Page Number
last_url=p_soup.find('div',{'class':'pagination'}).findAll('a')[3]['href']
page_total=re.search(r'[\d]{1,}',last_url).group()
print(page_total)
except:
pass
#Parsing Function
def parsing(arg):
#'arg' or argument of the function is basically each news link of a particular page
gc_url=arg['href']
gc_req= urllib.request.Request(gc_url, headers={'User-Agent': 'enter your user agent id here'})
#due to privacy reasons user agent not shared...
gc_html=urllib.request.urlopen(gc_req).read()
gc_soup=BeautifulSoup(gc_html, 'html.parser')
db1=c_url #News_Source.....Eg: page/1
db2=gc_url #News_URL
db3=arg['title'] #Title
db4=gc_soup.find('meta',{'property':'og:description'})['content'] #Content
db6=str(gc_soup.find('meta',{'property':'article:published_time'})['content']) #Published_Date
gc_imageurl=gc_soup.find('div',{'class':'article-top-image-section'})['style']
i=gc_imageurl.find('(')
f=gc_imageurl.find('}')
db5=gc_imageurl[i+2:f-1] #Image_URL
cur.execute("INSERT IGNORE INTO Cryptocurrency_Data (Published_Date,News_URL,Title,Content,Images_URL,News_Source,Language) VALUES (%s,%s,%s,%s,%s,%s,'ENGLISH')", (db6,db2,db3,db4,db5,db1))
#Now sequentially moving from Page to Page & Parsing each page's data and putting in Database
for pg_no in range(1,int(page_total)+1):
c_url='enter your url here'+str(pg_no)
try:
c_req= urllib.request.Request(c_url, headers={'User-Agent': 'enter your user agent id here'})
#due to privacy reasons user agent not shared...
c_html=urllib.request.urlopen(c_req).read()
c_soup=BeautifulSoup(c_html, 'html.parser')
gc_urls=c_soup.findAll('a',{'class':'standard-format-icon'})
for gc in gc_urls:
parsing(gc)
conn.commit()
except:
pass
cur.execute("SELECT * FROM Cryptocurrency_Data ORDER BY Published_Date DESC")
cur.close()
endtime=datetime.now()
print(endtime-starttime)
#[code by AAYUSH GADIA]