-
Notifications
You must be signed in to change notification settings - Fork 1
/
aros-scrapper.py
executable file
·116 lines (95 loc) · 4.74 KB
/
aros-scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#https://ieeexplore.ieee.org/xpl/conhome/9187508/proceeding?isnumber=9196508&pageNumber=6
#ICRA 2020 = 9187508 , isnumber = 9196508
#['conference name', 'publication year','paper name','link to paper','ciation count','first author name','first author link','last author name','last author link', ... author name/link]
#out to a csv file
import os
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
options = Options()
options.page_load_strategy = 'normal'
def rendering(url):
# change '/usr/local/bin/chromedriver' to the path from you got when you ran 'which chromedriver'
driver = webdriver.Chrome(options=options)
driver.get(url) # load the web page from the URL
time.sleep(7) # wait for the web page to load
render = driver.page_source # get the page source HTML
driver.quit() # quit ChromeDriver
return render # return the page source HTML
#arso has different url come back to it/hard code and copy paste cells?
conferences ={'arso19':[8937294,8948708]}#75
URL = "https://ieeexplore.ieee.org/xpl/conhome/{}/proceeding?isnumber={}&pageNumber={}&rowsPerPage=100"
outfile = 'arso2019.csv'
with open(outfile, 'w') as f:
f.write("'conference name','publication year','paper name','link to paper','ciation count','first author name','first author link','last author name','last author link','... remaining authors name and link\n'")
for val in conferences.values():
pagenum = 1
firstpage = rendering(URL.format(val[0],val[1],pagenum))
firstsoup = BeautifulSoup(firstpage, "html.parser")
firstout_soup= firstsoup.find("xpl-root")
conference = firstout_soup.find("div", class_="title-container text-lg-md")
print(conference.text)
while pagenum<=1:
#get page
page = rendering(URL.format(val[0],val[1],pagenum))
print(URL.format(val[0],val[1],pagenum))
soup = BeautifulSoup(page, "html.parser")
out_soup= soup.find("xpl-root")
results = out_soup.find_all('div', class_="List-results-items")
for result in results:
#write conference name
f.write('"'+conference.text[1:]+'",')
#write conference year
cited = result.find('div',class_="description text-base-md-lh")
year = cited.find('span')
f.write("'"+year.text[-4:]+"',")
#write paper title
name = result.find('a')
f.write('"'+name.text+'",')
#write link to paper
f.write("'"+"https://ieeexplore.ieee.org"+name["href"]+"',")
#write amnt of citations
cites = cited.find('a')
if cites:
s = cites.text
f.write("'"+s[s.find('(')+1:s.find(')')]+"',")
else:
f.write("'0',")
#make lists w authors and links
authors_list = []
authors_links = []
authors = result.find('p',class_="author text-base-md-lh")
if authors:
authorsinfo = authors.find_all('a')
for author in authorsinfo:
authorname = ("'"+author.text+"'")
authors_list.append(authorname)
if author.has_attr('href'):
authorlink = ("'"+"https://ieeexplore.ieee.org"+author["href"]+"'")
authors_links.append(authorlink)
#write first author
f.write(authors_list[0])
if authors_links:
f.write(","+authors_links[0])
else:
f.write(",")
#write last author
if len(authors_list)>1:
f.write(","+authors_list[-1])
if authors_links:
f.write(","+authors_links[-1])
else:
f.write(",")
#write remaining authors
i = 1
while i<len(authors_list)-1:
f.write(","+authors_list[i])
if authors_links:
f.write(","+authors_links[i])
else:
f.write(",")
i+=1
f.write('\n')
pagenum += 1