-
Notifications
You must be signed in to change notification settings - Fork 1
/
fsr_scraper.py
99 lines (80 loc) · 3.82 KB
/
fsr_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#https://link.springer.com/book/10.1007/978-981-15-9460-1#toc
#['conference name', 'publication year','paper name','link to paper','ciation count','first author name','first author link','last author name','last author link', ... author name/link]
#out to a csv file
import os
import requests
from bs4 import BeautifulSoup
import json
URL = "https://link.springer.com/book/10.1007/978-981-15-9460-1?page={}&oscar-books=true#toc"
pagenum = 1
api_link = "https://api.semanticscholar.org/graph/v1/paper/{}?fields=citationCount"
api_author_link="https://api.semanticscholar.org/graph/v1/paper/{}/authors?fields=url"
outfile = 'fsr_2019.csv'
with open(outfile, 'w') as f:
f.write("'conference name','publication year','paper name','link to paper','ciation count','first author name','first author link','last author name','last author link','... remaining authors name and link'\n")
while pagenum <=2:
#get page
page = requests.get(URL.format(pagenum))
soup = BeautifulSoup(page.content, "html.parser")
#get confrence
conf = soup.find('div',class_="c-book-evaluation-divider").contents[3].a.text
#get year
yr = soup.find_all('div',class_="c-book-evaluation-divider")[2].p.text[-48:-44]
#get papers
papers = soup.find_all('li',class_="c-card c-list-group__item c-card--flush u-pa-16")
for paper in papers:
#write conference name
f.write('"'+conf+'",')
#write year
f.write('"'+yr+'",')
#write paper title
title = paper.h3.a
if pagenum == 1:
doi = title["href"][9:]
else:
doi = title["href"][-28:]
link = ("https://link.springer.com/chapter/"+doi)
f.write('"'+title.text+'",')#changed to "" bc commas cause issues
#write paper link
f.write("'"+link+"',")
#use api with doi to find citation count
#uncommment below line to temporarily leave citation count blank
#f.write(",")
api_return = requests.get(api_link.format(doi))
api_content = json.loads(api_return.content)
cite_count = api_content["citationCount"]
f.write("'"+str(cite_count)+"',")
#get authors
authors = paper.find('li',class_="c-author-list__item").text.split(",")
#use paper link for authors if find et. al
if any("et al." in author for author in authors):
paper_page = requests.get(link)
soup = BeautifulSoup(paper_page.content, "html.parser")
new_authors_info = soup.find_all('li',class_="c-article-author-list__item")
for i,author_info in enumerate(new_authors_info):
if i < len(authors)-1:
authors[i] = "'"+author_info.a.text+"'"
else:
authors.append("'"+author_info.a.text+"'")
#use author api to get all author links
api_authors_return = requests.get(api_author_link.format(doi))
api_authors_content = json.loads(api_authors_return.content)["data"]
#put all author links in lists
authors_links = []
for author_info in api_authors_content:
authors_links.append("'"+author_info["url"]+"'")
#write first author
f.write(authors[0])
f.write(","+authors_links[0])
#write last author
if len(authors)>1:
f.write(","+authors[-1])
f.write(","+authors_links[-1])
#write remaining authors
i = 1
while i<len(authors)-1:
f.write(","+authors[i])
f.write(","+authors_links[i])
i+=1
f.write('\n')
pagenum += 1