-
Notifications
You must be signed in to change notification settings - Fork 0
/
retrack.py
121 lines (107 loc) · 4.08 KB
/
retrack.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import requests
from lxml import etree
from bs4 import BeautifulSoup
from datetime import datetime as dt
def parse_article(url):
"""
Parse article from IDEAS RePEc.
Args:
url (str): url of the article
Yields:
dict: Dictionary with the following keys:
- title: Title of the article
- date_updated: Date the article was updated
- author: Author of the article
- abstract: Abstract of the article
- jel: JEL codes of the article
- doi: DOI of the article
- link: Link to download the article
"""
page = requests.get(url) # Request page
soup = BeautifulSoup(page.text, 'html.parser') # Parse HTML source
# Get metadata
title = soup.find(
'meta', attrs={'name': 'citation_title'}).attrs['content']
author = soup.find(
'meta', attrs={'name': 'citation_authors'}).attrs['content'].split(';')
author = [x.strip() for x in author]
abstract = soup.find(
'meta', attrs={'name': 'citation_abstract'}).attrs['content']
date = soup.find('meta', attrs={'name': 'date'}).attrs['content']
jel = soup.find('meta', attrs={'name': 'jel_code'}
).attrs['content'].split(';')
if len(jel) == 1: # Format if only one JEL code
jel = jel[0].strip()
else:
jel = [x.strip() for x in jel] # Remove whitespaces
try: # DOI is not always available
doi = soup.find('meta', attrs={'name': 'DOI'}).attrs['content']
except AttributeError:
doi = ""
link = soup.find('input', attrs={'name': 'url'}).attrs['value']
yield {
'title': title,
'date_updated': date,
'author': author,
'abstract': abstract,
'jel': jel,
'doi': doi,
'link': link
}
def parse_journal(url, n_months=1, n_volumes=1):
"""
Parse journal from IDEAS RePEc.
Args:
url (str): Journal URL
n_months (int): Number of months to get
n_volumes (int): Number of volumes to get
Yields:
dict: Dictionary with the following keys:
- 'date': Date of the release
- 'volume': Volume number
- 'issue': Issue number
- 'articles': List of articles
"""
from datetime import datetime as dt
starting_url = 'https://ideas.repec.org' # Starting URL
# Get page
page = requests.get(url) # Request page
dom = etree.HTML(page.text) # Parse HTML source
def parse_volume(element):
papers = element.xpath('ul/li/b/a/@href')
for p in papers:
paper_url = starting_url + p
yield from parse_article(paper_url)
# Get data
release = dom.xpath('//div[@id="content"]/h3/text()') # Get release dates
date = [x.strip().split(', ')[0] for x in release] # Get dates
number = [int(x.split(', ')[1].split('Volume ')[1])
for x in release] # Get volume numbers
issue = [x.split(', ')[2].split('Issue ')[1]
for x in release] # Get issue numbers
# Get latest volumes
volumes = dom.xpath(
'//h2[text()="Content"]/following-sibling::div')[:n_volumes]
# Collect articles from the last n_months
if n_months > 0:
start_date = dt.today().replace(month=dt.today().month-n_months)
else:
start_date = dt(1900, 1, 1) # Collect all articles if n_months <= 0
for j, v in enumerate(volumes):
d = date[j] # Get date
n = number[j] # Get volume number
i = issue[j] # Get issue number
a = [] # Initialize empty list of articles
p_list = v.xpath('ul/li/b/a/@href') # Get list of papers
for p in p_list:
paper_url = starting_url + p # Get paper URL
new = list(parse_article(paper_url)) # Parse paper
# Filter by date
new = [x for x in new if dt.strptime(
x['date_updated'], '%Y-%m-%d') >= start_date]
if len(new) > 0: # Add articles to list
a += new
else:
break # Stop if no articles are found
if len(a) > 0:
yield {'date': d, 'volume': n, 'issue': i, 'articles': a}