-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathharvest_solr.py
116 lines (82 loc) · 3.4 KB
/
harvest_solr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import time
import os
import json
import requests
from adsputils import setup_logging, load_config
solr_config = load_config(proj_home=os.path.realpath(os.path.join(os.path.dirname(__file__), '.')))
def harvest_solr(bibcodes_list, start_index=0, fields='bibcode, title, abstract'):
''' Harvests citations for an input list of bibcodes using the ADS API.
It will perform minor cleaning of utf-8 control characters.
Log in output_dir/logs/harvest_clean.log -> tail -f logs/harvest_clean.log .
bibcodes_list: a list of bibcodes to harvest citations for.
paths_list:_list: a list of paths to save the output
start_index (optional): starting index for harvesting
fields (optional): fields to harvest from the API. Default is 'bibcode, title, abstract'.
'''
logger = setup_logging('harvest_clean', proj_home=os.path.dirname('harvest_log.txt'))
idx=start_index
step_size = 2000
# limit attempts to 10
total_attempts = 10
logger.info('Start of harvest')
print('Harvesting titles and abstracts from Solr')
# loop through list of bibcodes and query solr
while idx<len(bibcodes_list):
start_time = time.perf_counter()
# string to log
to_log = ''
attempts = 0
successful_req = False
# extract next step_size list
input_bibcodes = bibcodes_list[idx:idx+step_size]
bibcodes = 'bibcode\n' + '\n'.join(input_bibcodes)
# start attempts
while (not successful_req) and (attempts<total_attempts):
r_json = None
r = requests.post(solr_config['API_URL']+'/search/bigquery',
params={'q':'*:*', 'wt':'json', 'fq':'{!bitset}', 'fl':fields, 'rows':len(input_bibcodes)},
headers={'Authorization': 'Bearer ' + solr_config['API_TOKEN'], "Content-Type": "big-query/csv"},
data=bibcodes)
# check that request worked
# proceed if r.status_code == 200
# if fails, log r.text, then repeat for x tries
if r.status_code==200:
successful_req=True
else:
to_log += 'REQUEST {} FAILED: CODE {}\n'.format(attempts, r.status_code)
to_log += str(r.text)+'\n'
# inc count
attempts+=1
# after request
if successful_req:
#extract json
r_json = r.json()
# info to log
to_log += 'Harvested links up to {}\n'.format(idx)
# if not successful_req
else:
# add to log
to_log += 'FAILING BIBCODES: {}\n'.format(input_bibcodes)
# import pdb;pdb.set_trace()
end_time = time.perf_counter()
total_time = end_time - start_time
print()
print(f'Harvested bibcodes starting at: {idx}')
# pause to not go over API rate limit
if len(bibcodes_list)>step_size:
time.sleep(45)
idx+=step_size
logger.info(to_log)
return transform_r_json(r_json)
def transform_r_json(r_json):
"""
Extract the needed information from the json response from the solr query.
"""
record_list = []
for doc in r_json['response']['docs']:
if 'title' not in doc:
doc['title'] = None
if 'abstract' not in doc:
doc['abstract'] = None
record_list.append(doc)
return record_list