Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SHARE-777][Feature][HOLD] Add tind.io harvester for AgEcon #667

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@
'com.peerj = share.harvesters.com_peerj:PeerJHarvester',
'com.researchregistry = share.harvesters.com_researchregistry:ResearchRegistryHarvester',
'com.springer = share.harvesters.com_springer:SpringerHarvester',
'edu.ageconsearch = share.harvesters.edu_ageconsearch:AgEconHarvester',
'edu.gwu = share.harvesters.edu_gwu:GWScholarSpaceHarvester',
'edu.harvarddataverse = share.harvesters.edu_harvarddataverse:HarvardDataverseHarvester',
'gov.clinicaltrials = share.harvesters.gov_clinicaltrials:ClinicalTrialsHarvester',
Expand All @@ -67,6 +66,7 @@
'gov.scitech = share.harvesters.gov_scitech:SciTechHarvester',
'gov.usgs = share.harvesters.gov_usgs:USGSHarvester',
'io.osf = share.harvesters.io_osf:OSFHarvester',
'io.tind = share.harvesters.io_tind:TindHarvester',
'oai = share.harvesters.oai:OAIHarvester',
'org.arxiv = share.harvesters.org_arxiv:ArxivHarvester',
'org.biorxiv = share.harvesters.org_biorxiv:BiorxivHarvester',
Expand Down
117 changes: 0 additions & 117 deletions share/harvesters/edu_ageconsearch.py

This file was deleted.

80 changes: 80 additions & 0 deletions share/harvesters/io_tind.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import logging

from furl import furl
from lxml import etree
import pendulum

from share.harvest import BaseHarvester

logger = logging.getLogger('__name__')


class TindHarvester(BaseHarvester):
"""
Expected harvester kwargs:
collection: collection name to harvest
page_size: records per request
format_code:
'xo': MODS XML
'xd': Dublin Core-ish XML
'xm': MARC XML
'hm': MARC
'hb': HTML

API Query Parameters:
dt (type of date filter: 'm' for date modified)
d1d (start of date range day)
d1m (start of date range month)
d1y (start of date range year)
d2d (end of date range day)
d2m (end of date range month)
d2y (end of date range year)
sc (split by collection: 0 or 1)
sf (sort field: e.g. 'latest first')
so (sort order: 'a' for ascending, 'd' for descending)
rg (page size)
jrec (offset)
of (format code, see above)
"""
VERSION = 1

namespaces = {
'mods': 'http://www.loc.gov/mods/v3',
}

def do_harvest(self, start_date: pendulum.Pendulum, end_date: pendulum.Pendulum):
page_size = self.kwargs['page_size']
offset = 1
url = furl(self.config.base_url)
url.args.update({
'c': self.kwargs['collection'],
'of': self.kwargs['format_code'],
'rg': page_size,
'dt': 'm',
'd1d': start_date.day,
'd1m': start_date.month,
'd1y': start_date.year,
'd2d': end_date.day,
'd2m': end_date.month,
'd2y': end_date.year,
'sc': 0, # Splitting by collection screws up the page size
'sf': 'latest first',
'so': 'd',
})

while True:
logger.debug('Making request to %s', url.url)
resp = self.requests.get(url.url)
resp.raise_for_status()

parsed = etree.fromstring(resp.content, parser=etree.XMLParser(recover=True))
records = parsed.xpath('/modsCollection/mods:mods', namespaces=self.namespaces)
if not records:
break

for record in records:
id = record.xpath('mods:recordInfo/mods:recordIdentifier', namespaces=self.namespaces)[0].text
yield (id, etree.tostring(record, encoding=str))

offset += page_size
url.args['jrec'] = offset
19 changes: 15 additions & 4 deletions share/sources/edu.ageconsearch/source.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,23 @@
configs:
- base_url: http://ageconsearch.umn.edu/browse-date
- base_url: http://ageconsearch.tind.io/search
disabled: false
earliest_date: null
harvester: edu.ageconsearch
harvester_kwargs: {}
label: edu.ageconsearch
harvester: io.tind
harvester_kwargs:
collection: AgEcon Search
page_size: 100
format_code: xo
label: edu.ageconsearch.tind
rate_limit_allowance: 1
rate_limit_period: 2
transformer: mods
transformer_kwargs:
emitted_type: Preprint
- base_url: http://ageconsearch.umn.edu/browse-date
disabled: true
earliest_date: null
harvester: null
label: edu.ageconsearch
transformer: edu.ageconsearch
transformer_kwargs: {}
home_page: http://ageconsearch.umn.edu/
Expand Down
18 changes: 13 additions & 5 deletions share/transformers/mods.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,11 @@ class MODSCreativeWork(Parser):
)
)

date_updated = tools.ParseDate(tools.Try(ctx.header.datestamp))
date_updated = tools.OneOf(
tools.ParseDate(ctx.header.datestamp),
tools.ParseDate(ctx['mods:recordInfo']['mods:recordChangeDate']),
tools.Static(None)
)

# TODO (in regulator) handle date ranges, uncertain dates ('1904-1941', '1890?', '1980-', '19uu', etc.)
date_published = tools.OneOf(
Expand Down Expand Up @@ -492,7 +496,11 @@ def do_transform(self, data):

def unwrap_data(self, data):
unwrapped_data = xmltodict.parse(data, process_namespaces=True, namespaces=self.kwargs.get('namespaces', self.NAMESPACES))
return {
**unwrapped_data['record'].get('metadata', {}).get('mods:mods', {}),
'header': unwrapped_data['record']['header'],
}
if 'record' in unwrapped_data:
return {
**unwrapped_data['record'].get('metadata', {}).get('mods:mods', {}),
'header': unwrapped_data['record']['header'],
}
elif 'mods:mods' in unwrapped_data:
return unwrapped_data['mods:mods']
raise ValueError('Unrecognized MODS wrapper!\n{}'.format(data))