-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This feature becomes part of extract_selected. If requested (via --gindoi), and if the dataset's origin is on GIN, the code gets the dataset landing page and parses it using BeautifulSoup to obtain a GIN DOI. This can be useful if a dataset has a DOI from GIN but it is not declared in any of the sources (for example, a CFF file is required to obtain a DOI from GIN, but people rarely add the DOI there once obtained). Closes #97
- Loading branch information
Showing
3 changed files
with
60 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,49 @@ | ||
import json | ||
from uuid import UUID | ||
|
||
from bs4 import BeautifulSoup | ||
import requests | ||
|
||
|
||
class MyEncoder(json.JSONEncoder): | ||
def default(self, obj): | ||
if type(obj) is UUID: | ||
return str(obj) | ||
return super.default(obj) | ||
|
||
|
||
def find_gin_url(ds): | ||
"""Find a GIN URL in dataset siblings, return https""" | ||
for sibling in ds.siblings(result_renderer="disabled"): | ||
if sibling["name"] == "origin" and "gin.g-node.org" in sibling["url"]: | ||
if sibling["url"].startswith("https"): | ||
return ( | ||
sibling["url"] | ||
if not sibling["url"].endswith(".git") | ||
else sibling["url"][:-4] | ||
) | ||
elif sibling["url"].startswith("git@"): | ||
gin_url = sibling["url"].replace("git@", "https://") | ||
return gin_url if not gin_url.endswith(".git") else gin_url[:-4] | ||
return None | ||
|
||
|
||
def find_gin_doi(ds): | ||
"""Find GIN DOI on GIN's dataset page | ||
Gets html response and parses it. | ||
""" | ||
ginurl = find_gin_url(ds) | ||
if ginurl is None: | ||
return None | ||
|
||
r = requests.get(ginurl) | ||
if not r.ok: | ||
return None | ||
soup = BeautifulSoup(r.text, "html.parser") | ||
tag = soup.find("div", class_="gin doinr") | ||
|
||
doi = f"https://doi.org/{tag.get_text()}" if tag is not None else None | ||
|
||
return doi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters