Skip to content

Commit

Permalink
Add ability to look up GIN doi
Browse files Browse the repository at this point in the history
This feature becomes part of extract_selected. If requested (via
--gindoi), and if the dataset's origin is on GIN, the code gets the
dataset landing page and parses it using BeautifulSoup to obtain a GIN
DOI.

This can be useful if a dataset has a DOI from GIN but it is not
declared in any of the sources (for example, a CFF file is required to
obtain a DOI from GIN, but people rarely add the DOI there once
obtained).

Closes #97
  • Loading branch information
mslw committed May 3, 2024
1 parent 5ce3209 commit 592330f
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 1 deletion.
20 changes: 19 additions & 1 deletion code/extract_selected.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,14 @@
from datalad_next.datasets import Dataset

from list_files import list_files
from utils import MyEncoder
from utils import MyEncoder, find_gin_doi

parser = argparse.ArgumentParser()
parser.add_argument("dataset", type=Path, help="Dataset to extract from")
parser.add_argument("outdir", type=Path, help="Metadata output directory")
parser.add_argument("-c", "--catalog", type=Path, help="Catalog to add metadata to")
parser.add_argument("--files", action="store_true", help="Also list files")
parser.add_argument("--gindoi", action="store_true", help="Also try to look up GIN doi")
parser.add_argument(
"--filename",
help="Use this file name instead of deriving from folder names",
Expand Down Expand Up @@ -90,6 +91,23 @@
json.dump(metadata_item, json_file)
json_file.write("\n")


# If requested and origin is on GIN, search for DOI on GIN website
if args.gindoi:
if (doi := find_gin_doi(ds)) is not None:
basic_item = get_metadata_item(
item_type="dataset",
dataset_id=ds.id,
dataset_version=ds.repo.get_hexsha(),
source_name="gin_website",
source_version="0.1.0",
)
new_item = basic_item | {"doi": doi}
with translated_path.open("w") as json_file:
json.dump(new_item, json_file)
json_file.write("\n")


# update catalog if requested
if args.catalog is not None:
catalog_add(
Expand Down
40 changes: 40 additions & 0 deletions code/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,49 @@
import json
from uuid import UUID

from bs4 import BeautifulSoup
import requests


class MyEncoder(json.JSONEncoder):
def default(self, obj):
if type(obj) is UUID:
return str(obj)
return super.default(obj)


def find_gin_url(ds):
"""Find a GIN URL in dataset siblings, return https"""
for sibling in ds.siblings(result_renderer="disabled"):
if sibling["name"] == "origin" and "gin.g-node.org" in sibling["url"]:
if sibling["url"].startswith("https"):
return (
sibling["url"]
if not sibling["url"].endswith(".git")
else sibling["url"][:-4]
)
elif sibling["url"].startswith("git@"):
gin_url = sibling["url"].replace("git@", "https://")
return gin_url if not gin_url.endswith(".git") else gin_url[:-4]
return None


def find_gin_doi(ds):
"""Find GIN DOI on GIN's dataset page
Gets html response and parses it.
"""
ginurl = find_gin_url(ds)
if ginurl is None:
return None

r = requests.get(ginurl)
if not r.ok:
return None
soup = BeautifulSoup(r.text, "html.parser")
tag = soup.find("div", class_="gin doinr")

doi = f"https://doi.org/{tag.get_text()}" if tag is not None else None

return doi
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ datalad-wackyextra @ git+https://github.com/mslw/datalad-wackyextra.git@main
datalad-catalog @ git+https://github.com/datalad/datalad-catalog@main
tomli >= 2.0.1
tomli_w >= 1.0.0
beautifulsoup4 >= 4.12.3

0 comments on commit 592330f

Please sign in to comment.