Add ability to look up GIN doi

This feature becomes part of extract_selected. If requested (via --gindoi), and if the dataset's origin is on GIN, the code gets the dataset landing page and parses it using BeautifulSoup to obtain a GIN DOI. This can be useful if a dataset has a DOI from GIN but it is not declared in any of the sources (for example, a CFF file is required to obtain a DOI from GIN, but people rarely add the DOI there once obtained). Closes #97
sfb1451 · May 3, 2024 · 592330f · 592330f
1 parent 5ce3209
commit 592330f
Show file tree

Hide file tree

Showing 3 changed files with 60 additions and 1 deletion.
diff --git a/code/extract_selected.py b/code/extract_selected.py
@@ -13,13 +13,14 @@
 from datalad_next.datasets import Dataset
 
 from list_files import list_files
-from utils import MyEncoder
+from utils import MyEncoder, find_gin_doi
 
 parser = argparse.ArgumentParser()
 parser.add_argument("dataset", type=Path, help="Dataset to extract from")
 parser.add_argument("outdir", type=Path, help="Metadata output directory")
 parser.add_argument("-c", "--catalog", type=Path, help="Catalog to add metadata to")
 parser.add_argument("--files", action="store_true", help="Also list files")
+parser.add_argument("--gindoi", action="store_true", help="Also try to look up GIN doi")
 parser.add_argument(
     "--filename",
     help="Use this file name instead of deriving from folder names",
@@ -90,6 +91,23 @@
             json.dump(metadata_item, json_file)
             json_file.write("\n")
 
+
+# If requested and origin is on GIN, search for DOI on GIN website
+if args.gindoi:
+    if (doi := find_gin_doi(ds)) is not None:
+        basic_item = get_metadata_item(
+            item_type="dataset",
+            dataset_id=ds.id,
+            dataset_version=ds.repo.get_hexsha(),
+            source_name="gin_website",
+            source_version="0.1.0",
+        )
+        new_item = basic_item | {"doi": doi}
+        with translated_path.open("w") as json_file:
+            json.dump(new_item, json_file)
+            json_file.write("\n")
+
+
 # update catalog if requested
 if args.catalog is not None:
     catalog_add(

diff --git a/code/utils.py b/code/utils.py
@@ -1,9 +1,49 @@
 import json
 from uuid import UUID
 
+from bs4 import BeautifulSoup
+import requests
+
 
 class MyEncoder(json.JSONEncoder):
     def default(self, obj):
         if type(obj) is UUID:
             return str(obj)
         return super.default(obj)
+
+
+def find_gin_url(ds):
+    """Find a GIN URL in dataset siblings, return https"""
+    for sibling in ds.siblings(result_renderer="disabled"):
+        if sibling["name"] == "origin" and "gin.g-node.org" in sibling["url"]:
+            if sibling["url"].startswith("https"):
+                return (
+                    sibling["url"]
+                    if not sibling["url"].endswith(".git")
+                    else sibling["url"][:-4]
+                )
+            elif sibling["url"].startswith("git@"):
+                gin_url = sibling["url"].replace("git@", "https://")
+                return gin_url if not gin_url.endswith(".git") else gin_url[:-4]
+    return None
+
+
+def find_gin_doi(ds):
+    """Find GIN DOI on GIN's dataset page
+
+    Gets html response and parses it.
+
+    """
+    ginurl = find_gin_url(ds)
+    if ginurl is None:
+        return None
+
+    r = requests.get(ginurl)
+    if not r.ok:
+        return None
+    soup = BeautifulSoup(r.text, "html.parser")
+    tag = soup.find("div", class_="gin doinr")
+
+    doi = f"https://doi.org/{tag.get_text()}" if tag is not None else None
+
+    return doi
diff --git a/requirements.txt b/requirements.txt
@@ -5,3 +5,4 @@ datalad-wackyextra @ git+https://github.com/mslw/datalad-wackyextra.git@main
 datalad-catalog @ git+https://github.com/datalad/datalad-catalog@main
 tomli >= 2.0.1
 tomli_w >= 1.0.0
+beautifulsoup4 >= 4.12.3