diff --git a/code/extract_selected.py b/code/extract_selected.py index f964a5ac..22047cbb 100644 --- a/code/extract_selected.py +++ b/code/extract_selected.py @@ -13,13 +13,14 @@ from datalad_next.datasets import Dataset from list_files import list_files -from utils import MyEncoder +from utils import MyEncoder, find_gin_doi parser = argparse.ArgumentParser() parser.add_argument("dataset", type=Path, help="Dataset to extract from") parser.add_argument("outdir", type=Path, help="Metadata output directory") parser.add_argument("-c", "--catalog", type=Path, help="Catalog to add metadata to") parser.add_argument("--files", action="store_true", help="Also list files") +parser.add_argument("--gindoi", action="store_true", help="Also try to look up GIN doi") parser.add_argument( "--filename", help="Use this file name instead of deriving from folder names", @@ -90,6 +91,23 @@ json.dump(metadata_item, json_file) json_file.write("\n") + +# If requested and origin is on GIN, search for DOI on GIN website +if args.gindoi: + if (doi := find_gin_doi(ds)) is not None: + basic_item = get_metadata_item( + item_type="dataset", + dataset_id=ds.id, + dataset_version=ds.repo.get_hexsha(), + source_name="gin_website", + source_version="0.1.0", + ) + new_item = basic_item | {"doi": doi} + with translated_path.open("w") as json_file: + json.dump(new_item, json_file) + json_file.write("\n") + + # update catalog if requested if args.catalog is not None: catalog_add( diff --git a/code/utils.py b/code/utils.py index c4374563..5fea92b2 100644 --- a/code/utils.py +++ b/code/utils.py @@ -1,9 +1,49 @@ import json from uuid import UUID +from bs4 import BeautifulSoup +import requests + class MyEncoder(json.JSONEncoder): def default(self, obj): if type(obj) is UUID: return str(obj) return super.default(obj) + + +def find_gin_url(ds): + """Find a GIN URL in dataset siblings, return https""" + for sibling in ds.siblings(result_renderer="disabled"): + if sibling["name"] == "origin" and "gin.g-node.org" in sibling["url"]: + if sibling["url"].startswith("https"): + return ( + sibling["url"] + if not sibling["url"].endswith(".git") + else sibling["url"][:-4] + ) + elif sibling["url"].startswith("git@"): + gin_url = sibling["url"].replace("git@", "https://") + return gin_url if not gin_url.endswith(".git") else gin_url[:-4] + return None + + +def find_gin_doi(ds): + """Find GIN DOI on GIN's dataset page + + Gets html response and parses it. + + """ + ginurl = find_gin_url(ds) + if ginurl is None: + return None + + r = requests.get(ginurl) + if not r.ok: + return None + soup = BeautifulSoup(r.text, "html.parser") + tag = soup.find("div", class_="gin doinr") + + doi = f"https://doi.org/{tag.get_text()}" if tag is not None else None + + return doi diff --git a/requirements.txt b/requirements.txt index 1bacaeb3..ae0ee1e9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ datalad-wackyextra @ git+https://github.com/mslw/datalad-wackyextra.git@main datalad-catalog @ git+https://github.com/datalad/datalad-catalog@main tomli >= 2.0.1 tomli_w >= 1.0.0 +beautifulsoup4 >= 4.12.3