From d09d47829608092d343a6d80fc7ab035d636e4ca Mon Sep 17 00:00:00 2001 From: Matthias Probst Date: Thu, 27 Jun 2024 15:49:29 +0200 Subject: [PATCH] major zenodo simplifaction. only one interface implementation needed --- docs/_static/repo_class_diagram.svg | 619 +++++++----------- ...lass_diagram.svg.2024_06_27_14_29_45.0.svg | 589 +++++++++++++++++ docs/userguide/repository/zenodo.ipynb | 54 +- h5rdmtoolbox/convention/core.py | 2 +- .../convention/standard_names/table.py | 2 +- h5rdmtoolbox/repository/interface.py | 48 +- h5rdmtoolbox/repository/zenodo/core.py | 327 ++++++++- h5rdmtoolbox/utils.py | 32 +- tests/conventions/test_conventions.py | 2 +- tests/repository/test_zenodo.py | 193 +++++- 10 files changed, 1393 insertions(+), 475 deletions(-) create mode 100644 docs/_static/repo_class_diagram.svg.2024_06_27_14_29_45.0.svg diff --git a/docs/_static/repo_class_diagram.svg b/docs/_static/repo_class_diagram.svg index 24eabc0..33bf6fd 100644 --- a/docs/_static/repo_class_diagram.svg +++ b/docs/_static/repo_class_diagram.svg @@ -2,9 +2,9 @@ + visible="false" /> - - - + width="46.513145" + height="50.694767" + x="35.817364" + y="54.78907" /> RepositoryInterface(ABC) - + download_file(...) + x="38.633457" + y="60.147915">RepositoryInterface(ABC) + download_files() - + get_filenames() - + _upload_file(...) + x="38.390629" + y="73.131584">+ _upload_file(...) - - - + upload_file(...) + x="38.53532" + y="101.90322">+ upload_file(...) + get_doi() + x="38.390629" + y="84.551346">+ get_doi() + exists() + x="38.390629" + y="78.841461">+ exists() + id="rect1-8" + width="75.262329" + height="89.921661" + x="20.619564" + y="120.32349" /> ZenodoSandboxDeposit - + id="tspan1-3" + style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583" + x="23.435642" + y="125.68233">ZenodoRecord + get_filenames() + x="23.110783" + y="179.86317">+ delete() + _upload_file(...) - + access_token + x="23.255474" + y="185.76414">+ json(...) - - ZenodoRecord - + get_metadata() - + set_metadata(...) + _upload_file(...) - - - AbstractZenodoInterface + id="tspan7" + style="stroke-width:0.265;stroke-dasharray:none" + x="23.255474" + y="138.01498">+ base_url + delete() + id="tspan8" + style="stroke-width:0.265;stroke-dasharray:none" + x="23.255474" + y="145.25493">+ __init__(source, sandbox=False) + json(...) + id="tspan18-4" + style="font-style:italic;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:sans-serif;-inkscape-font-specification:'sans-serif Italic';stroke-width:0.265;stroke-dasharray:none" + x="38.390629" + y="90.26123">+ set_metadata() + get_doi() - - + id="tspan20-4" + style="font-style:italic;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:sans-serif;-inkscape-font-specification:'sans-serif Italic';stroke-width:0.265;stroke-dasharray:none" + x="38.390629" + y="95.971123">+ get_metadata() + rec_url + x="23.255474" + y="191.74951">+ delete() + __init__(...) + x="23.255474" + y="197.25256">+ unlock() + exists() + x="23.255474" + y="202.75563">+ discard() + rec_id + x="23.255474" + y="208.25868">+ publish() + + set_metadata() + x="38.390629" + y="66.856186">+ files + deposit_url + x="23.255474" + y="132.72333">+ files: Dict[str: RepositoryFile] + get_metadata() + id="tspan14" + style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:sans-serif;-inkscape-font-specification:sans-serif;stroke-width:0.265;stroke-dasharray:none" + x="23.110783" + y="151.16451">+ _upload_file(...) + get_metadata() + id="tspan17" + style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:sans-serif;-inkscape-font-specification:sans-serif;stroke-width:0.265;stroke-dasharray:none" + x="23.110783" + y="162.61848">+ get_doi() + set_metadata() + x="23.110783" + y="157.11542">+ exists() ... + id="tspan21" + style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:sans-serif;-inkscape-font-specification:sans-serif;stroke-width:0.265;stroke-dasharray:none" + x="23.110783" + y="168.6004">+ set_metadata() + new_version() + x="23.110783" + y="174.10345">+ get_metadata() + delete() + x="108.36485" + y="130.43611">N + ... + id="tspan1-3-6-0" + style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583" + x="114.35098" + y="132.3085">RepositoryFile + + download_file(...) + x="114.43713" + y="167.27322">+ get(filename:str) + + download_files() + id="tspan10" + style="stroke-width:0.265;stroke-dasharray:none" + x="114.43713" + y="155.48082">+ download_url + get_filenames() + id="tspan25" + style="stroke-width:0.265;stroke-dasharray:none" + x="114.43713" + y="144.63701">+ access_url + unlock() + x="114.43713" + y="150.19327">+ checksum + discard() + x="114.43713" + y="139.34947">+ name + publish() + x="113.1418" + y="158.9287">... + + diff --git a/docs/_static/repo_class_diagram.svg.2024_06_27_14_29_45.0.svg b/docs/_static/repo_class_diagram.svg.2024_06_27_14_29_45.0.svg new file mode 100644 index 0000000..cba56d2 --- /dev/null +++ b/docs/_static/repo_class_diagram.svg.2024_06_27_14_29_45.0.svg @@ -0,0 +1,589 @@ + + + + + + + + + + + + + + + + + + RepositoryInterface(ABC) + + _upload_file(...) + + + + upload_file(...) + + get_doi() + + exists() + + ZenodoRecord + + delete() + + json(...) + + + + base_url + + __init__(source, sandbox=False) + + set_metadata() + + get_metadata() + + delete() + + unlock() + + discard() + + publish() + + + files + + files + + _upload_file(...) + + get_doi() + + exists() + + set_metadata() + + get_metadata() + + + + RepositoryFiles + collections.abc.Sequence + + + get(filename) -> RepositoryFile + + files: RepositoryFile + + + __getitem__() + + __len__() + + RepositoryFile + + + get(filename:str) + + + download_url + + access_url + + checksum + + name + ... + + diff --git a/docs/userguide/repository/zenodo.ipynb b/docs/userguide/repository/zenodo.ipynb index 9c5246d..b90707c 100644 --- a/docs/userguide/repository/zenodo.ipynb +++ b/docs/userguide/repository/zenodo.ipynb @@ -7,9 +7,13 @@ "source": [ "# Zenodo\n", "\n", - "There are two types of Zenodo interfaces. One interfaces to the public repositories (`ZenodoRecord`), the other is for testing and accessed the sandbox server (`ZenodoSandboxDeposit`).\n", + "The [Zenodo](https://zenodo.org/) repository is a concrete implementation of the `RepositoryInterface`. Other repositories such as `Figshare` (https://figshare.com/) could be possible future realizations of it.\n", "\n", - "The class diagram below shows how they are constructed. First, an abstract zenodo interface class (`AbstractZenodoInterface`) is derived. From this, the concrete interface classes are derived.\n", + "Zenodo provides a sandbox (testing environment) and a production environment. They work the same in principle. Therefore, only one implementation is needed, which is `ZenodoRecord` (the interface to a record in Zenodo). Pass `sandbox=True` to use the testing environment.\n", + "\n", + "The below diagram shows the abstract base class with its abstract methods (indicated by italics). Note, that `upload_file()` is *not* abstract. The subclasses must implement `__upload_file__`, which uploads a file to the repository record. `upload_file()` is basically a wrapper, which additionally allows generating metadata files of the uploaded files. We will explore this feature later in this section.\n", + "\n", + "The `RepositoryInterface` further defines the communication with files. A file object `RepositoryFile` is implemented, providing mandatory properties as well as a download method. A repository implementation (just like the one for Zenodo) must return a Dictionary of `RepositoryFile` objects for the `files` class property (see source code for in-depth explanation and the example at the end of this section).\n", "\n", "\"../../_static/repo_class_diagram.svg\"\n", 1\u001b[0m [f\u001b[38;5;241m.\u001b[39mname \u001b[38;5;28;01mfor\u001b[39;00m f \u001b[38;5;129;01min\u001b[39;00m repo\u001b[38;5;241m.\u001b[39mfiles]\n", + "Cell \u001b[1;32mIn[7], line 1\u001b[0m, in \u001b[0;36m\u001b[1;34m(.0)\u001b[0m\n\u001b[1;32m----> 1\u001b[0m [\u001b[43mf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mname\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m f \u001b[38;5;129;01min\u001b[39;00m repo\u001b[38;5;241m.\u001b[39mfiles]\n", + "\u001b[1;31mAttributeError\u001b[0m: 'str' object has no attribute 'name'" + ] } ], "source": [ - "repo.get_filenames()" + "[f.name for f in repo.files]" ] }, { @@ -202,7 +208,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "fbf04793-b7eb-4377-a904-edb91542b056", "metadata": {}, "outputs": [], @@ -220,7 +226,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "02290359-b13b-413d-9b93-9706b3ab087d", "metadata": {}, "outputs": [], @@ -238,23 +244,13 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "4e75f4fc-6311-470c-9204-93c1c5d768d0", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tmp0.txt\n", - "tmp0.hdf\n", - "tmp0.jsonld\n" - ] - } - ], + "outputs": [], "source": [ "for file in repo.files:\n", - " print(file.filename)" + " print(file.name)" ] }, { @@ -282,7 +278,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.3" + "version": "3.8.19" } }, "nbformat": 4, diff --git a/h5rdmtoolbox/convention/core.py b/h5rdmtoolbox/convention/core.py index b95b226..6784224 100644 --- a/h5rdmtoolbox/convention/core.py +++ b/h5rdmtoolbox/convention/core.py @@ -755,7 +755,7 @@ def from_zenodo(doi_or_recid: str, if not filename.exists() or force_download: record = zenodo.ZenodoRecord(rec_id) - filenames = record.get_filenames() + filenames = list(record.files.keys()) if name is None: matches = [file for file in filenames if pathlib.Path(file).suffix == '.yaml'] else: diff --git a/h5rdmtoolbox/convention/standard_names/table.py b/h5rdmtoolbox/convention/standard_names/table.py index a06156a..399df3d 100644 --- a/h5rdmtoolbox/convention/standard_names/table.py +++ b/h5rdmtoolbox/convention/standard_names/table.py @@ -752,7 +752,7 @@ def from_zenodo(source: str = None, doi_or_recid=None) -> "StandardNameTable": z = zenodo.ZenodoRecord(rec_id) assert z.exists() - filenames = [file.download(target_folder=UserDir['standard_name_tables']) for file in z.files] + filenames = [file.download(target_folder=UserDir['standard_name_tables']) for file in z.files.values()] # filenames = z.download_files(target_folder=UserDir['standard_name_tables']) assert len(filenames) == 1 filename = filenames[0] diff --git a/h5rdmtoolbox/repository/interface.py b/h5rdmtoolbox/repository/interface.py index 4dbb3e6..2cfe4a3 100644 --- a/h5rdmtoolbox/repository/interface.py +++ b/h5rdmtoolbox/repository/interface.py @@ -6,6 +6,8 @@ import appdirs +from h5rdmtoolbox.utils import deprecated + logger = logging.getLogger('h5rdmtoolbox') @@ -37,14 +39,16 @@ def _HDF2JSON(filename: Union[str, pathlib.Path], **kwargs) -> pathlib.Path: return hdf2jsonld(filename=filename, skipND=1) -class RepositoryFile(abc.ABC): +class RepositoryFile: + """The interface class to files in a repository""" - def __init__(self, identifier, + def __init__(self, + identifier, identifier_url, download_url, access_url, checksum, - filename, + name, size, media_type, access_token=None, @@ -52,7 +56,7 @@ def __init__(self, identifier, self.download_url = download_url self.access_url = access_url self.checksum = checksum - self.filename = filename + self.name = name self.media_type = media_type self.size = size self.identifier = identifier @@ -60,6 +64,9 @@ def __init__(self, identifier, self.access_token = access_token self.additional_data = kwargs + def __repr__(self): + return f"{self.__class__.__name__}({self.name})" + def info(self) -> Dict: return dict(identifier=self.identifier, identifier_url=self.identifier_url, @@ -132,30 +139,31 @@ def set_metadata(self, metadata): @abc.abstractmethod def download_file(self, filename): - """Download a specific file from the repository.""" + """Download a specific file from the repository. + + ..note: This method is deprecated. Use method `.files.get(filename).download()` method instead. + """ @abc.abstractmethod def download_files(self): - """Download all files from the repository.""" + """Download all files from the repository. + ..note: This method is deprecated. Please iterate over `files` and call .download() on the items. + """ + + @deprecated(version='1.4.0rc1', + msg='Please use `list(self.files.keys())` instead') def get_filenames(self) -> List[str]: """Get a list of all filenames.""" - return [file.filename for file in self.files] + return list(self.files.keys()) @property @abc.abstractmethod - def files(self) -> List[RepositoryFile]: + def files(self) -> Dict[str, RepositoryFile]: """List of all files in the repository.""" - def file(self, filename: str) -> RepositoryFile: - """Return the file matching the filename, e.g. file.pdf""" - for file in self.files: - if file.filename == filename: - return file - raise FileNotFoundError(f'The file "{filename}" does not exist in the repository.') - @abc.abstractmethod - def _upload_file(self, filename: Union[str, pathlib.Path], overwrite: bool = False): + def __upload_file__(self, filename: Union[str, pathlib.Path], overwrite: bool = False): """Upload a file to the repository. This is a regular file uploader, hence the file can be of any type. This is a private method, which needs to be implemented by every repository interface. Will be called by `upload_file`""" @@ -206,12 +214,16 @@ def upload_file(self, else: meta_data_file = None - self._upload_file(filename=filename, overwrite=overwrite) + self.__upload_file__(filename=filename, overwrite=overwrite) if meta_data_file is not None: - self._upload_file(filename=meta_data_file, overwrite=overwrite) + self.__upload_file__(filename=meta_data_file, overwrite=overwrite) self.refresh() + @deprecated(version='1.4.0rc1', + msg='This method is deprecated. ' + 'Use `.upload_file(...)` instead and provide the ' + 'metamapper parameter there') def upload_hdf_file(self, filename, metamapper: Callable[[Union[str, pathlib.Path]], pathlib.Path], diff --git a/h5rdmtoolbox/repository/zenodo/core.py b/h5rdmtoolbox/repository/zenodo/core.py index 6a99ff3..a339792 100644 --- a/h5rdmtoolbox/repository/zenodo/core.py +++ b/h5rdmtoolbox/repository/zenodo/core.py @@ -11,6 +11,7 @@ from packaging.version import Version from rdflib import Graph +from h5rdmtoolbox.utils import deprecated from .metadata import Metadata from .tokens import get_api_token from ..interface import RepositoryInterface, RepositoryFile @@ -66,7 +67,6 @@ def __init__(self, DeprecationWarning) source = rec_id - if isinstance(source, int): rec_id = source elif isinstance(source, str): @@ -174,12 +174,7 @@ def refresh(self) -> None: self.json() @property - def files(self) -> List[RepositoryFile]: - # def _parse_download_url(filename): - # if filename is None: - # return filename - # return f"{self.rec_url}/{self.rec_id}/files/{filename}" - + def files(self) -> Dict[str, RepositoryFile]: is_submitted = self.submitted() def _parse_download_url(url, filename): @@ -201,7 +196,7 @@ def _get_media_type(filename: Optional[str]): def _parse(data: Dict): return dict(download_url=_parse_download_url(data['links']['download'], data['filename']), access_url=f"https://doi.org/{self.get_doi()}", - filename=data.get('filename', None), + name=data.get('filename', None), media_type=_get_media_type(data.get('filename', None)), identifier=data.get('id', None), identifier_url=data.get('id', None), @@ -209,8 +204,11 @@ def _parse(data: Dict): checksum=data.get('checksum', None), access_token=self.access_token) - return [RepositoryFile(**_parse(data)) for data in self.json()['files']] + rfiles = [RepositoryFile(**_parse(data)) for data in self.json()['files']] + return {f.name: f for f in rfiles} + @deprecated(version='1.4.0rc1', + msg='Please use `[file.download() for file in self.files.values()]` instead.') def download_files(self, target_folder: Union[str, pathlib.Path] = None, suffix: Union[str, List[str], None] = None) -> List[pathlib.Path]: @@ -228,9 +226,7 @@ def download_files(self, List[pathlib.Path] A list of all downloaded files. """ - warnings.warn("This method is deprecated. Please loop over `.files` and call `.download()` on the " - "items of the returned list", DeprecationWarning) - return [file.download(target_folder=target_folder) for file in self.files] + return [file.download(target_folder=target_folder) for file in self.files.values()] def download_file(self, filename: str, target_folder: Optional[Union[str, pathlib.Path]] = None) -> pathlib.Path: """Download a file based on URL. The url is validated using pydantic @@ -249,7 +245,7 @@ def download_file(self, filename: str, target_folder: Optional[Union[str, pathli pathlib.Path The path to the downloaded file. """ - warnings.warn("Please use `.file(filename).download()`", DeprecationWarning) + warnings.warn("Please use `.files.get(filename).download()`", DeprecationWarning) if target_folder is None: target_folder = pathlib.Path(appdirs.user_data_dir('h5rdmtoolbox')) / 'zenodo_downloads' / str( self.rec_id) @@ -258,7 +254,7 @@ def download_file(self, filename: str, target_folder: Optional[Union[str, pathli logger.debug(f'A target folder was specified. Downloading file to this folder: {target_folder}') target_folder = pathlib.Path(target_folder) - f = self.file(filename) + f = self.files.get(filename) return f.download(target_folder=target_folder) def delete(self) -> requests.Response: @@ -413,7 +409,7 @@ def get_file_infos(self, suffix=None) -> Dict[str, Dict]: file_dict.pop(r) return file_dict - def _upload_file(self, filename, overwrite: bool = False): + def __upload_file__(self, filename, overwrite: bool = False): """Add a file to the deposit. If the filename already exists, it can be overwritten with overwrite=True""" filename = pathlib.Path(filename) @@ -451,17 +447,78 @@ def _upload_file(self, filename, overwrite: bool = False): r.raise_for_status() -class ZenodoRecord(AbstractZenodoInterface): - """Interface to Zenodo records.""" +class ZenodoRecord(RepositoryInterface): + """Interface to Zenodo records. + + .. note: up to version 1.4.rc1, ZenodoRecord was inherited from `AbstractZenodoInterface`. + This is no longer needed. If you want to use the sandbox (testing) environment, + please init with sandbox=True. + """ + + def __init__(self, source: Union[int, str, None] = None, + sandbox: bool = False, + **kwargs): + rec_id = kwargs.pop('rec_id', None) + if rec_id is not None: + warnings.warn("The `rec_id` parameter is deprecated. Please use the source parameter instead.", + DeprecationWarning) + source = rec_id + self.sandbox = sandbox + self._cached_json = {} + if isinstance(source, int): + rec_id = source + elif isinstance(source, str): + """assuming it is a url""" + if not source.startswith('http'): + raise ValueError(f"String input should be a valid URL, which {source} seems not to be. If you intend " + "to provide a record id, please provide an integer.") + if source.startswith(f"{self.base_url}/record"): + rec_id = int(source.split('/')[-1]) + elif source.startswith('https://doi.org/'): + r = requests.get(source, allow_redirects=True) + # the redirected url contains the ID: + rec_id = int(r.url.split('/')[-1]) + elif source is None: + # create a new deposit (with new rec_id and without metadata!) + r = requests.post( + self.depositions_url, + json={}, + params={"access_token": self.access_token}, + headers={"Content-Type": "application/json"} + ) + r.raise_for_status() + rec_id = r.json()['id'] + self.rec_id = rec_id + assert self.rec_id is not None + + def __repr__(self) -> str: + return f"{self.__class__.__name__} (id={self.rec_id}, url={self.record_url})" @property - def base_url(self): + def base_url(self) -> str: + """Returns the base url of the repository""" + if self.sandbox: + return 'https://sandbox.zenodo.org' return 'https://zenodo.org' + @property + def depositions_url(self): + return f"{self.base_url}/api/deposit/depositions" + + @property + def records_url(self): + return f"{self.base_url}/api/deposit/depositions" + + @property + def record_url(self): + """Return the (published) url. Note, that it must not necessarily exist if you + just created a new record and have not published it yet!""" + return f"{self.base_url}/records/{self.rec_id}" + @property def access_token(self): """Get the access token for the Zenodo API. This is needed to upload files.""" - return get_api_token(sandbox=False) + return get_api_token(sandbox=self.sandbox) def get_metadata(self) -> Dict: return self.json()['metadata'] @@ -485,13 +542,239 @@ def set_metadata(self, metadata: Union[Dict, Metadata]): r.raise_for_status() self.refresh() - def _upload_file(self, filename, overwrite: bool = False): + def get_doi(self) -> str: + """Get the DOI of the deposit.""" + doi = self.json()['metadata'].get('doi', None) + if doi is None: + return self.json()['metadata']['prereserve_doi']['doi'] + return doi + + def exists(self) -> bool: + """Check if the deposit exists on Zenodo. Note, that only published records are detected!""" + return requests.get(self.record_url, params={'access_token': self.access_token}).ok + + def is_published(self) -> bool: + """Check if the deposit is published.""" + return self.json()['submitted'] + + submitted = is_published # alias + + def json(self, raise_for_status: bool = False): + """Get the deposit (json) data.""" + if not self._cached_json: + url = f"{self.depositions_url}/{self.rec_id}" + access_token = self.access_token + r = requests.get(url, params={"access_token": access_token}) + + if r.status_code == '403': + logger.critical( + f"You don't have the permission to request {url}. You may need to check your access token.") + r.raise_for_status() + + while r.status_code == 429: + logger.info(f"Too many requests message: {r.json()}. Sleep for 60 seconds and try again.") + time.sleep(60) + r = requests.get(url, params={"access_token": access_token}) + + while r.status_code == 500: + logger.info(f"Internal error: {r.json()}. Sleep for 60 seconds and try again.") + time.sleep(60) + r = requests.get(url, params={"access_token": access_token}) + + if raise_for_status: + r.raise_for_status() + + self._cached_json = r.json() + return self._cached_json + + def refresh(self) -> None: + """Since the json dict is cached, calling this method will refresh the json dict.""" + self._cached_json = {} + self.json() + + @property + def files(self) -> Dict[str, RepositoryFile]: + # def _parse_download_url(filename): + # if filename is None: + # return filename + # return f"{self.rec_url}/{self.rec_id}/files/{filename}" + + is_submitted = self.submitted() + + def _parse_download_url(url, filename): + if url is None: + return url + if is_submitted: + return f"{self.record_url}/files/{filename}" + if url.endswith('/content'): + return url.rsplit('/', 1)[0] + return url + + def _get_media_type(filename: Optional[str]): + if filename is None: + return None + suffix = pathlib.Path(filename).suffix + + return IANA_DICT.get(suffix, suffix[1:]) + + def _parse(data: Dict): + return dict(download_url=_parse_download_url(data['links']['download'], data['filename']), + access_url=f"https://doi.org/{self.get_doi()}", + name=data.get('filename', None), + media_type=_get_media_type(data.get('filename', None)), + identifier=data.get('id', None), + identifier_url=data.get('id', None), + size=data.get('filesize', None), + checksum=data.get('checksum', None), + access_token=self.access_token) + + rfiles = [RepositoryFile(**_parse(data)) for data in self.json()['files']] + return {f.name: f for f in rfiles} + + @deprecated(version='1.4.0rc1', + msg='Please use `[file.download() for file in self.files.values()]` instead.') + def download_files(self, + target_folder: Union[str, pathlib.Path] = None, + suffix: Union[str, List[str], None] = None) -> List[pathlib.Path]: + """Download all (!) files from Zenodo. You may specify one or multiple suffixes to only download certain files. + + Parameters + ---------- + target_folder : str or pathlib.Path, optional + The target folder, by default None + suffix: Union[str, List[str], None], optional=None + Specify a suffix to only download certain files + + Returns + ------- + List[pathlib.Path] + A list of all downloaded files. + """ + warnings.warn("This method is deprecated. Please loop over `.files` and call `.download()` on the " + "items of the returned list", DeprecationWarning) + return [file.download(target_folder=target_folder) for file in self.files.values()] + + def download_file(self, filename: str, target_folder: Optional[Union[str, pathlib.Path]] = None) -> pathlib.Path: + """Download a file based on URL. The url is validated using pydantic + + Parameters + ---------- + filename : str + The filename. + target_folder : Union[str, pathlib.Path], optional + The target folder, by default None + If None, the file will be downloaded to the default folder, which is in + the user data directory of the h5rdmtoolbox package. + + Returns + ------- + pathlib.Path + The path to the downloaded file. + """ + warnings.warn("Please use `.files.get(filename).download()`", DeprecationWarning) + if target_folder is None: + target_folder = pathlib.Path(appdirs.user_data_dir('h5rdmtoolbox')) / 'zenodo_downloads' / str( + self.rec_id) + target_folder.mkdir(exist_ok=True, parents=True) + else: + logger.debug(f'A target folder was specified. Downloading file to this folder: {target_folder}') + target_folder = pathlib.Path(target_folder) + + f = self.files.get(filename) + return f.download(target_folder=target_folder) + + def delete(self) -> requests.Response: + """Delete the deposit.""" + r = requests.delete(f"{self.depositions_url}/{self.rec_id}", params={"access_token": self.access_token}) + if r.status_code == 405: + logger.error(f'Only unpublished records can be deleted. Record "{self.rec_id}" is published.') + return r + + def new_version(self, new_version_string: str): + """Sets the record into edit mode while creating a new version. You need to call `.publish()` after + adding new files, metadata etc. + + Parameters + ---------- + new_version_string : str + The new version string. It must be higher than the current version. This + is checked using the `packaging.version.Version` class. + + Returns + ------- + ZenodoInterface + The new ZenodoInterface with the new version. + + Raises + ------ + ValueError + If the new version is not higher than the current version. + APIError + If the new version cannot be created because permission is missing. + """ + self.unlock() + jdata = self.json() + + curr_version = Version(jdata['metadata']['version']) + new_version = Version(new_version_string) + if not new_version > curr_version: + raise ValueError(f'The new version must be higher than the current version {curr_version}.') + + new_vers_url = jdata['links'].get('newversion', None) + if new_vers_url is None: + raise APIError("Unable to create a new version. Please check your permission associated with " + "the Zenodo API Token.") + + r = requests.post(new_vers_url, + params={'access_token': self.access_token}) + + r.raise_for_status() + latest_draft = r.json()['links']['latest_draft'] + _id = latest_draft.split('/')[-1] + self.rec_id = _id + return self + + def publish(self) -> requests.Response: + """Be careful. The record cannot be deleted afterward!""" + r = requests.post(self.json()['links']['publish'], + # data=json.dumps({'publication_date': '2024-03-03', 'version': '1.2.3'}), + params={'access_token': self.access_token}) + r.raise_for_status() + self.refresh() + + def discard(self): + """Discard the latest action, e.g. creating a new version""" + jdata = self.json() + r = requests.post(jdata['links']['discard'], + params={'access_token': self.access_token}) + r.raise_for_status() + + def unlock(self): + """unlock the deposit. To lock it call publish() + + Raises + ------ + APIError + If the record cannot be unlocked because permission is missing. + """ + edit_url = self.json()['links'].get('edit', None) + if edit_url is None: + raise APIError('Unable to unlock the record. Please check your permission of the Zenodo API Token.') + + r = requests.post(edit_url, + params={'access_token': self.access_token}) + if r.status_code == 400: + print(f'Cannot publish data. This might be because metadata is missing. Check on the website, which ' + f'fields are required!') + r.raise_for_status() + + def __upload_file__(self, filename, overwrite: bool = False): """Uploading file to record""" filename = pathlib.Path(filename) if not filename.exists(): raise FileNotFoundError(f'File "{filename}" does not exist.') - existing_filenames = [file.filename for file in self.files] + existing_filenames = [file.name for file in self.files.values()] file_exists_in_record = filename.name in existing_filenames if not overwrite and file_exists_in_record: @@ -501,7 +784,7 @@ def _upload_file(self, filename, overwrite: bool = False): # file exists in record. get file id if file_exists_in_record: - file_id = self.file(filename.name).identifier + file_id = self.files.get(filename.name).identifier url = f"{self.depositions_url}/{self.rec_id}/files/{file_id}" logger.debug(f'requests.delete(url={url}, ...)') r = requests.delete(url=url, diff --git a/h5rdmtoolbox/utils.py b/h5rdmtoolbox/utils.py index ef76870..fc68418 100644 --- a/h5rdmtoolbox/utils.py +++ b/h5rdmtoolbox/utils.py @@ -1,23 +1,24 @@ """utilities of the h5rdmtoolbox""" import datetime -import h5py import hashlib import json import logging -import numpy as np import os import pathlib -import pint import re -import requests import warnings -from h5py import File -from pydantic import HttpUrl, validate_call -from rdflib.plugins.shared.jsonld.context import Context from re import sub as re_sub from typing import Dict from typing import Union, Callable, List, Tuple +import h5py +import numpy as np +import pint +import requests +from h5py import File +from pydantic import HttpUrl, validate_call +from rdflib.plugins.shared.jsonld.context import Context + from . import _user, get_config, get_ureg from ._version import __version__ from .wrapper import rdf @@ -553,3 +554,20 @@ def download_context(url_source: Union[HttpUrl, List[HttpUrl]], force_download: raise RuntimeError(f'Failed to download context file from {_url}') filenames.append(str(context_file)) return Context(filenames) + + +def deprecated(version: str, msg: str, removing_in: str = None): + """Decorator for deprecated methods or functions""" + + def deprecated_decorator(func): + def depr_func(*args, **kwargs): + if removing_in: + warnings.warn(f"{func.__name__} is deprecated since {version}. Will be removed in {removing_in}." + f" {msg}", DeprecationWarning) + else: + warnings.warn(f"{func.__name__} is deprecated since {version}. {msg}", DeprecationWarning) + return func(*args, **kwargs) + + return depr_func + + return deprecated_decorator diff --git a/tests/conventions/test_conventions.py b/tests/conventions/test_conventions.py index ffe2f1b..df24220 100644 --- a/tests/conventions/test_conventions.py +++ b/tests/conventions/test_conventions.py @@ -119,7 +119,7 @@ def test_upload_convention(self): # download file from zenodo deposit: self.assertEqual(1, len(zsr.get_filenames())) - filename = zsr.file('tutorial_convention.yaml').download() + filename = zsr.files.get('tutorial_convention.yaml').download() self.assertTrue(filename.exists()) download_dir = pathlib.Path(appdirs.user_data_dir('h5rdmtoolbox')) / 'zenodo_downloads' self.assertEqual(download_dir, filename.parent.parent) diff --git a/tests/repository/test_zenodo.py b/tests/repository/test_zenodo.py index b34b342..0f1a9b5 100644 --- a/tests/repository/test_zenodo.py +++ b/tests/repository/test_zenodo.py @@ -61,17 +61,22 @@ def test_ZenodoFile(self): self.assertNotEqual(z._cached_json, {}) self.assertTrue(z.exists()) - for file in z.files: + for file in z.files.values(): self.assertIsInstance(file, RepositoryFile) self.assertEqual(len(z.files), 1) - r = requests.get(z.files[0].download_url) - self.assertEqual(r.status_code, 200) - # self.assertEqual(z.files[0].download_url, - # f"{z.rec_url}/{z.rec_id}/files/{z.files[0].filename}") - downloaded_filename = z.files[0].download() - self.assertTrue(downloaded_filename.exists()) - self.assertTrue(downloaded_filename.is_file()) - self.assertIsInstance(z.files[0].jsonld(), str) + for file in z.files.values(): + r = requests.get(file.download_url) + self.assertEqual(r.status_code, 200) + downloaded_filename = file.download() + self.assertTrue(downloaded_filename.exists()) + self.assertTrue(downloaded_filename.is_file()) + self.assertIsInstance(file.jsonld(), str) + + def test_newSandboxImplementation(self): + """from 1.4.0 on the sandbox can be init from ZenodoRecord""" + z = zenodo.ZenodoRecord(TutorialSNTZenodoRecordID, sandbox=True) + self.assertTrue(z.sandbox) + self.assertEqual(z.base_url, 'https://sandbox.zenodo.org') def test_ZenodoRecord_without_token(self): """remove all info about zenodo api token!""" @@ -293,13 +298,13 @@ def test_upload_hdf(self): filenames = z.get_filenames() self.assertIn(hdf_file_name, filenames) self.assertIn(json_name, filenames) - with self.assertRaises(FileNotFoundError): - _ = z.file('invalid.hdf') + + self.assertEqual(z.files.get('invalid.hdf'), None) hdf_filenames = [f for f in z.get_filenames() if pathlib.Path(f).suffix == '.hdf'] self.assertEqual(len(hdf_filenames), 1) - hdf_filename = z.file(hdf_file_name).download() + hdf_filename = z.files.get(hdf_file_name).download() self.assertTrue(hdf_filename.exists()) @@ -310,7 +315,7 @@ def test_upload_hdf(self): self.assertEqual(h5['grp1/test2'].attrs['test'], 1) self.assertEqual(h5['grp1/test2'].attrs['long_name'], 'dataset 2') - json_filename = z.file(json_name).download() + json_filename = z.files.get(json_name).download() self.assertTrue(json_filename.exists()) with open(json_filename) as f: json_dict = json.loads(f.read()) @@ -318,17 +323,47 @@ def test_upload_hdf(self): self.assertTrue('@context' in json_dict) self.assertEqual(json_dict['@type'], 'hdf5:File') - # - # print(json_dict['h5rdmtoolbox']['attrs']) - # self.assertDictEqual( - # json_dict['h5rdmtoolbox']['attrs'], - # { - # '@type': 'https://schema.org/SoftwareSourceCode', - # rdf.RDF_PREDICATE_ATTR_NAME: '{"__h5rdmtoolbox_version__": "https://schema.org/softwareVersion"}', - # '__h5rdmtoolbox_version__': h5tbx.__version__ - # } - # ) - # z.delete() + def test_upload_hdf_new_implementation(self): + z = zenodo.ZenodoRecord(None, sandbox=True) + + with h5tbx.File() as h5: + h5.attrs['long_name'] = 'root' + h5.create_dataset('test', data=1, attrs={'units': 'm/s', 'long_name': 'dataset 1'}) + h5.create_dataset('grp1/test2', data=2, attrs={'test': 1, 'long_name': 'dataset 2'}) + + orig_hdf_filename = h5.hdf_filename + + hdf_file_name = orig_hdf_filename.name + json_name = hdf_file_name.replace('.hdf', '.jsonld') + + z.upload_file(orig_hdf_filename) # metamapper per default converts to JSONLD file + filenames = z.get_filenames() + self.assertIn(hdf_file_name, filenames) + self.assertIn(json_name, filenames) + + self.assertEqual(z.files.get('invalid.hdf'), None) + + hdf_filenames = [f for f in z.get_filenames() if pathlib.Path(f).suffix == '.hdf'] + self.assertEqual(len(hdf_filenames), 1) + + hdf_filename = z.files.get(hdf_file_name).download() + + self.assertTrue(hdf_filename.exists()) + + with h5tbx.File(hdf_filename) as h5: + self.assertEqual(h5.attrs['long_name'], 'root') + self.assertEqual(h5['test'].attrs['units'], 'm/s') + self.assertEqual(h5['test'].attrs['long_name'], 'dataset 1') + self.assertEqual(h5['grp1/test2'].attrs['test'], 1) + self.assertEqual(h5['grp1/test2'].attrs['long_name'], 'dataset 2') + + json_filename = z.files.get(json_name).download() + self.assertTrue(json_filename.exists()) + with open(json_filename) as f: + json_dict = json.loads(f.read()) + + self.assertTrue('@context' in json_dict) + self.assertEqual(json_dict['@type'], 'hdf5:File') def test_ZenodoSandboxDeposit(self): z = zenodo.ZenodoSandboxDeposit(None) @@ -439,3 +474,113 @@ def test_ZenodoSandboxDeposit(self): self.assertFalse(z.exists()) # z.delete() # self.assertFalse(z.exists()) + + def test_ZenodoSandboxDeposit_newImplementation(self): + z = zenodo.ZenodoRecord(None, sandbox=True) + self.assertIsInstance(z.get_metadata(), dict) + self.assertEqual(z.get_doi(), f'10.5281/zenodo.{z.rec_id}') + self.assertIn('access_right', z.get_metadata()) + self.assertIn('prereserve_doi', z.get_metadata()) + self.assertEqual('open', z.get_metadata()['access_right']) + self.assertEqual(z.rec_id, z.get_metadata()['prereserve_doi']['recid']) + self.assertFalse(z.exists()) # not yet published! + self.assertFalse(z.is_published()) + + old_rec_id = z.rec_id + + # z.delete() + + with self.assertRaises(ValueError): + _ = zenodo.ZenodoRecord('123123123123', sandbox=True) + + z = zenodo.ZenodoRecord(None, sandbox=True) + self.assertNotEqual(old_rec_id, z.rec_id) + + # with self.assertRaises(TypeError): + # z.metadata = {'access_right': 'closed'} + + meta = Metadata( + version="0.1.0-rc.1+build.1", + title='[deleteme]h5tbxZenodoInterface', + description='A toolbox for managing HDF5-based research data management', + creators=[Creator(name="Probst, Matthias", + affiliation="KIT - ITS", + orcid="0000-0001-8729-0482")], + contributors=[Contributor(name="Probst, Matthias", + affiliation="KIT - ITS", + orcid="0000-0001-8729-0482", + type="ContactPerson")], + upload_type='image', + image_type='photo', + access_right='open', + keywords=['hdf5', 'research data management', 'rdm'], + publication_date=datetime.now(), + embargo_date='2020' + ) + + with self.assertRaises(TypeError): + z.set_metadata(12) + + z.set_metadata(meta.model_dump()) + z.set_metadata(meta) + ret_metadata = z.get_metadata() + self.assertEqual(ret_metadata['upload_type'], + meta.model_dump()['upload_type']) + self.assertListEqual(ret_metadata['keywords'], + meta.model_dump()['keywords']) + + # add file: + tmpfile = pathlib.Path('testfile.txt') + with open(tmpfile, 'w') as f: + f.write('This is a test file.') + + with self.assertRaises(FileNotFoundError): + z.upload_file('doesNotExist.txt', overwrite=True, metamapper=None) + + z.upload_file(tmpfile, overwrite=True, metamapper=None) + self.assertIn('testfile.txt', z.get_filenames()) + + with self.assertWarns(UserWarning): + z.upload_file('testfile.txt', overwrite=False, metamapper=None) + + upload_file(z, tmpfile, overwrite=True, metamapper=None) + + with self.assertWarns(UserWarning): + upload_file(z, tmpfile, overwrite=False, metamapper=None) + + # delete file locally: + tmpfile.unlink() + self.assertFalse(tmpfile.exists()) + + filename = z.download_file('testfile.txt', target_folder='.') + self.assertIsInstance(filename, pathlib.Path) + self.assertTrue(filename.exists()) + with open(filename, 'r') as f: + self.assertEqual(f.read(), 'This is a test file.') + filename.unlink() + + filenames = z.download_files(target_folder='.') + + self.assertIsInstance(filenames, list) + self.assertIsInstance(filenames[0], pathlib.Path) + for filename in filenames: + self.assertTrue(filename.exists()) + filename.unlink() + + hdf5_filenames = z.download_files(target_folder='.', suffix='.hdf') + self.assertIsInstance(hdf5_filenames, list) + self.assertEqual(len(hdf5_filenames), 1) + + txt_filenames = z.download_files(target_folder='.', suffix='.txt') + self.assertIsInstance(txt_filenames, list) + self.assertEqual(len(txt_filenames), 1) + self.assertEqual(txt_filenames[0].suffix, '.txt') + + hdf_and_txt_filenames = z.download_files(target_folder='.', suffix=['.txt', '.hdf']) + self.assertIsInstance(hdf_and_txt_filenames, list) + self.assertEqual(len(hdf_and_txt_filenames), 1) + self.assertEqual(hdf_and_txt_filenames[0].suffix, '.txt') + + self.assertFalse(z.exists()) + # z.delete() + # self.assertFalse(z.exists())