diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f5089f20..d175412e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -47,6 +47,11 @@ repos: - id: flake8 additional_dependencies: [ 'flake8-alphabetize', 'flake8-rst-docstrings' ] args: [ '--config=.flake8' ] + - repo: https://github.com/numpy/numpydoc + rev: v1.6.0 + hooks: + - id: numpydoc-validation + exclude: 'tests|docs/conf.py' - repo: https://github.com/keewis/blackdoc rev: v0.3.9 hooks: diff --git a/CHANGES.rst b/CHANGES.rst index cc67fe0e..00aedb6d 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -10,6 +10,20 @@ Contributors to this version: Trevor James Smith (:user:`Zeitsperre`), Thomas-Ch New features and enhancements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ * Added French language support to the documentation. (:issue:`53`, :pull:`55`). +* Added a new set of functions to support creating and updating `pooch` registries, caching testing datasets from `hydrologie/xhydro-testdata`, and ensuring that testing datasets can be loaded into temporary directories. +* `xhydro` is now configured to use `pooch` to download and cache testing datasets from `hydrologie/xhydro-testdata`. (:pull:`62`). + +Breaking changes +^^^^^^^^^^^^^^^^ +* Added `pooch` as an installation dependency. (:pull:`62`). + +Internal changes +^^^^^^^^^^^^^^^^ +* Added a new module for testing purposes: `xhydro.testing.helpers` with some new functions. (:pull:`62`): + * `generate_registry`: Parses data found in package (`xhydro.testing.data`), and adds it to the `registry.txt` + * `load_registry`: Loads installed (or custom) registry and returns dictionary + * `populate_testing_data`: Fetches the registry and optionally caches files at a different location (helpful for `pytest-xdist`). +* Added a `pre-commit` hook (`numpydoc`) to ensure that `numpy` docstrings are formatted correctly. (:pull:`62`). v0.3.0 (2023-12-01) ------------------- diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index f8205915..dbfff75f 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -106,6 +106,14 @@ Ready to contribute? Here's how to set up ``xhydro`` for local development. # Or, to run multiple build tests $ tox + .. note:: + + Running `pytest` or `tox` will automatically fetch and cache the testing data for the package to your local cache (using the `platformdirs` library). On Linux, this is located at ``XDG_CACHE_HOME`` (usually ``~/.cache``). On Windows, this is located at ``%LOCALAPPDATA%`` (usually ``C:\Users\username\AppData\Local``). On MacOS, this is located at ``~/Library/Caches``. + + If for some reason you wish to cache this data elsewhere, you can set the ``XHYDRO_DATA_DIR`` environment variable to a different location before running the tests. For example, to cache the data in the current working directory, run:: + + $ export XHYDRO_DATA_DIR=$(pwd)/.cache + #. Commit your changes and push your branch to GitHub:: $ git add . @@ -134,6 +142,12 @@ Ready to contribute? Here's how to set up ``xhydro`` for local development. You will have contributed your first changes to ``xhydro``! +.. warning:: + + If your Pull Request relies on modifications to the testing data of `xhydro`, you will need to update the testing data repository as well. As a preliminary testing measure, the branch of the testing data can be modified at testing time (from `main`) by setting the ``XHYDRO_TESTDATA_BRANCH`` environment variable to the branch name of the ``xhydro-testdata`` repository. + + Be sure to consult the ReadMe found at https://github.com/hydrologie/xhydro-testdata as well. + Pull Request Guidelines ----------------------- diff --git a/environment-dev.yml b/environment-dev.yml index 60487057..4049c6e1 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -6,6 +6,8 @@ dependencies: # Don't forget to sync changes between environment.yml, environment-dev.yml, and pyproject.toml! # Main packages - numpy + - pooch >=1.8.0 + - pydantic >=2.0,<2.5.3 # FIXME: Remove pin once our dependencies (xclim, xscen) support pydantic 2.5.3 - statsmodels - xarray - xclim >=0.45.0 diff --git a/environment.yml b/environment.yml index ffbcea69..5f6dcca3 100644 --- a/environment.yml +++ b/environment.yml @@ -6,6 +6,8 @@ dependencies: # Don't forget to sync changes between environment.yml, environment-dev.yml, and pyproject.toml! # Main packages - numpy + - pooch >=1.8.0\ + - pydantic >=2.0,<2.5.3 - statsmodels - xarray - xclim >=0.45.0 diff --git a/pyproject.toml b/pyproject.toml index e3e2e8ed..266672f4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,8 @@ dynamic = ["description", "version"] dependencies = [ # Don't forget to sync changes between environment.yml, environment-dev.yml, and pyproject.toml! "numpy", + "pooch>=1.8.0", + "pydantic>=2.0,<2.5.3", "statsmodels", "xarray", "xclim>=0.45.0", @@ -146,7 +148,8 @@ include = [ "docs/make.bat", "tests/*.py", "tox.ini", - "xhydro" + "xhydro", + "xhydro/testing/registry.txt" ] exclude = [ "*.py[co]", @@ -161,7 +164,8 @@ exclude = [ "Makefile", "docs/_*", "docs/apidoc/modules.rst", - "docs/apidoc/xhydro*.rst" + "docs/apidoc/xhydro*.rst", + "xhydro/testing/data/*" ] [tool.isort] @@ -178,6 +182,22 @@ warn_unused_configs = true module = [] ignore_missing_imports = true +[tool.numpydoc_validation] +checks = [ + "all", # report on all checks, except the below + "ES01", + "EX01", + "GL01", + "SA01" +] +exclude = [ + # don't report on objects that match any of these regex + '\.undocumented_method$', + '\.__repr__$', + # any object starting with an underscore is a private object + '\._\w+' +] + [tool.pytest.ini_options] addopts = [ "--verbose", diff --git a/tox.ini b/tox.ini index f53c4dcd..638d60e1 100644 --- a/tox.ini +++ b/tox.ini @@ -40,6 +40,7 @@ setenv = PYTHONPATH = {toxinidir} passenv = CI + ESMFMKFILE COVERALLS_* GITHUB_* extras = diff --git a/xhydro/cc.py b/xhydro/cc.py index faf92334..0723b93a 100644 --- a/xhydro/cc.py +++ b/xhydro/cc.py @@ -1,4 +1,5 @@ """Module to compute climate change statistics using xscen functions.""" +import xarray # Special imports from xscen from xscen import ( # FIXME: To be replaced with climatological_op once available @@ -17,8 +18,20 @@ # FIXME: To be deleted once climatological_op is available in xscen -def climatological_op(ds, **kwargs): - """Compute climatological operation. +def climatological_op(ds: xarray.Dataset, **kwargs: dict) -> xarray.Dataset: + r"""Compute climatological operation. + + Parameters + ---------- + ds : xarray.Dataset + Input dataset. + \*\*kwargs : dict + Keyword arguments passed to :py:func:`xscen.aggregate.climatological_mean`. + + Returns + ------- + xarray.Dataset + Output dataset. Notes ----- diff --git a/xhydro/indicators.py b/xhydro/indicators.py index 9d0a3b8f..455a657b 100644 --- a/xhydro/indicators.py +++ b/xhydro/indicators.py @@ -64,36 +64,37 @@ def get_yearly_op( missing_options: Optional[dict] = None, interpolate_na: bool = False, ) -> xr.Dataset: - """ - Compute yearly operations on a variable. + """Compute yearly operations on a variable. Parameters ---------- - ds: xr.Dataset + ds : xr.Dataset Dataset containing the variable to compute the operation on. - op: str + op : str Operation to compute. One of ["max", "min", "mean", "sum"]. - input_var: str + input_var : str Name of the input variable. Defaults to "streamflow". - window: int + window : int Size of the rolling window. A "mean" operation is performed on the rolling window before the call to xclim. This parameter cannot be used with the "sum" operation. - timeargs: dict, optional + timeargs : dict, optional Dictionary of time arguments for the operation. Keys are the name of the period that will be added to the results (e.g. "winter", "summer", "annual"). Values are up to two dictionaries, with both being optional. The first is {'freq': str}, where str is a frequency supported by xarray (e.g. "YS", "AS-JAN", "AS-DEC"). It needs to be a yearly frequency. Defaults to "AS-JAN". - The second is an indexer as supported by :py:func:`xclim.core.calendar.select_time`. Defaults to {}, which means the whole year. + The second is an indexer as supported by :py:func:`xclim.core.calendar.select_time`. + Defaults to {}, which means the whole year. See :py:func:`xclim.core.calendar.select_time` for more information. - Examples: {"winter": {"freq": "AS-DEC", "date_bounds": ['12-01', '02-28']}}, {"jan": {"freq": "YS", "month": 1}}, {"annual": {}}. - missing: str + Examples: {"winter": {"freq": "AS-DEC", "date_bounds": ["12-01", "02-28"]}}, {"jan": {"freq": "YS", "month": 1}}, {"annual": {}}. + missing : str How to handle missing values. One of "skip", "any", "at_least_n", "pct", "wmo". See :py:func:`xclim.core.missing` for more information. - missing_options: dict, optional + missing_options : dict, optional Dictionary of options for the missing values' method. See :py:func:`xclim.core.missing` for more information. - interpolate_na: bool - Whether to interpolate missing values before computing the operation. Only used with the "sum" operation. Defaults to False. + interpolate_na : bool + Whether to interpolate missing values before computing the operation. Only used with the "sum" operation. + Defaults to False. Returns ------- @@ -105,7 +106,6 @@ def get_yearly_op( ----- If you want to perform a frequency analysis on a frequency that is finer than annual, simply use multiple timeargs (e.g. 1 per month) to create multiple distinct variables. - """ missing_options = missing_options or {} timeargs = timeargs or {"annual": {}} @@ -174,7 +174,8 @@ def get_yearly_op( and freq != "AS-DEC" ): warnings.warn( - "The frequency is not AS-DEC, but the season indexer includes DJF. This will lead to misleading results." + "The frequency is not AS-DEC, but the season indexer includes DJF. " + "This will lead to misleading results." ) elif ( "doy_bounds" in indexer.keys() diff --git a/xhydro/testing/__init__.py b/xhydro/testing/__init__.py index 92c8062e..8bb3e9cf 100644 --- a/xhydro/testing/__init__.py +++ b/xhydro/testing/__init__.py @@ -1 +1,4 @@ -"""Helpers for testing.""" +"""Testing utilities and helper functions.""" + +from .helpers import * +from .utils import * diff --git a/xhydro/testing/helpers.py b/xhydro/testing/helpers.py new file mode 100644 index 00000000..b5d32142 --- /dev/null +++ b/xhydro/testing/helpers.py @@ -0,0 +1,205 @@ +"""Helper functions for testing data management.""" + +import importlib.resources as ilr +import logging +import os +from pathlib import Path +from typing import Optional, Union +from urllib.parse import urljoin + +import pooch + +from xhydro import __version__ as __xhydro_version__ + +__all__ = [ + "DATA_DIR", + "DATA_URL", + "DEVEREAUX", + "generate_registry", + "load_registry", + "populate_testing_data", +] + +_default_cache_dir = pooch.os_cache("xhydro-testdata") + +DATA_DIR = os.getenv("XHYDRO_DATA_DIR", _default_cache_dir) +"""Sets the directory to store the testing datasets. + +If not set, the default location will be used (based on ``platformdirs``, see :func:`pooch.os_cache`). + +Notes +----- +When running tests locally, this can be set for both `pytest` and `tox` by exporting the variable: + +.. code-block:: console + + $ export XHYDRO_DATA_DIR="/path/to/my/data" + +or setting the variable at runtime: + +.. code-block:: console + + $ env XHYDRO_DATA_DIR="/path/to/my/data" pytest +""" + +TESTDATA_BRANCH = os.getenv("XHYDRO_TESTDATA_BRANCH", "main") +"""Sets the branch of hydrologie/xhydro-testdata to use when fetching testing datasets. + +Notes +----- +When running tests locally, this can be set for both `pytest` and `tox` by exporting the variable: + +.. code-block:: console + + $ export XHYDRO_TESTDATA_BRANCH="my_testing_branch" + +or setting the variable at runtime: + +.. code-block:: console + + $ env XHYDRO_TESTDATA_BRANCH="my_testing_branch" pytest +""" + +DATA_URL = f"https://github.com/hydrologie/xhydro-testdata/raw/{TESTDATA_BRANCH}" + + +def generate_registry( + filenames: Optional[list[str]] = None, base_url: str = DATA_URL +) -> None: + """Generate a registry file for the test data. + + Parameters + ---------- + filenames : list of str, optional + List of filenames to generate the registry file for. + If not provided, all files under xhydro/testing/data will be used. + base_url : str, optional + Base URL to the test data repository. + """ + # Gather the data folder and registry file locations from installed package_data + data_folder = ilr.files("xhydro").joinpath("testing/data") + registry_file = ilr.files("xhydro").joinpath("testing/registry.txt") + + # Download the files to the installed xhydro/testing/data folder + if filenames is None: + with ilr.as_file(data_folder) as data: + for file in data.rglob("*"): + filename = file.relative_to(data).as_posix() + pooch.retrieve( + url=urljoin(base_url, filename), + known_hash=None, + fname=filename, + path=data_folder, + ) + + # Generate the registry file + with ilr.as_file(data_folder) as data, ilr.as_file(registry_file) as registry: + pooch.make_registry(data.as_posix(), registry.as_posix()) + + +def load_registry(file: Optional[Union[str, Path]] = None) -> dict[str, str]: + """Load the registry file for the test data. + + Parameters + ---------- + file : str or Path, optional + Path to the registry file. If not provided, the registry file found within the package data will be used. + + Returns + ------- + dict + Dictionary of filenames and hashes. + """ + # Get registry file from package_data + if file is None: + registry_file = ilr.files("xhydro").joinpath("testing/registry.txt") + if registry_file.is_file(): + logging.info("Registry file found in package_data: %s", registry_file) + else: + registry_file = Path(file) + if not registry_file.is_file(): + raise FileNotFoundError(f"Registry file not found: {registry_file}") + + # Load the registry file + registry = dict() + with registry_file.open() as buffer: + for entry in buffer.readlines(): + registry[entry.split()[0]] = entry.split()[1] + + return registry + + +DEVEREAUX = pooch.create( + path=pooch.os_cache("xhydro-testdata"), + base_url=DATA_URL, + version=__xhydro_version__, + version_dev="main", + env="XHYDRO_DATA_DIR", + allow_updates="XHYDRO_DATA_UPDATES", + registry=load_registry(), +) +"""Pooch registry instance for xhydro test data. + +Notes +----- +There are two environment variables that can be used to control the behaviour of this registry: + + - ``XHYDRO_DATA_DIR``: If this environment variable is set, it will be used as the base directory to store the data + files. The directory should be an absolute path (i.e., it should start with ``/``). Otherwise, + the default location will be used (based on ``platformdirs``, see :func:`pooch.os_cache`). + + - ``XHYDRO_DATA_UPDATES``: If this environment variable is set, then the data files will be downloaded even if the + upstream hashes do not match. This is useful if you want to always use the latest version of the data files. + +Examples +-------- +Using the registry to download a file: + +.. code-block:: python + + from xhydro.testing.utils import DEVEREAUX + import xarray as xr + + example_file = DEVEREAUX.fetch("example.nc") + data = xr.open_dataset(example_file) +""" + + +def populate_testing_data( + registry: Optional[Union[str, Path]] = None, + temp_folder: Optional[Path] = None, + branch: str = TESTDATA_BRANCH, + _local_cache: Path = _default_cache_dir, +) -> None: + """Populate the local cache with the testing data. + + Parameters + ---------- + registry : str or Path, optional + Path to the registry file. If not provided, the registry file from package_data will be used. + temp_folder : Path, optional + Path to a temporary folder to use as the local cache. If not provided, the default location will be used. + branch : str, optional + Branch of hydrologie/xhydro-testdata to use when fetching testing datasets. + _local_cache : Path, optional + Path to the local cache. Defaults to the default location. + + Returns + ------- + None + The testing data will be downloaded to the local cache. + """ + # Get registry file from package_data or provided path + registry = load_registry(registry) + + # Set the local cache to the temp folder + if temp_folder is not None: + _local_cache = temp_folder + # Set the branch + DEVEREAUX.version_dev = branch + # Set the local cache + DEVEREAUX.path = _local_cache + + # Download the files + for filename in registry.keys(): + DEVEREAUX.fetch(filename) diff --git a/xhydro/testing/registry.txt b/xhydro/testing/registry.txt new file mode 100644 index 00000000..e69de29b diff --git a/xhydro/testing/utils.py b/xhydro/testing/utils.py index 419eaa2a..ff52d2f4 100644 --- a/xhydro/testing/utils.py +++ b/xhydro/testing/utils.py @@ -6,6 +6,10 @@ from pathlib import Path from typing import Optional, TextIO, Union +__all__ = [ + "publish_release_notes", +] + def publish_release_notes( style: str = "md", @@ -26,7 +30,8 @@ def publish_release_notes( Returns ------- - str, optional + str or None + Formatted release notes as a string, if `file` is not provided. Notes -----