diff --git a/.github/workflows/ci_tests.yml b/.github/workflows/ci_tests.yml index 4fb8e21..dbc1c57 100644 --- a/.github/workflows/ci_tests.yml +++ b/.github/workflows/ci_tests.yml @@ -21,6 +21,10 @@ jobs: uses: actions/setup-python@v5 with: python-version: "3.10" + - name: Install system packages + run: | + sudo apt update + sudo apt install libudunits2-dev - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/scripts/benchmark_relalt_filter.py b/scripts/benchmark_relalt_filter.py new file mode 100644 index 0000000..d9c3e71 --- /dev/null +++ b/scripts/benchmark_relalt_filter.py @@ -0,0 +1,50 @@ +import pyaro +import csv +import time +import random +import os + +# Generate test-data +if not os.path.exists("tmp_data.csv"): + with open("tmp_data.csv", "w", newline='') as csvfile: + writer = csv.writer(csvfile) + variable = "NOx" + unit = "Gg" + for i in range(75000): + name = f"station{i}" + lat = random.uniform(30.05, 81.95) + lon = random.uniform(-29.95, 89.95) + value = random.uniform(0, 1) + altitude = random.randrange(0, 1000) + start_time = "1997-01-01 00:00:00" + end_time = "1997-01-02 00:00:00" + + + writer.writerow((variable, name, lon, lat, value, unit, start_time, end_time, altitude)) + +# Benchmark +engines = pyaro.list_timeseries_engines() +with engines["csv_timeseries"].open( + filename="tmp_data.csv", + #filters=[pyaro.timeseries.filters.get("altitude", min_altitude=200)], # 0.023s + filters=[pyaro.timeseries.filters.get("relaltitude", topo_file = "../tests/testdata/datadir_elevation/topography.nc", rdiff=90)], # 27.133s + columns={ + "variable": 0, + "station": 1, + "longitude": 2, + "latitude": 3, + "value": 4, + "units": 5, + "start_time": 6, + "end_time": 7, + "altitude": 8, + "country": "NO", + "standard_deviation": "NaN", + "flag": "0", + }) as ts: + start_time = time.perf_counter() + ts.stations() + end_time = time.perf_counter() + + +print(f"Total time: {end_time-start_time:.3f} seconds") diff --git a/scripts/make_topography_subset.py b/scripts/make_topography_subset.py new file mode 100644 index 0000000..fa7f946 --- /dev/null +++ b/scripts/make_topography_subset.py @@ -0,0 +1,14 @@ +import xarray as xr + +# Script which extracts only the first time point and topography value from the emep topography +# file to reduce data for the tests. +data = xr.open_dataset("/lustre/storeB/project/fou/kl/emep/Auxiliary/topography.nc") + +start_time = data["time"][0] + +data = data.sel(time=slice(start_time)) + +data = data["topography"] + + +data.to_netcdf("tests/testdata/datadir_elevation/topography.nc") \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index b26a7ce..266766e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -23,12 +23,20 @@ url = https://pyaro.readthedocs.io [options] python_version = >=3.10 install_requires = - numpy >= 1.13 + numpy >= 1.13, <2.0.0 importlib-metadata >= 3.6; python_version < "3.10" package_dir = =src packages = pyaro, pyaro.timeseries, pyaro.csvreader -test_require = tox:tox +test_require = + tox:tox + +[options.extras_require] +optional = + pandas + cf-units + xarray + netcdf4 [tox:tox] min_version = 4.0 @@ -36,11 +44,10 @@ env_list = py312 py311 py310 -depends = - pandas [testenv] commands = python3 -m unittest discover -s tests +extras = optional [options.entry_points] diff --git a/src/pyaro/csvreader/CSVTimeseriesReader.py b/src/pyaro/csvreader/CSVTimeseriesReader.py index 4b0833c..07120f8 100644 --- a/src/pyaro/csvreader/CSVTimeseriesReader.py +++ b/src/pyaro/csvreader/CSVTimeseriesReader.py @@ -180,7 +180,7 @@ def col_keys(cls): """ return cls._col_keys - def metadata(self) -> dict(): + def metadata(self) -> dict: return self._metadata def _unfiltered_data(self, varname) -> Data: diff --git a/src/pyaro/timeseries/AutoFilterReaderEngine.py b/src/pyaro/timeseries/AutoFilterReaderEngine.py index 82be4b8..9fac1d0 100644 --- a/src/pyaro/timeseries/AutoFilterReaderEngine.py +++ b/src/pyaro/timeseries/AutoFilterReaderEngine.py @@ -31,7 +31,7 @@ def supported_filters(cls) -> list[Filter]: :return: list of filters """ - supported = "variables,stations,countries,bounding_boxes,duplicates,time_bounds,time_resolution,flags,time_variable_station,altitude".split( + supported = "variables,stations,countries,bounding_boxes,duplicates,time_bounds,time_resolution,flags,time_variable_station,altitude,relaltitude".split( "," ) return [filters.get(name) for name in supported] diff --git a/src/pyaro/timeseries/Filter.py b/src/pyaro/timeseries/Filter.py index 87024bc..ac8d763 100644 --- a/src/pyaro/timeseries/Filter.py +++ b/src/pyaro/timeseries/Filter.py @@ -1,3 +1,4 @@ +import logging import math import abc from collections import defaultdict @@ -5,14 +6,22 @@ from datetime import datetime import inspect import re +import sys import types -from typing import Any import numpy as np from .Data import Data, Flag from .Station import Station +try: + # Optional dependencies required for relative altitude filter. + import xarray as xr + from cf_units import Unit +except ImportError: + pass + +logger = logging.getLogger(__name__) class Filter(abc.ABC): """Base-class for all filters used from pyaro-Readers""" @@ -855,4 +864,157 @@ def filter_stations(self, stations: dict[str, Station]) -> dict[str, Station]: if self._max_altitude is not None: stations = {n: s for n, s in stations.items() if (not math.isnan(s["altitude"]) and s["altitude"] <= self._max_altitude) } - return stations \ No newline at end of file + return stations + +@registered_filter +class RelativeAltitudeFilter(StationFilter): + """ + Filter class which filters stations based on the relative difference between + the station altitude, and the gridded topography altitude. + + https://github.com/metno/pyaro/issues/39 + """ + UNITS_METER = Unit("m") + # https://cfconventions.org/Data/cf-conventions/cf-conventions-1.11/cf-conventions.html#latitude-coordinate + UNITS_LAT = set(["degrees_north", "degree_north", "degree_N", "degrees_N", "degreeN", "degreesN"]) + + # https://cfconventions.org/Data/cf-conventions/cf-conventions-1.11/cf-conventions.html#longitude-coordinate + UNITS_LON = set(["degrees_east", "degree_east", "degree_E", "degrees_E", "degreeE", "degreesE"]) + + def __init__(self, topo_file: str | None = None, topo_var: str = "topography", rdiff: float = 0): + """ + :param topo_file : A .nc file from which to read gridded topography data. + :param topo_var : Name of variable that stores altitude. + :param rdiff : Relative difference (in meters). + + Note: + ----- + - Stations will be kept if abs(altobs-altmod) <= rdiff. + - Stations will not be kept if station altitude is NaN. + + Note: + ----- + This filter requires additional dependencies (xarray, netcdf4, cf-units) to function. These can be installed + with `pip install .[optional] + """ + if "cf_units" not in sys.modules: + logger.warning("relaltitude filter is missing required dependency 'cf-units'. Please install to use this filter.") + if "xarray" not in sys.modules: + logger.warning("relaltitude filter is missing required dependency 'xarray'. Please install to use this filter.") + + self._topo_file = topo_file + self._topo_var = topo_var + self._rdiff = rdiff + + self._topography = None + if topo_file is not None: + self._topography = xr.open_dataset(topo_file) + self._convert_altitude_to_meters() + self._find_lat_lon_variables() + self._extract_bounding_box() + else: + logger.warning("No topography data provided (topo_file='%s'). Relative elevation filtering will not be applied.", topo_file) + + def _convert_altitude_to_meters(self): + """ + Method which attempts to convert the altitude variable in the gridded topography data + to meters. + + :raises TypeError + If conversion isn't possible. + """ + # Convert altitude to meters + units = Unit(self._topography[self._topo_var].units) + if units.is_convertible(self.UNITS_METER): + self._topography[self._topo_var].values = self.UNITS_METER.convert(self._topography[self._topo_var].values, self.UNITS_METER) + self._topography[self._topo_var]["units"] = str(self.UNITS_METER) + else: + raise TypeError(f"Expected altitude units to be convertible to 'm', got '{units}'") + + def _find_lat_lon_variables(self): + """ + Determines the names of variables which represent the latitude and longitude + dimensions in the topography data. + + These are assigned to self._lat, self._lon, respectively for later use. + """ + for var_name in self._topography.coords: + unit_str = self._topography[var_name].attrs.get("units", None) + if unit_str in self.UNITS_LAT: + self._lat = var_name + continue + if unit_str in self.UNITS_LON: + self._lon = var_name + continue + + if any(x is None for x in [self._lat, self._lon]): + raise ValueError(f"Required variable names for lat, lon dimensions could not be found in file '{self._topo_file}") + + def _extract_bounding_box(self): + """ + Extract the bounding box of the grid. + """ + self._boundary_west = float(self._topography[self._lon].min()) + self._boundary_east = float(self._topography[self._lon].max()) + self._boundary_south = float(self._topography[self._lat].min()) + self._boundary_north = float(self._topography[self._lat].max()) + logger.info("Bounding box (NESW): %.2f, %.2f, %.2f, %.2f", self._boundary_north, self._boundary_east, self._boundary_south, self._boundary_west) + + def _gridded_altitude_from_lat_lon(self, lat: np.ndarray, lon: np.ndarray) -> np.ndarray: + altitude = self._topography.sel({"lat": xr.DataArray(lat, dims="latlon"), "lon": xr.DataArray(lon, dims="latlon")}, method="nearest") + + return altitude[self._topo_var].values[0] + + def _is_close(self, alt_gridded: np.ndarray, alt_station: np.ndarray) -> np.ndarray[bool]: + """ + Function to check if two altitudes are within a relative tolerance of each + other. + + :param alt_gridded : Gridded altitude (in meters). + :param alt_station : Observation / station altitude (in meters). + + :returns : + True if the absolute difference between alt_gridded and alt_station is + <= self._rdiff + """ + return np.abs(alt_gridded-alt_station) <= self._rdiff + + def init_kwargs(self): + return { + "topo_file": self._topo_file, + "topo_var": self._topo_var, + "rdiff": self._rdiff + } + + def name(self): + return "relaltitude" + + def filter_stations(self, stations: dict[str, Station]) -> dict[str, Station]: + if self._topography is None: + return stations + + names = np.ndarray(len(stations), dtype=np.dtypes.StrDType) + lats = np.ndarray(len(stations), dtype=np.float64) + lons = np.ndarray(len(stations), dtype=np.float64) + alts = np.ndarray(len(stations), dtype=np.float64) + + for i, name in enumerate(stations): + station = stations[name] + names[i] = name + lats[i] = station["latitude"] + lons[i] = station["longitude"] + alts[i] = station["altitude"] + + out_of_bounds_mask = np.logical_or(np.logical_or(lons < self._boundary_west, lons > self._boundary_east), np.logical_or(lats < self._boundary_south, lats > self._boundary_north)) + if np.sum(out_of_bounds_mask) > 0: + logger.warning("Some stations were removed due to being out of bounds of the gridded topography") + + topo = self._gridded_altitude_from_lat_lon(lats, lons) + + within_rdiff_mask = self._is_close(topo, alts) + + mask = np.logical_and(~out_of_bounds_mask, within_rdiff_mask) + + selected_names = names[mask] + + return {name: stations[name] for name in selected_names} \ No newline at end of file diff --git a/tests/test_CSVTimeSeriesReader.py b/tests/test_CSVTimeSeriesReader.py index dbd77bf..81f14e7 100644 --- a/tests/test_CSVTimeSeriesReader.py +++ b/tests/test_CSVTimeSeriesReader.py @@ -485,6 +485,149 @@ def test_altitude_filter_3(self): } ) as ts: self.assertEqual(len(ts.stations()), 1) + + def test_relaltitude_filter_emep_1(self): + engines = pyaro.list_timeseries_engines() + with engines["csv_timeseries"].open( + filename=self.elevation_file, + filters=[pyaro.timeseries.filters.get("relaltitude", topo_file = "./tests/testdata/datadir_elevation/topography.nc", rdiff=0)], + columns={ + "variable": 0, + "station": 1, + "longitude": 2, + "latitude": 3, + "value": 4, + "units": 5, + "start_time": 6, + "end_time": 7, + "altitude": 9, + "country": "NO", + "standard_deviation": "NaN", + "flag": "0", + } + ) as ts: + # Altitudes in test dataset: + # Station | Alt_obs | Modeobs | rdiff | + # Station 1 | 100 | 12.2554 | 87.7446 | + # Station 2 | 200 | 4.9016 | 195.0984 | + # Station 3 | 300 | 4.9016 | 195.0984 | + # Since rtol = 0, no station should be included. + self.assertEqual(len(ts.stations()), 0) + + def test_relaltitude_filter_emep_2(self): + engines = pyaro.list_timeseries_engines() + with engines["csv_timeseries"].open( + filename=self.elevation_file, + filters=[pyaro.timeseries.filters.get("relaltitude", topo_file = "./tests/testdata/datadir_elevation/topography.nc", rdiff=90)], + columns={ + "variable": 0, + "station": 1, + "longitude": 2, + "latitude": 3, + "value": 4, + "units": 5, + "start_time": 6, + "end_time": 7, + "altitude": 9, + "country": "NO", + "standard_deviation": "NaN", + "flag": "0", + } + ) as ts: + # At rdiff = 90, only the first station should be included. + self.assertEqual(len(ts.stations()), 1) + + def test_relaltitude_filter_emep_3(self): + engines = pyaro.list_timeseries_engines() + with engines["csv_timeseries"].open( + filename=self.elevation_file, + filters=[pyaro.timeseries.filters.get("relaltitude", topo_file = "./tests/testdata/datadir_elevation/topography.nc", rdiff=300)], + columns={ + "variable": 0, + "station": 1, + "longitude": 2, + "latitude": 3, + "value": 4, + "units": 5, + "start_time": 6, + "end_time": 7, + "altitude": 9, + "country": "NO", + "standard_deviation": "NaN", + "flag": "0", + } + ) as ts: + # Since rdiff=300, all stations should be included. + self.assertEqual(len(ts.stations()), 3) + + def test_relaltitude_filter_1(self): + engines = pyaro.list_timeseries_engines() + with engines["csv_timeseries"].open( + filename=self.elevation_file, + filters=[pyaro.timeseries.filters.get("relaltitude", topo_file = "./tests/testdata/datadir_elevation/topography.nc", rdiff=0)], + columns={ + "variable": 0, + "station": 1, + "longitude": 2, + "latitude": 3, + "value": 4, + "units": 5, + "start_time": 6, + "end_time": 7, + "altitude": 9, + "country": "NO", + "standard_deviation": "NaN", + "flag": "0", + } + ) as ts: + self.assertEqual(len(ts.stations()), 0) + + def test_relaltitude_filter_2(self): + engines = pyaro.list_timeseries_engines() + with engines["csv_timeseries"].open( + filename=self.elevation_file, + filters=[pyaro.timeseries.filters.get("relaltitude", topo_file = "./tests/testdata/datadir_elevation/topography.nc", rdiff=90)], + columns={ + "variable": 0, + "station": 1, + "longitude": 2, + "latitude": 3, + "value": 4, + "units": 5, + "start_time": 6, + "end_time": 7, + "altitude": 9, + "country": "NO", + "standard_deviation": "NaN", + "flag": "0", + } + ) as ts: + # At rdiff = 90, only the first station should be included. + self.assertEqual(len(ts.stations()), 1) + + def test_relaltitude_filter_3(self): + engines = pyaro.list_timeseries_engines() + with engines["csv_timeseries"].open( + filename=self.elevation_file, + filters=[pyaro.timeseries.filters.get("relaltitude", topo_file = "./tests/testdata/datadir_elevation/topography.nc", rdiff=300)], + columns={ + "variable": 0, + "station": 1, + "longitude": 2, + "latitude": 3, + "value": 4, + "units": 5, + "start_time": 6, + "end_time": 7, + "altitude": 9, + "country": "NO", + "standard_deviation": "NaN", + "flag": "0", + } + ) as ts: + # Since rdiff=300, all stations should be included. + self.assertEqual(len(ts.stations()), 3) + if __name__ == "__main__": diff --git a/tests/testdata/datadir_elevation/orography.nc b/tests/testdata/datadir_elevation/orography.nc new file mode 100644 index 0000000..26a5acc Binary files /dev/null and b/tests/testdata/datadir_elevation/orography.nc differ diff --git a/tests/testdata/datadir_elevation/topography.nc b/tests/testdata/datadir_elevation/topography.nc new file mode 100644 index 0000000..9672cd8 Binary files /dev/null and b/tests/testdata/datadir_elevation/topography.nc differ