Skip to content

Commit

Permalink
Merge pull request #42 from metno/obs-mod-relative-altitude-filter
Browse files Browse the repository at this point in the history
Implement relative altitude filter
  • Loading branch information
heikoklein authored Sep 6, 2024
2 parents 763f6ea + a69a48d commit 87ac570
Show file tree
Hide file tree
Showing 10 changed files with 388 additions and 8 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/ci_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ jobs:
uses: actions/setup-python@v5
with:
python-version: "3.10"
- name: Install system packages
run: |
sudo apt update
sudo apt install libudunits2-dev
- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand Down
50 changes: 50 additions & 0 deletions scripts/benchmark_relalt_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import pyaro
import csv
import time
import random
import os

# Generate test-data
if not os.path.exists("tmp_data.csv"):
with open("tmp_data.csv", "w", newline='') as csvfile:
writer = csv.writer(csvfile)
variable = "NOx"
unit = "Gg"
for i in range(75000):
name = f"station{i}"
lat = random.uniform(30.05, 81.95)
lon = random.uniform(-29.95, 89.95)
value = random.uniform(0, 1)
altitude = random.randrange(0, 1000)
start_time = "1997-01-01 00:00:00"
end_time = "1997-01-02 00:00:00"


writer.writerow((variable, name, lon, lat, value, unit, start_time, end_time, altitude))

# Benchmark
engines = pyaro.list_timeseries_engines()
with engines["csv_timeseries"].open(
filename="tmp_data.csv",
#filters=[pyaro.timeseries.filters.get("altitude", min_altitude=200)], # 0.023s
filters=[pyaro.timeseries.filters.get("relaltitude", topo_file = "../tests/testdata/datadir_elevation/topography.nc", rdiff=90)], # 27.133s
columns={
"variable": 0,
"station": 1,
"longitude": 2,
"latitude": 3,
"value": 4,
"units": 5,
"start_time": 6,
"end_time": 7,
"altitude": 8,
"country": "NO",
"standard_deviation": "NaN",
"flag": "0",
}) as ts:
start_time = time.perf_counter()
ts.stations()
end_time = time.perf_counter()


print(f"Total time: {end_time-start_time:.3f} seconds")
14 changes: 14 additions & 0 deletions scripts/make_topography_subset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import xarray as xr

# Script which extracts only the first time point and topography value from the emep topography
# file to reduce data for the tests.
data = xr.open_dataset("/lustre/storeB/project/fou/kl/emep/Auxiliary/topography.nc")

start_time = data["time"][0]

data = data.sel(time=slice(start_time))

data = data["topography"]


data.to_netcdf("tests/testdata/datadir_elevation/topography.nc")
15 changes: 11 additions & 4 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -23,24 +23,31 @@ url = https://pyaro.readthedocs.io
[options]
python_version = >=3.10
install_requires =
numpy >= 1.13
numpy >= 1.13, <2.0.0
importlib-metadata >= 3.6; python_version < "3.10"
package_dir =
=src
packages = pyaro, pyaro.timeseries, pyaro.csvreader
test_require = tox:tox
test_require =
tox:tox

[options.extras_require]
optional =
pandas
cf-units
xarray
netcdf4

[tox:tox]
min_version = 4.0
env_list =
py312
py311
py310
depends =
pandas

[testenv]
commands = python3 -m unittest discover -s tests
extras = optional


[options.entry_points]
Expand Down
2 changes: 1 addition & 1 deletion src/pyaro/csvreader/CSVTimeseriesReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def col_keys(cls):
"""
return cls._col_keys

def metadata(self) -> dict():
def metadata(self) -> dict:
return self._metadata

def _unfiltered_data(self, varname) -> Data:
Expand Down
2 changes: 1 addition & 1 deletion src/pyaro/timeseries/AutoFilterReaderEngine.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def supported_filters(cls) -> list[Filter]:
:return: list of filters
"""
supported = "variables,stations,countries,bounding_boxes,duplicates,time_bounds,time_resolution,flags,time_variable_station,altitude".split(
supported = "variables,stations,countries,bounding_boxes,duplicates,time_bounds,time_resolution,flags,time_variable_station,altitude,relaltitude".split(
","
)
return [filters.get(name) for name in supported]
Expand Down
166 changes: 164 additions & 2 deletions src/pyaro/timeseries/Filter.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,27 @@
import logging
import math
import abc
from collections import defaultdict
import csv
from datetime import datetime
import inspect
import re
import sys
import types
from typing import Any

import numpy as np

from .Data import Data, Flag
from .Station import Station

try:
# Optional dependencies required for relative altitude filter.
import xarray as xr
from cf_units import Unit
except ImportError:
pass

logger = logging.getLogger(__name__)

class Filter(abc.ABC):
"""Base-class for all filters used from pyaro-Readers"""
Expand Down Expand Up @@ -855,4 +864,157 @@ def filter_stations(self, stations: dict[str, Station]) -> dict[str, Station]:
if self._max_altitude is not None:
stations = {n: s for n, s in stations.items() if (not math.isnan(s["altitude"]) and s["altitude"] <= self._max_altitude) }

return stations
return stations

@registered_filter
class RelativeAltitudeFilter(StationFilter):
"""
Filter class which filters stations based on the relative difference between
the station altitude, and the gridded topography altitude.
https://github.com/metno/pyaro/issues/39
"""
UNITS_METER = Unit("m")
# https://cfconventions.org/Data/cf-conventions/cf-conventions-1.11/cf-conventions.html#latitude-coordinate
UNITS_LAT = set(["degrees_north", "degree_north", "degree_N", "degrees_N", "degreeN", "degreesN"])

# https://cfconventions.org/Data/cf-conventions/cf-conventions-1.11/cf-conventions.html#longitude-coordinate
UNITS_LON = set(["degrees_east", "degree_east", "degree_E", "degrees_E", "degreeE", "degreesE"])

def __init__(self, topo_file: str | None = None, topo_var: str = "topography", rdiff: float = 0):
"""
:param topo_file : A .nc file from which to read gridded topography data.
:param topo_var : Name of variable that stores altitude.
:param rdiff : Relative difference (in meters).
Note:
-----
- Stations will be kept if abs(altobs-altmod) <= rdiff.
- Stations will not be kept if station altitude is NaN.
Note:
-----
This filter requires additional dependencies (xarray, netcdf4, cf-units) to function. These can be installed
with `pip install .[optional]
"""
if "cf_units" not in sys.modules:
logger.warning("relaltitude filter is missing required dependency 'cf-units'. Please install to use this filter.")
if "xarray" not in sys.modules:
logger.warning("relaltitude filter is missing required dependency 'xarray'. Please install to use this filter.")

self._topo_file = topo_file
self._topo_var = topo_var
self._rdiff = rdiff

self._topography = None
if topo_file is not None:
self._topography = xr.open_dataset(topo_file)
self._convert_altitude_to_meters()
self._find_lat_lon_variables()
self._extract_bounding_box()
else:
logger.warning("No topography data provided (topo_file='%s'). Relative elevation filtering will not be applied.", topo_file)

def _convert_altitude_to_meters(self):
"""
Method which attempts to convert the altitude variable in the gridded topography data
to meters.
:raises TypeError
If conversion isn't possible.
"""
# Convert altitude to meters
units = Unit(self._topography[self._topo_var].units)
if units.is_convertible(self.UNITS_METER):
self._topography[self._topo_var].values = self.UNITS_METER.convert(self._topography[self._topo_var].values, self.UNITS_METER)
self._topography[self._topo_var]["units"] = str(self.UNITS_METER)
else:
raise TypeError(f"Expected altitude units to be convertible to 'm', got '{units}'")

def _find_lat_lon_variables(self):
"""
Determines the names of variables which represent the latitude and longitude
dimensions in the topography data.
These are assigned to self._lat, self._lon, respectively for later use.
"""
for var_name in self._topography.coords:
unit_str = self._topography[var_name].attrs.get("units", None)
if unit_str in self.UNITS_LAT:
self._lat = var_name
continue
if unit_str in self.UNITS_LON:
self._lon = var_name
continue

if any(x is None for x in [self._lat, self._lon]):
raise ValueError(f"Required variable names for lat, lon dimensions could not be found in file '{self._topo_file}")

def _extract_bounding_box(self):
"""
Extract the bounding box of the grid.
"""
self._boundary_west = float(self._topography[self._lon].min())
self._boundary_east = float(self._topography[self._lon].max())
self._boundary_south = float(self._topography[self._lat].min())
self._boundary_north = float(self._topography[self._lat].max())
logger.info("Bounding box (NESW): %.2f, %.2f, %.2f, %.2f", self._boundary_north, self._boundary_east, self._boundary_south, self._boundary_west)

def _gridded_altitude_from_lat_lon(self, lat: np.ndarray, lon: np.ndarray) -> np.ndarray:
altitude = self._topography.sel({"lat": xr.DataArray(lat, dims="latlon"), "lon": xr.DataArray(lon, dims="latlon")}, method="nearest")

return altitude[self._topo_var].values[0]

def _is_close(self, alt_gridded: np.ndarray, alt_station: np.ndarray) -> np.ndarray[bool]:
"""
Function to check if two altitudes are within a relative tolerance of each
other.
:param alt_gridded : Gridded altitude (in meters).
:param alt_station : Observation / station altitude (in meters).
:returns :
True if the absolute difference between alt_gridded and alt_station is
<= self._rdiff
"""
return np.abs(alt_gridded-alt_station) <= self._rdiff

def init_kwargs(self):
return {
"topo_file": self._topo_file,
"topo_var": self._topo_var,
"rdiff": self._rdiff
}

def name(self):
return "relaltitude"

def filter_stations(self, stations: dict[str, Station]) -> dict[str, Station]:
if self._topography is None:
return stations

names = np.ndarray(len(stations), dtype=np.dtypes.StrDType)
lats = np.ndarray(len(stations), dtype=np.float64)
lons = np.ndarray(len(stations), dtype=np.float64)
alts = np.ndarray(len(stations), dtype=np.float64)

for i, name in enumerate(stations):
station = stations[name]
names[i] = name
lats[i] = station["latitude"]
lons[i] = station["longitude"]
alts[i] = station["altitude"]

out_of_bounds_mask = np.logical_or(np.logical_or(lons < self._boundary_west, lons > self._boundary_east), np.logical_or(lats < self._boundary_south, lats > self._boundary_north))
if np.sum(out_of_bounds_mask) > 0:
logger.warning("Some stations were removed due to being out of bounds of the gridded topography")

topo = self._gridded_altitude_from_lat_lon(lats, lons)

within_rdiff_mask = self._is_close(topo, alts)

mask = np.logical_and(~out_of_bounds_mask, within_rdiff_mask)

selected_names = names[mask]

return {name: stations[name] for name in selected_names}
Loading

0 comments on commit 87ac570

Please sign in to comment.