diff --git a/docs/examples/geo.md b/docs/examples/geo.md index 927b15a..6c02988 100644 --- a/docs/examples/geo.md +++ b/docs/examples/geo.md @@ -117,13 +117,13 @@ Return a 1-column geodataframe with pseudoabsences concatenated to presence reco ```python presence_points = gpd.read_file('/path/to/occurrence-records.gpkg') -ela.stack_geometries(presence_points, pseudoabsence_points) +point_stack = ela.stack_geometries(presence_points, pseudoabsence_points) ``` Return 2 columns, with class labels assigned (1 for presences, 0 for pseudoabsences): ```python -ela.stack_geometries( +point_stack = ela.stack_geometries( presence_points, pseudoabsence_points, add_class_label=True, @@ -133,9 +133,10 @@ ela.stack_geometries( If the geometries are in different crs, default is to reproject to the presence crs. Override this with target_crs="background": ```python -ela.stack_geometries( +point_stack = ela.stack_geometries( presence_points, pseudoabsence_points, + add_class_label=True, target_crs="background", ) ``` @@ -149,9 +150,9 @@ Annotation refers to reading and storing raster values at the locations of a ser Once you have your species presence and pseudo-absence records, you can annotate these records with the covariate data from each location. ```python -pseudoabsence_covariates = ela.annotate( - pseudoabsence_points, - list_of_raster_paths, +covariates = ela.annotate( + point_stack, + list_of_rasters, drop_na = True, ) ``` @@ -176,8 +177,8 @@ labels = [ "TMP-mean", ] -pseudoabsence_covariates = ela.annotate( - pseudoabsence_points, +covariates = ela.annotate( + point_stack, raster_paths, labels = labels drop_na = True, @@ -199,7 +200,13 @@ One way to add spatial information to a model is to compute geographically-expli `elapid` does this by calculating sample weights based on the distance to the nearest neighbor. Points nearby other points receive lower weight scores; far-away points receive higher weight scores. ```python -sample_weight = ela.distance_weights(pseudoabsence_points) +sample_weight = ela.distance_weights(point_stack) +``` + +The default is to compute weights based on the distance to the nearest point. You can instead compute the average distance to `n` nearest points instead to compute sample weights using point densities instead of the distance to the single nearest point. This may be useful if you have small clusters of a few points far away from large, densely populated regions. + +```python +sample_weight = ela.distance_weights(point_stack, n_neighbors=10) ``` These weights can be passed to many many model fitting routines, typically via `model.fit(x, y, sample_weight=sample_weight)`. This is supported for `ela.MaxentModel()`, as well as many `sklearn` methods. @@ -208,6 +215,34 @@ This function uses `ela.nearest_point_distance()`, a handy function for computin --- +## Train/test splits + +Uniformly random train/test splits are generally discouraged in spatial modeling because of the strong spatial structure inherent in many datasets. The non-independence of these data is referred to as spatial autocorrelation. Using distance- or density-based sample weights is one way to mitigate these effects. Another is to split the data into geographically distinct train/test regions to try and prioritize model generalization. + +One method is to use a "checkerbox" system for creating train/test splits. Points are intersected along a regular grid, and every other grid is used to split the data into train/test sets. + +```python +train, test = ela.checkerboard_split(point_stack, grid_size=1000) +``` + +The height and width of the grid used to split the data is controlled by the `grid_size` parameter. This should specify distance in the units of the point data's CRS. The above call would split data along a 1x1 km grid if the CRS units were in meters. + +The black and white structure of the checkerboard means this method can only generate one train/test split. + +Alternatively, you can create `k` geographically-clustered folds using the `GeographicKFold` cross validation strategy: + +```python +gfolds = ela.GeographicKFold(n_folds=4) +for train_idx, test_idx in gfolds.split(point_stack): + train_points = point_stack.iloc[train_idx] + test_points = point_stack.iloc[test_idx] + # split x/y data, fit models, evaluate, etc. +``` + +This method uses KMeans clustering, fit with the x/y locations of the point data, to group points into spatially distinct clusters. This cross-validation strategy is a good way to test how well models generalize outside of their training extents into novel geographic regions. + +--- + ## Zonal statistics In addition to the tools for working with Point data, `elapid` contains a routine for calculating zonal statistics from Polygon or MutliPolygon geometry types. diff --git a/docs/index.md b/docs/index.md index 8322e64..28f11e6 100644 --- a/docs/index.md +++ b/docs/index.md @@ -71,6 +71,10 @@ Transform covariate data into derivative `features` to expand data dimensionalit Train and apply species distribution models based on annotated point data, configured with sensible defaults (like `elapid.MaxentModel()` and `elapid.NicheEnvelopeModel()`). +:satellite: **Training spatially-aware models** + +Compute spatially-explicit sample weights, checkerboard train/test splits, or geographically-clustered cross-validation splits to reduce spatial autocorellation effects (with `elapid.distance_weights()`, `elapid.checkerboard_split()` and `elapid.GeographicKFold()`). + :earth_asia: **Applying models to rasters** Apply any pixel-based model with a `.predict()` method to raster data to easily create prediction probability maps (like training a `RandomForestClassifier()` and applying with `elapid.apply_model_to_rasters()`). diff --git a/docs/module/train_test_split.md b/docs/module/train_test_split.md new file mode 100644 index 0000000..f3a06e4 --- /dev/null +++ b/docs/module/train_test_split.md @@ -0,0 +1,3 @@ +# elapid.train_test_split + +::: elapid.train_test_split diff --git a/elapid/__init__.py b/elapid/__init__.py index b8c6338..a027866 100644 --- a/elapid/__init__.py +++ b/elapid/__init__.py @@ -24,4 +24,5 @@ ) from elapid.models import MaxentModel, NicheEnvelopeModel from elapid.stats import normalize_sample_probabilities +from elapid.train_test_split import GeographicKFold, checkerboard_split from elapid.utils import load_object, load_sample_data, save_object diff --git a/elapid/__version__.py b/elapid/__version__.py index baaa23f..83f9d0b 100644 --- a/elapid/__version__.py +++ b/elapid/__version__.py @@ -1 +1 @@ -"0.3.13" +"0.3.14" diff --git a/elapid/train_test_split.py b/elapid/train_test_split.py new file mode 100644 index 0000000..c49f8a1 --- /dev/null +++ b/elapid/train_test_split.py @@ -0,0 +1,112 @@ +"""Methods for geographlically splitting data into train/test splits""" + +from typing import List, Tuple + +import geopandas as gpd +import numpy as np +from shapely.geometry import box +from sklearn.cluster import KMeans +from sklearn.model_selection import BaseCrossValidator + +from elapid.types import Vector + + +def checkerboard_split( + points: Vector, grid_size: float, buffer: float = 0, bounds: Tuple[float, float, float, float] = None +) -> Tuple[gpd.GeoDataFrame, gpd.GeoDataFrame]: + """Create train/test splits with a spatially-gridded checkerboard. + + Args: + points: point-format GeoSeries or GeoDataFrame + grid_size: the height and width of each checkerboard side to split + data using. Should match the units of the points CRS + (i.e. grid_size=1000 is a 1km grid for UTM data) + buffer: add an x/y buffer around the initial checkerboard bounds + bounds: instead of deriving the checkerboard bounds from `points`, + use this tuple of [xmin, ymin, xmax, ymax] values. + + Returns: + (train_points, test_points) split using a checkerboard grid. + """ + if isinstance(points, gpd.GeoSeries): + points = points.to_frame("geometry") + + bounds = points.total_bounds if bounds is None else bounds + xmin, ymin, xmax, ymax = bounds + + x0s = np.arange(xmin - buffer, xmax + buffer + grid_size, grid_size) + y0s = np.arange(ymin - buffer, ymax + buffer + grid_size, grid_size) + + train_cells = [] + test_cells = [] + for idy, y0 in enumerate(y0s): + offset = 0 if idy % 2 == 0 else 1 + for idx, x0 in enumerate(x0s): + cell = box(x0, y0, x0 + grid_size, y0 + grid_size) + cell_type = 0 if (idx + offset) % 2 == 0 else 1 + if cell_type == 0: + train_cells.append(cell) + else: + test_cells.append(cell) + + grid_crs = points.crs + train_grid = gpd.GeoDataFrame(geometry=train_cells, crs=grid_crs) + test_grid = gpd.GeoDataFrame(geometry=test_cells, crs=grid_crs) + train_points = ( + gpd.sjoin(points, train_grid, how="left", predicate="within") + .dropna() + .drop(columns="index_right") + .reset_index(drop=True) + ) + test_points = ( + gpd.sjoin(points, test_grid, how="left", predicate="within") + .dropna() + .drop(columns="index_right") + .reset_index(drop=True) + ) + + return train_points, test_points + + +class GeographicKFold(BaseCrossValidator): + """Compute geographically-clustered train/test folds using KMeans clustering""" + + def __init__(self, n_splits: int = 4): + self.n_splits = n_splits + + def split(self, points: Vector) -> Tuple[gpd.GeoDataFrame, gpd.GeoDataFrame]: + """Split point data into geographically-clustered train/test folds and + return their array indices. + + Args: + points: point-format GeoSeries or GeoDataFrame. + + Yields: + (train_idxs, test_idxs) the train/test splits for each geo fold. + """ + for train, test in super().split(points): + yield train, test + + def _iter_test_indices(self, X, y=None, groups=None): + """The method used by the base class to split train/test data""" + kmeans = KMeans(n_clusters=self.n_splits) + xy = np.array(list(zip(X.geometry.x, X.geometry.y))) + kmeans.fit(xy) + clusters = kmeans.predict(xy) + indices = np.arange(len(xy)) + for cluster in range(self.n_splits): + test = clusters == cluster + yield indices[test] + + def get_n_splits(self, X=None, y=None, groups=None) -> int: + """Returns the number of splitting iterations in the cross-validator + + Args: + X: ignored, exists for compatibility. + y: ignored, exists for compatibility. + groups: ignored, exists for compatibility. + + Returns: + The number of splitting iterations in the cross-validator. + """ + return self.n_splits diff --git a/mkdocs.yml b/mkdocs.yml index 07e2dbb..7e28d51 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -36,6 +36,7 @@ nav: - elapid.geo: 'module/geo.md' - elapid.models: 'module/models.md' - elapid.stats: 'module/stats.md' + - elapid.train_test_split: 'module/train_test_split.md' - elapid.types: 'module/types.md' - elapid.utils: 'module/utils.md' - Contributing to elapid: 'contributing.md' diff --git a/tests/test_train_test_split.py b/tests/test_train_test_split.py new file mode 100644 index 0000000..7aac0e9 --- /dev/null +++ b/tests/test_train_test_split.py @@ -0,0 +1,34 @@ +import os + +import geopandas as gpd +import numpy as np + +from elapid import train_test_split + +# set the test raster data paths +directory_path, script_path = os.path.split(os.path.abspath(__file__)) +data_path = os.path.join(directory_path, "data") +points = gpd.read_file(os.path.join(data_path, "test-point-samples.gpkg")) + + +def test_checkerboard_split(): + train, test = train_test_split.checkerboard_split(points, grid_size=1000) + assert isinstance(train, gpd.GeoDataFrame) + + buffer = 500 + xmin, ymin, xmax, ymax = points.total_bounds + buffered_bounds = [xmin - buffer, ymin - buffer, xmax + buffer, ymax + buffer] + train_buffered, test_buffered = train_test_split.checkerboard_split(points, grid_size=1000, bounds=buffered_bounds) + assert len(train_buffered) > len(train) + + +def test_GeographicKFold(): + n_folds = 4 + gfolds = train_test_split.GeographicKFold(n_splits=n_folds) + counted_folds = 0 + for train_idx, test_idx in gfolds.split(points): + train = points.iloc[train_idx] + test = points.iloc[test_idx] + assert len(train) > len(test) + counted_folds += 1 + assert gfolds.get_n_splits() == n_folds == counted_folds