diff --git a/docs/day2.html b/docs/day2.html index 34826a3..03be557 100644 --- a/docs/day2.html +++ b/docs/day2.html @@ -689,8 +689,6 @@

Correction for edge effect?

density(CSR) |> plot()
-# Warning in seq.default(xrange[1L], xrange[2L], length = n + 1L):
-# partial argument match of 'length' to 'length.out'
 plot(CSR, add = TRUE, col = 'green')
@@ -699,8 +697,6 @@
density(ppi) |> plot()
-# Warning in seq.default(xrange[1L], xrange[2L], length = n + 1L):
-# partial argument match of 'length' to 'length.out'
 plot(ppi, add = TRUE, col = 'green')
@@ -709,8 +705,6 @@
density(ppi, sigma = .05) |> plot()
-# Warning in seq.default(xrange[1L], xrange[2L], length = n + 1L):
-# partial argument match of 'length' to 'length.out'
 plot(ppi, add = TRUE, col = 'green')
@@ -849,7 +843,6 @@
# assuming Inhomogeneous Poisson:
 ppm(ppi, ~x)
-# Warning in FIT$coef: partial match of 'coef' to 'coefficients'
 # Nonstationary Poisson process
 # Fitted to point pattern dataset 'ppi'
 # 
@@ -864,7 +857,6 @@
 # x               1.96 0.247    1.48    2.45   ***  7.96
 # assuming Inhomogeneous clustered:
 kppm(Y, ~x)
-# Warning in FIT$coef: partial match of 'coef' to 'coefficients'
 # Inhomogeneous cluster point process model
 # Fitted to point pattern dataset 'Y'
 # Fitted by minimum contrast
diff --git a/docs/day3.html b/docs/day3.html
index 743f24e..06c8d16 100644
--- a/docs/day3.html
+++ b/docs/day3.html
@@ -418,7 +418,7 @@
 
  • Which model do you like the best? Can the SSErr attribute of the fitted model be used to compare the models? How else can variogram model fits be compared?
  • 3.5 Interpolation

    -

    For interpolation, we first need a target grid (point patterns have an observation window, geostatistical data not!)

    +

    For interpolation, we first need a target grid (point patterns have an observation window, geostatistical data do not!)

    A simple interpolator (that is hard to beat) is the inverse distance interpolator, \[\hat{Z}(s_0) = \sum_{j=1}^n \lambda_j Z(s_i)\] with \(\lambda_j\) proportional to \(||s_i - s_0||^{-p}\) and normalized to sum to one (weighted mean), and \(p\) tunable but defaulting to 2.

    Using the data range:

    @@ -1308,7 +1308,7 @@ ## Interpolation -For interpolation, we first need a target grid (point patterns have an observation window, geostatistical data not!) +For interpolation, we first need a target grid (point patterns have an observation window, geostatistical data do not!) A simple interpolator (that is hard to beat) is the inverse distance interpolator, $$\hat{Z}(s_0) = \sum_{j=1}^n \lambda_j Z(s_i)$$ with $\lambda_j$ proportional to $||s_i - s_0||^{-p}$ and normalized to sum to one (weighted mean), and $p$ tunable but defaulting to 2. diff --git a/docs/day4.html b/docs/day4.html index e925d65..89ed472 100644 --- a/docs/day4.html +++ b/docs/day4.html @@ -585,8 +585,6 @@ # # combine rf = randomForest(NO2~., as.data.frame(no2.sf)[c("NO2", "x", "y")]) -# Warning in seq.default(along = m): partial argument match of -# 'along' to 'along.with' g4$rf = predict(rf, as.data.frame(g4)) plot(g4["rf"], breaks = "equal", reset = FALSE, main = "random forest") plot(st_cast(st_geometry(de), "MULTILINESTRING"), add = TRUE, col = 'green')
    @@ -603,8 +601,6 @@ no2.sf$x1 = no2.sf$x + no2.sf$y no2.sf$y1 = no2.sf$x - no2.sf$y rf = randomForest(NO2~., as.data.frame(no2.sf)[c("NO2", "x1", "y1")]) -# Warning in seq.default(along = m): partial argument match of -# 'along' to 'along.with' g4$x1 = g4$x + g4$y g4$y1 = g4$x - g4$y g4$rf_rot = predict(rf, as.data.frame(g4)) @@ -650,8 +646,6 @@
    library(randomForest)
     rf = randomForest(NO2~., as.data.frame(no2.sf)[c("NO2", n[grepl("d_", n)])])
    -# Warning in seq.default(along = m): partial argument match of
    -# 'along' to 'along.with'
     g4$rf_d = predict(rf, as.data.frame(g4))
     plot(g4["rf_d"], breaks = "equal", reset = FALSE, main = "random forest")
     plot(st_cast(st_geometry(de), "MULTILINESTRING"), add = TRUE, col = 'green')
    @@ -727,12 +721,8 @@ # [201] "d_184" "d_185" "d_186" "d_187" # [205] "d_188" "d_189" "d_190" "d_191" # [209] "d_192" "d_193" "d_194" "d_195" -# [213] "d_196" "d_197" "d_198" "d_199" -# [217] "d_200" "d_201" "d_202" "d_203" -# [221] "d_204" +# [213] "d_196" "d_197" "d_198" rf = randomForest(NO2~., as.data.frame(no2.sf)[c("NO2", n[grepl("d_", n)])]) -# Warning in seq.default(along = m): partial argument match of -# 'along' to 'along.with' g4$rf_dm = predict(rf, as.data.frame(g4)) plot(g4["rf_dm"], breaks = "equal", reset = FALSE, main = "random forest") plot(st_cast(st_geometry(de), "MULTILINESTRING"), add = TRUE, col = 'green')
    @@ -791,75 +781,6 @@ x = st_drop_geometry(splotdata)[,6:16] y = splotdata$Species_richness tr = train(x, y) # chooses a random forest by default -# Warning in seq.default(along = pkg): partial argument match of -# 'along' to 'along.with' -# Warning in seq.default(along = models$library): partial argument -# match of 'along' to 'along.with' -# Warning in seq.default(along = data): partial argument match of -# 'along' to 'along.with' -# Warning in seq.default(along = data): partial argument match of -# 'along' to 'along.with' -# Warning in seq.default(along = data): partial argument match of -# 'along' to 'along.with' -# Warning in seq.default(along = data): partial argument match of -# 'along' to 'along.with' -# Warning in seq.default(along = data): partial argument match of -# 'along' to 'along.with' -# Warning in seq.default(along = data): partial argument match of -# 'along' to 'along.with' -# Warning in seq.default(along = data): partial argument match of -# 'along' to 'along.with' -# Warning in seq.default(along = data): partial argument match of -# 'along' to 'along.with' -# Warning in seq.default(along = data): partial argument match of -# 'along' to 'along.with' -# Warning in seq.default(along = data): partial argument match of -# 'along' to 'along.with' -# Warning in seq.default(along = data): partial argument match of -# 'along' to 'along.with' -# Warning in seq.default(along = data): partial argument match of -# 'along' to 'along.with' -# Warning in seq.default(along = data): partial argument match of -# 'along' to 'along.with' -# Warning in seq.default(along = data): partial argument match of -# 'along' to 'along.with' -# Warning in seq.default(along = data): partial argument match of -# 'along' to 'along.with' -# Warning in seq.default(along = data): partial argument match of -# 'along' to 'along.with' -# Warning in seq.default(along = data): partial argument match of -# 'along' to 'along.with' -# Warning in seq.default(along = data): partial argument match of -# 'along' to 'along.with' -# Warning in seq.default(along = data): partial argument match of -# 'along' to 'along.with' -# Warning in seq.default(along = data): partial argument match of -# 'along' to 'along.with' -# Warning in seq.default(along = data): partial argument match of -# 'along' to 'along.with' -# Warning in seq.default(along = data): partial argument match of -# 'along' to 'along.with' -# Warning in seq.default(along = data): partial argument match of -# 'along' to 'along.with' -# Warning in seq.default(along = data): partial argument match of -# 'along' to 'along.with' -# Warning in seq.default(along = data): partial argument match of -# 'along' to 'along.with' -# Warning in seq.default(along = x): partial argument match of -# 'along' to 'along.with' -# Warning in seq.default(along = outcome): partial argument match of -# 'along' to 'along.with' -# Warning in seq.default(along = x): partial argument match of -# 'along' to 'along.with' -# Warning in seq.default(2, to = p, length = len): partial argument -# match of 'length' to 'length.out' -# Warning in seq.default(along = resampleIndex): partial argument -# match of 'along' to 'along.with' -# Warning in tmp$resample: partial match of 'resample' to 'resamples' -# Warning in seq.default(along = paramNames): partial argument match -# of 'along' to 'along.with' -# Warning in seq.default(along = y): partial argument match of -# 'along' to 'along.with' predict(split(r), tr) |> plot()
    @@ -886,16 +807,7 @@

    Explained here;

    aoa <- aoa(r, tr, verbose = FALSE)
    -# Warning in seq.default(along = pkg): partial argument match of
    -# 'along' to 'along.with'
    -# Warning in seq.default(along = code$library): partial argument
    -# match of 'along' to 'along.with'
    -# Warning in seq.default(along = pkg): partial argument match of
    -# 'along' to 'along.with'
    -# Warning in seq.default(along = code$library): partial argument
    -# match of 'along' to 'along.with'
     # note: Either no model was given or no CV was used for model training. The DI threshold is therefore based on all training data
    -# Warning in trainDI$thres: partial match of 'thres' to 'threshold'
     plot(aoa)
     # Warning: Removed 395 rows containing non-finite outside the scale range
     # (`stat_density()`).
    diff --git a/docs/day4_files/figure-html/unnamed-chunk-11-1.png b/docs/day4_files/figure-html/unnamed-chunk-11-1.png index a483189..ff6f701 100644 Binary files a/docs/day4_files/figure-html/unnamed-chunk-11-1.png and b/docs/day4_files/figure-html/unnamed-chunk-11-1.png differ diff --git a/docs/day4_files/figure-html/unnamed-chunk-12-1.png b/docs/day4_files/figure-html/unnamed-chunk-12-1.png index b8738df..765241a 100644 Binary files a/docs/day4_files/figure-html/unnamed-chunk-12-1.png and b/docs/day4_files/figure-html/unnamed-chunk-12-1.png differ diff --git a/docs/day4_files/figure-html/unnamed-chunk-13-1.png b/docs/day4_files/figure-html/unnamed-chunk-13-1.png index f401257..a301509 100644 Binary files a/docs/day4_files/figure-html/unnamed-chunk-13-1.png and b/docs/day4_files/figure-html/unnamed-chunk-13-1.png differ diff --git a/docs/day4_files/figure-html/unnamed-chunk-15-1.png b/docs/day4_files/figure-html/unnamed-chunk-15-1.png index dd0da09..b17dd6d 100644 Binary files a/docs/day4_files/figure-html/unnamed-chunk-15-1.png and b/docs/day4_files/figure-html/unnamed-chunk-15-1.png differ diff --git a/docs/day4_files/figure-html/unnamed-chunk-15-2.png b/docs/day4_files/figure-html/unnamed-chunk-15-2.png index c0f0e80..bf837bd 100644 Binary files a/docs/day4_files/figure-html/unnamed-chunk-15-2.png and b/docs/day4_files/figure-html/unnamed-chunk-15-2.png differ diff --git a/docs/day4_files/figure-html/unnamed-chunk-15-3.png b/docs/day4_files/figure-html/unnamed-chunk-15-3.png index 6eda966..32a0030 100644 Binary files a/docs/day4_files/figure-html/unnamed-chunk-15-3.png and b/docs/day4_files/figure-html/unnamed-chunk-15-3.png differ diff --git a/docs/day4_files/figure-html/unnamed-chunk-16-1.png b/docs/day4_files/figure-html/unnamed-chunk-16-1.png index d44e19c..d825e41 100644 Binary files a/docs/day4_files/figure-html/unnamed-chunk-16-1.png and b/docs/day4_files/figure-html/unnamed-chunk-16-1.png differ diff --git a/docs/day4_files/figure-html/unnamed-chunk-8-1.png b/docs/day4_files/figure-html/unnamed-chunk-8-1.png index ec2833f..809c65f 100644 Binary files a/docs/day4_files/figure-html/unnamed-chunk-8-1.png and b/docs/day4_files/figure-html/unnamed-chunk-8-1.png differ diff --git a/docs/day4_files/figure-html/unnamed-chunk-9-1.png b/docs/day4_files/figure-html/unnamed-chunk-9-1.png index 3910208..d86802c 100644 Binary files a/docs/day4_files/figure-html/unnamed-chunk-9-1.png and b/docs/day4_files/figure-html/unnamed-chunk-9-1.png differ diff --git a/docs/search.json b/docs/search.json index ee62de4..55f87ff 100644 --- a/docs/search.json +++ b/docs/search.json @@ -184,7 +184,7 @@ "href": "day2.html#estimating-density", "title": "\n2  Point Pattern data\n", "section": "\n2.6 Estimating density", - "text": "2.6 Estimating density\n\nmain parameter: bandwidth (sigma): determines the amound of smoothing.\nif sigma is not specified: uses bw.diggle, an automatically tuned bandwidth\n\nCorrection for edge effect?\n\ndensity(CSR) |> plot()\n# Warning in seq.default(xrange[1L], xrange[2L], length = n + 1L):\n# partial argument match of 'length' to 'length.out'\nplot(CSR, add = TRUE, col = 'green')\n\n\n\n\n\n\ndensity(ppi) |> plot()\n# Warning in seq.default(xrange[1L], xrange[2L], length = n + 1L):\n# partial argument match of 'length' to 'length.out'\nplot(ppi, add = TRUE, col = 'green')\n\n\n\n\n\n\ndensity(ppi, sigma = .05) |> plot()\n# Warning in seq.default(xrange[1L], xrange[2L], length = n + 1L):\n# partial argument match of 'length' to 'length.out'\nplot(ppi, add = TRUE, col = 'green')", + "text": "2.6 Estimating density\n\nmain parameter: bandwidth (sigma): determines the amound of smoothing.\nif sigma is not specified: uses bw.diggle, an automatically tuned bandwidth\n\nCorrection for edge effect?\n\ndensity(CSR) |> plot()\nplot(CSR, add = TRUE, col = 'green')\n\n\n\n\n\n\ndensity(ppi) |> plot()\nplot(ppi, add = TRUE, col = 'green')\n\n\n\n\n\n\ndensity(ppi, sigma = .05) |> plot()\nplot(ppi, add = TRUE, col = 'green')", "crumbs": [ "2  Point Pattern data" ] @@ -204,7 +204,7 @@ "href": "day2.html#fitting-models-to-clustered-data", "title": "\n2  Point Pattern data\n", "section": "\n2.8 Fitting models to clustered data", - "text": "2.8 Fitting models to clustered data\n\n# assuming Inhomogeneous Poisson:\nppm(ppi, ~x)\n# Warning in FIT$coef: partial match of 'coef' to 'coefficients'\n# Nonstationary Poisson process\n# Fitted to point pattern dataset 'ppi'\n# \n# Log intensity: ~x\n# \n# Fitted trend coefficients:\n# (Intercept) x \n# 4.33 1.96 \n# \n# Estimate S.E. CI95.lo CI95.hi Ztest Zval\n# (Intercept) 4.33 0.174 3.99 4.67 *** 24.91\n# x 1.96 0.247 1.48 2.45 *** 7.96\n# assuming Inhomogeneous clustered:\nkppm(Y, ~x)\n# Warning in FIT$coef: partial match of 'coef' to 'coefficients'\n# Inhomogeneous cluster point process model\n# Fitted to point pattern dataset 'Y'\n# Fitted by minimum contrast\n# Summary statistic: inhomogeneous K-function\n# \n# Log intensity: ~x\n# \n# Fitted trend coefficients:\n# (Intercept) x \n# 3.69 1.47 \n# \n# Cluster model: Thomas process\n# Fitted cluster parameters:\n# kappa scale \n# 7.731 0.038 \n# Mean cluster size: [pixel image]\n# \n# Cluster strength: phi = 7.122\n# Sibling probability: psib = 0.8769", + "text": "2.8 Fitting models to clustered data\n\n# assuming Inhomogeneous Poisson:\nppm(ppi, ~x)\n# Nonstationary Poisson process\n# Fitted to point pattern dataset 'ppi'\n# \n# Log intensity: ~x\n# \n# Fitted trend coefficients:\n# (Intercept) x \n# 4.33 1.96 \n# \n# Estimate S.E. CI95.lo CI95.hi Ztest Zval\n# (Intercept) 4.33 0.174 3.99 4.67 *** 24.91\n# x 1.96 0.247 1.48 2.45 *** 7.96\n# assuming Inhomogeneous clustered:\nkppm(Y, ~x)\n# Inhomogeneous cluster point process model\n# Fitted to point pattern dataset 'Y'\n# Fitted by minimum contrast\n# Summary statistic: inhomogeneous K-function\n# \n# Log intensity: ~x\n# \n# Fitted trend coefficients:\n# (Intercept) x \n# 3.69 1.47 \n# \n# Cluster model: Thomas process\n# Fitted cluster parameters:\n# kappa scale \n# 7.731 0.038 \n# Mean cluster size: [pixel image]\n# \n# Cluster strength: phi = 7.122\n# Sibling probability: psib = 0.8769", "crumbs": [ "2  Point Pattern data" ] @@ -294,7 +294,7 @@ "href": "day3.html#interpolation", "title": "\n3  Geostatistical data\n", "section": "\n3.5 Interpolation", - "text": "3.5 Interpolation\nFor interpolation, we first need a target grid (point patterns have an observation window, geostatistical data not!)\nA simple interpolator (that is hard to beat) is the inverse distance interpolator, \\[\\hat{Z}(s_0) = \\sum_{j=1}^n \\lambda_j Z(s_i)\\] with \\(\\lambda_j\\) proportional to \\(||s_i - s_0||^{-p}\\) and normalized to sum to one (weighted mean), and \\(p\\) tunable but defaulting to 2.\nUsing the data range:\n\nlibrary(stars)\n# Loading required package: abind\ng1 = st_as_stars(st_bbox(no2.sf))\nlibrary(gstat)\nidw(NO2~1, no2.sf, g1) |> plot(reset = FALSE)\n# [inverse distance weighted interpolation]\nplot(st_geometry(no2.sf), add = TRUE, col = 'yellow')\nplot(st_cast(st_geometry(de), \"MULTILINESTRING\"), add = TRUE, col = 'red')\n\n\n\n\n\n\n\nBetter to use the outer polygon:\n\ng2 = st_as_stars(st_bbox(de))\nidw(NO2~1, no2.sf, g2) |> plot(reset = FALSE)\n# [inverse distance weighted interpolation]\nplot(st_geometry(no2.sf), add = TRUE, col = 'yellow')\nplot(st_cast(st_geometry(de), \"MULTILINESTRING\"), add = TRUE, col = 'red')\n\n\n\n\n\n\n\nAnd crop to (mask out outside) the area of interest:\n\ng3 = st_crop(g2, de)\ni = idw(NO2~1, no2.sf, g3) \n# [inverse distance weighted interpolation]\nplot(i, reset = FALSE, main = \"yearly mean NO2, rural background\")\nplot(st_geometry(no2.sf), add = TRUE, col = 'yellow')\nplot(st_cast(st_geometry(de), \"MULTILINESTRING\"), add = TRUE, col = 'red')\n\n\n\n\n\n\n\nGeostatistical approaches compute weights based on covariances between observations \\(Z(s_i)\\), and between observations and the value at the interpolation location \\(Z(s_0)\\). These covariances are obtained from a model fitted to the sample variogram.", + "text": "3.5 Interpolation\nFor interpolation, we first need a target grid (point patterns have an observation window, geostatistical data do not!)\nA simple interpolator (that is hard to beat) is the inverse distance interpolator, \\[\\hat{Z}(s_0) = \\sum_{j=1}^n \\lambda_j Z(s_i)\\] with \\(\\lambda_j\\) proportional to \\(||s_i - s_0||^{-p}\\) and normalized to sum to one (weighted mean), and \\(p\\) tunable but defaulting to 2.\nUsing the data range:\n\nlibrary(stars)\n# Loading required package: abind\ng1 = st_as_stars(st_bbox(no2.sf))\nlibrary(gstat)\nidw(NO2~1, no2.sf, g1) |> plot(reset = FALSE)\n# [inverse distance weighted interpolation]\nplot(st_geometry(no2.sf), add = TRUE, col = 'yellow')\nplot(st_cast(st_geometry(de), \"MULTILINESTRING\"), add = TRUE, col = 'red')\n\n\n\n\n\n\n\nBetter to use the outer polygon:\n\ng2 = st_as_stars(st_bbox(de))\nidw(NO2~1, no2.sf, g2) |> plot(reset = FALSE)\n# [inverse distance weighted interpolation]\nplot(st_geometry(no2.sf), add = TRUE, col = 'yellow')\nplot(st_cast(st_geometry(de), \"MULTILINESTRING\"), add = TRUE, col = 'red')\n\n\n\n\n\n\n\nAnd crop to (mask out outside) the area of interest:\n\ng3 = st_crop(g2, de)\ni = idw(NO2~1, no2.sf, g3) \n# [inverse distance weighted interpolation]\nplot(i, reset = FALSE, main = \"yearly mean NO2, rural background\")\nplot(st_geometry(no2.sf), add = TRUE, col = 'yellow')\nplot(st_cast(st_geometry(de), \"MULTILINESTRING\"), add = TRUE, col = 'red')\n\n\n\n\n\n\n\nGeostatistical approaches compute weights based on covariances between observations \\(Z(s_i)\\), and between observations and the value at the interpolation location \\(Z(s_0)\\). These covariances are obtained from a model fitted to the sample variogram.", "crumbs": [ "3  Geostatistical data" ] @@ -374,7 +374,7 @@ "href": "day4.html#spatial-coordinates-as-predictor", "title": "\n4  Machine Learning methods applied to spatial data\n", "section": "\n4.1 Spatial coordinates as predictor", - "text": "4.1 Spatial coordinates as predictor\nWe’ll rename coordinates to x and y:\n\nlibrary(dplyr)\n# \n# Attaching package: 'dplyr'\n# The following objects are masked from 'package:stats':\n# \n# filter, lag\n# The following objects are masked from 'package:base':\n# \n# intersect, setdiff, setequal, union\nlibrary(sf)\n# Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.4.0; sf_use_s2() is TRUE\ncrs <- st_crs(\"EPSG:32632\") # a csv doesn't carry a CRS!\nno2 <- read.csv(system.file(\"external/no2.csv\",\n package = \"gstat\")) \nno2 |> rename(x = station_longitude_deg, y = station_latitude_deg) |> \n st_as_sf(crs = \"OGC:CRS84\", coords =\n c(\"x\", \"y\"), remove = FALSE) |>\n st_transform(crs) -> no2.sf\n# we need to reassign x and y:\ncc = st_coordinates(no2.sf)\nno2.sf$x = cc[,1]\nno2.sf$y = cc[,2]\nhead(no2.sf)\n# Simple feature collection with 6 features and 21 fields\n# Geometry type: POINT\n# Dimension: XY\n# Bounding box: xmin: 495000 ymin: 5320000 xmax: 816000 ymax: 5930000\n# Projected CRS: WGS 84 / UTM zone 32N\n# station_european_code station_local_code country_iso_code\n# 1 DENI063 DENI063 DE\n# 2 DEBY109 DEBY109 DE\n# 3 DEBE056 DEBE056 DE\n# 4 DEBE062 DEBE062 DE\n# 5 DEBE032 DEBE032 DE\n# 6 DEHE046 DEHE046 DE\n# country_name station_name station_start_date\n# 1 Germany Altes Land 1999-02-11\n# 2 Germany Andechs/Rothenfeld 2003-04-17\n# 3 Germany B Friedrichshagen 1994-02-01\n# 4 Germany B Frohnau, Funkturm (3.5 m) 1996-02-01\n# 5 Germany B Grunewald (3.5 m) 1986-10-01\n# 6 Germany Bad Arolsen 1999-05-11\n# station_end_date type_of_station station_ozone_classification\n# 1 NA Background rural\n# 2 NA Background rural\n# 3 NA Background rural\n# 4 NA Background rural\n# 5 NA Background rural\n# 6 NA Background rural\n# station_type_of_area station_subcat_rural_back street_type x\n# 1 rural unknown 545414\n# 2 rural regional 665711\n# 3 rural near city 815741\n# 4 rural near city 790544\n# 5 rural near city 786923\n# 6 rural unknown 495007\n# y station_altitude station_city lau_level1_code\n# 1 5930802 3 NA\n# 2 5315213 700 NA\n# 3 5820995 35 NA\n# 4 5842367 50 BERLIN NA\n# 5 5822067 50 BERLIN NA\n# 6 5697747 343 BAD AROLSEN/KOHLGRUND NA\n# lau_level2_code lau_level2_name EMEP_station NO2\n# 1 3359028 Jork no 13.10\n# 2 9188117 Andechs no 7.14\n# 3 11000000 Berlin, Stadt no 12.80\n# 4 11000000 Berlin, Stadt no 11.83\n# 5 11000000 Berlin, Stadt no 11.98\n# 6 6635002 Bad Arolsen, Stadt no 8.94\n# geometry\n# 1 POINT (545414 5930802)\n# 2 POINT (665711 5315213)\n# 3 POINT (815741 5820995)\n# 4 POINT (790544 5842367)\n# 5 POINT (786923 5822067)\n# 6 POINT (495007 5697747)\n\"https://github.com/edzer/sdsr/raw/main/data/de_nuts1.gpkg\" |>\n read_sf() |>\n st_transform(crs) -> de\n\n\nlibrary(stars)\n# Loading required package: abind\ng2 = st_as_stars(st_bbox(de))\ng3 = st_crop(g2, de)\ng4 = st_rasterize(de, g3)\ng4$ID_1[g4$ID_1 == 758] = NA\ng4$ID1 = as.factor(g4$ID_1) # now a factor:\nplot(g4[\"ID1\"], reset = FALSE)\nplot(st_geometry(no2.sf), add = TRUE, col = 'green')\n\n\n\n\n\n\nno2.sf$ID1 = st_extract(g4, no2.sf)$ID1\nno2.sf$ID1 |> summary()\n# 753 754 755 756 759 760 761 762 763 764 765 766 767 \n# 4 8 3 4 10 6 6 6 6 1 6 5 1 \n# 768 NA's \n# 6 2\n\nSimple ANOVA type predictor:\n\nlm1 = lm(NO2~ID1, no2.sf)\nsummary(lm1)\n# \n# Call:\n# lm(formula = NO2 ~ ID1, data = no2.sf)\n# \n# Residuals:\n# Min 1Q Median 3Q Max \n# -6.324 -1.599 -0.311 0.859 12.358 \n# \n# Coefficients:\n# Estimate Std. Error t value Pr(>|t|) \n# (Intercept) 8.136 1.873 4.34 5.7e-05 ***\n# ID1754 1.883 2.294 0.82 0.41 \n# ID1755 4.068 2.861 1.42 0.16 \n# ID1756 -1.593 2.649 -0.60 0.55 \n# ID1759 1.015 2.216 0.46 0.65 \n# ID1760 -2.215 2.418 -0.92 0.36 \n# ID1761 1.353 2.418 0.56 0.58 \n# ID1762 3.697 2.418 1.53 0.13 \n# ID1763 -1.724 2.418 -0.71 0.48 \n# ID1764 1.091 4.188 0.26 0.80 \n# ID1765 0.591 2.418 0.24 0.81 \n# ID1766 -2.282 2.513 -0.91 0.37 \n# ID1767 1.024 4.188 0.24 0.81 \n# ID1768 -2.358 2.418 -0.98 0.33 \n# ---\n# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n# \n# Residual standard error: 3.75 on 58 degrees of freedom\n# (2 observations deleted due to missingness)\n# Multiple R-squared: 0.268, Adjusted R-squared: 0.104 \n# F-statistic: 1.63 on 13 and 58 DF, p-value: 0.102\ng4$NO2_aov = predict(lm1, as.data.frame(g4))\nplot(g4[\"NO2_aov\"], breaks = \"equal\", reset = FALSE)\nplot(st_cast(st_geometry(de), \"MULTILINESTRING\"), add = TRUE, col = 'green')\n\n\n\n\n\n\n\nSimple linear models in coordinates: trend surfaces\n\nlm2 = lm(NO2~x+y, no2.sf)\nsummary(lm2)\n# \n# Call:\n# lm(formula = NO2 ~ x + y, data = no2.sf)\n# \n# Residuals:\n# Min 1Q Median 3Q Max \n# -6.880 -2.634 -0.991 1.431 11.660 \n# \n# Coefficients:\n# Estimate Std. Error t value Pr(>|t|)\n# (Intercept) 9.47e+00 1.36e+01 0.70 0.49\n# x -3.66e-06 3.00e-06 -1.22 0.23\n# y 1.91e-07 2.45e-06 0.08 0.94\n# \n# Residual standard error: 3.95 on 71 degrees of freedom\n# Multiple R-squared: 0.0212, Adjusted R-squared: -0.00637 \n# F-statistic: 0.769 on 2 and 71 DF, p-value: 0.467\ncc = st_coordinates(g4)\ng4$x = cc[,1]\ng4$y = cc[,2]\ng4$NO2_lm2 = predict(lm2, g4)\nplot(g4[\"NO2_lm2\"], breaks = \"equal\", reset = FALSE, main = \"1st order polynomial\")\nplot(st_cast(st_geometry(de), \"MULTILINESTRING\"), add = TRUE, col = 'green')\n\n\n\n\n\n\n\n\nlm3 = lm(NO2~x+y+I(x^2)+I(y^2)+I(x*y), no2.sf)\nsummary(lm3)\n# \n# Call:\n# lm(formula = NO2 ~ x + y + I(x^2) + I(y^2) + I(x * y), data = no2.sf)\n# \n# Residuals:\n# Min 1Q Median 3Q Max \n# -5.480 -2.583 -0.585 1.523 12.750 \n# \n# Coefficients:\n# Estimate Std. Error t value Pr(>|t|) \n# (Intercept) -4.24e+02 3.27e+02 -1.30 0.199 \n# x 1.34e-04 9.13e-05 1.47 0.147 \n# y 1.39e-04 1.14e-04 1.21 0.230 \n# I(x^2) 2.52e-11 1.91e-11 1.32 0.190 \n# I(y^2) -1.06e-11 1.01e-11 -1.05 0.296 \n# I(x * y) -2.96e-11 1.65e-11 -1.79 0.077 .\n# ---\n# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n# \n# Residual standard error: 3.87 on 68 degrees of freedom\n# Multiple R-squared: 0.0972, Adjusted R-squared: 0.0308 \n# F-statistic: 1.46 on 5 and 68 DF, p-value: 0.213\ng4$NO2_lm3 = predict(lm3, g4)\nplot(g4[\"NO2_lm3\"], breaks = \"equal\", reset = FALSE, main = \"2nd order polynomial\")\nplot(st_cast(st_geometry(de), \"MULTILINESTRING\"), add = TRUE, col = 'green')\n\n\n\n\n\n\n\n\nlm4 = lm(NO2~x+y+I(x^2)+I(y^2)+I(x*y)+I(x^3)+I(x^2*y)+I(x*y^2)+I(y^3), no2.sf)\nsummary(lm4)\n# \n# Call:\n# lm(formula = NO2 ~ x + y + I(x^2) + I(y^2) + I(x * y) + I(x^3) + \n# I(x^2 * y) + I(x * y^2) + I(y^3), data = no2.sf)\n# \n# Residuals:\n# Min 1Q Median 3Q Max \n# -5.285 -2.582 -0.796 2.074 12.693 \n# \n# Coefficients:\n# Estimate Std. Error t value Pr(>|t|) \n# (Intercept) 3.38e+03 9.12e+03 0.37 0.712 \n# x 5.15e-03 2.50e-03 2.06 0.043 *\n# y -2.40e-03 4.84e-03 -0.49 0.622 \n# I(x^2) -1.19e-09 7.46e-10 -1.60 0.115 \n# I(y^2) 5.15e-10 8.59e-10 0.60 0.551 \n# I(x * y) -1.54e-09 8.57e-10 -1.80 0.077 .\n# I(x^3) 1.13e-16 1.31e-16 0.86 0.394 \n# I(x^2 * y) 1.80e-16 1.38e-16 1.30 0.197 \n# I(x * y^2) 1.14e-16 7.75e-17 1.47 0.146 \n# I(y^3) -3.49e-17 5.10e-17 -0.68 0.496 \n# ---\n# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n# \n# Residual standard error: 3.82 on 64 degrees of freedom\n# Multiple R-squared: 0.173, Adjusted R-squared: 0.0572 \n# F-statistic: 1.49 on 9 and 64 DF, p-value: 0.17\ng4$NO2_lm4 = predict(lm4, g4)\nplot(g4[\"NO2_lm4\"], breaks = \"equal\", reset = FALSE, main = \"3rd order polynomial\")\nplot(st_cast(st_geometry(de), \"MULTILINESTRING\"), add = TRUE, col = 'green')\n\n\n\n\n\n\n\n\n\n\n\n\n\nBreakout session 1\n\n\n\nDiscuss:\n\nhow will the predicted surface look like when instead of (functions of) coordinates, the variable elevation is used (e.g. to predict average temperatures)?\nwhat will be the value range, approximately, of the resulting predicted values?\n\n\n\nregression tree\n\nlibrary(rpart)\ntree = rpart(NO2~., as.data.frame(no2.sf)[c(\"NO2\", \"x\", \"y\")])\ng4$tree = predict(tree, as.data.frame(g4))\nplot(g4[\"tree\"], breaks = \"equal\", reset = FALSE, main = \"regression tree\")\nplot(st_cast(st_geometry(de), \"MULTILINESTRING\"), add = TRUE, col = 'green')\n\n\n\n\n\n\n\nRandom forest\n\nlibrary(randomForest)\n# randomForest 4.7-1.2\n# Type rfNews() to see new features/changes/bug fixes.\n# \n# Attaching package: 'randomForest'\n# The following object is masked from 'package:dplyr':\n# \n# combine\nrf = randomForest(NO2~., as.data.frame(no2.sf)[c(\"NO2\", \"x\", \"y\")])\n# Warning in seq.default(along = m): partial argument match of\n# 'along' to 'along.with'\ng4$rf = predict(rf, as.data.frame(g4))\nplot(g4[\"rf\"], breaks = \"equal\", reset = FALSE, main = \"random forest\")\nplot(st_cast(st_geometry(de), \"MULTILINESTRING\"), add = TRUE, col = 'green')\n\n\n\n\n\n\n\nRotated coordinates:\n\nlibrary(randomForest)\nno2.sf$x1 = no2.sf$x + no2.sf$y\nno2.sf$y1 = no2.sf$x - no2.sf$y\nrf = randomForest(NO2~., as.data.frame(no2.sf)[c(\"NO2\", \"x1\", \"y1\")])\n# Warning in seq.default(along = m): partial argument match of\n# 'along' to 'along.with'\ng4$x1 = g4$x + g4$y\ng4$y1 = g4$x - g4$y\ng4$rf_rot = predict(rf, as.data.frame(g4))\nplot(g4[\"rf_rot\"], breaks = \"equal\", reset = FALSE, main = \"random forest\")\nplot(st_cast(st_geometry(de), \"MULTILINESTRING\"), add = TRUE, col = 'green')\n\n\n\n\n\n\n\nUsing distance variables:\n\nst_bbox(de) |> st_as_sfc() |> st_cast(\"POINT\") -> pts\npts = c(pts[1:4], st_centroid(st_geometry(de)))\nd = st_distance(st_as_sfc(g4, as_points = TRUE), pts)\nfor (i in seq_len(ncol(d))) {\n g4[[ paste0(\"d_\", i) ]] = d[,i]\n}\ne = st_extract(g4, no2.sf)\nfor (i in seq_len(ncol(d))) {\n no2.sf[[ paste0(\"d_\", i) ]] = e[[16+i]]\n}\n(n = names(g4))\n# [1] \"ID_0\" \"ID_1\" \"Shape_Leng\" \"Shape_Area\"\n# [5] \"ID1\" \"NO2_aov\" \"x\" \"y\" \n# [9] \"NO2_lm2\" \"NO2_lm3\" \"NO2_lm4\" \"tree\" \n# [13] \"rf\" \"x1\" \"y1\" \"rf_rot\" \n# [17] \"d_1\" \"d_2\" \"d_3\" \"d_4\" \n# [21] \"d_5\" \"d_6\" \"d_7\" \"d_8\" \n# [25] \"d_9\" \"d_10\" \"d_11\" \"d_12\" \n# [29] \"d_13\" \"d_14\" \"d_15\" \"d_16\" \n# [33] \"d_17\" \"d_18\" \"d_19\" \"d_20\"\nplot(merge(g4[grepl(\"d_\", n)]))\n\n\n\n\n\n\n\n\nlibrary(randomForest)\nrf = randomForest(NO2~., as.data.frame(no2.sf)[c(\"NO2\", n[grepl(\"d_\", n)])])\n# Warning in seq.default(along = m): partial argument match of\n# 'along' to 'along.with'\ng4$rf_d = predict(rf, as.data.frame(g4))\nplot(g4[\"rf_d\"], breaks = \"equal\", reset = FALSE, main = \"random forest\")\nplot(st_cast(st_geometry(de), \"MULTILINESTRING\"), add = TRUE, col = 'green')\n\n\n\n\n\n\n\nAdding more…\n\npts = st_sample(de, 200, type = \"regular\")\nd = st_distance(st_as_sfc(g4, as_points = TRUE), pts)\nfor (i in seq_len(ncol(d))) {\n g4[[ paste0(\"d_\", i) ]] = d[,i]\n}\ne = st_extract(g4, no2.sf)\nfor (i in seq_len(ncol(d))) {\n no2.sf[[ paste0(\"d_\", i) ]] = e[[16+i]]\n}\n(n = names(g4))\n# [1] \"ID_0\" \"ID_1\" \"Shape_Leng\" \"Shape_Area\"\n# [5] \"ID1\" \"NO2_aov\" \"x\" \"y\" \n# [9] \"NO2_lm2\" \"NO2_lm3\" \"NO2_lm4\" \"tree\" \n# [13] \"rf\" \"x1\" \"y1\" \"rf_rot\" \n# [17] \"d_1\" \"d_2\" \"d_3\" \"d_4\" \n# [21] \"d_5\" \"d_6\" \"d_7\" \"d_8\" \n# [25] \"d_9\" \"d_10\" \"d_11\" \"d_12\" \n# [29] \"d_13\" \"d_14\" \"d_15\" \"d_16\" \n# [33] \"d_17\" \"d_18\" \"d_19\" \"d_20\" \n# [37] \"rf_d\" \"d_21\" \"d_22\" \"d_23\" \n# [41] \"d_24\" \"d_25\" \"d_26\" \"d_27\" \n# [45] \"d_28\" \"d_29\" \"d_30\" \"d_31\" \n# [49] \"d_32\" \"d_33\" \"d_34\" \"d_35\" \n# [53] \"d_36\" \"d_37\" \"d_38\" \"d_39\" \n# [57] \"d_40\" \"d_41\" \"d_42\" \"d_43\" \n# [61] \"d_44\" \"d_45\" \"d_46\" \"d_47\" \n# [65] \"d_48\" \"d_49\" \"d_50\" \"d_51\" \n# [69] \"d_52\" \"d_53\" \"d_54\" \"d_55\" \n# [73] \"d_56\" \"d_57\" \"d_58\" \"d_59\" \n# [77] \"d_60\" \"d_61\" \"d_62\" \"d_63\" \n# [81] \"d_64\" \"d_65\" \"d_66\" \"d_67\" \n# [85] \"d_68\" \"d_69\" \"d_70\" \"d_71\" \n# [89] \"d_72\" \"d_73\" \"d_74\" \"d_75\" \n# [93] \"d_76\" \"d_77\" \"d_78\" \"d_79\" \n# [97] \"d_80\" \"d_81\" \"d_82\" \"d_83\" \n# [101] \"d_84\" \"d_85\" \"d_86\" \"d_87\" \n# [105] \"d_88\" \"d_89\" \"d_90\" \"d_91\" \n# [109] \"d_92\" \"d_93\" \"d_94\" \"d_95\" \n# [113] \"d_96\" \"d_97\" \"d_98\" \"d_99\" \n# [117] \"d_100\" \"d_101\" \"d_102\" \"d_103\" \n# [121] \"d_104\" \"d_105\" \"d_106\" \"d_107\" \n# [125] \"d_108\" \"d_109\" \"d_110\" \"d_111\" \n# [129] \"d_112\" \"d_113\" \"d_114\" \"d_115\" \n# [133] \"d_116\" \"d_117\" \"d_118\" \"d_119\" \n# [137] \"d_120\" \"d_121\" \"d_122\" \"d_123\" \n# [141] \"d_124\" \"d_125\" \"d_126\" \"d_127\" \n# [145] \"d_128\" \"d_129\" \"d_130\" \"d_131\" \n# [149] \"d_132\" \"d_133\" \"d_134\" \"d_135\" \n# [153] \"d_136\" \"d_137\" \"d_138\" \"d_139\" \n# [157] \"d_140\" \"d_141\" \"d_142\" \"d_143\" \n# [161] \"d_144\" \"d_145\" \"d_146\" \"d_147\" \n# [165] \"d_148\" \"d_149\" \"d_150\" \"d_151\" \n# [169] \"d_152\" \"d_153\" \"d_154\" \"d_155\" \n# [173] \"d_156\" \"d_157\" \"d_158\" \"d_159\" \n# [177] \"d_160\" \"d_161\" \"d_162\" \"d_163\" \n# [181] \"d_164\" \"d_165\" \"d_166\" \"d_167\" \n# [185] \"d_168\" \"d_169\" \"d_170\" \"d_171\" \n# [189] \"d_172\" \"d_173\" \"d_174\" \"d_175\" \n# [193] \"d_176\" \"d_177\" \"d_178\" \"d_179\" \n# [197] \"d_180\" \"d_181\" \"d_182\" \"d_183\" \n# [201] \"d_184\" \"d_185\" \"d_186\" \"d_187\" \n# [205] \"d_188\" \"d_189\" \"d_190\" \"d_191\" \n# [209] \"d_192\" \"d_193\" \"d_194\" \"d_195\" \n# [213] \"d_196\" \"d_197\" \"d_198\" \"d_199\" \n# [217] \"d_200\" \"d_201\" \"d_202\" \"d_203\" \n# [221] \"d_204\"\nrf = randomForest(NO2~., as.data.frame(no2.sf)[c(\"NO2\", n[grepl(\"d_\", n)])])\n# Warning in seq.default(along = m): partial argument match of\n# 'along' to 'along.with'\ng4$rf_dm = predict(rf, as.data.frame(g4))\nplot(g4[\"rf_dm\"], breaks = \"equal\", reset = FALSE, main = \"random forest\")\nplot(st_cast(st_geometry(de), \"MULTILINESTRING\"), add = TRUE, col = 'green')\n\n\n\n\n\n\n\nFurther approaches:\n\nuse linear regression on Gaussian kernel basis functions, \\(\\exp(-h^2)\\)\n\nuse splines in \\(x\\) and \\(y\\), with a given degree of smoothing (or effective degrees of freedom)\nuse additional, non-distance/coordinate functions as base function(s)\n\nprovided they are available “everywhere” (as coverage)\nexamples: elevation, bioclimatic variables, (values derived from) satellite imagery bands\n\n\n\n\n\n\n\n\n\nBreakout session 2\n\n\n\nDiscuss:\n\nHow would you assess whether residuals from your fitted model are spatially correlated?\nDoes resampling using random partitioning (as is done in random forest) implicitly assume that observations are independent?\n\n\n\nExample from CAST / caret\n\nlibrary(CAST)\nlibrary(caret)\n# Loading required package: ggplot2\n# \n# Attaching package: 'ggplot2'\n# The following object is masked from 'package:randomForest':\n# \n# margin\n# Loading required package: lattice\ndata(splotdata)\nclass(splotdata)\n# [1] \"sf\" \"data.frame\"\nr = read_stars(system.file(\"extdata/predictors_chile.tif\", \n package = \"CAST\"))\nx = st_drop_geometry(splotdata)[,6:16]\ny = splotdata$Species_richness\ntr = train(x, y) # chooses a random forest by default\n# Warning in seq.default(along = pkg): partial argument match of\n# 'along' to 'along.with'\n# Warning in seq.default(along = models$library): partial argument\n# match of 'along' to 'along.with'\n# Warning in seq.default(along = data): partial argument match of\n# 'along' to 'along.with'\n# Warning in seq.default(along = data): partial argument match of\n# 'along' to 'along.with'\n# Warning in seq.default(along = data): partial argument match of\n# 'along' to 'along.with'\n# Warning in seq.default(along = data): partial argument match of\n# 'along' to 'along.with'\n# Warning in seq.default(along = data): partial argument match of\n# 'along' to 'along.with'\n# Warning in seq.default(along = data): partial argument match of\n# 'along' to 'along.with'\n# Warning in seq.default(along = data): partial argument match of\n# 'along' to 'along.with'\n# Warning in seq.default(along = data): partial argument match of\n# 'along' to 'along.with'\n# Warning in seq.default(along = data): partial argument match of\n# 'along' to 'along.with'\n# Warning in seq.default(along = data): partial argument match of\n# 'along' to 'along.with'\n# Warning in seq.default(along = data): partial argument match of\n# 'along' to 'along.with'\n# Warning in seq.default(along = data): partial argument match of\n# 'along' to 'along.with'\n# Warning in seq.default(along = data): partial argument match of\n# 'along' to 'along.with'\n# Warning in seq.default(along = data): partial argument match of\n# 'along' to 'along.with'\n# Warning in seq.default(along = data): partial argument match of\n# 'along' to 'along.with'\n# Warning in seq.default(along = data): partial argument match of\n# 'along' to 'along.with'\n# Warning in seq.default(along = data): partial argument match of\n# 'along' to 'along.with'\n# Warning in seq.default(along = data): partial argument match of\n# 'along' to 'along.with'\n# Warning in seq.default(along = data): partial argument match of\n# 'along' to 'along.with'\n# Warning in seq.default(along = data): partial argument match of\n# 'along' to 'along.with'\n# Warning in seq.default(along = data): partial argument match of\n# 'along' to 'along.with'\n# Warning in seq.default(along = data): partial argument match of\n# 'along' to 'along.with'\n# Warning in seq.default(along = data): partial argument match of\n# 'along' to 'along.with'\n# Warning in seq.default(along = data): partial argument match of\n# 'along' to 'along.with'\n# Warning in seq.default(along = data): partial argument match of\n# 'along' to 'along.with'\n# Warning in seq.default(along = x): partial argument match of\n# 'along' to 'along.with'\n# Warning in seq.default(along = outcome): partial argument match of\n# 'along' to 'along.with'\n# Warning in seq.default(along = x): partial argument match of\n# 'along' to 'along.with'\n# Warning in seq.default(2, to = p, length = len): partial argument\n# match of 'length' to 'length.out'\n# Warning in seq.default(along = resampleIndex): partial argument\n# match of 'along' to 'along.with'\n# Warning in tmp$resample: partial match of 'resample' to 'resamples'\n# Warning in seq.default(along = paramNames): partial argument match\n# of 'along' to 'along.with'\n# Warning in seq.default(along = y): partial argument match of\n# 'along' to 'along.with'\npredict(split(r), tr) |> plot()\n\n\n\n\n\n\n\nClustered data?\n\nplot(r[,,,1], reset = FALSE)\nplot(st_geometry(splotdata), add = TRUE, col = 'green')", + "text": "4.1 Spatial coordinates as predictor\nWe’ll rename coordinates to x and y:\n\nlibrary(dplyr)\n# \n# Attaching package: 'dplyr'\n# The following objects are masked from 'package:stats':\n# \n# filter, lag\n# The following objects are masked from 'package:base':\n# \n# intersect, setdiff, setequal, union\nlibrary(sf)\n# Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.4.0; sf_use_s2() is TRUE\ncrs <- st_crs(\"EPSG:32632\") # a csv doesn't carry a CRS!\nno2 <- read.csv(system.file(\"external/no2.csv\",\n package = \"gstat\")) \nno2 |> rename(x = station_longitude_deg, y = station_latitude_deg) |> \n st_as_sf(crs = \"OGC:CRS84\", coords =\n c(\"x\", \"y\"), remove = FALSE) |>\n st_transform(crs) -> no2.sf\n# we need to reassign x and y:\ncc = st_coordinates(no2.sf)\nno2.sf$x = cc[,1]\nno2.sf$y = cc[,2]\nhead(no2.sf)\n# Simple feature collection with 6 features and 21 fields\n# Geometry type: POINT\n# Dimension: XY\n# Bounding box: xmin: 495000 ymin: 5320000 xmax: 816000 ymax: 5930000\n# Projected CRS: WGS 84 / UTM zone 32N\n# station_european_code station_local_code country_iso_code\n# 1 DENI063 DENI063 DE\n# 2 DEBY109 DEBY109 DE\n# 3 DEBE056 DEBE056 DE\n# 4 DEBE062 DEBE062 DE\n# 5 DEBE032 DEBE032 DE\n# 6 DEHE046 DEHE046 DE\n# country_name station_name station_start_date\n# 1 Germany Altes Land 1999-02-11\n# 2 Germany Andechs/Rothenfeld 2003-04-17\n# 3 Germany B Friedrichshagen 1994-02-01\n# 4 Germany B Frohnau, Funkturm (3.5 m) 1996-02-01\n# 5 Germany B Grunewald (3.5 m) 1986-10-01\n# 6 Germany Bad Arolsen 1999-05-11\n# station_end_date type_of_station station_ozone_classification\n# 1 NA Background rural\n# 2 NA Background rural\n# 3 NA Background rural\n# 4 NA Background rural\n# 5 NA Background rural\n# 6 NA Background rural\n# station_type_of_area station_subcat_rural_back street_type x\n# 1 rural unknown 545414\n# 2 rural regional 665711\n# 3 rural near city 815741\n# 4 rural near city 790544\n# 5 rural near city 786923\n# 6 rural unknown 495007\n# y station_altitude station_city lau_level1_code\n# 1 5930802 3 NA\n# 2 5315213 700 NA\n# 3 5820995 35 NA\n# 4 5842367 50 BERLIN NA\n# 5 5822067 50 BERLIN NA\n# 6 5697747 343 BAD AROLSEN/KOHLGRUND NA\n# lau_level2_code lau_level2_name EMEP_station NO2\n# 1 3359028 Jork no 13.10\n# 2 9188117 Andechs no 7.14\n# 3 11000000 Berlin, Stadt no 12.80\n# 4 11000000 Berlin, Stadt no 11.83\n# 5 11000000 Berlin, Stadt no 11.98\n# 6 6635002 Bad Arolsen, Stadt no 8.94\n# geometry\n# 1 POINT (545414 5930802)\n# 2 POINT (665711 5315213)\n# 3 POINT (815741 5820995)\n# 4 POINT (790544 5842367)\n# 5 POINT (786923 5822067)\n# 6 POINT (495007 5697747)\n\"https://github.com/edzer/sdsr/raw/main/data/de_nuts1.gpkg\" |>\n read_sf() |>\n st_transform(crs) -> de\n\n\nlibrary(stars)\n# Loading required package: abind\ng2 = st_as_stars(st_bbox(de))\ng3 = st_crop(g2, de)\ng4 = st_rasterize(de, g3)\ng4$ID_1[g4$ID_1 == 758] = NA\ng4$ID1 = as.factor(g4$ID_1) # now a factor:\nplot(g4[\"ID1\"], reset = FALSE)\nplot(st_geometry(no2.sf), add = TRUE, col = 'green')\n\n\n\n\n\n\nno2.sf$ID1 = st_extract(g4, no2.sf)$ID1\nno2.sf$ID1 |> summary()\n# 753 754 755 756 759 760 761 762 763 764 765 766 767 \n# 4 8 3 4 10 6 6 6 6 1 6 5 1 \n# 768 NA's \n# 6 2\n\nSimple ANOVA type predictor:\n\nlm1 = lm(NO2~ID1, no2.sf)\nsummary(lm1)\n# \n# Call:\n# lm(formula = NO2 ~ ID1, data = no2.sf)\n# \n# Residuals:\n# Min 1Q Median 3Q Max \n# -6.324 -1.599 -0.311 0.859 12.358 \n# \n# Coefficients:\n# Estimate Std. Error t value Pr(>|t|) \n# (Intercept) 8.136 1.873 4.34 5.7e-05 ***\n# ID1754 1.883 2.294 0.82 0.41 \n# ID1755 4.068 2.861 1.42 0.16 \n# ID1756 -1.593 2.649 -0.60 0.55 \n# ID1759 1.015 2.216 0.46 0.65 \n# ID1760 -2.215 2.418 -0.92 0.36 \n# ID1761 1.353 2.418 0.56 0.58 \n# ID1762 3.697 2.418 1.53 0.13 \n# ID1763 -1.724 2.418 -0.71 0.48 \n# ID1764 1.091 4.188 0.26 0.80 \n# ID1765 0.591 2.418 0.24 0.81 \n# ID1766 -2.282 2.513 -0.91 0.37 \n# ID1767 1.024 4.188 0.24 0.81 \n# ID1768 -2.358 2.418 -0.98 0.33 \n# ---\n# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n# \n# Residual standard error: 3.75 on 58 degrees of freedom\n# (2 observations deleted due to missingness)\n# Multiple R-squared: 0.268, Adjusted R-squared: 0.104 \n# F-statistic: 1.63 on 13 and 58 DF, p-value: 0.102\ng4$NO2_aov = predict(lm1, as.data.frame(g4))\nplot(g4[\"NO2_aov\"], breaks = \"equal\", reset = FALSE)\nplot(st_cast(st_geometry(de), \"MULTILINESTRING\"), add = TRUE, col = 'green')\n\n\n\n\n\n\n\nSimple linear models in coordinates: trend surfaces\n\nlm2 = lm(NO2~x+y, no2.sf)\nsummary(lm2)\n# \n# Call:\n# lm(formula = NO2 ~ x + y, data = no2.sf)\n# \n# Residuals:\n# Min 1Q Median 3Q Max \n# -6.880 -2.634 -0.991 1.431 11.660 \n# \n# Coefficients:\n# Estimate Std. Error t value Pr(>|t|)\n# (Intercept) 9.47e+00 1.36e+01 0.70 0.49\n# x -3.66e-06 3.00e-06 -1.22 0.23\n# y 1.91e-07 2.45e-06 0.08 0.94\n# \n# Residual standard error: 3.95 on 71 degrees of freedom\n# Multiple R-squared: 0.0212, Adjusted R-squared: -0.00637 \n# F-statistic: 0.769 on 2 and 71 DF, p-value: 0.467\ncc = st_coordinates(g4)\ng4$x = cc[,1]\ng4$y = cc[,2]\ng4$NO2_lm2 = predict(lm2, g4)\nplot(g4[\"NO2_lm2\"], breaks = \"equal\", reset = FALSE, main = \"1st order polynomial\")\nplot(st_cast(st_geometry(de), \"MULTILINESTRING\"), add = TRUE, col = 'green')\n\n\n\n\n\n\n\n\nlm3 = lm(NO2~x+y+I(x^2)+I(y^2)+I(x*y), no2.sf)\nsummary(lm3)\n# \n# Call:\n# lm(formula = NO2 ~ x + y + I(x^2) + I(y^2) + I(x * y), data = no2.sf)\n# \n# Residuals:\n# Min 1Q Median 3Q Max \n# -5.480 -2.583 -0.585 1.523 12.750 \n# \n# Coefficients:\n# Estimate Std. Error t value Pr(>|t|) \n# (Intercept) -4.24e+02 3.27e+02 -1.30 0.199 \n# x 1.34e-04 9.13e-05 1.47 0.147 \n# y 1.39e-04 1.14e-04 1.21 0.230 \n# I(x^2) 2.52e-11 1.91e-11 1.32 0.190 \n# I(y^2) -1.06e-11 1.01e-11 -1.05 0.296 \n# I(x * y) -2.96e-11 1.65e-11 -1.79 0.077 .\n# ---\n# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n# \n# Residual standard error: 3.87 on 68 degrees of freedom\n# Multiple R-squared: 0.0972, Adjusted R-squared: 0.0308 \n# F-statistic: 1.46 on 5 and 68 DF, p-value: 0.213\ng4$NO2_lm3 = predict(lm3, g4)\nplot(g4[\"NO2_lm3\"], breaks = \"equal\", reset = FALSE, main = \"2nd order polynomial\")\nplot(st_cast(st_geometry(de), \"MULTILINESTRING\"), add = TRUE, col = 'green')\n\n\n\n\n\n\n\n\nlm4 = lm(NO2~x+y+I(x^2)+I(y^2)+I(x*y)+I(x^3)+I(x^2*y)+I(x*y^2)+I(y^3), no2.sf)\nsummary(lm4)\n# \n# Call:\n# lm(formula = NO2 ~ x + y + I(x^2) + I(y^2) + I(x * y) + I(x^3) + \n# I(x^2 * y) + I(x * y^2) + I(y^3), data = no2.sf)\n# \n# Residuals:\n# Min 1Q Median 3Q Max \n# -5.285 -2.582 -0.796 2.074 12.693 \n# \n# Coefficients:\n# Estimate Std. Error t value Pr(>|t|) \n# (Intercept) 3.38e+03 9.12e+03 0.37 0.712 \n# x 5.15e-03 2.50e-03 2.06 0.043 *\n# y -2.40e-03 4.84e-03 -0.49 0.622 \n# I(x^2) -1.19e-09 7.46e-10 -1.60 0.115 \n# I(y^2) 5.15e-10 8.59e-10 0.60 0.551 \n# I(x * y) -1.54e-09 8.57e-10 -1.80 0.077 .\n# I(x^3) 1.13e-16 1.31e-16 0.86 0.394 \n# I(x^2 * y) 1.80e-16 1.38e-16 1.30 0.197 \n# I(x * y^2) 1.14e-16 7.75e-17 1.47 0.146 \n# I(y^3) -3.49e-17 5.10e-17 -0.68 0.496 \n# ---\n# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n# \n# Residual standard error: 3.82 on 64 degrees of freedom\n# Multiple R-squared: 0.173, Adjusted R-squared: 0.0572 \n# F-statistic: 1.49 on 9 and 64 DF, p-value: 0.17\ng4$NO2_lm4 = predict(lm4, g4)\nplot(g4[\"NO2_lm4\"], breaks = \"equal\", reset = FALSE, main = \"3rd order polynomial\")\nplot(st_cast(st_geometry(de), \"MULTILINESTRING\"), add = TRUE, col = 'green')\n\n\n\n\n\n\n\n\n\n\n\n\n\nBreakout session 1\n\n\n\nDiscuss:\n\nhow will the predicted surface look like when instead of (functions of) coordinates, the variable elevation is used (e.g. to predict average temperatures)?\nwhat will be the value range, approximately, of the resulting predicted values?\n\n\n\nregression tree\n\nlibrary(rpart)\ntree = rpart(NO2~., as.data.frame(no2.sf)[c(\"NO2\", \"x\", \"y\")])\ng4$tree = predict(tree, as.data.frame(g4))\nplot(g4[\"tree\"], breaks = \"equal\", reset = FALSE, main = \"regression tree\")\nplot(st_cast(st_geometry(de), \"MULTILINESTRING\"), add = TRUE, col = 'green')\n\n\n\n\n\n\n\nRandom forest\n\nlibrary(randomForest)\n# randomForest 4.7-1.2\n# Type rfNews() to see new features/changes/bug fixes.\n# \n# Attaching package: 'randomForest'\n# The following object is masked from 'package:dplyr':\n# \n# combine\nrf = randomForest(NO2~., as.data.frame(no2.sf)[c(\"NO2\", \"x\", \"y\")])\ng4$rf = predict(rf, as.data.frame(g4))\nplot(g4[\"rf\"], breaks = \"equal\", reset = FALSE, main = \"random forest\")\nplot(st_cast(st_geometry(de), \"MULTILINESTRING\"), add = TRUE, col = 'green')\n\n\n\n\n\n\n\nRotated coordinates:\n\nlibrary(randomForest)\nno2.sf$x1 = no2.sf$x + no2.sf$y\nno2.sf$y1 = no2.sf$x - no2.sf$y\nrf = randomForest(NO2~., as.data.frame(no2.sf)[c(\"NO2\", \"x1\", \"y1\")])\ng4$x1 = g4$x + g4$y\ng4$y1 = g4$x - g4$y\ng4$rf_rot = predict(rf, as.data.frame(g4))\nplot(g4[\"rf_rot\"], breaks = \"equal\", reset = FALSE, main = \"random forest\")\nplot(st_cast(st_geometry(de), \"MULTILINESTRING\"), add = TRUE, col = 'green')\n\n\n\n\n\n\n\nUsing distance variables:\n\nst_bbox(de) |> st_as_sfc() |> st_cast(\"POINT\") -> pts\npts = c(pts[1:4], st_centroid(st_geometry(de)))\nd = st_distance(st_as_sfc(g4, as_points = TRUE), pts)\nfor (i in seq_len(ncol(d))) {\n g4[[ paste0(\"d_\", i) ]] = d[,i]\n}\ne = st_extract(g4, no2.sf)\nfor (i in seq_len(ncol(d))) {\n no2.sf[[ paste0(\"d_\", i) ]] = e[[16+i]]\n}\n(n = names(g4))\n# [1] \"ID_0\" \"ID_1\" \"Shape_Leng\" \"Shape_Area\"\n# [5] \"ID1\" \"NO2_aov\" \"x\" \"y\" \n# [9] \"NO2_lm2\" \"NO2_lm3\" \"NO2_lm4\" \"tree\" \n# [13] \"rf\" \"x1\" \"y1\" \"rf_rot\" \n# [17] \"d_1\" \"d_2\" \"d_3\" \"d_4\" \n# [21] \"d_5\" \"d_6\" \"d_7\" \"d_8\" \n# [25] \"d_9\" \"d_10\" \"d_11\" \"d_12\" \n# [29] \"d_13\" \"d_14\" \"d_15\" \"d_16\" \n# [33] \"d_17\" \"d_18\" \"d_19\" \"d_20\"\nplot(merge(g4[grepl(\"d_\", n)]))\n\n\n\n\n\n\n\n\nlibrary(randomForest)\nrf = randomForest(NO2~., as.data.frame(no2.sf)[c(\"NO2\", n[grepl(\"d_\", n)])])\ng4$rf_d = predict(rf, as.data.frame(g4))\nplot(g4[\"rf_d\"], breaks = \"equal\", reset = FALSE, main = \"random forest\")\nplot(st_cast(st_geometry(de), \"MULTILINESTRING\"), add = TRUE, col = 'green')\n\n\n\n\n\n\n\nAdding more…\n\npts = st_sample(de, 200, type = \"regular\")\nd = st_distance(st_as_sfc(g4, as_points = TRUE), pts)\nfor (i in seq_len(ncol(d))) {\n g4[[ paste0(\"d_\", i) ]] = d[,i]\n}\ne = st_extract(g4, no2.sf)\nfor (i in seq_len(ncol(d))) {\n no2.sf[[ paste0(\"d_\", i) ]] = e[[16+i]]\n}\n(n = names(g4))\n# [1] \"ID_0\" \"ID_1\" \"Shape_Leng\" \"Shape_Area\"\n# [5] \"ID1\" \"NO2_aov\" \"x\" \"y\" \n# [9] \"NO2_lm2\" \"NO2_lm3\" \"NO2_lm4\" \"tree\" \n# [13] \"rf\" \"x1\" \"y1\" \"rf_rot\" \n# [17] \"d_1\" \"d_2\" \"d_3\" \"d_4\" \n# [21] \"d_5\" \"d_6\" \"d_7\" \"d_8\" \n# [25] \"d_9\" \"d_10\" \"d_11\" \"d_12\" \n# [29] \"d_13\" \"d_14\" \"d_15\" \"d_16\" \n# [33] \"d_17\" \"d_18\" \"d_19\" \"d_20\" \n# [37] \"rf_d\" \"d_21\" \"d_22\" \"d_23\" \n# [41] \"d_24\" \"d_25\" \"d_26\" \"d_27\" \n# [45] \"d_28\" \"d_29\" \"d_30\" \"d_31\" \n# [49] \"d_32\" \"d_33\" \"d_34\" \"d_35\" \n# [53] \"d_36\" \"d_37\" \"d_38\" \"d_39\" \n# [57] \"d_40\" \"d_41\" \"d_42\" \"d_43\" \n# [61] \"d_44\" \"d_45\" \"d_46\" \"d_47\" \n# [65] \"d_48\" \"d_49\" \"d_50\" \"d_51\" \n# [69] \"d_52\" \"d_53\" \"d_54\" \"d_55\" \n# [73] \"d_56\" \"d_57\" \"d_58\" \"d_59\" \n# [77] \"d_60\" \"d_61\" \"d_62\" \"d_63\" \n# [81] \"d_64\" \"d_65\" \"d_66\" \"d_67\" \n# [85] \"d_68\" \"d_69\" \"d_70\" \"d_71\" \n# [89] \"d_72\" \"d_73\" \"d_74\" \"d_75\" \n# [93] \"d_76\" \"d_77\" \"d_78\" \"d_79\" \n# [97] \"d_80\" \"d_81\" \"d_82\" \"d_83\" \n# [101] \"d_84\" \"d_85\" \"d_86\" \"d_87\" \n# [105] \"d_88\" \"d_89\" \"d_90\" \"d_91\" \n# [109] \"d_92\" \"d_93\" \"d_94\" \"d_95\" \n# [113] \"d_96\" \"d_97\" \"d_98\" \"d_99\" \n# [117] \"d_100\" \"d_101\" \"d_102\" \"d_103\" \n# [121] \"d_104\" \"d_105\" \"d_106\" \"d_107\" \n# [125] \"d_108\" \"d_109\" \"d_110\" \"d_111\" \n# [129] \"d_112\" \"d_113\" \"d_114\" \"d_115\" \n# [133] \"d_116\" \"d_117\" \"d_118\" \"d_119\" \n# [137] \"d_120\" \"d_121\" \"d_122\" \"d_123\" \n# [141] \"d_124\" \"d_125\" \"d_126\" \"d_127\" \n# [145] \"d_128\" \"d_129\" \"d_130\" \"d_131\" \n# [149] \"d_132\" \"d_133\" \"d_134\" \"d_135\" \n# [153] \"d_136\" \"d_137\" \"d_138\" \"d_139\" \n# [157] \"d_140\" \"d_141\" \"d_142\" \"d_143\" \n# [161] \"d_144\" \"d_145\" \"d_146\" \"d_147\" \n# [165] \"d_148\" \"d_149\" \"d_150\" \"d_151\" \n# [169] \"d_152\" \"d_153\" \"d_154\" \"d_155\" \n# [173] \"d_156\" \"d_157\" \"d_158\" \"d_159\" \n# [177] \"d_160\" \"d_161\" \"d_162\" \"d_163\" \n# [181] \"d_164\" \"d_165\" \"d_166\" \"d_167\" \n# [185] \"d_168\" \"d_169\" \"d_170\" \"d_171\" \n# [189] \"d_172\" \"d_173\" \"d_174\" \"d_175\" \n# [193] \"d_176\" \"d_177\" \"d_178\" \"d_179\" \n# [197] \"d_180\" \"d_181\" \"d_182\" \"d_183\" \n# [201] \"d_184\" \"d_185\" \"d_186\" \"d_187\" \n# [205] \"d_188\" \"d_189\" \"d_190\" \"d_191\" \n# [209] \"d_192\" \"d_193\" \"d_194\" \"d_195\" \n# [213] \"d_196\" \"d_197\" \"d_198\"\nrf = randomForest(NO2~., as.data.frame(no2.sf)[c(\"NO2\", n[grepl(\"d_\", n)])])\ng4$rf_dm = predict(rf, as.data.frame(g4))\nplot(g4[\"rf_dm\"], breaks = \"equal\", reset = FALSE, main = \"random forest\")\nplot(st_cast(st_geometry(de), \"MULTILINESTRING\"), add = TRUE, col = 'green')\n\n\n\n\n\n\n\nFurther approaches:\n\nuse linear regression on Gaussian kernel basis functions, \\(\\exp(-h^2)\\)\n\nuse splines in \\(x\\) and \\(y\\), with a given degree of smoothing (or effective degrees of freedom)\nuse additional, non-distance/coordinate functions as base function(s)\n\nprovided they are available “everywhere” (as coverage)\nexamples: elevation, bioclimatic variables, (values derived from) satellite imagery bands\n\n\n\n\n\n\n\n\n\nBreakout session 2\n\n\n\nDiscuss:\n\nHow would you assess whether residuals from your fitted model are spatially correlated?\nDoes resampling using random partitioning (as is done in random forest) implicitly assume that observations are independent?\n\n\n\nExample from CAST / caret\n\nlibrary(CAST)\nlibrary(caret)\n# Loading required package: ggplot2\n# \n# Attaching package: 'ggplot2'\n# The following object is masked from 'package:randomForest':\n# \n# margin\n# Loading required package: lattice\ndata(splotdata)\nclass(splotdata)\n# [1] \"sf\" \"data.frame\"\nr = read_stars(system.file(\"extdata/predictors_chile.tif\", \n package = \"CAST\"))\nx = st_drop_geometry(splotdata)[,6:16]\ny = splotdata$Species_richness\ntr = train(x, y) # chooses a random forest by default\npredict(split(r), tr) |> plot()\n\n\n\n\n\n\n\nClustered data?\n\nplot(r[,,,1], reset = FALSE)\nplot(st_geometry(splotdata), add = TRUE, col = 'green')", "crumbs": [ "4  Machine Learning methods applied to spatial data" ] @@ -394,7 +394,7 @@ "href": "day4.html#transferrability-of-models-area-of-applicability", "title": "\n4  Machine Learning methods applied to spatial data\n", "section": "\n4.3 Transferrability of models: “area of applicability”", - "text": "4.3 Transferrability of models: “area of applicability”\nExplained here;\n\naoa <- aoa(r, tr, verbose = FALSE)\n# Warning in seq.default(along = pkg): partial argument match of\n# 'along' to 'along.with'\n# Warning in seq.default(along = code$library): partial argument\n# match of 'along' to 'along.with'\n# Warning in seq.default(along = pkg): partial argument match of\n# 'along' to 'along.with'\n# Warning in seq.default(along = code$library): partial argument\n# match of 'along' to 'along.with'\n# note: Either no model was given or no CV was used for model training. The DI threshold is therefore based on all training data\n# Warning in trainDI$thres: partial match of 'thres' to 'threshold'\nplot(aoa)\n# Warning: Removed 395 rows containing non-finite outside the scale range\n# (`stat_density()`).\n\n\n\n\n\n\nplot(aoa$DI)\n\n\n\n\n\n\nplot(aoa$AOA)", + "text": "4.3 Transferrability of models: “area of applicability”\nExplained here;\n\naoa <- aoa(r, tr, verbose = FALSE)\n# note: Either no model was given or no CV was used for model training. The DI threshold is therefore based on all training data\nplot(aoa)\n# Warning: Removed 395 rows containing non-finite outside the scale range\n# (`stat_density()`).\n\n\n\n\n\n\nplot(aoa$DI)\n\n\n\n\n\n\nplot(aoa$AOA)", "crumbs": [ "4  Machine Learning methods applied to spatial data" ] diff --git a/docs/sitemap.xml b/docs/sitemap.xml index ce04f98..ac58d56 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -14,7 +14,7 @@ https://edzer.github.io/sswr/day3.html - 2024-11-13T14:30:27.026Z + 2024-11-13T18:05:38.237Z https://edzer.github.io/sswr/day4.html