Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add codes to extract SoilGrids soil texture data and derive ensemble … #3406

Open
wants to merge 5 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 31 additions & 13 deletions models/sipnet/R/write.configs.SIPNET.R
Original file line number Diff line number Diff line change
Expand Up @@ -462,22 +462,40 @@ write.config.SIPNET <- function(defaults, trait.values, settings, run.id, inputs
}
} ## end loop over PFTS
####### end parameter update
#working on reading soil file (only working for 1 soil file)
if(length(settings$run$inputs$soilinitcond$path)==1){
soil_IC_list <- PEcAn.data.land::pool_ic_netcdf2list(settings$run$inputs$soilinitcond$path)
#SoilWHC and LitterWHC
if("volume_fraction_of_water_in_soil_at_saturation"%in%names(soil_IC_list$vals)){
#SoilWHC
param[which(param[, 1] == "soilWHC"), 2] <- mean(unlist(soil_IC_list$vals["volume_fraction_of_water_in_soil_at_saturation"]))*100

#LitterWHC
#param[which(param[, 1] == "litterWHC"), 2] <- unlist(soil_IC_list$vals["volume_fraction_of_water_in_soil_at_saturation"])[1]*100
#working on reading soil file
if (length(settings$run$inputs$soilinitcond$path) > 0) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Still no clear how you're handling the case if >1 file is passed in

template.soilinit <- settings$run$inputs$soilinitcond$path ## read from settings

if (!is.null(inputs)) {
## override if specified in inputs
if ("soilinitcond" %in% names(inputs)) {
template.soilinit <- inputs$soilinit$path
}
}
if("soil_hydraulic_conductivity_at_saturation"%in%names(soil_IC_list$vals)){
mdietze marked this conversation as resolved.
Show resolved Hide resolved
soil_IC_list <- PEcAn.data.land::pool_ic_netcdf2list(template.soilinit)
# Calculate the thickness of soil layers based on the assumption that the depth values are at bottoms and the first layer top is at 0
if ("depth" %in% names(soil_IC_list$dims)) {
thickness<-c(soil_IC_list$dims$depth[1],diff(soil_IC_list$dims$depth))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this line still work if the product only has one layer?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, should/could there be an "else" here? A bunch of code below continues to check for "depth" being defined, but doesn't ever use depth again -- instead you're just using this to ensure thickness is defined.

}
#SoilWHC
if ("depth" %in% names(soil_IC_list$dims) && "volume_fraction_of_water_in_soil_at_saturation" %in% names(soil_IC_list$vals)) {
# Calculate the soilWHC for the whole soil profile in cm
soilWHC_total <- sum(unlist(soil_IC_list$vals["volume_fraction_of_water_in_soil_at_saturation"])*thickness)
param[which(param[, 1] == "soilWHC"), 2] <- soilWHC_total
#LitterWHC
param[which(param[, 1] == "litterWHC"), 2] <- soilWHC_total
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is incorrect. If there's a litter layer it would have a much much smaller WHC than the entire soil pool.

}
if ("depth" %in% names(soil_IC_list$dims) && "soil_hydraulic_conductivity_at_saturation" %in% names(soil_IC_list$vals)) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

per comment below about this being for surface litter, I'm not sure you need "depth" here

# Calculate the soil_hydraulic_conductivity for the whole soil profile (soilHC_total) if vertical flow dominates for
# unsaturated conditions or in deep, well-drained profiles. If horizontal flow dominates such as for
# saturated conditions or where barriers to vertical flow exist, the overall conductivity is the
# weighted arithmetic mean based on the proportion of the total thickness each layer represents as:
# soilHC_total <- sum(unlist(soil_IC_list$vals["soil_hydraulic_conductivity_at_saturation"])*thickness)/sum(thickness)
soilHC_total <- sum(thickness)/sum(thickness/(unlist(soil_IC_list$vals["soil_hydraulic_conductivity_at_saturation"])))
#litwaterDrainrate
param[which(param[, 1] == "litWaterDrainRate"), 2] <- unlist(soil_IC_list$vals["soil_hydraulic_conductivity_at_saturation"])[1]*100/(3600*24)
param[which(param[, 1] == "litWaterDrainRate"), 2] <- soilHC_total
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

First, I'm not sure why you're using the average hydraulic conductivity of the entire soil column to control the rate at which moisture leaves the litter and enters the first soil layer. Second, we should check that the way SIPNET uses the conductivity (as a proportional decay rate, see https://github.com/PecanProject/sipnet/blob/becec4d2d6d857fa25dc47f974c48621a0b9044b/sipnet.c#L1516) is compatible with the way the parameter is defined in the soils file (Darcy's law) or whether additional assumptions or unit conversions are needed

}
}
}
if (!is.null(IC)) {
ic.names <- names(IC)
## plantWoodInit gC/m2
Expand Down
7 changes: 4 additions & 3 deletions modules/assim.sequential/R/metSplit.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 +32,18 @@ metSplit <- function(conf.settings, inputs, settings, model, no_split = FALSE, o
# Loading the model package - this is required bc of the furrr
library(paste0("PEcAn.",model), character.only = TRUE)

inputs.split <- list()
# Initialize inputs.split with inputs to keep the sub-list of soil parameters "soilinitcond"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not following the logic here. The metSplit code should only ever be touching the met. The comment here about soil parameters is confusing, as metSplit shouldn't be touching ANY of the other inputs, not just soils. I'm having a hard time telling if the code below is incorrect, or if it's fixing a long standing bug, or if the larger issue is that metSplit shouldn't ever have been passed the entirety of inputs to begin with.

inputs.split <- inputs
if (!no_split) {
for (i in seq_len(nens)) {
#---------------- model specific split inputs
inputs.split$samples[i] <- do.call(
inputs.split$met$samples[i] <- do.call(
my.split_inputs,
args = list(
settings = settings,
start.time = (lubridate::ymd_hms(start.time, truncated = 3) + lubridate::second(lubridate::hms("00:00:01"))),
stop.time = lubridate::ymd_hms(stop.time, truncated = 3),
inputs = inputs$samples[[i]])
inputs = inputs$met$samples[[i]])
)
}
} else{
Expand Down
36 changes: 25 additions & 11 deletions modules/assim.sequential/R/sda.enkf_MultiSite.R
Original file line number Diff line number Diff line change
Expand Up @@ -355,17 +355,28 @@ sda.enkf.multisite <- function(settings,
#reformatting params
new.params <- sda_matchparam(settings, ensemble.samples, site.ids, nens)
}
#sample met ensemble members
#sample met and soil parameter ensemble members
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should be sampling ALL input ensemble members, not just met, not just (met + soils)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  #TODO: incorporate Phyllis's restart work
 #      sample all inputs specified in the settings$ensemble not just met

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI Phyllis's work lives on the BU server at /projectnb/dietzelab/yixuanli and still needs going through to make sure (a) write.ensemble.configs is saving ALL the info it needs about ensembles, not just params and (b) that you can have the option to pass in a specific already-sampled ensemble of params, met, soil, IC, etc. without having the code resample it. This is important in the general case to allow for sensitivity analysis designs. It's also important for the SDA case as the sample from t==1 needs to be the same sample that's used for t>1.

#TODO: incorporate Phyllis's restart work
# sample all inputs specified in the settings$ensemble not just met
inputs <- PEcAn.settings::papply(conf.settings,function(setting) {
PEcAn.uncertainty::input.ens.gen(
settings = setting,
input = "met",
method = setting$ensemble$samplingspace$met$method,
parent_ids = NULL
)
})
input_name <- c("met","soilinitcond")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should not be hard coded. The set of inputs that are ensemble-based needs to be detected from the settings

inputs <- list()
new_inputs <- list()
for (i_input in seq_along(input_name)){
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inputs can't be sampled in an arbitrary order. See the logic in write.ensemble.configs. Indeed, if the SDA can't call write.ensemble.configs directly, then there's a lot of code there than needs to be refactored into a function that can be called by both forward and SDA code, since maintaining two separate versions of the sampling code is clearly resulting in code divergence.

inputs[[input_name[i_input]]]<-PEcAn.settings::papply(conf.settings,function(setting) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could you explain what the papply is doing here?

PEcAn.uncertainty::input.ens.gen(
settings = setting,
input = input_name[i_input],
method = setting$ensemble$samplingspace[[input_name[i_input]]]$method,
parent_ids = NULL
)
})
#reformat the input list to make the multisite setting as the first sub-category
for (setting_name in names(inputs[[input_name[i_input]]])) {
new_inputs[[setting_name]][[input_name[i_input]]] <- inputs[[input_name[i_input]]][[setting_name]]
}
}


###------------------------------------------------------------------------------------------------###
### loop over time ###
###------------------------------------------------------------------------------------------------###
Expand All @@ -379,7 +390,9 @@ sda.enkf.multisite <- function(settings,
if (t>1){
#for next time step split the met if model requires
#-Splitting the input for the models that they don't care about the start and end time of simulations and they run as long as their met file.
inputs.split <- metSplit(conf.settings, inputs, settings, model, no_split = FALSE, obs.times, t, nens, restart_flag = FALSE, my.split_inputs)
inputs.split <- list()
inputs.split <- metSplit(conf.settings, new_inputs, settings, model, no_split = FALSE, obs.times, t, nens, restart_flag = FALSE, my.split_inputs)


#---------------- setting up the restart argument for each site separately and keeping them in a list
restart.list <-
Expand Down Expand Up @@ -768,4 +781,5 @@ sda.enkf.multisite <- function(settings,
# }
## MCD: I commented the above "if" out because if you are restarting from a previous forecast, this might delete the files in that earlier folder
} ### end loop over time
} # sda.enkf
} # sda.enkf

Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ template <- PEcAn.settings::Settings(list(
ensemble = structure(list(size = 25, variable = "NPP",
samplingspace = structure(list(
parameters = structure(list(method = "lhc")),
soilinitcond = structure(list(method = "sampling")),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Check to make sure there aren't other inputs that are being left out

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, is the operational SDA using a LHC sampling of the posterior parameters? That's not random.

met = structure(list(method = "sampling"))
))
)),
Expand Down
1 change: 1 addition & 0 deletions modules/data.land/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ export(soil2netcdf)
export(soil_params)
export(soil_process)
export(soilgrids_soilC_extract)
export(soilgrids_texture_extraction)
export(subset_layer)
export(to.Tag)
export(to.TreeCode)
Expand Down
2 changes: 2 additions & 0 deletions modules/data.land/R/extract_soil_nc.R
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,7 @@ extract_soil_nc <- function(in.file,outdir,lat,lon){
#' * `soil_thermal_conductivity_at_saturation`
#' * `soil_thermal_capacity`
#' * `soil_albedo`

#'
#' @param varname character vector. See details
#'
Expand Down Expand Up @@ -383,6 +384,7 @@ soil.units <- function(varname = NA){
"soil_thermal_conductivity_at_saturation","W m-1 K-1",
"soil_thermal_capacity","J kg-1 K-1",
"soil_albedo","1"

),
ncol=2,byrow = TRUE))
colnames(variables) <- c('var','unit')
Expand Down
223 changes: 223 additions & 0 deletions modules/data.land/R/soil_params_ensemble.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
##' A function to estimate individual alphas for Dirichlet distribution to approximate the observed quantiles with means as known moments
##' for SoilGrids soil texture data.
##' Dirichlet distribution is assumed as soil texture data follow categorical distribution and the probability of each category is in the range 0 to 1,
##' and all must sum to 1.
##'
##' @param means A vector of means of sand, clay, and silt proportion for one soil layer at one site from SoilGrids data
##' @param quantiles A list of 5th, 50th, and 95th percentiles for sand, clay, and silt for one soil layer at one site from SoilGrids data
##'
##' @example
##' \dontrun{
##' means <- c(0.566,0.193,0.241) # means of sand, clay, and silt at one site and one depth
##' quantiles <-list(
##' q5 = c(0.127,0.034,0.052), # 5th percentile for each category: sand, clay, and silt at one site and one depth
##' q50 = c(0.615,0.15,0.191), # 50th percentile (median) for each category: sand, clay, and silt at one site and one depth
##' q95 = c(0.799,0.66,0.616)) # 95th percentile for each category: sand, clay, and silt at one site and one depth
##' alpha_est <- estimate_dirichlet_parameters(means, quantiles)
##' }
##' @return The individual alphas that work best to fit the observed quantiles
##' @export
##' @author Qianyu Li
estimate_dirichlet_parameters <- function(means, quantiles) {

# A function to optimize alpha0, which is the sum of individual alphas.
estimate_alpha0 <- function(means, quantiles) {
# Objective function to minimize the difference between observed and simulated quantiles with means as a known moment
objective_function <- function(alpha0) {
if (alpha0 <= 0)
return(Inf) # alpha0 couldn't be zero or negative as it is the sum of individual alpha which are positive reals
# Estimate individual alpha based on that the means of each categorical data are individual alpha divided by alpha0 in Dirichlet distribution
alpha <- means * alpha0
# Generate samples based on estimated alpha
samples <- MCMCpack::rdirichlet(10000, alpha) # Generate samples
# Compute differences with observed quantiles
estimated_quantiles <- apply(samples, 2, quantile, probs = c(0.05, 0.5, 0.95),na.rm = TRUE)
quantile_diff <- sum((estimated_quantiles - do.call(rbind, quantiles))^2)
return(quantile_diff)
}

# Optimize alpha0
result <- optim(
par = 1, # Initial guess for alpha0
fn = objective_function,
method = "L-BFGS-B",
lower = 0.01 # alpha0 must be positive
)
return(result$par)
}

alpha0 <- estimate_alpha0(means, quantiles)
if (alpha0 <= 0) {
stop("Estimated alpha0 is non-positive, which is invalid.")
}
alphas <- means * alpha0
mdietze marked this conversation as resolved.
Show resolved Hide resolved
return(alphas)
}



##' A function to estimate the soil parameters based on SoilGrids soil texture data and write the parameter paths into settings
##'
##' @param settings A multi-site settings
##' @param sand A data frame containing sand fraction in percentage from SoilGrids250m v2.0 with columns "Depth", "Quantile", "Siteid", and "Value"
##' @param clay A data frame containing clay fraction in percentage from SoilGrids250m v2.0 with columns "Depth", "Quantile", "Siteid", and "Value"
##' @param silt A data frame containing silt fraction in percentage from SoilGrids250m v2.0 with columns "Depth", "Quantile", "Siteid", and "Value"
##' @param outdir Provide the path to store the parameter files
##' @param write_into_settings Whether to write the path of parameter file into the setting. The default is TRUE
##'
##' @example
##' \dontrun{
##'
##' outdir <- "/projectnb/dietzelab/Cherry/SoilGrids_texture/39NEON"
##' sand <- readRDS("/projectnb/dietzelab/Cherry/SoilGrids_texture/sand_percent.rds") #sand fraction in percentage
##' clay <- readRDS("/projectnb/dietzelab/Cherry/SoilGrids_texture/clay_percent.rds") #clay fraction in percentage
##' silt <- readRDS("/projectnb/dietzelab/Cherry/SoilGrids_texture/silt_percent.rds") #silt fraction in percentage
##' settings <-read.settings("/projectnb/dietzelab/Cherry/xml/pecan_monthly_SDA_soilwater.xml")
##' soil_params_ensemble_soilgrids(settings,sand,clay,silt,outdir)
##' }
##'
##' @return Ensemble soil parameter files defined in outdir and file paths in xml file
##' @export
##' @author Qianyu Li
##' @importFrom magrittr %>%
##'

soil_params_ensemble_soilgrids <- function(settings,sand,clay,silt,outdir,write_into_settings=TRUE){

# A function to rescale the sums of mean texture fractions to 1 as the original sums are slightly different from 1 for some layers
rescale_sum_to_one <- function(sand, clay, silt) {
total <- sand + clay + silt
rescaled_sand <- sand / total
rescaled_clay <- clay / total
rescaled_silt <- silt / total
return(list(
sand = rescaled_sand,
clay = rescaled_clay,
silt = rescaled_silt))
}

# A function to write to settings
create_mult_list <- function(list.names, paths) {
out <- as.list(paths)
names(out) <- list.names
out
}

# Convert values to proportion (0-1) from percentage
if (any(c(sand$Value, clay$Value, silt$Value) > 2)) {
sand$Value <- if (is.null(sand$Value)) { NULL } else { sand$Value / 100 }
clay$Value <- if (is.null(clay$Value)) { NULL } else { clay$Value / 100 }
silt$Value <- if (is.null(silt$Value)) { NULL } else { silt$Value / 100 }
}
ens_n <- as.numeric(settings$ensemble$size)
# Merge all soil texture data together
texture_all <-merge(sand, clay, by=c("Depth", "Quantile", "Siteid")) %>% merge(silt, by=c("Depth", "Quantile", "Siteid")) %>%
`colnames<-`(c(
"soil_depth", #"soil_depth" will be used in "soil2netcdf" function
"quantile",
"siteid",
"fraction_of_sand_in_soil",
"fraction_of_clay_in_soil",
"fraction_of_silt_in_soil"))

# Substitute the depth range with the bottom depth values (with the assumption that the first layer's top is at 0)
texture_all$soil_depth <-
gsub("100-200cm", 200, gsub("60-100cm", 100, gsub(
"30-60cm", 60, gsub("15-30cm", 30, gsub(
"5-15cm", 15, gsub("0-5cm", 5, texture_all$soil_depth))))))
texture_all$soil_depth <- as.numeric(texture_all$soil_depth)
# Reformat the list based on site id
f1 <- factor(texture_all$siteid, levels = unique(texture_all$siteid))
dat <- split(texture_all, f1)
# Grab Site IDs from settings
settings_id <-lapply(settings, function(x) as.numeric(x$run$site$id))

for (i in seq_along(dat)) {
samples_ens <- list()
paths <- c()
siteid <- as.numeric(unique(dat[[i]]$siteid))
soil_depth <- unique(dat[[i]]$soil_depth)
str_ns <- paste0(siteid %/% 1e+09, "-", siteid %% 1e+09)
temp_outdir <- file.path(outdir, siteid)
dir.create(temp_outdir)
# Estimate Dirichlet parameters for each depth at each site
for (depths in sort(unique(texture_all$soil_depth))) {
quantiles <- list(
q5 = dplyr::filter(dat[[i]], quantile == "0.05", soil_depth == depths) %>% dplyr::select(
fraction_of_sand_in_soil,
fraction_of_clay_in_soil,
fraction_of_silt_in_soil), # 5th percentile for each category
q50 = dplyr::filter(dat[[i]], quantile == "0.5", soil_depth == depths) %>% dplyr::select(
fraction_of_sand_in_soil,
fraction_of_clay_in_soil,
fraction_of_silt_in_soil), # 50th percentile (median) for each category
q95 = dplyr::filter(dat[[i]], quantile == "0.95", soil_depth == depths) %>% dplyr::select(
fraction_of_sand_in_soil,
fraction_of_clay_in_soil,
fraction_of_silt_in_soil)) # 95th percentile for each category

# Extract the means
means <- dplyr::filter(dat[[i]], quantile == "Mean", soil_depth == depths) %>% dplyr::select(fraction_of_sand_in_soil,fraction_of_clay_in_soil,fraction_of_silt_in_soil)
soil_rescaled <-rescale_sum_to_one(means$fraction_of_sand_in_soil,means$fraction_of_clay_in_soil,means$fraction_of_silt_in_soil)

# Replace the original means with the rescaled ones
means$fraction_of_sand_in_soil <- soil_rescaled$sand
means$fraction_of_clay_in_soil <- soil_rescaled$clay
means$fraction_of_silt_in_soil <- soil_rescaled$silt

# Estimate Dirichlet parameters
alpha_est <- estimate_dirichlet_parameters(as.matrix(means), quantiles)

# Generate the ensemble soil texture data based on the ensemble size (ens_n) defined in the settings
samples <- MCMCpack::rdirichlet(ens_n, alpha_est)
colnames(samples) <-c("fraction_of_sand_in_soil","fraction_of_clay_in_soil","fraction_of_silt_in_soil")
samples <-list(samples) %>% setNames(depths)
samples_ens <- append(samples_ens, samples)
}

# Generate soil parameter file for each one in ensemble soil texture data
for (ens in 1:ens_n) {
# Choose one sample
samples_all_depth <- lapply(samples_ens, function(x) x[ens, ])
# Reformat the nested list as input to "soil2netcdf" function
reformatted_soil_list <- reformat_soil_list(samples_all_depth)
prefix <- paste0("Soil_params_", str_ns, "_", ens)
new.file <- file.path(outdir, siteid, paste0(prefix, ".nc"))
out.ense <- soil2netcdf(reformatted_soil_list, new.file)
paths <- c(new.file, paths)
}

# Write the parameter paths to settings
if (write_into_settings) {
ind <- which(settings_id == siteid)
settings[[ind]]$run$inputs$soilinitcond$source <- "SoilGrids"
settings[[ind]]$run$inputs$soilinitcond$output <- "soilinitcond"
settings[[ind]]$run$inputs$soilinitcond$ensemble <- ens_n
settings[[ind]]$run$inputs$soilinitcond$path <-create_mult_list(rep("path", ens_n), paths)
write.settings(settings,outputdir = settings$outdir,outputfile = "pecan.xml")
}
}
}

# A function to reformat the nested list as inputs to "soil2netcdf" function
reformat_soil_list <- function(samples_all_depth) {
# Define the fractions we want to extract
fractions <-
c("fraction_of_sand_in_soil",
"fraction_of_clay_in_soil",
"fraction_of_silt_in_soil")

# Initialize a new list to store reformatted data
reformatted <-setNames(vector("list", length(fractions)), fractions)

# Extract data for each fraction
for (fraction in fractions) {
reformatted[[fraction]] <-
unlist(lapply(samples_all_depth, function(depth_list) {
depth_list[[fraction]] # Extract the fraction value
})) %>% purrr::set_names(NULL)
}
# Combine depth into a single vector for readability
reformatted$soil_depth <- as.numeric(names(samples_all_depth))
return(reformatted)
}
Loading
Loading