diff --git a/pkg/proc.attr.hydfab/DESCRIPTION b/pkg/proc.attr.hydfab/DESCRIPTION index 5b551e1..aa4fe51 100644 --- a/pkg/proc.attr.hydfab/DESCRIPTION +++ b/pkg/proc.attr.hydfab/DESCRIPTION @@ -1,6 +1,6 @@ Package: proc.attr.hydfab Title: Grab and process catchment attributes using the hydrofabric -Version: 0.0.1.0013 +Version: 0.0.1.0014 Authors@R: c(person("Guy", "Litt", , "guy.litt@noaa.gov", role = c("aut", "cre"), comment = c(ORCID = "https://orcid.org/0000-0003-1996-7468")), diff --git a/pkg/proc.attr.hydfab/R/proc_attr_grabber.R b/pkg/proc.attr.hydfab/R/proc_attr_grabber.R index 4b54ab9..373fa1a 100644 --- a/pkg/proc.attr.hydfab/R/proc_attr_grabber.R +++ b/pkg/proc.attr.hydfab/R/proc_attr_grabber.R @@ -375,6 +375,10 @@ proc_attr_wrap <- function(comid, Retr_Params, lyrs='network',overwrite=FALSE,hf path_attrs <- base::file.path(Retr_Params$paths$dir_db_attrs, base::paste0("comid_",comid,"_attrs.parquet")) vars_ls <- Retr_Params$vars + # ------- Retr_Params$vars format checker --------- # + # Run check on requested variables for retrieval: + proc.attr.hydfab:::wrap_check_vars(vars_ls) + # ----------- existing dataset checker ----------- # ls_chck <- proc.attr.hydfab::proc_attr_exst_wrap(comid,path_attrs, vars_ls,bucket_conn=NA) @@ -513,8 +517,13 @@ proc_attr_gageids <- function(gage_ids,featureSource,featureID,Retr_Params, revisit the configuration yaml file that processes this dataset in fs_proc: \n {featureSource}, and featureID={featureID}")) } else if (!is.null(site_feature)){ - comid <- site_feature['comid']$comid - ls_site_feat[[gage_id]] <- site_feature + if(!base::is.na(site_feature['comid']$comid)){ + comid <- site_feature['comid']$comid + } else { + message(glue::glue("Could not retrieve comid for {nldi_feat$featureID}.")) + comid <- nhdplusTools::discover_nhdplus_id(point=site_feature$geometry) + message(glue::glue("Geospatial search found a comid value of: {comid}")) + } ls_comid[[gage_id]] <- comid # Retrieve the variables corresponding to datasets of interest & update database @@ -522,6 +531,8 @@ proc_attr_gageids <- function(gage_ids,featureSource,featureID,Retr_Params, Retr_Params=Retr_Params, lyrs=lyrs,overwrite=FALSE, hfab_retr=hfab_retr)) + loc_attrs$gage_id <- gage_id # Add the original identifier to dataset + ls_site_feat[[gage_id]] <- loc_attrs if("try-error" %in% class(loc_attrs)){ message(glue::glue("Skipping gage_id {gage_id} corresponding to comid {comid}")) } @@ -529,17 +540,16 @@ proc_attr_gageids <- function(gage_ids,featureSource,featureID,Retr_Params, message(glue::glue("Skipping {gage_id}")) } } - just_comids <- ls_comid %>% unname() %>% unlist() + just_comids <- ls_comid %>% base::unname() %>% base::unlist() if(any(is.na(just_comids))){ - idxs_na_comids <- which(is.na(just_comids)) + idxs_na_comids <- base::which(base::is.na(just_comids)) gage_ids_missing <- paste0(names(ls_comid[idxs_na_comids]), collapse = ", ") warning(glue::glue("The following gage_id values did not return a comid:\n {gage_ids_missing}")) } - dt_site_feat <- data.table::rbindlist(ls_site_feat) - dt_site_feat$gage_id <- gage_ids # Add the original identifier to dataset + dt_site_feat <- data.table::rbindlist(ls_site_feat,fill = TRUE) return(dt_site_feat) } @@ -795,6 +805,48 @@ write_meta_nldi_feat <- function(dt_site_feat, path_meta){ base::message(glue::glue("Wrote nldi location metadata to {path_meta}")) } +wrap_check_vars <- function(vars_ls){ + #' @title Internal wrapper to run checks on requested attribute variable names + #' @param vars_ls A named list from Retr_Params$vars in the standardized format + #' @description Given a list of variable categories, each containing vectors + #' of variable names, check the following: + #' 1) the variable category is a recognized category name (e.g. 'usgs_vars') + #' 2) the variable names inside the category name are actual variable names + #' that can be used to retrieve attributes (e.g. 'TOT_TWI' as an nhdplus attribute) + + # Get the accepted variable categories used in proc.attr.hydfab R package + dir_pkg <- system.file("extdata",package="proc.attr.hydfab") + cfg_attr_src <- yaml::read_yaml(base::file.path(dir_pkg,"attr_source_types.yml")) + var_catgs <- base::lapply(cfg_attr_src, + function(x) base::unlist(x)[['name']]) %>% + base::unlist() %>% base::unname() + + # Now check what var categories provided by user in the the Retr_Params$vars + names_var_catg <- base::names(vars_ls) + if(base::any(base::is.null(names_var_catg))){ + stop(glue::glue("Retr_Params$vars should be a sublist with sublist names ", + "corresponding to\n standardized names in the proc.attr.hydfab package.", + " These names include:\n{paste0(var_catgs,collapse='\n')}")) + } + + # Run test that the variable name is inside + test_bool_var_catg <- base::lapply(names_var_catg, + function(x) x %in% var_catgs) %>% unlist() + if(base::any(!test_bool_var_catg)){ + stop(glue::glue("Retr_Params$vars contains the following unrecognized ", + "variable category name(s): ", + "{paste0(names_var_catg[!test_bool_var_catg],collapse='\n')}", + "\nAcceptable names include:\n", + "{paste0(var_catgs,collapse='\n')}" + )) + } + + # ------------------ RUN CHECK ON INDIVIDUAL VARIABLE NAMES -------------- # + for(var_group_name in names(vars_ls)){ + sub_vars <- vars_ls[[var_group_name]] + proc.attr.hydfab::check_attr_selection(vars=sub_vars) + } +} check_attr_selection <- function(attr_cfg_path = NULL, vars = NULL, verbose = TRUE){ #' @title Check that attributes selected by user are available diff --git a/scripts/config/attr_gen_camels.R b/scripts/config/attr_gen_camels.R new file mode 100644 index 0000000..92a031b --- /dev/null +++ b/scripts/config/attr_gen_camels.R @@ -0,0 +1,61 @@ +#' @title Generate attributes for CAMELS basins +#' @description This script uses the proc.attr.hydfab package to acquire attributes +#' of interest. +#' + + +library(dplyr) +library(glue) +library(tidyr) +library(yaml) +library(proc.attr.hydfab) + +main <- function(){ + # Define args supplied to command line + home_dir <- Sys.getenv("HOME") + + ############################ BEGIN CUSTOM MUNGING ############################ + + # ----------------------=-- Read in CAMELS gage ids ------------------------ # + path_gages_ii <- glue::glue("{home_dir}/noaa/camels/gagesII_wood/gages_list.txt") + dat_gages_ii <- read.csv(path_gages_ii) + gage_ids <- base::lapply(1:nrow(dat_gages_ii), function(i) + tail(strsplit(dat_gages_ii[i,],split = ' ',fixed = TRUE)[[1]],n=1)) |> + unlist() |> + lapply(function(x) + gsub(pattern=".gpkg",replacement = "",x = x)) |> + unlist() |> + lapply( function(x) gsub(pattern = "Gage_", replacement = "",x=x)) |> + unlist() + + utils::write.table(gage_ids,glue::glue('{home_dir}/noaa/camels/gagesII_wood/camels_ii_gage_ids.txt'),row.names = FALSE,col.names = FALSE) + + # --------------------- Read in usgs NHD attribute IDs --------------------- # + # Read desired usgs nhdplus attributes, stored in NOAA shared drive here: + # https://docs.google.com/spreadsheets/d/1h-630L2ChH5zlQIcWJHVaxY9YXtGowcCqakQEAXgRrY/edit?usp=sharing + attrs_nhd_df <- read.csv(glue::glue("{home_dir}/noaa/regionalization/processing/usgs_nhdplus_attrs.csv")) + + attrs_nhd <- attrs_nhd_df$ID + + Retr_Params <- list(paths = list(dir_db_attrs = glue::glue("{home_dir}/noaa/regionalization/data/input/attributes/"), + dir_std_base = glue::glue("{home_dir}/noaa/regionalization/data/input/user_data_std")), + vars = list(usgs_vars = attrs_nhd), + datasets = "camelsii_nhdp_grab_nov24", + xtra_hfab = list(hfab_retr=FALSE)) + + + ############################ END CUSTOM MUNGING ############################## + + # ---------------------- Grab all needed attributes ---------------------- # + # Now acquire the attributes: + ls_comids <- proc.attr.hydfab::proc_attr_gageids(gage_ids=gage_ids, + featureSource='nwissite', + featureID='USGS-{gage_id}', + Retr_Params=Retr_Params, + overwrite=FALSE) + + message(glue::glue("Completed attribute acquisition for {Retr_Params$paths$dir_db_attrs}")) +} + + +main() diff --git a/scripts/config/camels_attr_config.yaml b/scripts/config/camels_attr_config.yaml new file mode 100644 index 0000000..d23aa8f --- /dev/null +++ b/scripts/config/camels_attr_config.yaml @@ -0,0 +1,54 @@ +# Config for grabbing catchment attributes corresponding to standard-named locations +# Two options exist for defining locations that need attributes. At least one must be used. Both may be used. +# 1. Refer to a file/dataset {loc_id_filepath} with a column identifer {loc_id} representing a standardized location identifier. +# 2. Refer to a dataset processed by fs_proc python package and point to its location, {dir_std_base}/{datasets}, where {datasets} is a specific subdirectory name(s) or simply 'all' + +col_schema: # required column mappings in the evaluation metrics dataset (if read in) + - 'featureID': 'USGS-{gage_id}' # python f-string / R glue() format; converting the 'gage_id' to the standardized featureID used by nhdplusTools/hydrofabric. Must use '{gage_id}' e.g. 'USGS-{gage_id}' + - 'featureSource': 'nwissite' # The standardized nhdplusTools featureSource. Possible featureSources might be 'nwissite', 'comid'. +loc_id_read: # This section only required for locations NOT to be read in under a standardized dataset location (dir_std_base). May be used for additional prediction locations. MUST leave each item name inside list with empty assignments if no datasets desired. + - 'gage_id': 'gage_id' # expects tabular dataset with this column name representing the location id. + - 'loc_id_filepath': '' # Required. filepath. Allows reading of .csv or a dataset accessible using arrow::open_datast() in lieu of reading dataset generated by fs_proc. + - 'featureID_loc' : 'USGS-{gage_id}' # python f-string / R glue() format; converting the 'loc_id' to the standardized featureID used by nhdplusTools/hydrofabric. Must use '{loc_id}' e.g. 'USGS-{loc_id}'. + - 'featureSource_loc': 'nwissite' # The standardized nhdplusTools featureSource. +file_io: # May define {home_dir} for python's '{home_dir}/string_path'.format(home_dir =str(Path.home())) functionality + - 'save_loc': 'local' # #TODO implement once s3 becomes a capability. Use 'local' for saving to a local path via dir_save. Future work will create an approach for 'aws' or other cloud saving methods + - 'dir_base' : '{home_dir}/noaa/regionalization/data/input' # Required. The save location of standardized output + - 'dir_std_base' : '{dir_base}/user_data_std' # Required. The location of standardized data generated by fs_proc python package + - 'dir_db_hydfab' : '{dir_base}/hydrofabric' # Required. The local dir where hydrofabric data are stored (limits the total s3 connections) + - 'dir_db_attrs' : '{dir_base}/attributes' # Required. The parent dir where each comid's attribute parquet file is stored in the subdirectory 'comid/', and each dataset's aggregated parquet attributes are stored in the subdirectory '/{dataset_name} +formulation_metadata: + - 'datasets': # Required. Must match directory name inside dir_std_base. May be a list of items, or simply sublist 'all' to select everything inside dir_std_base for attribute grabbing. + - 'juliemai-xSSA' # Required. In this example case, it's a sublist of just one thing. + - 'formulation_base': 'Raven_blended' # Informational. Unique name of formulation. Optional. +hydfab_config: # Required section describing hydrofabric connection details and objects of interest + - 's3_base' : "s3://lynker-spatial/tabular-resources" # Required. s3 path containing hydrofabric-formatted attribute datasets + - 's3_bucket' : 'lynker-spatial' # Required. s3 bucket containing hydrofabric data + - 'ext' : 'gpkg' # Required. file extension of the hydrofrabric data. Default 'gpkg'. + - 'hf_cat_sel': "total" # Required. Options include 'total' or 'all'; total: interested in the single location's aggregated catchment data; all: all subcatchments of interest +attr_select: # Required. The names of variable sublistings are standardized, e.g. ha_vars, usgs_vars, sc_vars + - 's3_path_hydatl' : '{s3_base}/hydroATLAS/hydroatlas_vars.parquet' # path to hydroatlas data formatted for hydrofabric. Required only if hydroatlas variables desired. + - 'ha_vars': # hydroatlas variables. Must specify s3_path_hydatl if desired. + - 'pet_mm_s01' + - 'cly_pc_sav' + - 'cly_pc_uav' + - 'ari_ix_sav' + - 'usgs_vars': # list of variables retrievable using nhdplusTools::get_characteristics_metadata(). + - 'TOT_TWI' + - 'TOT_PRSNOW' + - 'TOT_POPDENS90' + - 'TOT_EWT' + - 'TOT_RECHG' + - 'TOT_PPT7100_ANN' + - 'TOT_AET' + - 'TOT_PET' + - 'TOT_SILTAVE' + - 'TOT_BASIN_AREA' + - 'TOT_BASIN_SLOPE' + - 'TOT_ELEV_MEAN' + - 'TOT_ELEV_MAX' + - 'TOT_Intensity' + - 'TOT_Wet' + - 'TOT_Dry' + - 'sc_vars': # Streamcat variables of interest. #TODO add streamcat grabber capability to proc.attr.hydfab + - # In this example case, no streamcat variables selected