diff --git a/.Rbuildignore b/.Rbuildignore index 91114bf..9b5b641 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,2 +1,4 @@ +^Meta$ +^doc$ ^.*\.Rproj$ ^\.Rproj\.user$ diff --git a/.gitignore b/.gitignore index 09a72cb..33cd43d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +Meta +doc .Rproj.user .Rhistory .RData diff --git a/DESCRIPTION b/DESCRIPTION index d550da0..1c97265 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,4 +1,4 @@ -Package: forestSampling +Package: forestsamplr Title: Standard Forest Sampling Design Workups Version: 0.0.0.9000 Authors@R: c( @@ -11,9 +11,10 @@ Depends: License: MIT Encoding: UTF-8 LazyData: true -RoxygenNote: 6.0.1 +RoxygenNote: 6.1.1 Suggests: testthat, knitr, rmarkdown VignetteBuilder: knitr Imports: dplyr +Requires: dplyr diff --git a/R/all_cluster.R b/R/all_cluster.R index acc4b1e..79a0d6d 100644 --- a/R/all_cluster.R +++ b/R/all_cluster.R @@ -1,10 +1,10 @@ #' @title Summarize all cluster sample #' @description Summarizes population-level statistics for -#' cluster sample data. This function has two options: (1) -#' Cluster sample with a normal distribution and (2) Cluster +#' cluster sample data. This function has two options: (1) +#' Cluster sample with a normal distribution and (2) Cluster #' sample with a Bernoulli distribution. -#' @usage summarize_all_cluster(data, attribute = NA, element = TRUE, -#' plotTot = NA, +#' @usage summarize_all_cluster(data, attribute = NA, element = TRUE, +#' plotTot = NA, #' desiredConfidence = 0.95, #' bernoulli = F) #' @param data data frame containing observations of variable of @@ -12,10 +12,10 @@ #' @param attribute character name of attribute to be summarized. #' @param element logical true if parameter data is plot-level, false if #' parameter data is cluster-level. Default is True. -#' @param plotTot numeric population size. Equivalent to the +#' @param plotTot numeric population size. Equivalent to the #' total number of possible elements in the population. #' @param desiredConfidence numeric desired confidence level (e.g. 0.9). -#' @param bernoulli logical TRUE if data fitting the Bernoulli +#' @param bernoulli logical TRUE if data fitting the Bernoulli #' distribution is used. #' @return data frame of stand-level statistics including #' standard error and confidence interval limits. @@ -28,55 +28,66 @@ #' #' # Plot level data can be expressed as: #' -#' plotLevelDataExample <- data.frame(clusterID = c(1, 1, 1, 1, 1, 2, -#' 2, 3, 4, 4, 4, 4, -#' 4, 4, 5, 5, 5, 5, -#' 5), -#' attr = c(1000, 1250, 950, 900, -#' 1005, 1000, 1250, 950, -#' 900, 1005, 1000, 1250, -#' 950, 900, 1005, 1000, -#' 1250, 950, 900), -#' isUsed = c(T, T, T, T, T, T, T, -#' T, T, T, T, T, T, T, -#' F, F, F, F, F)) +#' plotLevelDataExample <- data.frame( +#' clusterID = c( +#' 1, 1, 1, 1, 1, 2, +#' 2, 3, 4, 4, 4, 4, +#' 4, 4, 5, 5, 5, 5, +#' 5 +#' ), +#' attr = c( +#' 1000, 1250, 950, 900, +#' 1005, 1000, 1250, 950, +#' 900, 1005, 1000, 1250, +#' 950, 900, 1005, 1000, +#' 1250, 950, 900 +#' ), +#' isUsed = c( +#' T, T, T, T, T, T, T, +#' T, T, T, T, T, T, T, +#' F, F, F, F, F +#' ) +#' ) #' -#' # Cluster level data can be expressed as: +#' # Cluster level data can be expressed as: #' -#' clusterLevelDataExample <- data.frame(clusterID = c(1, 2, 3, 4, 5), -#' clusterElements = c(4, 2, 9, -#' 4, 10), -#' sumAttr = c(1000, 1250, 950, -#' 900, 1005), -#' isUsed = c(T, T, F, T, T)) +#' clusterLevelDataExample <- data.frame( +#' clusterID = c(1, 2, 3, 4, 5), +#' clusterElements = c( +#' 4, 2, 9, +#' 4, 10 +#' ), +#' sumAttr = c( +#' 1000, 1250, 950, +#' 900, 1005 +#' ), +#' isUsed = c(T, T, F, T, T) +#' ) #' # Set element = FALSE #' #' -#' # Bernoulli data can be expressed as: +#' # Bernoulli data can be expressed as: #' -#' bernoulliData <- data.frame(plots = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), -#' propAlive = c(0.75, 0.80, 0.80, 0.85, -#' 0.70, 0.90, 0.70, 0.75, -#' 0.80, 0.65)) +#' bernoulliData <- data.frame( +#' plots = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), +#' propAlive = c( +#' 0.75, 0.80, 0.80, 0.85, +#' 0.70, 0.90, 0.70, 0.75, +#' 0.80, 0.65 +#' ) +#' ) #' # Set parameter bernoulli = TRUE -#' #' } #' @export -summarize_all_cluster <- function(data, attribute = NA, element = TRUE, - plotTot = NA, desiredConfidence = 0.95, bernoulli = F) { - +summarize_all_cluster <- function(data, attribute = NA, element = TRUE, + plotTot = NA, desiredConfidence = 0.95, bernoulli = F) { if (bernoulli == F) { - out <- summarize_cluster(data, element, attribute, desiredConfidence) - } else { - out <- summarize_cluster_discrete(data, attribute, plotTot, desiredConfidence) - } - + return(out) - } diff --git a/R/all_srs.R b/R/all_srs.R index fae425b..a00ab97 100644 --- a/R/all_srs.R +++ b/R/all_srs.R @@ -1,11 +1,11 @@ #' @title Summarize all simple random sample #' @description Summarizes population-level statistics for -#' simple random sample data. This function has three options: (1) SRS +#' simple random sample data. This function has three options: (1) SRS #' of a finite population or sampled without replacement, #' (2) SRS of an infinite population or sampled with replacement, #' and (3) SRS with a Bernoulli distribution. #' @usage summarize_all_srs(data, attribute = 'attr', -#' popSize = NA, desiredConfidence = 0.95, +#' popSize = NA, desiredConfidence = 0.95, #' infiniteReplacement = F, bernoulli = F) #' @param data data frame or vector containing observations of #' variable of interest. Variable of interest must already be expanded @@ -17,7 +17,7 @@ #' @param infiniteReplacement logical true if sample was done with replacement #' or from an infinite population. False if sampled without replacement, #' from a finite population. Defaults to False. -#' @param bernoulli logical TRUE if data fitting the Bernoulli +#' @param bernoulli logical TRUE if data fitting the Bernoulli #' distribution is used. #' @return a data frame of population mean, variance, standard error, and #' high and low confidence limits. @@ -35,35 +35,34 @@ #' #' # Data frame data example: #' -#' data <- data.frame(bapa = c(120, 140, 160, 110, 100, 90), -#' plots = c(1, 2, 3, 4, 5, 6)) -#' attribute <- 'bapa' +#' data <- data.frame( +#' bapa = c(120, 140, 160, 110, 100, 90), +#' plots = c(1, 2, 3, 4, 5, 6) +#' ) +#' attribute <- "bapa" #' #' #' # Bernoulli data example: #' -#' data <- data.frame(alive = c(T, T, F, T, F, F), -#' plots = c(1, 2, 3, 4, 5, 6)) -#' attribute <- 'alive' -#' +#' data <- data.frame( +#' alive = c(T, T, F, T, F, F), +#' plots = c(1, 2, 3, 4, 5, 6) +#' ) +#' attribute <- "alive" #' } #' @export -summarize_all_srs <- function(data, attribute = 'attr', popSize = NA, - desiredConfidence = 0.95, infiniteReplacement = F, bernoulli = F) { - +summarize_all_srs <- function(data, attribute = "attr", popSize = NA, + desiredConfidence = 0.95, infiniteReplacement = F, bernoulli = F) { if (bernoulli == F) { - - out <- summarize_simple_random(data, attribute, popSize, - desiredConfidence, infiniteReplacement) - + out <- summarize_simple_random( + data, attribute, popSize, + desiredConfidence, infiniteReplacement + ) } else { - out <- summarize_simple_random_discrete(data, attribute, popTot = popSize, desiredConfidence) - } - + return(out) - } diff --git a/R/cluster.R b/R/cluster.R index db4e166..2573a98 100644 --- a/R/cluster.R +++ b/R/cluster.R @@ -5,7 +5,7 @@ #' variance terms refer to the variance of the mean. #' @param data data frame containing observations of variable of #' interest for either cluster-level or element-level data. -#' @param element logical true if parameter data is element-level +#' @param element logical true if parameter data is element-level #' (plot-level), false if parameter data is cluster-level. Default is True. #' @param attribute character name of attribute to be summarized. #' @param desiredConfidence numeric desired confidence level (e.g. 0.9). @@ -15,12 +15,16 @@ #' @import dplyr #' @examples #' \dontrun{ -#' dataPlot <- data.frame(clusterID = c(1, 1, 1, 1, 1, 2, 2, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5), -#' attr = c(1000, 1250, 950, 900, 1005, 1000, 1250, 950, 900, 1005, 1000, -#' 1250, 950, 900, 1005, 1000, 1250, 950, 900), -#' isUsed = c(T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F)) -#' element = TRUE -#' attribute = 'attr' +#' dataPlot <- data.frame( +#' clusterID = c(1, 1, 1, 1, 1, 2, 2, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5), +#' attr = c( +#' 1000, 1250, 950, 900, 1005, 1000, 1250, 950, 900, 1005, 1000, +#' 1250, 950, 900, 1005, 1000, 1250, 950, 900 +#' ), +#' isUsed = c(T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F) +#' ) +#' element <- TRUE +#' attribute <- "attr" #' } #' @export @@ -39,36 +43,31 @@ summarize_cluster <- function(data, element = TRUE, attribute = NA, desiredConfi if (element) { # change variable of interest name to attr, unsummarized data$attr <- attrTemp - } else { # change variable of interest name to sumAttr, summarized by element data$sumAttr <- attrTemp - } - } -if (element) { - - # calculates cluster values from element data + if (element) { - cluster <- data %>% - group_by(clusterID) %>% - summarize(sumAttr = sum(attr), # sum attributes by cluster - clusterElements = n()) %>% # tally of elements in each cluster - left_join(distinct(data, clusterID, .keep_all = TRUE)) # maintain isUsed for each cluster + # calculates cluster values from element data -} else { + cluster <- data %>% + group_by(clusterID) %>% + summarize( + sumAttr = sum(attr), # sum attributes by cluster + clusterElements = n() + ) %>% # tally of elements in each cluster + left_join(distinct(data, clusterID, .keep_all = TRUE)) # maintain isUsed for each cluster + } else { - # reassigns data as cluster, if input data is cluster-level data - cluster <- data - -} + # reassigns data as cluster, if input data is cluster-level data + cluster <- data + } if (length(cluster$clusterID) == 1) { - stop("Must have multiple clusters. Consider other analysis.") - } # basic values: sample-level @@ -77,38 +76,42 @@ if (element) { mutate(nSamp = n()) %>% # num clusters mutate(mSampBar = sum(clusterElements) / nSamp) %>% # avg num elements in a cluster mutate(df = sum(clusterElements) - 1) - + # basic values: population-level popValues <- cluster %>% - summarize(mPop = sum(clusterElements), - nPop = n(), # num clusters - mPopBar = mPop / nPop) + summarize( + mPop = sum(clusterElements), + nPop = n(), # num clusters + mPopBar = mPop / nPop + ) if (is.na(popValues$mPopBar) | popValues$mPopBar == sampValues$mSampBar[[1]]) { # if Mbar (pop) is unknown, approximate it with mbar (samp) popValues$mPopBar <- sum(sampValues$mSampBar[[1]]) - } - + clusterSummary <- sampValues %>% mutate(yBar = sum(sumAttr) / sum(clusterElements)) %>% - mutate(ySETempNum = (sumAttr - yBar * clusterElements) ^ 2) %>% - summarize(ySE = sqrt(((popValues$nPop - nSamp[[1]]) / - (popValues$nPop * nSamp[[1]] * (popValues$mPopBar ^ 2))) - * (sum(ySETempNum) / (nSamp[[1]] - 1))), - yBar = mean(yBar), - nSamp = mean(nSamp), - mSampBar = mean(mSampBar), - df = df[[1]]) %>% + mutate(ySETempNum = (sumAttr - yBar * clusterElements)^2) %>% + summarize( + ySE = sqrt( + ((popValues$nPop - nSamp[[1]]) / (popValues$nPop * nSamp[[1]] * + (popValues$mPopBar^2))) * (sum(ySETempNum) / (nSamp[[1]] - 1)) + ), + yBar = mean(yBar), + nSamp = mean(nSamp), + mSampBar = mean(mSampBar), + df = df[[1]] + ) %>% mutate(highCL = yBar + qt(1 - ((1 - desiredConfidence) / 2), df) * ySE) %>% mutate(lowCL = yBar - qt(1 - ((1 - desiredConfidence) / 2), df) * ySE) %>% - select(standardError = ySE, lowerLimitCI = lowCL, upperLimitCI = highCL, - mean = yBar, nSamp, mSampBar) %>% + select( + standardError = ySE, lowerLimitCI = lowCL, upperLimitCI = highCL, + mean = yBar, nSamp, mSampBar + ) %>% bind_cols(popValues) # return data frame of stand-level statistics return(clusterSummary) - } - diff --git a/R/cluster_discrete.R b/R/cluster_discrete.R index 3e39232..b4db6bd 100644 --- a/R/cluster_discrete.R +++ b/R/cluster_discrete.R @@ -2,53 +2,58 @@ #' @description Summarizes population-level statistics for #' cluster sample for attribute data. The calculations are #' derived from Chapter 3 in Avery and Burkhart's (1967) -#' Forest Measurements, Fifth Edition. The variance terms +#' Forest Measurements, Fifth Edition. The variance terms #' refer to the variance of the mean. #' @param data data frame containing observations of variable #' of interest. Attribute (variable of interest) must be the #' proportion alive in the associated plot. #' @param attribute character name of attribute to be summarized. -#' @param plotTot numeric population size. Equivalent to the +#' @param plotTot numeric population size. Equivalent to the #' total number of possible plots in the population. -#' @param desiredConfidence numeric desired confidence level +#' @param desiredConfidence numeric desired confidence level #' (e.g. 0.9). -#' @return data frame of stand-level statistics. Includes +#' @return data frame of stand-level statistics. Includes #' standard error and confidence interval. #' @author Karin Wolken #' @import dplyr #' @examples #' \dontrun{ -#' data <- data.frame(plots = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), -#' propAlive = c(0.75, 0.80, 0.80, 0.85, 0.70, -#' 0.90, 0.70, 0.75, 0.80, 0.65)) -#' attribute = 'propAlive' -#' plotTot = 250 -#' desiredConfidence = 0.95 +#' data <- data.frame( +#' plots = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), +#' propAlive = c( +#' 0.75, 0.80, 0.80, 0.85, 0.70, +#' 0.90, 0.70, 0.75, 0.80, 0.65 +#' ) +#' ) +#' attribute <- "propAlive" +#' plotTot <- 250 +#' desiredConfidence <- 0.95 #' } #' @export summarize_cluster_discrete <- function(data, attribute, plotTot = NA, desiredConfidence = 0.95) { - attrTemp <- unlist(data %>% dplyr::select(one_of(attribute))) data$attr <- attrTemp - + if (is.na(plotTot)) { stop("Total number of units is required as input.") } - + calculations <- data %>% - mutate(attrSq = attr ^ 2) %>% - summarize(sampleSize = length(attr), - mean = sum(attr) / sampleSize, - variance = (sum(attrSq) - sum(attr) ^ 2 / sampleSize) / (sampleSize - 1), - standardError = sqrt((variance / sampleSize) * (1 - sampleSize / plotTot)), - # 2-tailed - upperLimitCI = mean + qt(1 - ((1 - desiredConfidence) / 2), sampleSize - 1) * standardError, - lowerLimitCI = mean - qt(1 - ((1 - desiredConfidence) / 2), sampleSize - 1) * standardError - ) - + mutate(attrSq = attr^2) %>% + summarize( + sampleSize = length(attr), + mean = sum(attr) / sampleSize, + variance = (sum(attrSq) - sum(attr)^2 / sampleSize) / (sampleSize - 1), + standardError = sqrt((variance / sampleSize) * (1 - sampleSize / plotTot)), + # 2-tailed + tScore = qt(1 - ((1 - desiredConfidence) / 2), sampleSize - 1), + upperLimitCI = mean + tScore * standardError, + lowerLimitCI = mean - tScore * standardError + ) %>% + select(-tScore) + return(calculations) - } diff --git a/R/lazy-data.R b/R/lazy-data.R new file mode 100644 index 0000000..00c02e5 --- /dev/null +++ b/R/lazy-data.R @@ -0,0 +1,54 @@ +#' @title a cluster sample data object +#' @description A simple data example of a cluster sample, drawn from +#' Avery and Burkhart Forest Measurements text. +#' @docType data +#' @format an object of class dataframe +#' \itemize{ +#' \item{clusterID}{cluster} +#' \item{volume}{sampled volume - attribute of interest} +#' \item{isUsed}{TRUE/FALSE} +#' } +#' @name dataPlot +#' @usage dataPlot +NULL + +#' @title a cluster sample data object +#' @description A more complex data example of a cluster sample +#' @docType data +#' @format an object of class dataframe +#' \itemize{ +#' \item{clusterID}{cluster} +#' \item{bapa}{basal area per acre - attribute of interest} +#' \item{isUsed}{TRUE/FALSE} +#' } +#' @name clusterBaData +#' @usage clusterBaData +NULL + +#' @title a two-stage cluster sample data object +#' @description Dataset from Avery and Burkhart Forest Measurements text, Fifth edition +#' @docType data +#' @format an object of class dataframe +#' \itemize{ +#' \item{clusterID}{cluster} +#' \item{volume}{plot level volume - attribute of interest} +#' \item{isUsed}{TRUE/FALSE} +#' } +#' @name redData +#' @usage redData +NULL + +#' @title a simple random sample data object +#' @description A simple random sample object, representing typical forest cruise data +#' @docType data +#' @format an object of class dataframe +#' \itemize{ +#' \item{plot}{plot ID} +#' \item{tree}{tree ID - unique within plot} +#' \item{BAF}{basal area factor for plot} +#' \item{DBH}{tree diameter in inches} +#' \item{Height}{tree height in feet} +#' } +#' @name simpleRandom +#' @usage simpleRandom +NULL diff --git a/R/poisson.R b/R/poisson.R index 74de6df..b6d5754 100644 --- a/R/poisson.R +++ b/R/poisson.R @@ -1,13 +1,13 @@ #' @title Summarize Poisson sample #' @description Summarizes population-level statistics for -#' Poisson sample data. The calculations are derived from open +#' Poisson-distributed sample data. The calculations are derived from open #' resources provided by Penn State Eberly College. #' @usage summarize_poisson(data, desiredConfidence = 0.95) -#' @param data vector containing number of instances per desired +#' @param data vector containing number of instances per desired #' unit (e.g. 6 trees were alive at plot 10 -> c(6)). -#' @param desiredConfidence numeric desired confidence level +#' @param desiredConfidence numeric desired confidence level #' (e.g. 0.9). -#' @return data frame of statistics including standard error and +#' @return data frame of statistics including standard error and #' confidence interval limits. #' @author Karin Wolken #' @import dplyr @@ -19,49 +19,41 @@ #' # Data can be expressed as: #' #' data <- c(2, 3, 4, 3, 4, 5, 2, 7) -#' #' } #' @export summarize_poisson <- function(data, desiredConfidence = 0.95) { - + # Calculate frequency of the input values, and convert to a dataframe tableFreq <- table(data) - dataFrame <- data.frame(values = as.numeric(names(tableFreq)), - frequency = as.vector(tableFreq) - ) - + dataFrame <- data.frame( + values = as.numeric(names(tableFreq)), + frequency = as.vector(tableFreq) + ) + summary <- dataFrame %>% mutate(xi = values * frequency) %>% - summarize(sampleSize = length(data), - mean = 1 / sampleSize * sum(xi), - lambdaHat = mean, - se = sqrt(mean / sampleSize), - lowerBoundCI = lambdaHat - qnorm(1 - (1 - desiredConfidence) - / 2) * se, - upperBoundCI = lambdaHat + qnorm(1 - (1 - desiredConfidence) - / 2) * se - ) - - overdispersionLimit <- summary$mean * (summary$sampleSize - summary$mean) / - summary$sampleSize - + summarize( + sampleSize = length(data), + mean = 1 / sampleSize * sum(xi), + lambdaHat = mean, + se = sqrt(mean / sampleSize), + lowerBoundCI = lambdaHat - + qnorm(1 - (1 - desiredConfidence) / 2) * se, + upperBoundCI = lambdaHat + + qnorm(1 - (1 - desiredConfidence) / 2) * se + ) + + overdispersionLimit <- summary$mean * (summary$sampleSize - summary$mean) / + summary$sampleSize + # Check for overdispersion if (var(data) > overdispersionLimit) { - warning('Variance significantly higher than the mean. The data is influenced - by overdispersion.') + warning( + "Variance significantly higher than the mean. The data is influenced by overdispersion." + ) } -return(summary) - + return(summary) } - - - - - - - - - diff --git a/R/simple_random.R b/R/simple_random.R index 0b0aa71..81bc9f3 100644 --- a/R/simple_random.R +++ b/R/simple_random.R @@ -20,23 +20,24 @@ #' @import dplyr #' @examples #' \dontrun{ -#' #trainingDataFrame -#' data <- data.frame(bapa = c(120, 140, 160, 110, 100, 90), -#' plots = c(1, 2, 3, 4, 5, 6)) -#' #trainingVector +#' # trainingDataFrame +#' data <- data.frame( +#' bapa = c(120, 140, 160, 110, 100, 90), +#' plots = c(1, 2, 3, 4, 5, 6) +#' ) +#' # trainingVector #' data <- c(120, 140, 160, 110, 100, 90) -#' attribute <- 'bapa' +#' attribute <- "bapa" #' desiredConfidence <- 0.9 #' } #' @export -summarize_simple_random <- function(data, attribute = 'attr', popSize = NA, +summarize_simple_random <- function(data, attribute = "attr", popSize = NA, desiredConfidence = 0.95, infiniteReplacement = F) { - type <- class(data) - + # converts variable of interest into a vector with a generic name - if (any(type == 'numeric')) { + if (any(type == "numeric")) { attr <- data } else { # makes sure data is expressed as a numeric vector @@ -48,7 +49,7 @@ summarize_simple_random <- function(data, attribute = 'attr', popSize = NA, stop("Population size must be greater than sample size.") } - if(any(is.na(attr))) { + if (any(is.na(attr))) { stop("NA values are present in the input data.") } @@ -60,18 +61,19 @@ summarize_simple_random <- function(data, attribute = 'attr', popSize = NA, simpRandomSummary <- data.frame(attr) %>% summarize( mean = mean(attr), - variance = (sum(attr ^ 2) - (sum(attr) ^ 2 / sampleSize)) / (sampleSize - 1), - standardError = ifelse(test, sqrt((variance / sampleSize) * ((popSize - sampleSize) / popSize)), - # without replacement, finite population - sqrt(variance / sampleSize)), # with replacement, infinite population - upperLimitCI = mean(attr) + qt(1 - ((1 - desiredConfidence) / 2), sampleSize - 1) * standardError, # 2-tailed - lowerLimitCI = mean(attr) - qt(1 - ((1 - desiredConfidence) / 2), sampleSize - 1) * standardError - ) + variance = (sum(attr^2) - (sum(attr)^2 / sampleSize)) / (sampleSize - 1), + standardError = ifelse( + test, + sqrt((variance / sampleSize) * ((popSize - sampleSize) / popSize)), + # without replacement, finite population + sqrt(variance / sampleSize) + ), # with replacement, infinite population + tScore = qt(1 - ((1 - desiredConfidence) / 2), sampleSize - 1), + upperLimitCI = mean(attr) + tScore * standardError, # 2-tailed + lowerLimitCI = mean(attr) - tScore * standardError + ) %>% + select(-tScore) # return data frame of key values return(simpRandomSummary) - } - - - diff --git a/R/srs_discrete.R b/R/srs_discrete.R index 9910fe3..56c8391 100644 --- a/R/srs_discrete.R +++ b/R/srs_discrete.R @@ -8,7 +8,7 @@ #' interest. Attribute must be coded as either TRUE and FALSE or #' 1 and 0. #' @param attribute character name of attribute to be summarized. -#' @param popTot numeric population size. Equivalent to total +#' @param popTot numeric population size. Equivalent to total #' number of individuals. #' @param desiredConfidence numeric desired confidence level (e.g. 0.9). #' @return data frame of stand-level statistics. Includes standard error and @@ -17,16 +17,17 @@ #' @import dplyr #' @examples #' \dontrun{ -#' data <- data.frame(alive = c(T, T, F, T, F, F), -#' tree = c(1, 2, 3, 4, 5, 6)) -#' attribute = 'alive' -#' popTot = 50 +#' data <- data.frame( +#' alive = c(T, T, F, T, F, F), +#' tree = c(1, 2, 3, 4, 5, 6) +#' ) +#' attribute <- "alive" +#' popTot <- 50 #' } #' @export -summarize_simple_random_discrete <- function(data, attribute = 'attr', +summarize_simple_random_discrete <- function(data, attribute = "attr", popTot = NA, desiredConfidence = 0.95) { - attrTemp <- unlist(data %>% dplyr::select(one_of(attribute))) data$attr <- attrTemp @@ -34,18 +35,21 @@ summarize_simple_random_discrete <- function(data, attribute = 'attr', stop("Total number of units is required as input.") } - calculations = data.frame(sampTot = length(data$attr), - sampAttr = sum(data$attr)) %>% - summarize(totalSample = sampTot, - attribute = sampAttr, - totalPopulation = popTot, - P = sampAttr / sampTot, # proportion - SE = sqrt(((P * (1 - P)) / (sampTot - 1)) * (1 - (sampTot / popTot))), - lowerLimitCI = P - (qt(1 - ((1 - desiredConfidence) / 2), sampAttr - 1) - * SE + 1 / (2 * sampTot)), - upperLimitCI = P + (qt(1 - ((1 - desiredConfidence) / 2), sampAttr - 1) - * SE + 1 / (2 * sampTot))) + calculations <- data.frame( + sampTot = length(data$attr), + sampAttr = sum(data$attr) + ) %>% + summarize( + totalSample = sampTot, + attribute = sampAttr, + totalPopulation = popTot, + P = sampAttr / sampTot, # proportion + SE = sqrt(((P * (1 - P)) / (sampTot - 1)) * (1 - (sampTot / popTot))), + lowerLimitCI = P - (qt(1 - ((1 - desiredConfidence) / 2), sampAttr - 1) + * SE + 1 / (2 * sampTot)), + upperLimitCI = P + (qt(1 - ((1 - desiredConfidence) / 2), sampAttr - 1) + * SE + 1 / (2 * sampTot)) + ) return(calculations) - } diff --git a/R/stratified.R b/R/stratified.R index ed46411..20d541c 100644 --- a/R/stratified.R +++ b/R/stratified.R @@ -3,9 +3,9 @@ #' stratified sample data. The calculations are derived from Chapter 5 in #' Gregoire and Valentine's (2008) Sampling Strategies for Natural Resources #' and the Environment. The variance terms refer to the variance of the mean, -#' hence the \code{n} terms in the denominators. This function has two +#' hence the \code{n} terms in the denominators. This function has two #' options: (1) stratified and (2) post-stratified. -#' @usage summarize_stratified(trainingData, attribute, stratumTab, +#' @usage summarize_stratified(trainingData, attribute, stratumTab, #' desiredConfidence = 0.95, post = T) #' @param trainingData data frame containing observations of variable of #' interest, and stratum assignment for each plot @@ -24,71 +24,82 @@ #' #' # Data can be expressed as: #' -#' trainingData <- data.frame(bapa = c(120, 140, 160, 110, 100, 90), -#' stratum = c(1, 1, 1, 2, 2, 2)) +#' trainingData <- data.frame( +#' bapa = c(120, 140, 160, 110, 100, 90), +#' stratum = c(1, 1, 1, 2, 2, 2) +#' ) #' stratumTab <- data.frame(stratum = c(1, 2), acres = c(200, 50)) -#' attribute = 'bapa' -#' desiredConfidence = 0.9 -#' +#' attribute <- "bapa" +#' desiredConfidence <- 0.9 #' } #' @export summarize_stratified <- function(trainingData, attribute, - stratumTab, desiredConfidence = 0.95, post = T) { - - # give the variable of interest a generic name - attrTemp <- unlist(trainingData %>% dplyr::select(one_of(attribute))) - trainingData$attr <- attrTemp - - # summarize strata - if(!post) { - # summarize strata - stratumSummaries <- trainingData %>% - left_join(stratumTab) %>% - mutate(attrExpanded = attr * acres) %>% - group_by(stratum) %>% - summarize(stratMeanTot = mean(attr), - stratVarTot = var(attrExpanded) / n(), - stratVarMean = stratVarTot / mean(acres) ^ 2, - stratSE = sqrt(stratVarMean), - stratPlots = n(), - stratAcres = mean(acres)) + stratumTab, desiredConfidence = 0.95, post = T) { - totalSummary <- stratumSummaries %>% - left_join(stratumTab) %>% - summarize(popMean = weighted.mean(stratMeanTot, w = acres), - popVar = sum(stratVarTot) / (sum(acres) ^ 2), - popSE = sqrt(popVar), - popCIhalf = popSE * qt(1 - (1 - desiredConfidence) / 2, - df = sum(stratPlots - 1))) %>% - mutate(ciPct = 100 * popCIhalf / popMean) %>% - select(popMean, popSE, popCIhalf, ciPct) + # give the variable of interest a generic name + attrTemp <- unlist(trainingData %>% dplyr::select(one_of(attribute))) + trainingData$attr <- attrTemp - } else { # summarize (post-stratification, in progress) - stratumSummaries <- trainingData %>% - left_join(stratumTab) %>% - group_by(stratum) %>% - summarize(stratMean = mean(attr), - stratVarMean = var(attr) / n(), - stratSE = sqrt(stratVarMean), - stratPlots = n(), - stratAcres = mean(acres)) + # summarize strata + if (!post) { + # summarize strata + stratumSummaries <- trainingData %>% + left_join(stratumTab) %>% + mutate(attrExpanded = attr * acres) %>% + group_by(stratum) %>% + summarize( + stratMeanTot = mean(attr), + stratVarTot = var(attrExpanded) / n(), + stratVarMean = stratVarTot / mean(acres)^2, + stratSE = sqrt(stratVarMean), + stratPlots = n(), + stratAcres = mean(acres) + ) - totalSummary <- stratumSummaries %>% - left_join(stratumTab) %>% - summarize(popMean = weighted.mean(stratMean, w = acres), - popMeanVar = sum((acres / sum(acres)) ^ 2 * stratVarMean), - popSE = sqrt(popMeanVar), - popCIhalf = popSE * qt(1 - (1 - desiredConfidence) / 2, - df = sum(stratPlots - 1))) %>% - mutate(ciPct = 100 * popCIhalf / popMean) %>% - select(popMean, popSE, popCIhalf, ciPct) - } + totalSummary <- stratumSummaries %>% + left_join(stratumTab) %>% + summarize( + popMean = weighted.mean(stratMeanTot, w = acres), + popVar = sum(stratVarTot) / (sum(acres)^2), + popSE = sqrt(popVar), + popCIhalf = popSE * qt(1 - (1 - desiredConfidence) / 2, + df = sum(stratPlots - 1) + ) + ) %>% + mutate(ciPct = 100 * popCIhalf / popMean) %>% + select(popMean, popSE, popCIhalf, ciPct) + } else { # summarize (post-stratification, in progress) + stratumSummaries <- trainingData %>% + left_join(stratumTab) %>% + group_by(stratum) %>% + summarize( + stratMean = mean(attr), + stratVarMean = var(attr) / n(), + stratSE = sqrt(stratVarMean), + stratPlots = n(), + stratAcres = mean(acres) + ) - # return list of key values - outList <- list(stratumSummaries = data.frame(stratumSummaries), - totalSummary = data.frame(totalSummary)) + totalSummary <- stratumSummaries %>% + left_join(stratumTab) %>% + summarize( + popMean = weighted.mean(stratMean, w = acres), + popMeanVar = sum((acres / sum(acres))^2 * stratVarMean), + popSE = sqrt(popMeanVar), + popCIhalf = popSE * qt(1 - (1 - desiredConfidence) / 2, + df = sum(stratPlots - 1) + ) + ) %>% + mutate(ciPct = 100 * popCIhalf / popMean) %>% + select(popMean, popSE, popCIhalf, ciPct) + } - return(outList) + # return list of key values + outList <- list( + stratumSummaries = data.frame(stratumSummaries), + totalSummary = data.frame(totalSummary) + ) + return(outList) } diff --git a/R/systematic.R b/R/systematic.R index e827374..201d64a 100644 --- a/R/systematic.R +++ b/R/systematic.R @@ -4,7 +4,7 @@ #' Avery and Burkhart's (1967) Forest Measurements, Fifth Edition. The #' variance terms refer to the variance of the mean, hence the #' \code{sampleSize} terms in the denominators. -#' @usage summarize_systematic(data, attribute = 'attr', +#' @usage summarize_systematic(data, attribute = 'attr', #' popSize = NA, desiredConfidence = 0.95) #' @param data data frame or vector containing observations of #' variable of interest. Variable of interest must already be expanded @@ -24,27 +24,28 @@ #' #' # Data frame input data can be expressed as: #' -#' trainingData <- data.frame(bapa = c(120, 140, 160, 110, 100, 90), -#' plots = c(1, 2, 3, 4, 5, 6)) -#' attribute = 'bapa' -#' desiredConfidence = 0.9 +#' trainingData <- data.frame( +#' bapa = c(120, 140, 160, 110, 100, 90), +#' plots = c(1, 2, 3, 4, 5, 6) +#' ) +#' attribute <- "bapa" +#' desiredConfidence <- 0.9 #' #' # Vector input data can be expressed as: #' #' vector <- c(120, 140, 160, 110, 100, 90) -#' #' } #' @export -summarize_systematic <- function(data, attribute = 'attr', popSize = NA, desiredConfidence = 0.95) { +summarize_systematic <- function(data, attribute = "attr", + popSize = NA, desiredConfidence = 0.95) { # return data frame of key values - output <- summarize_simple_random(data = data, attribute, popSize, desiredConfidence, FALSE) - - - return(output) - + summarize_simple_random( + data = data, + attribute, + popSize, + desiredConfidence, + FALSE + ) } - - - diff --git a/R/threeP.R b/R/threeP.R index 35ec5d1..3a4342f 100644 --- a/R/threeP.R +++ b/R/threeP.R @@ -2,114 +2,129 @@ #' @description Summarizes population-level statistics for #' 3P sample data. The calculations are derived from Kim Iles #' 'A Sampler of Inventory Topics' (2003). -#' @usage summarize_threeP(data, sampSize, treeCount, BAF, cvPercent, -#' trueNetVBAR, height = FALSE, +#' @usage summarize_threeP(data, sampSize, treeCount, BAF, cvPercent, +#' trueNetVBAR, height = FALSE, #' desiredConfidence = 0.95) #' @param data data frame containing key tree- or plot-level information. -#' @param cvPercent numeric average Coefficient of Variation expressed +#' @param cvPercent numeric average Coefficient of Variation expressed #' as a percent (e.g. 50). -#' @param trueNetVBAR numeric measured net VBAR (volume to basal area +#' @param trueNetVBAR numeric measured net VBAR (volume to basal area #' ratio). -#' @param height logical TRUE if data input contains height values. +#' @param height logical TRUE if data input contains height values. #' Otherwise, FALSE assumes data contains estimateVBAR. -#' @param plot logical TRUE if the input data is averaged by plot, +#' @param plot logical TRUE if the input data is averaged by plot, #' FALSE if data is tree-level. #' @param desiredConfidence numeric desired confidence level (e.g. 0.9). -#' @return data frame of statistics including standard error and +#' @return data frame of statistics including standard error and #' confidence interval limits. #' @author Karin Wolken #' @import dplyr #' @examples #' \dontrun{ -#' +#' #' # This function has a few options for data input: -#' -#' +#' +#' #' # 1. Height is included, data is plot-level. -#' -#' dataHeight <- data.frame(plotNum = c(5, 4, 3, 5, 4), -#' treeCount = c(4, 5, 6, 3, 4), -#' BAF = c(10, 10, 10, 10, 10), -#' avgTreeVBARSperPlot = c(9, 8, 7, 8, 2), -#' treeHeight = c(1, 2, 3, 4, 5)) -#' -#' # Example: -#' summarize_threeP(dataHeight, cvPercent = 50, trueNetVBAR = 10, -#' height = T, plot = T) -#' -#' -#' # 2. estimateVBAR is included (height data is not used, -#' and so is not included), data is plot-level. -#' -#' dataEstimateVBAR <- data.frame(plotNum = c(1, 2, 3, 4, 5), -#' treeCount = c(4, 5, 6, 3, 4), -#' BAF = c(10, 10, 10, 10, 10), -#' avgTreeVBARSperPlot = -#' c(9, 8, 7, 8, 2), -#' estimateVBAR = c(1, 2, 3, 4, 5)) -#' -#' # Example: -#' summarize_threeP(dataEstimateVBAR, cvPercent = 50, -#' trueNetVBAR = 10, height = F, plot = T) -#' -#' +#' +#' dataHeight <- data.frame( +#' plotNum = c(5, 4, 3, 5, 4), +#' treeCount = c(4, 5, 6, 3, 4), +#' BAF = c(10, 10, 10, 10, 10), +#' avgTreeVBARSperPlot = c(9, 8, 7, 8, 2), +#' treeHeight = c(1, 2, 3, 4, 5) +#' ) +#' +#' # Example: +#' summarize_threeP(dataHeight, +#' cvPercent = 50, trueNetVBAR = 10, +#' height = T, plot = T +#' ) +#' +#' +#' # 2. estimateVBAR is included (height data is not used, so is not included), +#' data is plot-level. +#' +#' +#' dataEstimateVBAR <- data.frame( +#' plotNum = c(1, 2, 3, 4, 5), +#' treeCount = c(4, 5, 6, 3, 4), +#' BAF = c(10, 10, 10, 10, 10), +#' avgTreeVBARSperPlot = +#' c(9, 8, 7, 8, 2), +#' estimateVBAR = c(1, 2, 3, 4, 5) +#' ) +#' +#' # Example: +#' summarize_threeP(dataEstimateVBAR, +#' cvPercent = 50, +#' trueNetVBAR = 10, height = F, plot = T +#' ) +#' +#' #' # 3. estimateVBAR is included, data is tree-level. -#' -#' dataNotPlot <- data.frame(plotNum = c(1, 1, 2, 2, 3), -#' BAF = c(10, 10, 10, 10, 10), -#' VBAR = c(1, 2, 3, 4, 3), -#' estimateVBAR = c(1, 2, 3, 4, 5)) -#' +#' +#' dataNotPlot <- data.frame( +#' plotNum = c(1, 1, 2, 2, 3), +#' BAF = c(10, 10, 10, 10, 10), +#' VBAR = c(1, 2, 3, 4, 3), +#' estimateVBAR = c(1, 2, 3, 4, 5) +#' ) +#' #' # Example: -#' summarize_threeP(dataNotPlot, cvPercent = 50, trueNetVBAR = 10, -#' height = F, plot = F) -#' +#' summarize_threeP(dataNotPlot, +#' cvPercent = 50, trueNetVBAR = 10, +#' height = F, plot = F +#' ) #' } #' @export #' summarize_threeP <- function(data, cvPercent, trueNetVBAR, - height = FALSE, plot = FALSE, desiredConfidence = 0.95) { - - # if the data input is height, VBAR will be estimated by 2 * tree height - if (height) { - origData <- data %>% - mutate(estimateVBAR = treeHeight * 2) - } else { - origData <- data - } - + height = FALSE, plot = FALSE, + desiredConfidence = 0.95) { + + # if the data input is height, VBAR will be estimated by 2 * tree height + if (height) { + origData <- data %>% + mutate(estimateVBAR = treeHeight * 2) + } else { + origData <- data + } + if (!plot) { avgVBAR <- origData %>% group_by(plotNum) %>% - summarize(avgTreeVBARSperPlot = mean(VBAR), - treeCount = length(plotNum), - BAF = mean(BAF), - estimateVBAR = mean(estimateVBAR)) + summarize( + avgTreeVBARSperPlot = mean(VBAR), + treeCount = length(plotNum), + BAF = mean(BAF), + estimateVBAR = mean(estimateVBAR) + ) } - + if (plot) { fullData <- origData } else { fullData <- data.frame(avgVBAR) } - - + + # merge data and avgVBAR summary <- fullData %>% mutate(estPlotVol = treeCount * BAF * avgTreeVBARSperPlot) %>% - summarize(sampSize = length(plotNum), # number of plots - avgVol = mean(estPlotVol), - ratio = trueNetVBAR / mean(estimateVBAR), - netVolume = avgVol * ratio, - sePercent = cvPercent / sqrt(sampSize), - se = sePercent * netVolume, # 'multiply SE% by volume to get SE units' - lowerBoundCI = avgVol - qnorm(1 - (1 - desiredConfidence) / 2) - * sqrt(avgVol / sampSize), - upperBoundCI = avgVol + qnorm(1 - (1 - desiredConfidence) / 2) - * sqrt(avgVol / sampSize) - ) - - return(summary) + summarize( + sampSize = length(plotNum), # number of plots + avgVol = mean(estPlotVol), + ratio = trueNetVBAR / mean(estimateVBAR), + netVolume = avgVol * ratio, + sePercent = cvPercent / sqrt(sampSize), + se = sePercent * netVolume, # 'multiply SE% by volume to get SE units' + lowerBoundCI = avgVol - + qnorm(1 - (1 - desiredConfidence) / 2) * sqrt(avgVol / sampSize), + upperBoundCI = avgVol + + qnorm(1 - (1 - desiredConfidence) / 2) * sqrt(avgVol / sampSize) + ) + return(summary) } diff --git a/R/twostage.R b/R/twostage.R index 98a75d4..008395e 100644 --- a/R/twostage.R +++ b/R/twostage.R @@ -12,7 +12,7 @@ #' @param plot logical TRUE if parameter data is plot-level, FALSE if #' parameter data is cluster-level. Default is TRUE. #' @param attribute character name of attribute to be summarized. -#' @param populationClusters numeric total number of clusters in the +#' @param populationClusters numeric total number of clusters in the #' population. #' @param populationElementsPerCluster numeric total number of #' elements in the population. @@ -33,55 +33,62 @@ #' #' # Cluster level data can be expressed as: #' -#' dataCluster <- data.frame(clusterID = c(1, 2, 3, 4, 5), -#' totClusterElements = c(5, 2, 1, 6, 5), -#' sampledElements = c(2, 2, 2, 2, 2), -#' isUsed = c(T, T, T, T, F), -#' attrSumCluster = c(1000, 1250, 950, -#' 900, 1005)) +#' dataCluster <- data.frame( +#' clusterID = c(1, 2, 3, 4, 5), +#' totClusterElements = c(5, 2, 1, 6, 5), +#' sampledElements = c(2, 2, 2, 2, 2), +#' isUsed = c(T, T, T, T, F), +#' attrSumCluster = c( +#' 1000, 1250, 950, +#' 900, 1005 +#' ) +#' ) #' # Example: -#' summarize_two_stage(dataCluster, F, 'attr') -#' -#' +#' summarize_two_stage(dataCluster, F, "attr") +#' +#' #' # Plot level data can be expressed as: #' -#' dataPlot <- data.framedata.frame(clusterID = c(1, 1, 1, 2, 2, 2, -#' 3, 3, 3, 4, 4, 4, -#' 5, 5, 5, 6, 6, 6), -#' volume = c(500, 650, 610, 490, -#' 475, 505, 940, 825, -#' 915, 210, 185, 170, -#' 450, 300, 500, 960, -#' 975, 890), -#' isUsed = c(T, T, T, T, T, T, T, -#' T, T, T, T, T, T, T, -#' T, T, T, T)) +#' dataPlot <- data.framedata.frame( +#' clusterID = c( +#' 1, 1, 1, 2, 2, 2, +#' 3, 3, 3, 4, 4, 4, +#' 5, 5, 5, 6, 6, 6 +#' ), +#' volume = c( +#' 500, 650, 610, 490, +#' 475, 505, 940, 825, +#' 915, 210, 185, 170, +#' 450, 300, 500, 960, +#' 975, 890 +#' ), +#' isUsed = c( +#' T, T, T, T, T, T, T, +#' T, T, T, T, T, T, T, +#' T, T, T, T +#' ) +#' ) #' # Example: -#' summarize_two_stage(redData, T, 'volume', populationClusters = 16, -#' populationElementsPerCluster = 160) -#' +#' summarize_two_stage(redData, T, "volume", +#' populationClusters = 16, +#' populationElementsPerCluster = 160 +#' ) #' } #' @export summarize_two_stage <- function(data, plot = TRUE, attribute = NA, - populationClusters = 0, populationElementsPerCluster = 0, + populationClusters = 0, + populationElementsPerCluster = 0, desiredConfidence = 0.95) { - if (!is.na(attribute) && (attribute %in% colnames(data))) { - attrTemp <- unlist(data %>% dplyr::select(one_of(attribute))) if (plot) { - data$attr <- attrTemp - } else { - data$attrSumCluster <- attrTemp - } - } @@ -90,40 +97,55 @@ summarize_two_stage <- function(data, plot = TRUE, attribute = NA, # calculates cluster values from plot data temp <- data %>% mutate(attr = ifelse(is.na(attr), 0, attr)) %>% - mutate(attrSq = attr ^ 2) + mutate(attrSq = attr^2) # sum attributes by cluster - attrSum <- aggregate(temp$attr, by = list(clusterID = temp$clusterID), FUN = sum) - attrSqSum <- aggregate(temp$attrSq, by = list(clusterID = temp$clusterID), FUN = sum) + attrSum <- aggregate( + temp$attr, + by = list(clusterID = temp$clusterID), FUN = sum + ) + attrSqSum <- aggregate( + temp$attrSq, + by = list(clusterID = temp$clusterID), FUN = sum + ) # checks if any secondary is used in any primary/cluster - clusterUsed <- aggregate(data$isUsed, by = list(clusterID = data$clusterID), FUN = sum) + clusterUsed <- aggregate( + data$isUsed, + by = list(clusterID = data$clusterID), FUN = sum + ) clusterUsed$x <- ifelse(clusterUsed$x > 0, T, F) # converts above to T/F totalElementsInCluster <- count(data, clusterID) # tally of all elements in each cluster - sampledElementsInCluster <- count(data[data$isUsed,], clusterID) # tally of elements sampled in each cluster + sampledElementsInCluster <- count(data[data$isUsed, ], clusterID) # tally of elements sampled in each cluster # attach the sum of attributes and tally of elements to unique cluster # produce table with key values - cluster <- full_join(clusterUsed, attrSum, by = 'clusterID') %>% - full_join(totalElementsInCluster, by = 'clusterID') %>% + cluster <- full_join(clusterUsed, attrSum, by = "clusterID") %>% + full_join(totalElementsInCluster, by = "clusterID") %>% rename(totClusterElements = n, isUsed = x.x, attrSumCluster = x.y) %>% - full_join(sampledElementsInCluster, by = 'clusterID') %>% - select(clusterID, totClusterElements, sampledElements = n, isUsed, attrSumCluster) %>% - mutate(sampledElements = ifelse(is.na(sampledElements), 0, sampledElements)) %>% + full_join(sampledElementsInCluster, by = "clusterID") %>% + select( + clusterID, + totClusterElements, + sampledElements = n, + isUsed, + attrSumCluster + ) %>% + mutate( + sampledElements = ifelse(is.na(sampledElements), 0, sampledElements) + ) %>% mutate(attrSqSumCluster = attrSqSum$x) - } else { # reassigns data as cluster, if input data is cluster-level data # data must include unique ID ('clusterID'), number of cluster elements - # in each cluster ('totClusterElements'), number of cluster elements - # sampled in each cluster ('sampledElements'), logical true if the - # cluster has any sampled elements ('isUsed'), sum of attributes in - # each cluster ('attrSumCluster') + # in each cluster ('totClusterElements'), number of cluster elements + # sampled in each cluster ('sampledElements'), logical true if the + # cluster has any sampled elements ('isUsed'), sum of attributes in + # each cluster ('attrSumCluster') cluster <- data %>% - mutate(attrSqSumCluster = attrSumCluster ^ 2 ) - + mutate(attrSqSumCluster = attrSumCluster^2) } @@ -135,60 +157,75 @@ summarize_two_stage <- function(data, plot = TRUE, attribute = NA, stop("Must have multiple clusters. Consider other analysis.") } - n = sum(cluster$isUsed) # number of sampled clusters - m = sum(cluster$sampledElements) / n # average number of sampled plots among sampled clusters - EN = ifelse(populationClusters != 0, populationClusters, length(cluster$isUsed)) # total number of clusters - EM = ifelse(populationElementsPerCluster != 0, populationElementsPerCluster, - sum(cluster$totClusterElements)) # total number of total plots among all clusters - + n <- sum(cluster$isUsed) # number of sampled clusters + m <- sum(cluster$sampledElements) / n # average number of sampled plots among sampled clusters + EN <- ifelse( # total number of clusters + populationClusters != 0, + populationClusters, + length(cluster$isUsed) + ) + + EM <- ifelse( # total number of total plots among all clusters + populationElementsPerCluster != 0, + populationElementsPerCluster, + sum(cluster$totClusterElements) + ) + finalCalc <- cluster %>% summarize( # yBar denominator written for clarity: average m per cluster * n - yBar = sum(attrSumCluster) / (m * n), - s2b = ((sum(attrSumCluster ^ 2) / (m)) - - (sum(attrSumCluster) ^ 2 / (m * n))) / (n - 1), - s2w = (sum(attrSqSumCluster) - sum(attrSumCluster ^ 2) / (m)) / (n * (m - 1)), + yBar = sum(attrSumCluster) / (m * n), + s2b = ((sum(attrSumCluster^2) / (m)) - + (sum(attrSumCluster)^2 / (m * n))) / (n - 1), + s2w = (sum(attrSqSumCluster) - sum(attrSumCluster^2) / (m)) / + (n * (m - 1)), df = sum(sampledElements) - ) %>% - mutate(ySE = ifelse(length(unique(cluster$totClusterElements)) == 1, - #if clusters are equal in size, num of elements per cluster is equal: - sqrt((1 / (m * n)) * ((s2b * (1 - n / EN) + n * s2w / EN * (1 - m / EM))[[1]])), - - ifelse((n / EN < 0.2), - #if n is a small fraction of N, SE is simplified to: - sqrt(s2b ^ 2 / (m * n)), - - ifelse((m / EM < 0.2), - # when n/N is fairly large, but num of secondaries (m) - # sampled in each selected primary is only a fraction - # of the total secondaries (M) in each primary (n) - sqrt(1 / (m * n) * (s2b * (1 - n / EN) + (n * s2w) / EN)), - - # SE calculated below is approximated based on available - # methods and may not be accurate - # These statements are intended to catch all other samples - if (n / EN < m / EM) { - - sqrt(s2b ^ 2 / (m * n)) - - } else { - - sqrt(1 / (m * n) * (s2b * (1 - n / EN) + (n * s2w) / EN)) - - } - ) - ) - ) ) %>% - mutate(highCL = yBar + qt(1 - ((1 - desiredConfidence) / 2), df) * ySE) %>% - mutate(lowCL = yBar - qt(1 - ((1 - desiredConfidence) / 2), df) * ySE) + mutate( + ySE = ifelse( + length(unique(cluster$totClusterElements)) == 1, + # if clusters are equal in size, num of elements per cluster is equal: + sqrt( + (1 / (m * n)) * ((s2b * (1 - n / EN) + + n * s2w / EN * (1 - m / EM))[[1]]) + ), + ifelse(n / EN < 0.2, + # if n is a small fraction of N, SE is simplified to: + sqrt(s2b^2 / (m * n)), + ifelse(m / EM < 0.2, + # when n/N is fairly large, but num of secondaries (m) + # sampled in each selected primary is only a fraction + # of the total secondaries (M) in each primary (n) + sqrt(1 / (m * n) * (s2b * (1 - n / EN) + (n * s2w) / EN)), + # SE calculated below is approximated based on available + # methods and may not be accurate + # These statements are intended to catch all other samples + if (n / EN < m / EM) { + sqrt(s2b^2 / (m * n)) + } else { + sqrt(1 / (m * n) * (s2b * (1 - n / EN) + (n * s2w) / EN)) + } + ) + ) + ) + ) %>% + mutate( + tScore = qt(1 - ((1 - desiredConfidence) / 2), df), + highCL = yBar + tScore * ySE, + lowCL = yBar - tScore * ySE + ) %>% + select(-tScore) clusterSummary <- finalCalc %>% - summarize(mean = yBar, varianceB = s2b, varianceW = s2w, standardError = ySE, - upperLimitCI = highCL, lowerLimitCI = lowCL) + summarize( + mean = yBar, + varianceB = s2b, + varianceW = s2w, + standardError = ySE, + upperLimitCI = highCL, + lowerLimitCI = lowCL + ) # return data frame of stand-level statistics return(clusterSummary) - } - diff --git a/README.md b/README.md index d30adc6..9ac5f1e 100644 --- a/README.md +++ b/README.md @@ -5,38 +5,36 @@ A library of functions for processing natural resource sample data from a variet Pretty Tree

-## Source Repository -Source-code repository is hosted on GitHub, [here](https://github.com/SilviaTerra/forest_sampling). +## Source Code +The source code is hosted on GitHub, [here](https://github.com/SilviaTerra/forest_sampling). ## How to Contribute -We welcome your contributions! This package is a work in progress, and expansion is welcome. -Please take a look at the [contribution guidelines](https://github.com/SilviaTerra/forest_sampling/wiki/Contribution-Guidelines). +We welcome your contributions! This package is a work in progress, and expansion is welcome. +Please take a look at the [contribution guidelines](https://github.com/SilviaTerra/forest_sampling/wiki/Contribution-Guidelines). -Wish-list areas include: +Wish-list areas include: * Test existing functions with data you have access to, suggest feature improvements * Add additional sampling design workup functions * Edit existing functions to be more useful in practice ## How to Download from GitHub -Download the package: - - library(devtools) - install_github("SilviaTerra/forest_sampling") - +Download the package: +```r +devtools::install_github("SilviaTerra/forest_sampling") +``` Helpful link: [Install R Packages Hosted on GitHub](https://cran.r-project.org/web/packages/githubinstall/vignettes/githubinstall.html) ## How to Run Tests After downloading from github, run this: - - library(devtools) - build() - install() - test() - +```r +devtools::build() +devtools::install() +devtools::test() +``` ## Sponsors -The forest_sampling package was first created by [SilviaTerra](https://silviaterra.com/bark/index.html), with the intent of building an open source tool for the field of forestry. +The `forestsamplr` package was first created by [SilviaTerra](https://silviaterra.com/bark/index.html), with the intent of building an open source tool for the field of forestry. The functions contained in the package are based on the equations and concepts outlined in several commonly referenced forest inventory textbooks. -In collaboration with: +In collaboration with: * [SAF A1 Working Group](https://www.eforester.org/Main/Community/Join_a_Working_Group/Main/About/Working_Groups.aspx?hkey=415c5b8e-28b9-4376-b23f-ad89a158adc8) diff --git a/data/clusterBAData.rda b/data/clusterBAData.rda new file mode 100644 index 0000000..6f0b3fd Binary files /dev/null and b/data/clusterBAData.rda differ diff --git a/data/dataPlot.rda b/data/dataPlot.rda new file mode 100644 index 0000000..8ba0a89 Binary files /dev/null and b/data/dataPlot.rda differ diff --git a/data/redData.rda b/data/redData.rda new file mode 100644 index 0000000..1de5559 Binary files /dev/null and b/data/redData.rda differ diff --git a/man/clusterBAData.Rd b/man/clusterBAData.Rd new file mode 100644 index 0000000..ec16146 --- /dev/null +++ b/man/clusterBAData.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazy-data.R +\docType{data} +\name{clusterBaData} +\alias{clusterBaData} +\title{a cluster sample data object} +\format{an object of class dataframe +\itemize{ + \item{clusterID}{cluster} + \item{bapa}{basal area per acre - attribute of interest} + \item{isUsed}{TRUE/FALSE} +}} +\usage{ +clusterBaData +} +\description{ +A more complex data example of a cluster sample +} diff --git a/man/dataPlot.Rd b/man/dataPlot.Rd new file mode 100644 index 0000000..c15b134 --- /dev/null +++ b/man/dataPlot.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazy-data.R +\docType{data} +\name{dataPlot} +\alias{dataPlot} +\title{a cluster sample data object} +\format{an object of class dataframe +\itemize{ + \item{clusterID}{cluster} + \item{volume}{sampled volume - attribute of interest} + \item{isUsed}{TRUE/FALSE} +}} +\usage{ +dataPlot +} +\description{ +A simple data example of a cluster sample, drawn from +Avery and Burkhart Forest Measurements text. +} diff --git a/man/summarize_all_cluster.Rd b/man/summarize_all_cluster.Rd index 727f28a..062459b 100644 --- a/man/summarize_all_cluster.Rd +++ b/man/summarize_all_cluster.Rd @@ -34,7 +34,7 @@ standard error and confidence interval limits. Summarizes population-level statistics for cluster sample data. This function has two options: (1) Cluster sample with a normal distribution and (2) Cluster -sample with a bernoulli distribution. +sample with a Bernoulli distribution. } \examples{ \dontrun{ diff --git a/man/summarize_all_srs.Rd b/man/summarize_all_srs.Rd index e07afeb..177bf24 100644 --- a/man/summarize_all_srs.Rd +++ b/man/summarize_all_srs.Rd @@ -36,7 +36,7 @@ Summarizes population-level statistics for simple random sample data. This function has three options: (1) SRS of a finite population or sampled without replacement, (2) SRS of an infinite population or sampled with replacement, -and (3) SRS with a bernoulli distribution. +and (3) SRS with a Bernoulli distribution. } \examples{ \dontrun{ diff --git a/man/summarize_poisson.Rd b/man/summarize_poisson.Rd index 33d15fe..21be832 100644 --- a/man/summarize_poisson.Rd +++ b/man/summarize_poisson.Rd @@ -19,7 +19,7 @@ confidence interval limits. } \description{ Summarizes population-level statistics for -Poisson sample data. The calculations are derived from open +Poisson-distributed sample data. The calculations are derived from open resources provided by Penn State Eberly College. } \examples{ diff --git a/tests/testthat.R b/tests/testthat.R index fdd6b3d..1810eb5 100644 --- a/tests/testthat.R +++ b/tests/testthat.R @@ -1,4 +1,4 @@ library(testthat) -library(forestSampling) +library(forestsamplr) -test_check("forestSampling") +test_check("forestsamplr") diff --git a/tests/testthat/test_all_cluster.R b/tests/testthat/test_all_cluster.R index 73c4103..8baca0e 100644 --- a/tests/testthat/test_all_cluster.R +++ b/tests/testthat/test_all_cluster.R @@ -1,26 +1,39 @@ context("Forest sampling statistics calculations: all cluster samples") -dataPlot <- data.frame(clusterID = c(1, 1, 1, 1, 1, 2, 2, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5), - attr = c(1000, 1250, 950, 900, 1005, 1000, 1250, 950, 900, 1005, 1000, - 1250, 950, 900, 1005, 1000, 1250, 950, 900), - isUsed = c(T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F)) - test_that("all cluster function handles simple cluster sample", { - - expect_equal(summarize_all_cluster(dataPlot, element = TRUE, bernoulli = F), - summarize_cluster(dataPlot, element = TRUE), tolerance = 0.1) - + expect_equal( + summarize_all_cluster( + redData, + attribute = "volume", element = TRUE, bernoulli = F + ), + summarize_cluster( + redData, + attribute = "volume", element = TRUE + ), + tolerance = 0.1 + ) }) test_that("all cluster function handles cluster sample for discrete attribute, bernoulli", { - - data <- data.frame(plots = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), - propAlive = c(0.75, 0.80, 0.80, 0.85, 0.70, - 0.90, 0.70, 0.75, 0.80, 0.65)) - - expect_equal(summarize_all_cluster(data, attribute = 'propAlive', plotTot = 250, bernoulli = T), - summarize_cluster_discrete(data, attribute = 'propAlive', plotTot = 250), tolerance = 0.01) - + data <- data.frame( + plots = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), + propAlive = c( + 0.75, 0.80, 0.80, 0.85, 0.70, + 0.90, 0.70, 0.75, 0.80, 0.65 + ) + ) + + expect_equal( + summarize_all_cluster( + data, + attribute = "propAlive", plotTot = 250, bernoulli = T + ), + summarize_cluster_discrete( + data, + attribute = "propAlive", plotTot = 250 + ), + tolerance = 0.01 + ) }) diff --git a/tests/testthat/test_all_srs.R b/tests/testthat/test_all_srs.R index 0058cbb..ac46bac 100644 --- a/tests/testthat/test_all_srs.R +++ b/tests/testthat/test_all_srs.R @@ -2,28 +2,38 @@ context("Forest sampling statistics calculations: all simple random samples") test_that("all srs function handles basic simple random sample", { - - data <- data.frame(bapa = c(120, 140, 160, 110, 100, 90), - plots = c(1, 2, 3, 4, 5, 6)) - - expect_equal(summarize_all_srs(data, attribute = 'bapa', popSize = 40, - infiniteReplacement = F), - summarize_simple_random(data, attribute = 'bapa', popSize = 40, - infiniteReplacement = F), - tolerance = 0.1) - + data <- data.frame( + bapa = c(120, 140, 160, 110, 100, 90), + plots = c(1, 2, 3, 4, 5, 6) + ) + + expect_equal(summarize_all_srs(data, + attribute = "bapa", popSize = 40, + infiniteReplacement = F + ), + summarize_simple_random(data, + attribute = "bapa", popSize = 40, + infiniteReplacement = F + ), + tolerance = 0.1 + ) }) test_that("all srs function handles simple random sample for discrete attribute, Bernoulli", { - - data <- data.frame(alive = c(T, T, F, T, F, F), - tree = c(1, 2, 3, 4, 5, 6)) - - expect_equal(summarize_all_srs(data, attribute = 'alive', popSize = 50, - desiredConfidence = 0.95, infiniteReplacement = F, bernoulli = T), - summarize_simple_random_discrete(data, attribute = 'alive', popTot = 50, - desiredConfidence = 0.95), - tolerance = 0.1) - + data <- data.frame( + alive = c(T, T, F, T, F, F), + tree = c(1, 2, 3, 4, 5, 6) + ) + + expect_equal(summarize_all_srs(data, + attribute = "alive", popSize = 50, + desiredConfidence = 0.95, infiniteReplacement = F, bernoulli = T + ), + summarize_simple_random_discrete(data, + attribute = "alive", popTot = 50, + desiredConfidence = 0.95 + ), + tolerance = 0.1 + ) }) diff --git a/tests/testthat/test_cluster.R b/tests/testthat/test_cluster.R index 15e1b15..7b1cdc6 100644 --- a/tests/testthat/test_cluster.R +++ b/tests/testthat/test_cluster.R @@ -1,78 +1,65 @@ context("Forest sampling statistics calculations: cluster sample") - +clusterClean <- clusterBaData %>% + filter(!is.na(bapa)) # test clusters -clusterFalse = data.frame(clusterID = c(1, 2, 3, 4, 5), clusterElements = c(4, 2, 9, 4, 10), - sumAttr = c(1000, 1250, 950, 900, 1005), isUsed = c(T, T, F, T, T)) - -clusterTrue = data.frame(clusterID = c(1, 2, 3, 4, 5), clusterElements = c(4, 2, 9, 4, 10), - sumAttr = c(1000, 1250, 950, 900, 1005), isUsed = c(T, T, T, T, T)) - -clusterNone = data.frame(clusterID = c(NA), clusterElements = c(NA), - sumAttr = c(NA), isUsed = c(NA)) - -clusterOne = data.frame(clusterID = c(1), clusterElements = c(4), - sumAttr = c(1000), isUsed = c(T)) - -clusterTwo = data.frame(clusterID = c(1, 2), clusterElements = c(4, 2), - sumAttr = c(1000, 1250), isUsed = c(T, T)) +clusterFalse <- data.frame( + clusterID = c(1, 2, 3, 4, 5), clusterElements = c(4, 2, 1, 5, 4), + sumAttr = c(1039, 1125, 950, 1001, 1039), isUsed = c(T, T, T, T, F) +) + +clusterTrue <- data.frame( + clusterID = c(1, 2, 3, 4, 5), clusterElements = c(4, 2, 1, 5, 4), + sumAttr = c(1000, 1250, 950, 900, 1005), isUsed = c(T, T, T, T, T) +) + +clusterNone <- data.frame( + clusterID = c(NA), clusterElements = c(NA), + sumAttr = c(NA), isUsed = c(NA) +) + +clusterOne <- data.frame( + clusterID = c(1), clusterElements = c(4), + sumAttr = c(1000), isUsed = c(T) +) + +clusterTwo <- data.frame( + clusterID = c(1, 2), clusterElements = c(4, 2), + sumAttr = c(1000, 1250), isUsed = c(T, T) +) # test plots -plotLevelFalse <- data.frame(clusterID = c(1, 1, 1, 1, 1, 2, 2, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5), - attr = c(1000, 1250, 950, 900, 1005, 1000, 1250, 950, 900, 1005, 1000, 1250, 950, 900, 1005, - 1000, 1250, 950, 900), - isUsed = c(T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F)) - -plotLevelTrue <- data.frame(clusterID = c(1, 1, 1, 1, 1, 2, 2, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5), - attr = c(1000, 1250, 950, 900, 1005, 1000, 1250, 950, 900, 1005, 1000, 1250, 950, 900, 1005, - 1000, 1250, 950, 900), - isUsed = c(T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T)) - -plotLevelNAOne <- data.frame(clusterID = c(NA), attr = c(NA), isUsed = c(NA)) - -plotLevelNAMore <- data.frame(clusterID = c(1, 1, 1, 1, 1, 2, 2, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5), - attr = c(1000, 1250, 950, 900, 1005, 1000, 1250, NA, 900, 1005, 1000, 1250, 950, 900, 1005, - 1000, 1250, 950, 900), - isUsed = c(T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T)) - -plotLevelAttribute <- data.frame(clusterID = c(1, 1, 1, 1, 1, 2, 2, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5), - kittens = c(1000, 1250, 950, 900, 1005, 1000, 1250, 950, 900, 1005, 1000, 1250, 950, 900, 1005, - 1000, 1250, 950, 900), - isUsed = c(T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T)) - +plotLevelFalse <- clusterClean +plotLevelTrue <- clusterClean %>% mutate(isUsed = TRUE) test_that("cluster sample isUsed functions correctly", { - - expect_equal(summarize_cluster(clusterFalse, F)[[1]], c(30.51), tolerance = 0.1) - expect_false(identical(summarize_cluster(clusterTrue, F), - summarize_cluster(clusterFalse, F))) - expect_false(identical(summarize_cluster(plotLevelTrue, T), - summarize_cluster(plotLevelFalse, T))) - + expect_equal(summarize_cluster(clusterFalse, F)[[1]], c(43.8), tolerance = 0.1) + expect_false(identical( + summarize_cluster(clusterTrue, F), + summarize_cluster(clusterFalse, F) + )) + expect_false(identical( + summarize_cluster(plotLevelTrue, T, attr = "bapa"), + summarize_cluster(plotLevelFalse, T, attr = "bapa") + )) }) test_that("cluster sample handles input with no, one, or two clusters correctly", { - expect_error(summarize_cluster(clusterNone, F)) expect_error(summarize_cluster(clusterOne, F)) expect_false(any(is.na(summarize_cluster(clusterTwo, F)))) - }) test_that("cluster sample handles input data with NA values", { - - expect_error(summarize_cluster(plotLevelNAOne, T)) - expect_error(summarize_cluster(plotLevelNAMore, T)) - + expect_error(summarize_cluster(clusterBaData, T)) }) test_that("cluster sample handles attribute", { - - expect_equal(summarize_cluster(plotLevelAttribute, T, 'kittens'), - summarize_cluster(plotLevelTrue, T)) - + expect_equal( + summarize_cluster(clusterClean, T, "bapa"), + summarize_cluster(clusterClean %>% rename(attr = bapa), T) + ) }) - diff --git a/tests/testthat/test_cluster_discrete.R b/tests/testthat/test_cluster_discrete.R index 8017747..fb37f17 100644 --- a/tests/testthat/test_cluster_discrete.R +++ b/tests/testthat/test_cluster_discrete.R @@ -1,20 +1,26 @@ context("Forest sampling statistics calculations: cluster sample for attributes, discrete variables") # dataset is from the example for this sampling method in Avery and Burkhart -data <- data.frame(plots = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), - propAlive = c(0.75, 0.80, 0.80, 0.85, 0.70, - 0.90, 0.70, 0.75, 0.80, 0.65)) +data <- data.frame( + plots = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), + propAlive = c( + 0.75, 0.80, 0.80, 0.85, 0.70, + 0.90, 0.70, 0.75, 0.80, 0.65 + ) +) test_that("cluster discrete calculates values correctly", { - - expect_equal(summarize_cluster_discrete(data, attribute = 'propAlive', plotTot = 250)$upperLimitCI, - 0.82275, tolerance = 0.001) - + expect_equal( + summarize_cluster_discrete( + data, + attribute = "propAlive", plotTot = 250 + )$upperLimitCI, + 0.82275, + tolerance = 0.001 + ) }) test_that("cluster discrete requires a population total value", { - - expect_error(summarize_cluster_discrete(data, attribute = 'propAlive')) - + expect_error(summarize_cluster_discrete(data, attribute = "propAlive")) }) diff --git a/tests/testthat/test_poisson.R b/tests/testthat/test_poisson.R index b1c8a9e..72c739a 100644 --- a/tests/testthat/test_poisson.R +++ b/tests/testthat/test_poisson.R @@ -3,18 +3,21 @@ context("Forest sampling statistics calculations: poisson sample") data <- c(2, 3, 4, 3, 4, 5, 2, 7) test_that("Poisson produces output", { - - expect_equal(suppressWarnings(summarize_poisson(data)), data.frame('sampleSize' = 8, 'mean' = 3.75, - 'lambdaHat' = 3.75, - 'se' = 0.68, - 'lowerBoundCI' = 2.40, - 'upperBoundCI' = 5.09), - tolerance = 0.1) - + expect_equal( + suppressWarnings( + summarize_poisson(data) + ), + data.frame( + "sampleSize" = 8, "mean" = 3.75, + "lambdaHat" = 3.75, + "se" = 0.68, + "lowerBoundCI" = 2.40, + "upperBoundCI" = 5.09 + ), + tolerance = 0.1 + ) }) test_that("Poisson warns against overdispersion", { - expect_warning(summarize_poisson(data)) - }) diff --git a/tests/testthat/test_simple_random.R b/tests/testthat/test_simple_random.R index 9424116..a1b1336 100644 --- a/tests/testthat/test_simple_random.R +++ b/tests/testthat/test_simple_random.R @@ -1,76 +1,97 @@ context("Forest sampling statistics calculations: simple random sample") -trainingData <- data.frame(bapa = c(120, 140, 160, 110, 100, 90), - plots = c(1, 2, 3, 4, 5, 6)) -attribute <- 'bapa' +trainingData <- simpleRandom %>% + group_by(plot) %>% + summarize(bapa = sum(BAF)) +attribute <- "bapa" test_that("simple random function handles missing arguments correctly", { - - sampT <- summarize_simple_random(data = trainingData, attribute, popSize = 50, - desiredConfidence = 0.9, infiniteReplacement = T) - sampNA <- summarize_simple_random(data = trainingData, attribute, popSize = NA, - desiredConfidence = 0.9, infiniteReplacement = T) - sampNoN <- summarize_simple_random(data = trainingData, attribute, desiredConfidence = 0.9, - infiniteReplacement = T) - sampNoReplacement <- summarize_simple_random(trainingData, attribute, popSize = 50, - desiredConfidence = 0.9) - sampF <- summarize_simple_random(trainingData, attribute, popSize = 50, - desiredConfidence = 0.9, infiniteReplacement = F) - sampReplacementNA <- summarize_simple_random(trainingData, attribute, popSize = 50, - desiredConfidence = 0.9, infiniteReplacement = NA) + sampT <- summarize_simple_random( + data = trainingData, attribute, popSize = 50, + desiredConfidence = 0.9, infiniteReplacement = T + ) + sampNA <- summarize_simple_random( + data = trainingData, attribute, popSize = NA, + desiredConfidence = 0.9, infiniteReplacement = T + ) + sampNoN <- summarize_simple_random( + data = trainingData, attribute, desiredConfidence = 0.9, + infiniteReplacement = T + ) + sampNoReplacement <- summarize_simple_random(trainingData, attribute, + popSize = 50, + desiredConfidence = 0.9 + ) + sampF <- summarize_simple_random(trainingData, attribute, + popSize = 50, + desiredConfidence = 0.9, infiniteReplacement = F + ) + sampReplacementNA <- summarize_simple_random(trainingData, attribute, + popSize = 50, + desiredConfidence = 0.9, infiniteReplacement = NA + ) expect_equal(sampNA, sampT) expect_equal(sampNA, sampNoN) expect_equal(sampF, sampNoReplacement) expect_equal(sampF, sampReplacementNA) - expect_error(summarize_simple_random(trainingData, attribute, popSize = 0, - desiredConfidence = 0.9, infiniteReplacement = T)) - - expect_equal(sampF, data.frame('mean' = 120, 'variance' = 680, 'standardError' = 9.98, - 'upperLimitCI' = 140.12, 'lowerLimitCI' = 99.87), tolerance = 0.1) - + expect_error(summarize_simple_random(trainingData, attribute, + popSize = 0, + desiredConfidence = 0.9, infiniteReplacement = T + )) + + expect_equal(sampF, data.frame( + "mean" = 135, "variance" = 1172, "standardError" = 8.6, + "upperLimitCI" = 150.5, "lowerLimitCI" = 120 + ), tolerance = 1) }) test_that("simple random throws errors for data with few or no entries, or missing values", { - - trainingDataOne <- data.frame('bapa' = c(4), 'plot' = c(10)) - trainingDataThree <- data.frame('bapa' = c(4, 5, NA), 'plot' = c(10, 11, 12)) - trainingDataNone <- data.frame('bapa' = c(NA), 'plot' = c(10)) - - expect_warning(summarize_simple_random(trainingDataOne, attribute, popSize = 50, - desiredConfidence = 0.9, infiniteReplacement = T)) - expect_error(summarize_simple_random(trainingDataNone, attribute, popSize = 50, - desiredConfidence = 0.9, infiniteReplacement = T)) - expect_error(summarize_simple_random(trainingDataThree, attribute, popSize = 50, - desiredConfidence = 0.9, infiniteReplacement = T)) - + trainingDataOne <- data.frame("bapa" = c(4), "plot" = c(10)) + trainingDataThree <- data.frame("bapa" = c(4, 5, NA), "plot" = c(10, 11, 12)) + trainingDataNone <- data.frame("bapa" = c(NA), "plot" = c(10)) + + expect_warning(summarize_simple_random(trainingDataOne, attribute, + popSize = 50, + desiredConfidence = 0.9, infiniteReplacement = T + )) + expect_error(summarize_simple_random(trainingDataNone, attribute, + popSize = 50, + desiredConfidence = 0.9, infiniteReplacement = T + )) + expect_error(summarize_simple_random(trainingDataThree, attribute, + popSize = 50, + desiredConfidence = 0.9, infiniteReplacement = T + )) }) test_that("simple random processes data with at least two", { + trainingDataTwo <- data.frame("bapa" = c(4, 5), "plot" = c(10, 11)) - trainingDataTwo <- data.frame('bapa' = c(4, 5), 'plot' = c(10, 11)) - - sampTwo <- summarize_simple_random(trainingDataTwo, attribute, popSize = 50, - desiredConfidence = 0.9, infiniteReplacement = T) - - expect_equal(sampTwo$mean, mean(trainingDataTwo[[1]])) + sampTwo <- summarize_simple_random(trainingDataTwo, attribute, + popSize = 50, + desiredConfidence = 0.9, infiniteReplacement = T + ) + expect_equal(sampTwo$mean, mean(trainingDataTwo[[1]])) }) test_that("simple random accepts and processes vectors and dataframes equally", { - - dataframe <- summarize_simple_random(trainingData, attribute, popSize = 50, - desiredConfidence = 0.9, infiniteReplacement = T) - vector <- summarize_simple_random(c(120, 140, 160, 110, 100, 90), popSize = 50, - desiredConfidence = 0.9, infiniteReplacement = T) - + dataframe <- summarize_simple_random(trainingData, attribute, + popSize = 50, + desiredConfidence = 0.9, infiniteReplacement = T + ) + vector <- summarize_simple_random(c( + 140, 140, 180, 140, 180, 120, + 120, 80, 80, 120, 180, 140 + ), + popSize = 50, + desiredConfidence = 0.9, infiniteReplacement = T + ) expect_equal(dataframe, vector) - }) - - diff --git a/tests/testthat/test_srs_discrete.R b/tests/testthat/test_srs_discrete.R index aa64a62..5bf143a 100644 --- a/tests/testthat/test_srs_discrete.R +++ b/tests/testthat/test_srs_discrete.R @@ -1,33 +1,35 @@ context("Forest sampling statistics calculations: simple random sample for attributes, discrete variables") -data <- data.frame(alive = c(T, T, F, T, F, F), - plots = c(1, 2, 3, 4, 5, 6)) +data <- data.frame( + alive = c(T, T, F, T, F, F), + plots = c(1, 2, 3, 4, 5, 6) +) -attribute <- 'alive' +attribute <- "alive" test_that("srs discrete calculates values correctly", { - expect_equal(summarize_simple_random_discrete(data, attribute, popTot = 50)$upperLimitCI, - 1.4858, tolerance = 0.001) - + 1.4858, + tolerance = 0.001 + ) }) test_that("srs discrete requires a population total value", { - expect_error(summarize_simple_random_discrete(data, attribute)) - }) test_that("srs discrete generalizes attribute", { - dataGeneral <- rename(data, attr = alive) - - expect_equal(summarize_simple_random_discrete(dataGeneral, 'attr', popTot = 50), - summarize_simple_random_discrete(data, 'alive', popTot = 50)) - expect_equal(summarize_simple_random_discrete(dataGeneral, 'attr', popTot = 50), - summarize_simple_random_discrete(dataGeneral, popTot = 50)) - + + expect_equal( + summarize_simple_random_discrete(dataGeneral, "attr", popTot = 50), + summarize_simple_random_discrete(data, "alive", popTot = 50) + ) + expect_equal( + summarize_simple_random_discrete(dataGeneral, "attr", popTot = 50), + summarize_simple_random_discrete(dataGeneral, popTot = 50) + ) }) diff --git a/tests/testthat/test_stratified.R b/tests/testthat/test_stratified.R index 8732bab..f4e321d 100644 --- a/tests/testthat/test_stratified.R +++ b/tests/testthat/test_stratified.R @@ -1,33 +1,55 @@ context("Forest sampling statistics calculations: stratified sample") +trainingData <- clusterBaData %>% + filter(!is.na(bapa)) %>% + filter(clusterID %in% c(1, 2)) %>% + rename(stratum = clusterID) - -trainingData <- data.frame(bapa = c(120, 140, 160, 110, 100, 90), - stratum = c(1, 1, 1, 2, 2, 2)) stratumTab <- data.frame(stratum = c(1, 2), acres = c(200, 50)) -attribute = 'bapa' -desiredConfidence = 0.9 -stratSum = data.frame("stratum" = c(1, 2), "stratMean" = c(140, 100), - #"stratVarTot" = c(5333333.33, 83333.33), - "stratVarMean" = c(133.33, 33.33), #I don't understand how this is in the output - "stratSE" = c(11.547, 5.773), "stratPlots" = c(3, 3), - "stratAcres" = c(200, 50)) -totSum = data.frame("popMean" = c(132), "popSE" = c(9.30), - "popCIhalf" = c(19.84), "ciPct" = c(15.03)) -finalSum = list('stratumSummaries' = data.frame(stratSum), - 'totalSummary' = data.frame(totSum)) +stratSum <- data.frame( + "stratum" = c(1, 2), + "stratMean" = c(1038.75, 1125), + "stratVarMean" = c(5543.229, 15625), + "stratSE" = c(74.45, 125), + "stratPlots" = c(4, 2), + "stratAcres" = c(200, 50) +) +totSum <- data.frame( + "popMean" = c(1056), + "popSE" = c(64.60), + "popCIhalf" = c(137.7), + "ciPct" = c(13.04) +) +finalSum <- list( + "stratumSummaries" = data.frame(stratSum), + "totalSummary" = data.frame(totSum) +) test_that("stratified functions correctly", { - strat <- summarize_stratified(trainingData, attribute,stratumTab, - desiredConfidence = 0.9, post = T) - expect_equal(strat, - finalSum, tolerance = 0.1) - expect_equal(strat$stratumSummaries, - stratSum, tolerance = 0.1) - expect_equal(strat$totalSummary, - totSum, tolerance = 0.1) + strat <- summarize_stratified( + trainingData, + attribute = "bapa", + stratumTab, + desiredConfidence = 0.9, + post = T + ) + expect_equal( + strat, + finalSum, + tolerance = 0.1 + ) + expect_equal( + strat$stratumSummaries, + stratSum, + tolerance = 0.1 + ) + expect_equal( + strat$totalSummary, + totSum, + tolerance = 0.1 + ) }) diff --git a/tests/testthat/test_systematic.R b/tests/testthat/test_systematic.R index dba1ab0..b538bbf 100644 --- a/tests/testthat/test_systematic.R +++ b/tests/testthat/test_systematic.R @@ -1,26 +1,43 @@ context("Forest sampling statistics calculations: systematic sample") -trainingData <- data.frame(bapa = c(120, 140, 160, 110, 100, 90), - plots = c(1, 2, 3, 4, 5, 6)) - +trainingData <- clusterBaData %>% + filter(!is.na(bapa)) %>% + group_by(clusterID) %>% + sample_n(1) %>% + ungroup() %>% + rename(plots = clusterID) test_that("systematic functions correctly with infiniteReplacement default", { - - systematic <- summarize_systematic(trainingData, attribute = 'bapa', popSize = 50, desiredConfidence = 0.9) - simple <- summarize_simple_random(trainingData, attribute = 'bapa', popSize = 50, desiredConfidence = 0.9, - infiniteReplacement = F) + systematic <- summarize_systematic( + trainingData, + attribute = "bapa", + popSize = 50, + desiredConfidence = 0.9 + ) + simple <- summarize_simple_random( + trainingData, + attribute = "bapa", + popSize = 50, + desiredConfidence = 0.9, + infiniteReplacement = F + ) expect_equal(systematic, simple) - }) test_that("systematic functions correctly with vector and data frame input", { - - dataframe <- summarize_systematic(trainingData, attribute = 'bapa', popSize = 50, - desiredConfidence = 0.9) - vector <- summarize_systematic(c(120, 140, 160, 110, 100, 90), popSize = 50, - desiredConfidence = 0.9) + dataframe <- summarize_systematic( + trainingData, + attribute = "bapa", + popSize = 50, + desiredConfidence = 0.9 + ) + + vector <- summarize_systematic( + trainingData$bapa, + popSize = 50, + desiredConfidence = 0.9 + ) expect_equal(dataframe, vector) - }) diff --git a/tests/testthat/test_threeP.R b/tests/testthat/test_threeP.R index 8b8eb5d..080d915 100644 --- a/tests/testthat/test_threeP.R +++ b/tests/testthat/test_threeP.R @@ -1,14 +1,22 @@ context("Forest sampling statistics calculations: 3P sample") -dataEstimateVBAR <- data.frame(plotNum = c(1, 2, 3, 4, 5), - treeCount = c(4, 5, 6, 3, 4), - BAF = c(10, 10, 10, 10, 10), - avgTreeVBARSperPlot = c(9, 8, 7, 8, 2), - estimateVBAR = c(1, 2, 3, 4, 5)) +dataEstimateVBAR <- data.frame( + plotNum = c(1, 2, 3, 4, 5), + treeCount = c(4, 5, 6, 3, 4), + BAF = c(10, 10, 10, 10, 10), + avgTreeVBARSperPlot = c(9, 8, 7, 8, 2), + estimateVBAR = c(1, 2, 3, 4, 5) +) test_that("3P produces output", { - - expect_equal(summarize_threeP(dataEstimateVBAR, cvPercent = 50, trueNetVBAR = 10, height = F, plot = T)[1], - data.frame('sampSize' = 5)) - + expect_equal( + summarize_threeP( + dataEstimateVBAR, + cvPercent = 50, + trueNetVBAR = 10, + height = F, + plot = T + )[1], + data.frame("sampSize" = 5) + ) }) diff --git a/tests/testthat/test_twostage.R b/tests/testthat/test_twostage.R index 242edd4..4c1e619 100644 --- a/tests/testthat/test_twostage.R +++ b/tests/testthat/test_twostage.R @@ -1,70 +1,83 @@ context("Forest sampling statistics calculations: two stage sample") -trainingData = data.frame(clusterID = c(1, 1, 1, 1, 1, 2, 2, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5), - bapa = c(1000, 1250, NA, 900, 1005, 1000, 1250, 950, 900, 1005, NA, 1250, 950, 900, 1005, - 1000, 1250, NA, 900), - isUsed = c(T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, F, F, F, F)) - -# data from Avery and Burkhart -redData = data.frame(clusterID = c(1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6), - volume = c(500, 650, 610, 490, 475, 505, 940, 825, 915, 210, 185, - 170, 450, 300, 500, 960, 975, 890), - isUsed = c(T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T)) - - test_that("two stage functions correctly with warning", { - - expect_equal(suppressWarnings(summarize_two_stage(trainingData, TRUE, 'bapa', desiredConfidence = 0.95))$mean, - 1179.64, tolerance = 0.1) - + expect_equal( + suppressWarnings( + summarize_two_stage( + clusterBaData, + TRUE, + "bapa", + desiredConfidence = 0.95 + ) + )$mean, + 1179.64, + tolerance = 0.1 + ) }) test_that("two stage attribute name functions correctly", { - redDataAttr <- rename(redData, attr = volume) - expect_equal(summarize_two_stage(redDataAttr, T, attribute = 'attr', - populationClusters = 16, - populationElementsPerCluster = 160), - summarize_two_stage(redData, T, attribute = 'volume', - populationClusters = 16, - populationElementsPerCluster = 160)) - expect_equal(summarize_two_stage(redDataAttr, T, attribute = 'attr', - populationClusters = 16, - populationElementsPerCluster = 160), - summarize_two_stage(redDataAttr, T, - populationClusters = 16, - populationElementsPerCluster = 160)) - + expect_equal( + summarize_two_stage(redDataAttr, T, + attribute = "attr", + populationClusters = 16, + populationElementsPerCluster = 160 + ), + summarize_two_stage(redData, T, + attribute = "volume", + populationClusters = 16, + populationElementsPerCluster = 160 + ) + ) + expect_equal( + summarize_two_stage(redDataAttr, T, + attribute = "attr", + populationClusters = 16, + populationElementsPerCluster = 160 + ), + summarize_two_stage(redDataAttr, T, + populationClusters = 16, + populationElementsPerCluster = 160 + ) + ) }) test_that("two stage cluster input data functions correctly", { - - trainingDataCluster <- data.frame(clusterID = c(1, 2, 3, 4, 5), - totClusterElements = c(5, 2, 1, 6, 5), - sampledElements = c(2, 2, 2, 2, 2), - isUsed = c(T, T, T, T, F), - attrSumCluster = c(1000, 1250, 950, 900, 1005)) - - expect_equal(summarize_two_stage(trainingDataCluster, F)$lowerLimitCI, - 70.5, tolerance = 0.1) - + clusterBaDataCluster <- data.frame( + clusterID = c(1, 2, 3, 4, 5), + totClusterElements = c(5, 2, 1, 6, 5), + sampledElements = c(2, 2, 2, 2, 2), + isUsed = c(T, T, T, T, F), + attrSumCluster = c(1000, 1250, 950, 900, 1005) + ) + + expect_equal(summarize_two_stage(clusterBaDataCluster, F)$lowerLimitCI, + 70.5, + tolerance = 0.1 + ) }) test_that("two stage calculates values correctly", { - # checks the function against the values produced in Avery and Burkhart's (1967) - # Forest Measurements, Fifth Edition - # CI limits are different than the answers expressed in the textbook, because - # the true t-score was calculated in this script - expect_equal(summarize_two_stage(redData, T, 'volume', populationClusters = 16, - populationElementsPerCluster = 160), - data.frame(mean = 586.1, varianceB = 250188.9, varianceW = 3869.4, - standardError = 93.628, upperLimitCI = 782.81, - lowerLimitCI = 389.40), tolerance = 0.01) - + # checks the function against the values produced in Avery and Burkhart's (1967) + # Forest Measurements, Fifth Edition + # CI limits are different than the answers expressed in the textbook, because + # the true t-score was calculated in this script + expect_equal( + summarize_two_stage( + redData, T, "volume", + populationClusters = 16, + populationElementsPerCluster = 160 + ), + data.frame( + mean = 586.1, varianceB = 250188.9, varianceW = 3869.4, + standardError = 93.628, upperLimitCI = 782.81, + lowerLimitCI = 389.40 + ), + tolerance = 0.01 + ) }) - diff --git a/vignettes/examplePackageProcedure.Rmd b/vignettes/examplePackageProcedure.Rmd index e8f3348..d7f9290 100644 --- a/vignettes/examplePackageProcedure.Rmd +++ b/vignettes/examplePackageProcedure.Rmd @@ -9,14 +9,14 @@ vignette: > %\VignetteEncoding{UTF-8} --- -Morals: +Imperatives: 1. Data input must be expanded prior to the function call. 2. Data frame input needs to have the same column names as expressed in the example data (except for the attribute of interest, the name of which can be set in the function call). -The setting: +The setting: Wilma Antall wanted to analyze data. She'd spent time and resources carefully planning a survey. She had decided to use the simple random sampling design, because it fit her goals best - to assess the _Euphorbia obesa_ in a long-standing, worldwide study on succulents. (Or, she was scrounging a database and decided to see what statistics this stray database would produce if it were analyzed like a simple random sample.) - She wanted to be smart about this. She'd expended a lot of effort collecting the data, managing, and storing it, being careful to have a truly random, unbiased dataset. She wanted to do analyze the data correctly. She also wanted to be transparent, and be sure that other statisticians would support her findings. They would have access to the easily repeatable R code that explicitly stated her steps of analysis - if they had any questions, it would be easy to prove her process. So, she decided to use the Forest Sampling R package. + She wanted to be smart about this. She'd expended a lot of effort collecting the data, managing, and storing it, being careful to have a truly random, unbiased dataset. She wanted to analyze the data correctly. She also wanted to be transparent, and be sure that other statisticians would support her findings. They would have access to the easily repeatable R code that explicitly stated her steps of analysis - if they had any questions, it would be easy to prove her process. So, she decided to use the Forest Sampling R package. ```{r include = TRUE} library(forestSampling) @@ -25,32 +25,32 @@ library(dplyr) She wasn't exactly sure how to start, so she took a few minutes to look at her data. -Her data may have looked something like this: +Her data may have looked something like this: ```{r include = TRUE} -head(simpleRandom, n=10) +head(simpleRandom, n = 10) ``` But this didn't quite fit the data format she read about in the Forest Sampling Vignette. -Moral 1: -First, she had to expand her data how she wanted and provide her attribute of interest. She wanted basal area per acre (given the incredibly rare, monstrously tall, secret population she sampled, this was a reasonable attribute to use). So, she ran this little R code: +Imperative 1: +First, she had to expand her data how she wanted and provide her attribute of interest. She wanted basal area per acre (given the incredibly rare, monstrously tall, secret population she sampled, this was a reasonable attribute to use). So, she ran this little chunk of R code: -```{r include = FALSE} +```{r, include = TRUE} expanded <- simpleRandom %>% - mutate(bapaTree = pi * (DBH / 2) ^ 2 / 144) %>% + mutate(bapaTree = pi * (DBH / 2)^2 / 144) %>% group_by(plot) %>% summarize(bapa = sum(bapaTree) / sum(BAF)) ``` -Moral 2: +Imperative 2: Now, she checked the Forest Sampling Vignette to make sure that her data was on the right track. ```{r include = TRUE} -head(expanded, n=12) +head(expanded, n = 12) ``` -She was thrilled. Her data had a column she could identify as her attribute of interest in a call similar to `attribute <- 'bapa`. She did jumping jacks for joy. She was now ready to run the package's function. +She was thrilled. Her data had a column she could identify as her attribute of interest in a call similar to `attribute <- 'bapa' `. She did jumping jacks for joy. She was now ready to run the package's function. She looked at the Forest Sampling Vignette. The simple random sample function had three options: 1. For a finite population or sampled without replacement @@ -62,10 +62,16 @@ She'd sampled a finite population without replacement, so she decided Option #1 Looking at the example function call, she knew it had to look something like this: `summarize_all_srs(data, attribute, type, popSize = NA, desiredConfidence = 0.95, infiniteReplacement = F)` -She modified this to match her data's needs, ran the code, and got: +She modified this to match her data structure, ran the code, and got: ```{r include = TRUE} -summarize_all_srs(expanded, attribute = 'bapa', popSize = NA, desiredConfidence = 0.90, infiniteReplacement = F) +summarize_all_srs( + expanded, + attribute = "bapa", + popSize = NA, + desiredConfidence = 0.90, + infiniteReplacement = F +) ``` The output she got was the calculated mean, variance, standard error, and the upper and lower limits of the 90% confidence interval. @@ -83,7 +89,13 @@ stratumTab <- data.frame(stratum = c(1, 2), acres = c(200, 50)) She looked in the Forest Sampling Vignette and figured out the call: ```{r include = TRUE} -summarize_stratified(stratumData, attribute = 'bapa', stratumTab, desiredConfidence = 0.90, post = T) +summarize_stratified( + stratumData, + attribute = "bapa", + stratumTab, + desiredConfidence = 0.90, + post = T +) ``` She felt accomplished - she now had summary statistics for her data! Congratulations, Wilma! diff --git a/vignettes/forestSampling.Rmd b/vignettes/forestSampling.Rmd index d151705..c32a449 100644 --- a/vignettes/forestSampling.Rmd +++ b/vignettes/forestSampling.Rmd @@ -10,29 +10,31 @@ vignette: > --- -# Forest Sampling Package +# Forest Sampling Package -The Forest Sampling is a package of functions that calculates workups for sampling designs commonly used in forestry. It is an open source opportunity for collaboration and transparency. +The Forest Sampling is a package of functions that calculates workups for sampling designs commonly used in forestry. It is an open source opportunity for collaboration and transparency. -These sampling designs include: -* Simple Random Sampling -* Cluster Sampling -* Stratified Sampling -* Systematic Sampling -* Two-Stage Sampling -* 3P Sampling - -Variations are included, described in detail below. +These sampling designs include: +* Simple Random Sampling +* Cluster Sampling +* Stratified Sampling +* Systematic Sampling +* Two-Stage Sampling +* 3P Sampling + +Variations are included, described in detail below. # General Information -* All functions require data input. For most functions, this data must be a data frame formatted in the generalized structure. Select functions can act on vectors (`summarize_simple_random` and `summarize_systematic`). -* In most cases, additional parameters are necessary. Defaults have been provided to help minimize this. +* All functions require data input. For most functions, this data must be a +data frame or tibble formatted in the generalized structure. Select functions +can act on vectors (`summarize_simple_random` and `summarize_systematic`). +* In most cases, additional parameters are necessary. Defaults have been provided to help minimize this. -## Common parameters, usage information: -* `attribute`: assumes target attribute is called `attr`. Set `attribute` to the column header used for the variable of interest. Not required for vector input. -* `___Tot`: total number of the ___ unit in the sampling design. Commonly the total number of clusters or plots in the sampling frame. -* `desiredConfidence`: Percent confidence level (two-sided) expressed as decimal. Default is set to `0.95`. +## Common parameters, usage information: +* `attribute`: assumes target attribute is called `attr`. Set `attribute` to the column header used for the variable of interest. Not required for vector input. +* `___Tot`: total number of the ___ unit in the sampling design. Commonly the total number of clusters or plots in the sampling frame. +* `desiredConfidence`: Percent confidence level (two-sided) expressed as decimal. Default is set to `0.95`. ## Output: @@ -43,37 +45,37 @@ The outputs of the functions vary, but all produce a data frame containing: * upper limit for the confidence interval * lower limit for the confidence interval -# Functions: -The following are the basic functions and their associated sampling strategy. +# Functions: +The following are the basic functions and their associated sampling strategy. Function | Sampling Strategy -------------------------------------|--------------------------------------------------------- `summarize_all_cluster()` | cluster sampling -`summarize_all_simple_random()` | simple random sampling -`summarize_stratified()` | stratified sampling -`summarize_systematic()` | systematic sampling -`summarize_two_stage()` | two-stage sampling +`summarize_all_simple_random()` | simple random sampling +`summarize_stratified()` | stratified sampling +`summarize_systematic()` | systematic sampling +`summarize_two_stage()` | two-stage sampling `summarize_threeP()` | 3P sampling `summarize_poisson()` | Poisson sampling -## Cluster Sample: +## Cluster Sample: -### Sampling Strategy Definition: +### Sampling Strategy Definition: Each sampling unit is represented as a cluster, a group of several plots. -### Function Definition: +### Function Definition: `summarize_all_cluster(data, attribute = NA, element = TRUE, plotTot = NA, desiredConfidence = 0.95, bernoulli = F)` ### Options: -Cluster sample has 2 options: +Cluster sample has 2 options: 1. Cluster sample with a normal distribution - * Set `bernoulli = FALSE` or do not include the parameter in the function call - * Ex. `summarize_all_cluster(dataPlot, attribute, element = TRUE, bernoulli = F)` -2. Cluster sample with a bernoulli distribution - * Set `bernoulli = TRUE` + * Set `bernoulli = FALSE` or do not include the parameter in the function call + * Ex. `summarize_all_cluster(dataPlot, attribute, element = TRUE, bernoulli = F)` +2. Cluster sample with a bernoulli distribution + * Set `bernoulli = TRUE` * Ex. `summarize_all_cluster(data, attribute, plotTot = 250, bernoulli = T)` Parameter | Include parameter in option #: | Description @@ -87,55 +89,69 @@ bernoulli | 1, 2 | logical TRUE if data fitti ### Data Input: -Option 1 can take in element (plot) data or stand data. +Option 1 can take in element (plot) data or stand data. ```{r include = FALSE} -plotLevelDataExample <- data.frame(clusterID = c(1, 1, 1, 1, 1, 2, 2, 3, 4, 4, 4, 4, 4, 4, - 5, 5, 5, 5, 5), - attr = c(1000, 1250, 950, 900, 1005, 1000, 1250, 950, - 900, 1005, 1000, 1250, 950, 900, 1005, 1000, - 1250, 950, 900), - isUsed = c(T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, - F, F, F, F)) +plotLevelDataExample <- data.frame( + clusterID = c( + 1, 1, 1, 1, 1, 2, 2, 3, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5 + ), + attr = c( + 1000, 1250, 950, 900, 1005, 1000, 1250, 950, + 900, 1005, 1000, 1250, 950, 900, 1005, 1000, + 1250, 950, 900 + ), + isUsed = c( + T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, + F, F, F, F + ) +) ``` ```{r include = FALSE} -clusterLevelDataExample <- data.frame(clusterID = c(1, 2, 3, 4, 5), - clusterElements = c(4, 2, 9, 4, 10), - sumAttr = c(1000, 1250, 950, 900, 1005), - isUsed = c(T, T, F, T, T)) +clusterLevelDataExample <- data.frame( + clusterID = c(1, 2, 3, 4, 5), + clusterElements = c(4, 2, 9, 4, 10), + sumAttr = c(1000, 1250, 950, 900, 1005), + isUsed = c(T, T, F, T, T) +) ``` -Option 2 is the option for the Bernoulli distribution. For this distribution, the total number of plots in the sampling frame must be indicated, and data must be input as follows (where attr is the value of the attribute for each plot): +Option 2 is the option for the Bernoulli distribution. For this distribution, the total number of plots in the sampling frame must be indicated, and data must be input as follows (where attr is the value of the attribute for each plot): ```{r include = FALSE} -data <- data.frame(plots = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), - propAlive = c(0.75, 0.80, 0.80, 0.85, 0.70, - 0.90, 0.70, 0.75, 0.80, 0.65)) +data <- data.frame( + plots = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), + propAlive = c( + 0.75, 0.80, 0.80, 0.85, 0.70, + 0.90, 0.70, 0.75, 0.80, 0.65 + ) +) ``` -## Simple Random Sample +## Simple Random Sample -### Sampling Strategy Definition: +### Sampling Strategy Definition: *n* sampling units are selected. Each sampling unit is selected independently from the selection of other sampling units. -### Function Definition: +### Function Definition: `summarize_all_srs(data, attribute = 'attr', popSize = NA, desiredConfidence = 0.95, infiniteReplacement = F, bernoulli = F)` ### Options: -Simple random sample (SRS) has 3 options: +Simple random sample (SRS) has 3 options: 1. SRS of a finite population or sampled without replacement - * Set `infiniteReplacement = FALSE` or do not include the parameter in the function call - * Data input can be as a vector or data frame - * Ex. `summarize_all_srs(data, attribute, popSize = NA, desiredConfidence = 0.95, infiniteReplacement = F)` + * Set `infiniteReplacement = FALSE` or do not include the parameter in the function call + * Data input can be as a vector or data frame + * Ex. `summarize_all_srs(data, attribute, popSize = NA, desiredConfidence = 0.95, infiniteReplacement = F)` 2. SRS of an infinite population or sampled with replacement - * Set `infiniteReplacement = TRUE` - * Data input can be as a vector or data frame - * Ex. `summarize_all_srs(data, attribute, popSize = NA, desiredConfidence = 0.95, infiniteReplacement = T)` + * Set `infiniteReplacement = TRUE` + * Data input can be as a vector or data frame + * Ex. `summarize_all_srs(data, attribute, popSize = NA, desiredConfidence = 0.95, infiniteReplacement = T)` 3. SRS with a Bernoulli distribution * Set `bernoulli = TRUE` * Data input can only be as a data frame @@ -152,36 +168,39 @@ infiniteReplacement | 1, 2 | logical true if sample wa bernoulli | 1, 2, 3 | logical TRUE if data fitting the Bernoulli distribution is used -### Data Input: +### Data Input: - 1. Vector (Options 1, 2) + 1. Vector (Options 1, 2) ```{r include = FALSE} data <- c(120, 140, 160, 110, 100, 90) ``` - 2. Data frame (Options 1, 2) + 2. Data frame (Options 1, 2) ```{r include = FALSE} -data <- data.frame(bapa = c(120, 140, 160, 110, 100, 90), plots = c(1, 2, 3, 4, 5, 6)) -attribute <- 'bapa' +data <- data.frame( + bapa = c(120, 140, 160, 110, 100, 90), + plots = c(1, 2, 3, 4, 5, 6) +) +attribute <- "bapa" ``` - 3. For Bernoulli Distribution - data frame input only (Option 3) + 3. For Bernoulli Distribution - data frame input only (Option 3) ```{r include = FALSE} data <- data.frame(alive = c(T, T, F, T, F, F), plots = c(1, 2, 3, 4, 5, 6)) -attribute <- 'alive' +attribute <- "alive" ``` ## Stratified Sample: -### Sampling Strategy Definition: +### Sampling Strategy Definition: A simple random sample is performed on at least two subpopulations of known size. The potential subpopulations are formed from dividing the population. -### Function Definition: +### Function Definition: `summarize_stratified(trainingData, attribute, stratumTab, desiredConfidence = 0.95, post = T)` @@ -196,32 +215,34 @@ post | logical true if post-stratification was used ### Options: Stratified sample currently has two options: -1. Stratified - * Set `post = FALSE` -2. Post-Stratified - * Set `post = TRUE` +1. Stratified + * Set `post = FALSE` +2. Post-Stratified + * Set `post = TRUE` ### Data Input: Note: The stratum columns MUST be named 'stratum' and the string you set to equal `attribute`. ```{r include = FALSE} -trainingData <- data.frame(bapa = c(120, 140, 160, 110, 100, 90), - stratum = c(1, 1, 1, 2, 2, 2)) +trainingData <- data.frame( + bapa = c(120, 140, 160, 110, 100, 90), + stratum = c(1, 1, 1, 2, 2, 2) +) stratumTab <- data.frame(stratum = c(1, 2), acres = c(200, 50)) -attribute = 'bapa' -desiredConfidence = 0.9 +attribute <- "bapa" +desiredConfidence <- 0.9 ``` -## Systematic Sample: +## Systematic Sample: -### Sampling Strategy Definition: +### Sampling Strategy Definition: The first sampling unit is arbitrarily established in the field or randomly selected. Subsequent sampling units are placed at uniform intervals through the target area. -### Function Definition: +### Function Definition: -`summarize_systematic(data, attribute = 'attr', popSize = NA, desiredConfidence = 0.95)` +`summarize_systematic(data, attribute = 'attr', popSize = NA, desiredConfidence = 0.95)` Parameter | Description ------------------|----------------------------------------- @@ -232,34 +253,37 @@ desiredConfidence | numeric desired confidence level (e.g. 0.9) ### Data Input: -Data input can be either as a vector or data frame. +Data input can be either as a vector or data frame. ```{r include = FALSE} -dataframe <- data.frame(bapa = c(120, 140, 160, 110, 100, 90), plots = c(1, 2, 3, 4, 5, 6)) -attribute <- 'bapa' +dataframe <- data.frame( + bapa = c(120, 140, 160, 110, 100, 90), + plots = c(1, 2, 3, 4, 5, 6) +) +attribute <- "bapa" ``` ```{r include = FALSE} vector <- c(120, 140, 160, 110, 100, 90) ``` -## Two-Stage Sample: +## Two-Stage Sample: -### Sampling Strategy Definition: +### Sampling Strategy Definition: A variation of cluster sampling. A simple random sample of clusters is selected, then a simple random sample of elements within each cluster is selected and sampled. -### Function Definition: +### Function Definition: `summarize_two_stage(data, plot = TRUE, attribute = NA, populationClusters = 0, populationElementsPerCluster = 0, desiredConfidence = 0.95)` ### Options: Two-stage sample currently has two data input options: -1. Plot-level data - * Set `post = FALSE` -2. Cluster-level data - * Set `post = TRUE` +1. Plot-level data + * Set `post = FALSE` +2. Cluster-level data + * Set `post = TRUE` Parameter | Include parameter in option #: | Description -----------------------------|--------------------------------|--------------------------------------------- @@ -272,16 +296,20 @@ desiredConfidence | 1, 2 | numeric desired ### Data Input: -Data can be organized by either by plot (`plot = TRUE`) or the total sum of the attribute for each cluster (`plot = FALSE`). +Data can be organized by either by plot (`plot = TRUE`) or the total sum of the attribute for each cluster (`plot = FALSE`). ```{r include = FALSE} -data <- data.frame(clusterID = c(1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6), - volume = c(500, 650, 610, 490, 475, 505, 940, 825, 915, 210, 185, 170, 450, - 300, 500, 960, 975, 890), - isUsed = c(T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T)) +data <- data.frame( + clusterID = c(1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6), + volume = c( + 500, 650, 610, 490, 475, 505, 940, 825, 915, 210, 185, 170, 450, + 300, 500, 960, 975, 890 + ), + isUsed = c(T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T) +) ``` -Example: `summarize_two_stage(data, plot = T, 'volume', populationClusters = 16, populationElementsPerCluster = 160)` +Example: `summarize_two_stage(data, plot = T, 'volume', populationClusters = 16, populationElementsPerCluster = 160)` ## 3P Sample: @@ -306,8 +334,8 @@ desiredConfidence | numeric desired confidence level (e.g. 0.9) 3P sample currently has two key options: 1. height - a. Height data is included - * An estimate VBAR will be roughly calculated + a. Height data is included + * An estimate VBAR will be roughly calculated * Set `height = TRUE` b. Estimate VBARs are included * An estimate VBAR is included in the input dat @@ -315,53 +343,77 @@ desiredConfidence | numeric desired confidence level (e.g. 0.9) 2. plot a. Plot-level Data is Input * Data is set up as plot-level data, defined below - * Set `plot = TRUE` + * Set `plot = TRUE` b. Tree-level Data is Input * Data is set up as tree-level data, defined below - * Set `plot = FALSE` + * Set `plot = FALSE` ### Data Input: 1. Height is included, data is plot-level. ```{r include = FALSE} - dataHeight <- data.frame(plotNum = c(5, 4, 3, 5, 4), - treeCount = c(4, 5, 6, 3, 4), - BAF = c(10, 10, 10, 10, 10), - avgTreeVBARSperPlot = c(9, 8, 7, 8, 2), - treeHeight = c(1, 2, 3, 4, 5)) +dataHeight <- data.frame( + plotNum = c(5, 4, 3, 5, 4), + treeCount = c(4, 5, 6, 3, 4), + BAF = c(10, 10, 10, 10, 10), + avgTreeVBARSperPlot = c(9, 8, 7, 8, 2), + treeHeight = c(1, 2, 3, 4, 5) +) ``` Example: `summarize_threeP(dataHeight, cvPercent = 50, trueNetVBAR = 10, height = T, plot = T)` 2. estimateVBAR is included (height data is not used, and so is not included), data is plot-level. ```{r include = FALSE} - dataEstimateVBAR <- data.frame(plotNum = c(1, 2, 3, 4, 5), - treeCount = c(4, 5, 6, 3, 4), - BAF = c(10, 10, 10, 10, 10), - avgTreeVBARSperPlot = c(9, 8, 7, 8, 2), - estimateVBAR = c(1, 2, 3, 4, 5)) +dataEstimateVBAR <- data.frame( + plotNum = c(1, 2, 3, 4, 5), + treeCount = c(4, 5, 6, 3, 4), + BAF = c(10, 10, 10, 10, 10), + avgTreeVBARSperPlot = c(9, 8, 7, 8, 2), + estimateVBAR = c(1, 2, 3, 4, 5) +) +``` +Example: +```r +summarize_threeP( + dataEstimateVBAR, + cvPercent = 50, + trueNetVBAR = 10, + height = F, + plot = T +) ``` -Example: `summarize_threeP(dataEstimateVBAR, cvPercent = 50, trueNetVBAR = 10, height = F, plot = T)` 3. estimateVBAR is included, data is tree-level. ```{r include = FALSE} - dataNotPlot <- data.frame(plotNum = c(1, 1, 2, 2, 3), - BAF = c(10, 10, 10, 10, 10), - VBAR = c(1, 2, 3, 4, 3), - estimateVBAR = c(1, 2, 3, 4, 5)) +dataNotPlot <- data.frame( + plotNum = c(1, 1, 2, 2, 3), + BAF = c(10, 10, 10, 10, 10), + VBAR = c(1, 2, 3, 4, 3), + estimateVBAR = c(1, 2, 3, 4, 5) +) +``` +Example: +```r +summarize_threeP( + dataNotPlot, + cvPercent = 50, + trueNetVBAR = 10, + height = F, + plot = F +) ``` -Example: `summarize_threeP(dataNotPlot, cvPercent = 50, trueNetVBAR = 10, height = F, plot = F)` ## Poisson: -### Sampling Strategy Definition: +### Sampling Strategy Definition: -An assessment of count data that follows the Poisson distribution. +An assessment of count data that follows the Poisson distribution. Note: The term 'Poisson sampling' was also used to refer to a precursor to 3P sampling (Iles, 2003). The preliminary 3P version is discussed in Gregoire and Valentine's sampling strategies book (2008). However, in this package the clearest distinction is that Poisson sampling analyzes to count data and 3P considers field estimations. -### Function Definition: +### Function Definition: -`summarize_poisson(data, desiredConfidence = 0.95)` +`summarize_poisson(data, desiredConfidence = 0.95)` Parameter | Description ------------------|----------------------------------------- @@ -374,13 +426,12 @@ desiredConfidence | numeric desired confidence level (e.g. 0.9) data <- c(2, 3, 4, 3, 4, 5, 2, 7) ``` -## Sources Used +## Sources Used -Avery, T. E., & Burkhart, H., E. (1975). _Forest Measurements_ (5th ed.). New York, NY: McGraw-Hill. +Avery, T. E., & Burkhart, H., E. (1975). _Forest Measurements_ (5th ed.). New York, NY: McGraw-Hill. -Gregoire, T. G., & Valentine, H. T. (2008). _Sampling Strategies for Natural Resources and the Environment_. Boca Raton, FL: CRC Press. +Gregoire, T. G., & Valentine, H. T. (2008). _Sampling Strategies for Natural Resources and the Environment_. Boca Raton, FL: CRC Press. Iles, K. (2003). _A Sampler of Inventory Topics_. Canada: Kim Iles & Associates Ltd. The Pennsylvania State University. (n.d.). _Poisson Sampling_. Retrieved from https://onlinecourses.science.psu.edu/stat504/node/57 -