From 5a271597c49f771a3f8c5c52cd2ffaccae639c60 Mon Sep 17 00:00:00 2001 From: Danielle Callan Date: Mon, 26 Feb 2024 11:24:31 -0500 Subject: [PATCH 1/6] allow ... args to toJSON --- R/methods-VariableMetadata.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/methods-VariableMetadata.R b/R/methods-VariableMetadata.R index c9c18eb..8707aed 100644 --- a/R/methods-VariableMetadata.R +++ b/R/methods-VariableMetadata.R @@ -31,7 +31,7 @@ setMethod("merge", signature("VariableSpecList", "VariableSpecList"), function(x #' @return character vector of length 1 containing JSON string #' @export setGeneric("toJSON", - function(object, named = c(TRUE, FALSE)) standardGeneric("toJSON"), + function(object, named = c(TRUE, FALSE), ...) standardGeneric("toJSON"), signature = "object" ) From 461fcc50261438943337ed05aca6b47a9d75a778 Mon Sep 17 00:00:00 2001 From: Danielle Callan Date: Mon, 26 Feb 2024 11:25:19 -0500 Subject: [PATCH 2/6] add helper for formatting compute result columns headers --- R/utils.R | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/R/utils.R b/R/utils.R index bf45752..4715196 100644 --- a/R/utils.R +++ b/R/utils.R @@ -104,4 +104,19 @@ setroworder <- function(x, neworder) { shiftToNonNeg <- function(x) { x <- x - min(x, na.rm = TRUE) return(x) +} + +#' Strip Entity ID from Column Header +#' +#' This function strips entity ID from column headers for EDA formatted tabular data. +#' @param columnNames character vector +#' @export +stripEntityIdFromColumnHeader <- function(columnNames) { + columnsToFix <- grepl(".", columnNames, fixed=T) + + if (sum(columnsToFix) > 0) { + columnNames[columnsToFix] <- veupathUtils::strSplit(columnNames[columnsToFix], ".", index=2) + } + + return(columnNames) } \ No newline at end of file From 3ad09b38d6ad63e7bba9ed187676289989e6624e Mon Sep 17 00:00:00 2001 From: Danielle Callan Date: Mon, 26 Feb 2024 11:26:11 -0500 Subject: [PATCH 3/6] move some general compute stuff out of mbio computes package --- R/class-ComputeResult.R | 90 +++++++++++++++++ R/methods-ComputeResult.R | 135 +++++++++++++++++++++++++ tests/testthat/test-ComputeResult.R | 146 ++++++++++++++++++++++++++++ 3 files changed, 371 insertions(+) create mode 100644 R/class-ComputeResult.R create mode 100644 R/methods-ComputeResult.R create mode 100644 tests/testthat/test-ComputeResult.R diff --git a/R/class-ComputeResult.R b/R/class-ComputeResult.R new file mode 100644 index 0000000..267e7d3 --- /dev/null +++ b/R/class-ComputeResult.R @@ -0,0 +1,90 @@ +# so S4 will recognize data.table class as inheriting from data.frame +setOldClass(c("data.table", "data.frame")) + +check_compute_result <- function(object) { + errors <- character() + + if (is.na(object@name)) { + msg <- "Compute result must have a name." + errors <- c(errors, msg) + } else if (length(object@name) != 1) { + msg <- "Compute result name must have a single value." + errors <- c(errors, msg) + } + + # If computedVariableMetadata is supplied, check that it contains + # metadata for variables actually in the result data, and that + # the variable classes are correct. + if (!!length(object@computedVariableMetadata)) { + variables <- object@computedVariableMetadata + col_names <- stripEntityIdFromColumnHeader(veupathUtils::findAllColNames(variables)) + + if (!all(col_names %in% names(object@data))) { + msg <- paste("Some specified computed variables are not present in compute result data.frame") + errors <- c(errors, msg) + } + + var_classes <- unlist(lapply(as.list(variables), function(x) {x@variableClass@value})) + if (!all(var_classes %in% 'computed')) { + msg <- paste("Some specified computed variables have the wrong variable class.") + errors <- c(errors, msg) + } + + # if we have computed variable metadata, we should have data and vice versa + if (!length(object@data) || nrow(object@data) == 0) { + msg <- "Compute result must include computed variable metadata or data." + errors <- c(errors, msg) + } else { + if (any(grepl(".", names(object@data), fixed = TRUE))) { + msg <- paste("Column headers appear to be in dot notation [entityId.variableId]. They should be the raw variableId.") + errors <- c(errors, msg) + } + + expectedOutputIdColHeaders <- stripEntityIdFromColumnHeader(c(object@recordIdColumn, object@ancestorIdColumns)) + actualOutputIdColHeaders <- names(object@data)[1:length(expectedOutputIdColHeaders)] + if (all(expectedOutputIdColHeaders != actualOutputIdColHeaders)) { + msg <- paste("Columns must be ordered by recordIdColumn, ancestorIdColumns, and then data columns.") + errors <- c(errors, msg) + } + } + } else if (!length(object@statistics)) { + msg <- "Compute result must include computed variable metadata or statistics." + errors <- c(errors, msg) + } + + return(if (length(errors) == 0) TRUE else errors) +} + +#' Compute Result +#' +#' A class for consistently representing the results of computes. +#' This includes their representation in R, as JSON and how they are written to files. +#' +#' @slot data A data.frame of values where computed variables are columns and samples rows. +#' @slot name The name of the compute, ex: 'alphaDiv'. +#' @slot recordIdColumn The name of the column containing IDs for the samples. All other columns will be treated as computed values. +#' @slot ancestorIdColumns A character vector of column names representing parent entities of the recordIdColumn. +#' @slot computedVariableMetadata veupathUtils::VariableMetadataList detailing the computed variables. +#' @slot statistics An optional slot of any values. List or data.frame are recommended. It is not required to have rows or cols map to samples. +#' @slot computationDetails An optional message about the computed results. +#' @slot parameters A record of the input parameters used to generate the computed results. +#' @slot droppedColumns A character vector of column names that have been dropped for being unsuitable for the computation. +#' @name ComputeResult-class +#' @rdname ComputeResult-class +#' @export +ComputeResult <- setClass("ComputeResult", representation( + name = 'character', + data = 'data.frame', + recordIdColumn = 'character', + ancestorIdColumns = 'character', + computedVariableMetadata = 'VariableMetadataList', + statistics = 'ANY', + computationDetails = 'character', + parameters = 'character', + droppedColumns = 'character' +), prototype = prototype( + name = NA_character_, + recordIdColumn = NA_character_, + computationDetails = NA_character_, + parameters = NA_character_ +), validity = check_compute_result) \ No newline at end of file diff --git a/R/methods-ComputeResult.R b/R/methods-ComputeResult.R new file mode 100644 index 0000000..620d47a --- /dev/null +++ b/R/methods-ComputeResult.R @@ -0,0 +1,135 @@ +#' Write Computed Variable Metadata +#' +#' This function, for any given ComputeResult, +#' will return a filehandle where ComputedVariableMetadata +#' has been written in JSON format. +#' +#' @param object ComputeResult +#' @param pattern string to incorporate into tmp file name +#' @param verbose boolean indicating if timed logging is desired +#' @return filehandle where JSON representation of ComputedVariableMetadata can be found +#' @export +setGeneric("writeMeta", + function(object, pattern = NULL, verbose = c(TRUE, FALSE)) standardGeneric("writeMeta"), + signature = c("object") +) + +#'@export +setMethod("writeMeta", signature("ComputeResult"), function(object, pattern = NULL, verbose = c(TRUE, FALSE)) { + verbose <- veupathUtils::matchArg(verbose) + + outJson <- veupathUtils::toJSON(object@computedVariableMetadata) + + if (is.null(pattern)) { + pattern <- object@name + if (is.null(pattern)) { + pattern <- 'file' + } + } + pattern <- paste0(pattern, '-meta-') + + outFileName <- basename(tempfile(pattern = pattern, tmpdir = tempdir(), fileext = ".json")) + write(outJson, outFileName) + veupathUtils::logWithTime(paste('New output file written:', outFileName), verbose) + + return(outFileName) +}) + +#' Write Compute Result Data +#' +#' This function, for any given ComputeResult, +#' will return a filehandle where result data +#' has been written in tab delimited format. +#' @param object ComputeResult +#' @param pattern string to incorporate into tmp file name +#' @param verbose boolean indicating if timed logging is desired +#' @return filehandle where tab delimited representation of result data can be found +#' @export +setGeneric("writeData", + function(object, pattern = NULL, verbose = c(TRUE, FALSE)) standardGeneric("writeData"), + signature = c("object") +) + +#'@export +setMethod("writeData", signature("ComputeResult"), function(object, pattern = NULL, verbose = c(TRUE, FALSE)) { + verbose <- veupathUtils::matchArg(verbose) + + if (is.null(pattern)) { + pattern <- object@name + if (is.null(pattern)) { + pattern <- 'file' + } + } + pattern <- paste0(pattern, '-data-') + + outFileName <- basename(tempfile(pattern = pattern, tmpdir = tempdir(), fileext = ".tab")) + data.table::fwrite(object@data, outFileName, sep = '\t', quote = FALSE) + veupathUtils::logWithTime(paste('New output file written:', outFileName), verbose) + + return(outFileName) +}) + + + + +#' Write Computed Variable Statistics +#' +#' This function, for any given ComputeResult, +#' will return a filehandle where the statistics table +#' has been written in JSON format. +#' +#' @param object ComputeResult with statistics +#' @param pattern string to incorporate into tmp file name +#' @param verbose boolean indicating if timed logging is desired +#' @return filehandle where JSON representation of ComputeResult statistics can be found +#' @export +setGeneric("writeStatistics", + function(object, pattern = NULL, verbose = c(TRUE, FALSE)) standardGeneric("writeStatistics"), + signature = c("object") +) + +#'@export +setMethod("writeStatistics", signature("ComputeResult"), function(object, pattern = NULL, verbose = c(TRUE, FALSE)) { + verbose <- veupathUtils::matchArg(verbose) + + # Convert all to character but maintain structure + if (inherits(object@statistics, 'data.frame')) { + outObject <- data.frame(lapply(object@statistics, as.character)) + } else { + outObject <- object@statistics + } + + outJson <- jsonlite::toJSON(outObject) + + if (is.null(pattern)) { + pattern <- object@name + if (is.null(pattern)) { + pattern <- 'file' + } + } + pattern <- paste0(pattern, '-statistics-') + + outFileName <- basename(tempfile(pattern = pattern, tmpdir = tempdir(), fileext = ".json")) + write(outJson, outFileName) + veupathUtils::logWithTime(paste('New output file written:', outFileName), verbose) + + return(outFileName) +}) + +setMethod("toJSON", signature("CorrelationResult"), function(object, ...) { + tmp <- character() + + # The outputted CorrelationResult json object should have properties data1Metadata, data2Metadata, and statistics. + # All values in statistics should be strings. + tmp <- paste0(tmp, '"data1Metadata": ', jsonlite::toJSON(jsonlite::unbox(object@data1Metadata)), ',') + tmp <- paste0(tmp, '"data2Metadata": ', jsonlite::toJSON(jsonlite::unbox(object@data2Metadata)), ',') + outObject <- data.frame(lapply(object@statistics, as.character)) + tmp <- paste0(tmp, paste0('"statistics": ', jsonlite::toJSON(outObject))) + + tmp <- paste0("{", tmp, "}") + return(tmp) +}) + +# these let jsonlite::toJSON work by using the custom toJSON method for our custom result class +asJSONGeneric <- getGeneric("asJSON", package = "jsonlite") +setMethod(asJSONGeneric, "CorrelationResult", function(x, ...) toJSON(x)) \ No newline at end of file diff --git a/tests/testthat/test-ComputeResult.R b/tests/testthat/test-ComputeResult.R new file mode 100644 index 0000000..958c26c --- /dev/null +++ b/tests/testthat/test-ComputeResult.R @@ -0,0 +1,146 @@ +test_that('ComputeResult validation works', { + + nSamples <- 200 + df <- data.table::data.table( + 'entity.SampleID' = paste0('sample_', 1:nSamples), + 'entity.contA' = rnorm(nSamples) + ) + names(df) <- stripEntityIdFromColumnHeader(names(df)) + df$alphaDiversity <- .1 + + computedVariableMetadata <- veupathUtils::VariableMetadata( + variableClass = veupathUtils::VariableClass(value = "computed"), + variableSpec = veupathUtils::VariableSpec(variableId = 'alphaDiversity', entityId = 'entity'), + plotReference = veupathUtils::PlotReference(value = "yAxis"), + displayName = "Alpha Diversity", + displayRangeMin = 0, + displayRangeMax = max(max(df$alphaDiversity, na.rm = TRUE),1), + dataType = veupathUtils::DataType(value = "NUMBER"), + dataShape = veupathUtils::DataShape(value = "CONTINUOUS") + ) + + expect_error(new("ComputeResult", + recordIdColumn = 'entity.SampleID', + computedVariableMetadata = veupathUtils::VariableMetadataList(S4Vectors::SimpleList(computedVariableMetadata)), + data = df)) + expect_error(new("ComputeResult", + name = c('alphaDiv', 'test'), + recordIdColumn = 'entity.SampleID', + computedVariableMetadata = veupathUtils::VariableMetadataList(S4Vectors::SimpleList(computedVariableMetadata)), + data = df)) + expect_error(new("ComputeResult", + name = NULL, + recordIdColumn = 'entity.SampleID', + computedVariableMetadata = NULL, + data = df)) + expect_error(new("ComputeResult", + name = NA, + recordIdColumn = 'entity.SampleID', + computedVariableMetadata = NULL, + data = df)) + expect_error(new("ComputeResult", + name = 'alphaDiv', + recordIdColumn = 'entity.SampleID', + computedVariableMetadata = NULL, + data = df)) + expect_error(new("ComputeResult", + name = 'alphaDiv', + recordIdColumn = 'entity.SampleID', + data = df)) + expect_error(new("ComputeResult", + name = 'alphaDiv', + recordIdColumn = 'entity.SampleID', + computedVariableMetadata = NA, + data = df)) + + df$alphaDiversity <- NULL + expect_error(new("ComputeResult", + name = c('alphaDiv'), + recordIdColumn = 'entity.SampleID', + computedVariableMetadata = veupathUtils::VariableMetadataList(S4Vectors::SimpleList(computedVariableMetadata)), + data = df)) + + df$alphaDiversity <- .1 + computedVariableMetadata <- veupathUtils::VariableMetadata( + variableClass = veupathUtils::VariableClass(value = "native"), + variableSpec = veupathUtils::VariableSpec(variableId = 'alphaDiversity', entityId = 'entity'), + plotReference = veupathUtils::PlotReference(value = "yAxis"), + displayName = "Alpha Diversity", + displayRangeMin = 0, + displayRangeMax = max(max(df$alphaDiversity, na.rm = TRUE),1), + dataType = veupathUtils::DataType(value = "NUMBER"), + dataShape = veupathUtils::DataShape(value = "CONTINUOUS") + ) + + expect_error(new("ComputeResult", + name = c('alphaDiv'), + recordIdColumn = 'entity.SampleID', + computedVariableMetadata = veupathUtils::VariableMetadataList(S4Vectors::SimpleList(computedVariableMetadata)), + data = df)) + + # TODO test that extra cols in data will err, dot notation of output cols or not, column order +}) + +test_that("ComputeResult writeMeta method returns well formatted json", { + nSamples <- 200 + df <- data.table::data.table( + 'entity.SampleID' = paste0('sample_', 1:nSamples), + 'entity.contA' = rnorm(nSamples) + ) + names(df) <- stripEntityIdFromColumnHeader(names(df)) + df$alphaDiversity <- .1 + + computedVariableMetadata <- veupathUtils::VariableMetadata( + variableClass = veupathUtils::VariableClass(value = "computed"), + variableSpec = veupathUtils::VariableSpec(variableId = 'alphaDiversity', entityId = 'entity'), + plotReference = veupathUtils::PlotReference(value = "yAxis"), + displayName = "Alpha Diversity", + displayRangeMin = 0, + displayRangeMax = max(max(df$alphaDiversity, na.rm = TRUE),1), + dataType = veupathUtils::DataType(value = "NUMBER"), + dataShape = veupathUtils::DataShape(value = "CONTINUOUS") + ) + + result <- new("ComputeResult", + name = "alphaDiv", + recordIdColumn = 'entity.SampleID', + computedVariableMetadata = veupathUtils::VariableMetadataList(S4Vectors::SimpleList(computedVariableMetadata)), + data = df) + + jsonlist <- jsonlite::fromJSON(veupathUtils::toJSON(result@computedVariableMetadata)) + + expect_equal(names(jsonlist), 'variables') + expect_equal(names(jsonlist$variables), c("variableClass","variableSpec","plotReference","displayName","displayRangeMin","displayRangeMax","dataType","dataShape","isCollection","imputeZero","hasStudyDependentVocabulary")) +}) + + +test_that("ComputeResult writeStatistics returns formatted json.", { + + nStats <- 100 + nSamples <- 200 + df <- data.table::data.table( + 'entity.SampleID' = paste0('sample_', 1:nSamples), + 'entity.contA' = rnorm(nSamples) + ) + names(df) <- stripEntityIdFromColumnHeader(names(df)) + result <- new("ComputeResult", + name = "differentialAbundance", + recordIdColumn = 'entity.SampleID', + statistics = data.frame(list("stat_float" = rnorm(nStats, sd=5), + "stat_char" = sample(letters,nStats,replace=T), + "stat_with_missing" = sample(c(1.2, 3, NA), nStats, replace=T) + )), + data = df) + + # transform to character, just like in the real method + charObject <- data.frame(lapply(result@statistics, as.character)) + outJson <- jsonlite::toJSON(charObject) + jsonList <- jsonlite::fromJSON(outJson) + + expect_equal(length(jsonList$stat_float), nStats) # Check we still have enough stats + expect_equal(names(jsonList), c('stat_float','stat_char','stat_with_missing')) + expect_equal(unname(unlist(lapply(jsonList,class))), c('character','character','character')) + expect_equal(jsonList$stat_with_missing, as.character(result@statistics$stat_with_missing)) + expect_equal(jsonList$stat_float, as.character(result@statistics$stat_float)) + +}) From a50dee2efdc0e0402c6261f70be4e6ca5b8389c2 Mon Sep 17 00:00:00 2001 From: Danielle Callan Date: Mon, 26 Feb 2024 11:26:25 -0500 Subject: [PATCH 4/6] move some general correlation stuff out of mbio computes package --- R/class-CorrelationResult.R | 34 ++++ R/method-correlation.R | 291 ++++++++++++++++++++++++++++++ tests/testthat/test-correlation.R | 100 ++++++++++ 3 files changed, 425 insertions(+) create mode 100644 R/class-CorrelationResult.R create mode 100644 R/method-correlation.R create mode 100644 tests/testthat/test-correlation.R diff --git a/R/class-CorrelationResult.R b/R/class-CorrelationResult.R new file mode 100644 index 0000000..75465ad --- /dev/null +++ b/R/class-CorrelationResult.R @@ -0,0 +1,34 @@ +check_correlation_result <- function(object) { + errors <- character() + + if (!length(object@data1Metadata) > 0) { + msg <- "Compute result must include information about data1 in the data1Metadata slot" + errors <- c(errors, msg) + } else if (!length(object@data2Metadata) > 0) { + msg <- "Compute result must include information about data2 in the data2Metadata slot" + errors <- c(errors, msg) + } + + return(if (length(errors) == 0) TRUE else errors) +} + + +#' Corrleation Result +#' +#' A class to represent the computed results from a correlation calculation. +#' This includes their representation in R, as JSON, and how they are written to files. +#' +#' @slot statistics An optional slot of any values. List or data.frame are recommended. It is not required to have rows or cols map to samples. +#' @slot data1Metadata A character describing data1. Should at least describe the name of data1. +#' @slot data2Metadata A character describing data2. Should at least describe the name of data2. +#' @name CorrelationResult-class +#' @rdname CorrelationResult-class +#' @export +CorrelationResult <- setClass("CorrelationResult", representation( + data1Metadata = "character", + data2Metadata = "character", + statistics = "data.frame" +), prototype = prototype( + data1Metadata = NA_character_, + data2Metadata = NA_character_ +), validity = check_correlation_result) \ No newline at end of file diff --git a/R/method-correlation.R b/R/method-correlation.R new file mode 100644 index 0000000..10ee496 --- /dev/null +++ b/R/method-correlation.R @@ -0,0 +1,291 @@ +#' Predicate Factory +#' +#' This function creates a predicate function based on a string defining the type of predicate to run and a numeric value. +#' The currently supported types are 'proportionNonZero', 'variance' and 'sd'. The numeric value associated +#' with each predicate type is the threshold for the predicate to be true. +#' +#' @param predicateType string defining the type of predicate to run. The currently supported values are 'proportionNonZero', 'variance' and 'sd' +#' @param threshold numeric value associated with the predicate type +#' @return Function returning a boolean indicating if a feature should be included (TRUE) or excluded (FALSE) +#' @export +setGeneric("predicateFactory", + function(predicateType, threshold) standardGeneric("predicateFactory"), + signature = c("predicateType", "threshold") +) + + +#' Predicate Factory +#' +#' This function creates a predicate function based on a string defining the type of predicate to run and a numeric value. +#' The currently supported types are 'proportionNonZero', 'variance' and 'sd'. The numeric value associated +#' with each predicate type is the threshold for the predicate to be true. +#' +#' @param predicateType string defining the type of predicate to run. The currently supported values are 'proportionNonZero', 'variance' and 'sd' +#' @param threshold numeric value associated with the predicate type +#' @return Function returning a boolean indicating if a feature should be included (TRUE) or excluded (FALSE) +#' @export +setMethod("predicateFactory", signature("character", "numeric"), function(predicateType = c('proportionNonZero', 'variance', 'sd'), threshold = 0.5) { + predicateType <- veupathUtils::matchArg(predicateType) + + if (predicateType == 'proportionNonZero') { + if (threshold < 0 | threshold > 1) { + stop('threshold must be between 0 and 1 for proportionNonZero') + } + return(function(x){sum(x > 0) >= length(x) * threshold}) + + } else if (predicateType == 'variance') { + if (threshold < 0) { + stop('threshold must be greater than 0 for variance') + } + return(function(x){var(x) > threshold}) + + } else if (predicateType == 'sd') { + if (threshold < 0) { + stop('threshold must be greater than 0 for sd') + } + return(function(x){sd(x) > threshold}) + } + +}) + +#' Correlation +#' +#' This function returns correlation coefficients for variables in one dataset against variables in a second dataset +#' +#' @param data1 first dataset. An AbundanceData object or data.table +#' @param data2 second dataset. A SampleMetadata object (if data1 is class AbundanceData) or a data.table +#' @param method string defining the type of correlation to run. The currently supported values are specific to the class of data1 and data2. +#' @param format string defining the desired format of the result. The currently supported values are 'data.table' and 'ComputeResult'. +#' @param verbose boolean indicating if timed logging is desired +#' @return data.frame with correlation coefficients or a ComputeResult object +#' @import veupathUtils +#' @import data.table +#' @useDynLib microbiomeComputations +#' @export +setGeneric("correlation", + function(data1, data2, method, format = c('ComputeResult', 'data.table'), verbose = c(TRUE, FALSE), ...) standardGeneric("correlation"), + signature = c("data1","data2") +) + +############ These methods should be used with caution. It is recommended to build methods for specific classes instead. ############ +############ That allows for proper validation of inputs (sparcc is for compositional data for example). ############ +############ See microbiomeComputations for an example. ############ + +#' Correlation +#' +#' This function returns correlation coefficients for all columns in one data table with all columns in a second data table. +#' +#' @param data1 data.table with columns as variables. All columns must be numeric. One row per sample. +#' @param data2 data.table with columns as variables. All columns must be numeric. One row per sample. Will correlate all columns of data2 with all columns of data1. +#' @param method string defining the type of correlation to run. The currently supported values are 'spearman' and 'pearson' +#' @param format string defining the desired format of the result. The currently supported values are 'data.table' and 'ComputeResult'. +#' @param verbose boolean indicating if timed logging is desired +#' @importFrom Hmisc rcorr +#' @return data.frame with correlation coefficients +setMethod("correlation", signature("data.table", "data.table"), +function(data1, data2, method = c('spearman','pearson'), format = c('ComputeResult', 'data.table'), verbose = c(TRUE, FALSE)) { + + format <- veupathUtils::matchArg(format) + method <- veupathUtils::matchArg(method) + verbose <- veupathUtils::matchArg(verbose) + + # Check that the number of rows match. + if (!identical(nrow(data1), nrow(data2))) { + stop("data1 and data2 must have the same number of rows.") + } + + # Check that all values are numeric + if (!identical(veupathUtils::findNumericCols(data1), names(data1))) { stop("All columns in data1 must be numeric.")} + if (!identical(veupathUtils::findNumericCols(data2), names(data2))) { stop("All columns in data2 must be numeric.")} + + + ## Compute correlation + #corrResult <- data.table::as.data.table(cor(data1, data2, method = method, use='na.or.complete'), keep.rownames = T) + lastData1ColIndex <- length(data1) + firstData2ColIndex <- length(data1) + 1 + corrResult <- Hmisc::rcorr(as.matrix(data1), as.matrix(data2), type = method) + # this bc Hmisc::rcorr cbinds the two data.tables and runs the correlation + # so we need to extract only the relevant values + # of note, seems subsetting a matrix to have a single row/ column drops rownames by default. adding drop=F to prevent this. + pVals <- data.table::as.data.table(corrResult$P[1:lastData1ColIndex, firstData2ColIndex:length(colnames(corrResult$P)), drop = F], keep.rownames = T) + corrResult <- data.table::as.data.table(corrResult$r[1:lastData1ColIndex, firstData2ColIndex:length(colnames(corrResult$r)), drop = F], keep.rownames = T) + + veupathUtils::logWithTime(paste0('Completed correlation with method=', method,'. Formatting results.'), verbose) + + + ## Format results + meltedCorrResult <- melt(corrResult, id.vars=c('rn')) + meltedPVals <- melt(pVals, id.vars=c('rn')) + formattedCorrResult <- data.frame( + data1 = meltedCorrResult[['rn']], + data2 = meltedCorrResult[['variable']], + correlationCoef = meltedCorrResult[['value']], + # should we do a merge just to be sure? + pValue = meltedPVals[['value']] + ) + + if (format == 'data.table') { + return(formattedCorrResult) + } else { + result <- buildCorrelationComputeResult(formattedCorrResult, data1, data2, method, verbose) + return(result) + } +}) + +#' Correlation +#' +#' This function returns correlation coefficients for all columns in one data table against themselves. +#' +#' @param data1 data.table with columns as variables. All columns must be numeric. One row per sample. +#' @param method string defining the type of correlation to run. The currently supported values are 'spearman', 'pearson' and 'sparcc' +#' @param format string defining the desired format of the result. The currently supported values are 'data.table' and 'ComputeResult'. +#' @param verbose boolean indicating if timed logging is desired +#' @return data.frame with correlation coefficients +#' @importFrom Hmisc rcorr +#' @importFrom SpiecEasi pval.sparccboot +#' @importFrom SpiecEasi sparccboot +#' @importFrom SpiecEasi sparcc +setMethod("correlation", signature("data.table", "missing"), +function(data1, data2, method = c('spearman','pearson','sparcc'), format = c('ComputeResult', 'data.table'), verbose = c(TRUE, FALSE)) { + + format <- veupathUtils::matchArg(format) + method <- veupathUtils::matchArg(method) + verbose <- veupathUtils::matchArg(verbose) + + # Check that all values are numeric + if (!identical(veupathUtils::findNumericCols(data1), names(data1))) { stop("All columns in data1 must be numeric.")} + + ## Compute correlation + # rownames and colnames should be the same in this case + # keep matrix for now so we can use lower.tri later, expand.grid will give us the needed data.frame + if (method == 'sparcc') { + + # this is a local alias for the sparcc function from the SpiecEasi namespace that do.call can find + # if we need to customize how we call sparcc in the future, we'll need to do something like the following + # except the empty list would have args for the sparcc function + #sparcc <- get("sparcc", asNamespace("SpiecEasi")) + #statisticperm=function(data, indices) do.call("sparcc", c(list(apply(data[indices,], 2, sample)), list()))$Cor + #statisticboot=function(data, indices) do.call("sparcc", c(list(data[indices,,drop=FALSE]), list()))$Cor + + # sub-sampled `statistic` functions to pass to `boot::boot` that we can use to find pvalues + # statisticboot = function which takes data and bootstrap sample indices of the bootstapped correlation matrix + # statisticperm = function which takes data and permutated sample indices of the null correlation matrix + # the SpiecEasi defaults for these functions return the upper triangle of the correlation matrix. We do that manually later instead. + statisticperm = function(data, indices) SpiecEasi::sparcc(apply(data[indices,], 2, sample))$Cor + statisticboot = function(data, indices) SpiecEasi::sparcc(data[indices,,drop=FALSE])$Cor + + # calling the bootstrap version of sparcc and finding pvalues + result <- SpiecEasi::pval.sparccboot(SpiecEasi::sparccboot(data1, statisticboot = statisticboot, statisticperm = statisticperm, R = 100)) + # making sure results are formatted correctly for downstream use + pVals <- matrix(result$pvals, nrow = ncol(data1), ncol = ncol(data1), byrow = T) + corrResult <- result$cors + rownames(corrResult) <- colnames(corrResult) <- colnames(data1) + + } else { + + corrResult <- Hmisc::rcorr(as.matrix(data1), type = method) + pVals <- corrResult$P + corrResult <- corrResult$r + + } + + veupathUtils::logWithTime(paste0('Completed correlation with method=', method,'. Formatting results.'), verbose) + + ## Format results + rowAndColNames <- expand.grid(rownames(corrResult), colnames(corrResult)) + deDupedRowAndColNames <- rowAndColNames[as.vector(upper.tri(corrResult)),] + formattedCorrResult <- cbind(deDupedRowAndColNames, corrResult[upper.tri(corrResult)]) + formattedCorrResult <- cbind(formattedCorrResult, pVals[upper.tri(pVals)]) + colnames(formattedCorrResult) <- c("data1","data2","correlationCoef","pValue") + + if (format == 'data.table') { + return(formattedCorrResult) + } else { + result <- buildCorrelationComputeResult(formattedCorrResult, data1, data2, method, verbose) + return(result) + } +}) + +getDataMetadataType <- function(data) { + if (inherits(data, 'AbundanceData')) { + return('assay') + } else if (inherits(data, 'SampleMetadata')) { + return('sampleMetadata') + } else { + return('unknown') + } +} + +## Helper function +# should this be s4? +buildCorrelationComputeResult <- function(corrResult, data1, data2 = NULL, method = c('spearman','pearson','sparcc'), verbose = c(TRUE, FALSE)) { + method <- veupathUtils::matchArg(method) + verbose <- veupathUtils::matchArg(verbose) + + # both AbundanceData and SampleMetadata have these slots + recordIdColumn <- ifelse('recordIdColumn' %in% slotNames(data1), data1@recordIdColumn, NA_character_) + ancestorIdColumns <- ifelse('ancestorIdColumns' %in% slotNames(data1), data1@ancestorIdColumns, NA_character_) + allIdColumns <- c(recordIdColumn, ancestorIdColumns) + + ## Format results + # Construct the ComputeResult + result <- new("ComputeResult") + result@name <- 'correlation' + result@recordIdColumn <- recordIdColumn + result@ancestorIdColumns <- ancestorIdColumns + statistics <- CorrelationResult( + statistics = corrResult, + data1Metadata = getDataMetadataType(data1), + data2Metadata = ifelse(is.null(data2), getDataMetadataType(data1), getDataMetadataType(data2)) + ) + result@statistics <- statistics + result@parameters <- paste0('method = ', method) + + # The resulting data should contain only the samples actually used. + # this seems slightly complicated to generalize, and im not sure what we use it for anyhow + #result@data <- abundances[, ..allIdColumns] + #names(result@data) <- stripEntityIdFromColumnHeader(names(result@data)) + + validObject(result) + veupathUtils::logWithTime(paste('Correlation computation completed with parameters recordIdColumn=', recordIdColumn, ', method = ', method), verbose) + + return(result) +} + +#' Self Correlation +#' +#' This function returns correlation coefficients for variables in one dataset against itself +#' +#' @param data first dataset. An AbundanceData object or data.table +#' @param method string defining the type of correlation to run. The currently supported values are 'spearman','pearson' and 'sparcc' +#' @param format string defining the desired format of the result. The currently supported values are 'data.table' and 'ComputeResult'. +#' @param verbose boolean indicating if timed logging is desired +#' @return ComputeResult object +#' @import veupathUtils +#' @export +setGeneric("selfCorrelation", + function(data, method = c('spearman','pearson','sparcc'), format = c('ComputeResult', 'data.table'), verbose = c(TRUE, FALSE), ...) standardGeneric("selfCorrelation"), + signature = c("data") +) + +#' Self Correlation +#' +#' This function returns correlation coefficients for variables in one data.table against itself. +#' This is essentially an alias to the microbiomeComputations::correlation function. +#' +#' @param data a data.table +#' @param method string defining the type of correlation to run. The currently supported values are 'spearman', 'pearson' and 'sparcc' +#' @param format string defining the desired format of the result. The currently supported values are 'data.table' and 'ComputeResult'. +#' @param verbose boolean indicating if timed logging is desired +#' @return ComputeResult object +#' @import veupathUtils +#' @export +setMethod("selfCorrelation", signature("data.table"), +function(data, method = c('spearman','pearson','sparcc'), format = c('ComputeResult', 'data.table'), verbose = c(TRUE, FALSE)) { + + format <- veupathUtils::matchArg(format) + method <- veupathUtils::matchArg(method) + verbose <- veupathUtils::matchArg(verbose) + + correlation(data, method=method, format = format, verbose=verbose) +}) \ No newline at end of file diff --git a/tests/testthat/test-correlation.R b/tests/testthat/test-correlation.R new file mode 100644 index 0000000..08f1026 --- /dev/null +++ b/tests/testthat/test-correlation.R @@ -0,0 +1,100 @@ +test_that('correlation works with two data tables', { + nSamples = 200 + + testData1 <- data.table( + "contA1" = rnorm(nSamples), + "contB1" = rnorm(nSamples), + "contC1" = rnorm(nSamples) + ) + + testData2 <- data.table( + "contA2" = rnorm(nSamples), + "contB2" = rnorm(nSamples), + "contC2" = rnorm(nSamples) + ) + + corrResult <- correlation(testData1, testData2, method = 'spearman', format = 'data.table', verbose=F) + expect_equal(names(corrResult), c("data1","data2","correlationCoef","pValue")) + expect_equal(nrow(corrResult), ncol(testData1) * ncol(testData2)) + expect_true(!all(is.na(corrResult$correlationCoef))) + + corrResult <- correlation(testData1, testData2, method = 'pearson', format = 'data.table', verbose=F) + expect_equal(names(corrResult), c("data1","data2","correlationCoef","pValue")) + expect_equal(nrow(corrResult), ncol(testData1) * ncol(testData2)) + expect_true(!all(is.na(corrResult$correlationCoef))) + + # Test with NAs + # Should compute using complete cases. + # This to try to catch regression back to the default behavior of cor, which returns results of NA if any value is NA + testData2$contA2[1] <- NA + corrResult <- correlation(testData1, testData2, method = 'pearson', format = 'data.table', verbose=F) + expect_equal(names(corrResult), c("data1","data2","correlationCoef","pValue")) + expect_equal(nrow(corrResult), ncol(testData1) * ncol(testData2)) + expect_true(!all(is.na(corrResult$correlationCoef))) + +}) + +test_that("correlation works with a single data table", { + nSamples = 200 + testData1 <- data.table( + "contA1" = rnorm(nSamples), + "contB1" = rnorm(nSamples), + "contC1" = rnorm(nSamples) + ) + corrResult <- correlation(testData1, method='spearman', format = 'data.table', verbose=F) + expect_equal(names(corrResult), c("data1","data2","correlationCoef","pValue")) + expect_equal(nrow(corrResult), (ncol(testData1) * ncol(testData1) - 3)/2) + expect_true(!all(is.na(corrResult$correlationCoef))) + + corrResult <- correlation(testData1, method='pearson', format = 'data.table', verbose=F) + expect_equal(names(corrResult), c("data1","data2","correlationCoef","pValue")) + expect_equal(nrow(corrResult), (ncol(testData1) * ncol(testData1) - 3)/2) + expect_true(!all(is.na(corrResult$correlationCoef))) + + corrResult <- correlation(abs(testData1), method='sparcc', format = 'data.table', verbose=F) #sparcc wont like negative values + expect_equal(names(corrResult), c("data1","data2","correlationCoef","pValue")) + expect_equal(nrow(corrResult), (ncol(testData1) * ncol(testData1) - 3)/2) + expect_true(!all(is.na(corrResult$correlationCoef))) + + #test alias + corrResult <- selfCorrelation(testData1, method='pearson', format = 'data.table', verbose=F) + expect_equal(names(corrResult), c("data1","data2","correlationCoef","pValue")) + expect_equal(nrow(corrResult), (ncol(testData1) * ncol(testData1) - 3)/2) + expect_true(!all(is.na(corrResult$correlationCoef))) + + # Test with NAs + # Should compute using complete cases. + # This to try to catch regression back to the default behavior of cor, which returns results of NA if any value is NA + testData1$contA1[1] <- NA + corrResult <- correlation(testData1, method='pearson', format = 'data.table', verbose=F) + expect_equal(names(corrResult), c("data1","data2","correlationCoef","pValue")) + expect_equal(nrow(corrResult), (ncol(testData1) * ncol(testData1) - 3)/2) + expect_true(!all(is.na(corrResult$correlationCoef))) + +}) + +test_that("toJSON works as expected for the CorrelationResult class", { + + nSamples <- 200 + sampleMetadata <- data.table::data.table( + "entity.SampleID" = 1:nSamples, + "entity.contA" = rnorm(nSamples), + "entity.contB" = rnorm(nSamples), + "entity.contC" = rnorm(nSamples) + ) + + data <- data.table::data.table( + "entity.SampleID" = sampleMetadata$entity.SampleID, + "entity.cont1" = rnorm(nSamples), + "entity.cont2" = rnorm(nSamples), + "entity.cont3" = rnorm(nSamples) + ) + + result <- correlation(data, sampleMetadata, method='pearson', verbose = FALSE) + jsonList <- jsonlite::fromJSON(toJSON(result@statistics)) + expect_equal(names(jsonList), c('data1Metadata', 'data2Metadata', 'statistics')) + expect_equal(class(jsonList$data1Metadata), "character") + expect_equal(class(jsonList$data2Metadata), "character") + expect_equal(names(jsonList$statistics), c('data1', 'data2', 'correlationCoef', 'pValue')) + expect_equal(unname(unlist(lapply(jsonList$statistics, class))), c('character', 'character', 'character', 'character')) +}) From b00c2f7a39f33f1538366f51f7e861c2a179f273 Mon Sep 17 00:00:00 2001 From: Danielle Callan Date: Mon, 26 Feb 2024 11:32:47 -0500 Subject: [PATCH 5/6] update generated docs --- DESCRIPTION | 6 +++- NAMESPACE | 21 ++++++++++++ R/class-ComputeResult.R | 1 + R/methods-ComputeResult.R | 1 + man/ComputeResult-class.Rd | 33 +++++++++++++++++++ man/CorrelationResult-class.Rd | 21 ++++++++++++ ...orrelation-data.table-data.table-method.Rd | 31 +++++++++++++++++ man/correlation-data.table-missing-method.Rd | 29 ++++++++++++++++ man/correlation.Rd | 32 ++++++++++++++++++ ...edicateFactory-character-numeric-method.Rd | 24 ++++++++++++++ man/predicateFactory.Rd | 21 ++++++++++++ man/selfCorrelation-data.table-method.Rd | 29 ++++++++++++++++ man/selfCorrelation.Rd | 29 ++++++++++++++++ man/stripEntityIdFromColumnHeader.Rd | 14 ++++++++ man/toJSON.Rd | 2 +- man/writeData.Rd | 23 +++++++++++++ man/writeMeta.Rd | 23 +++++++++++++ man/writeStatistics.Rd | 23 +++++++++++++ 18 files changed, 361 insertions(+), 2 deletions(-) create mode 100644 man/ComputeResult-class.Rd create mode 100644 man/CorrelationResult-class.Rd create mode 100644 man/correlation-data.table-data.table-method.Rd create mode 100644 man/correlation-data.table-missing-method.Rd create mode 100644 man/correlation.Rd create mode 100644 man/predicateFactory-character-numeric-method.Rd create mode 100644 man/predicateFactory.Rd create mode 100644 man/selfCorrelation-data.table-method.Rd create mode 100644 man/selfCorrelation.Rd create mode 100644 man/stripEntityIdFromColumnHeader.Rd create mode 100644 man/writeData.Rd create mode 100644 man/writeMeta.Rd create mode 100644 man/writeStatistics.Rd diff --git a/DESCRIPTION b/DESCRIPTION index ef1cd00..943a5ca 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -31,14 +31,18 @@ Config/testthat/edition: 3 Collate: 'class-Bin.R' 'class-VariableMetadata.R' + 'class-ComputeResult.R' + 'class-CorrelationResult.R' 'class-Megastudy.R' 'class-Range.R' 'class-Statistic.R' 'data.R' + 'method-correlation.R' 'methods-Bin.R' + 'methods-VariableMetadata.R' + 'methods-ComputeResult.R' 'methods-Megastudy.R' 'methods-Statistic.R' - 'methods-VariableMetadata.R' 'utils-classes.R' 'utils-cut.R' 'utils-numeric.R' diff --git a/NAMESPACE b/NAMESPACE index 3cafd57..70ef4cb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,6 +9,8 @@ S3method(validateNumericCols,default) S3method(validateNumericCols,list) export(Bin) export(BinList) +export(ComputeResult) +export(CorrelationResult) export(DataShape) export(DataType) export(Megastudy) @@ -25,6 +27,7 @@ export(VariableMetadataList) export(VariableSpec) export(VariableSpecList) export(as.data.table) +export(correlation) export(cut_interval) export(cut_number) export(cut_width) @@ -62,12 +65,15 @@ export(merge) export(naToZero) export(new_data_frame) export(nonZeroRound) +export(predicateFactory) export(removeAttr) +export(selfCorrelation) export(setAttrFromList) export(setNaToZero) export(setroworder) export(shiftToNonNeg) export(strSplit) +export(stripEntityIdFromColumnHeader) export(toJSON) export(toStringOrNull) export(toStringOrPoint) @@ -76,8 +82,13 @@ export(updateAttrById) export(validateNumericCols) export(whichValuesInBin) export(whichValuesInBinList) +export(writeData) +export(writeMeta) +export(writeStatistics) exportClasses(Bin) exportClasses(BinList) +exportClasses(ComputeResult) +exportClasses(CorrelationResult) exportClasses(DataShape) exportClasses(DataType) exportClasses(Megastudy) @@ -118,13 +129,23 @@ exportMethods(getStudyIdColumnName) exportMethods(getVariableSpec) exportMethods(getVariableSpecColumnName) exportMethods(merge) +exportMethods(predicateFactory) +exportMethods(selfCorrelation) exportMethods(toJSON) exportMethods(whichValuesInBin) exportMethods(whichValuesInBinList) +exportMethods(writeData) +exportMethods(writeMeta) +exportMethods(writeStatistics) import(data.table) +importFrom(Hmisc,rcorr) importFrom(S4Vectors,SimpleList) +importFrom(SpiecEasi,pval.sparccboot) +importFrom(SpiecEasi,sparcc) +importFrom(SpiecEasi,sparccboot) importFrom(digest,digest) importFrom(microbenchmark,microbenchmark) importFrom(purrr,map) importFrom(purrr,map_lgl) importFrom(stringi,stri_detect_regex) +useDynLib(microbiomeComputations) diff --git a/R/class-ComputeResult.R b/R/class-ComputeResult.R index 267e7d3..38c7054 100644 --- a/R/class-ComputeResult.R +++ b/R/class-ComputeResult.R @@ -72,6 +72,7 @@ check_compute_result <- function(object) { #' @name ComputeResult-class #' @rdname ComputeResult-class #' @export +#' @include class-VariableMetadata.R ComputeResult <- setClass("ComputeResult", representation( name = 'character', data = 'data.frame', diff --git a/R/methods-ComputeResult.R b/R/methods-ComputeResult.R index 620d47a..4f0c0f8 100644 --- a/R/methods-ComputeResult.R +++ b/R/methods-ComputeResult.R @@ -116,6 +116,7 @@ setMethod("writeStatistics", signature("ComputeResult"), function(object, patter return(outFileName) }) +#' @include methods-VariableMetadata.R setMethod("toJSON", signature("CorrelationResult"), function(object, ...) { tmp <- character() diff --git a/man/ComputeResult-class.Rd b/man/ComputeResult-class.Rd new file mode 100644 index 0000000..1930df3 --- /dev/null +++ b/man/ComputeResult-class.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/class-ComputeResult.R +\docType{class} +\name{ComputeResult-class} +\alias{ComputeResult-class} +\alias{ComputeResult} +\title{Compute Result} +\description{ +A class for consistently representing the results of computes. +This includes their representation in R, as JSON and how they are written to files. +} +\section{Slots}{ + +\describe{ +\item{\code{data}}{A data.frame of values where computed variables are columns and samples rows.} + +\item{\code{name}}{The name of the compute, ex: 'alphaDiv'.} + +\item{\code{recordIdColumn}}{The name of the column containing IDs for the samples. All other columns will be treated as computed values.} + +\item{\code{ancestorIdColumns}}{A character vector of column names representing parent entities of the recordIdColumn.} + +\item{\code{computedVariableMetadata}}{veupathUtils::VariableMetadataList detailing the computed variables.} + +\item{\code{statistics}}{An optional slot of any values. List or data.frame are recommended. It is not required to have rows or cols map to samples.} + +\item{\code{computationDetails}}{An optional message about the computed results.} + +\item{\code{parameters}}{A record of the input parameters used to generate the computed results.} + +\item{\code{droppedColumns}}{A character vector of column names that have been dropped for being unsuitable for the computation.} +}} + diff --git a/man/CorrelationResult-class.Rd b/man/CorrelationResult-class.Rd new file mode 100644 index 0000000..2fe0c98 --- /dev/null +++ b/man/CorrelationResult-class.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/class-CorrelationResult.R +\docType{class} +\name{CorrelationResult-class} +\alias{CorrelationResult-class} +\alias{CorrelationResult} +\title{Corrleation Result} +\description{ +A class to represent the computed results from a correlation calculation. +This includes their representation in R, as JSON, and how they are written to files. +} +\section{Slots}{ + +\describe{ +\item{\code{statistics}}{An optional slot of any values. List or data.frame are recommended. It is not required to have rows or cols map to samples.} + +\item{\code{data1Metadata}}{A character describing data1. Should at least describe the name of data1.} + +\item{\code{data2Metadata}}{A character describing data2. Should at least describe the name of data2.} +}} + diff --git a/man/correlation-data.table-data.table-method.Rd b/man/correlation-data.table-data.table-method.Rd new file mode 100644 index 0000000..8b3d54b --- /dev/null +++ b/man/correlation-data.table-data.table-method.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/method-correlation.R +\name{correlation,data.table,data.table-method} +\alias{correlation,data.table,data.table-method} +\title{Correlation} +\usage{ +\S4method{correlation}{data.table,data.table}( + data1, + data2, + method = c("spearman", "pearson"), + format = c("ComputeResult", "data.table"), + verbose = c(TRUE, FALSE) +) +} +\arguments{ +\item{data1}{data.table with columns as variables. All columns must be numeric. One row per sample.} + +\item{data2}{data.table with columns as variables. All columns must be numeric. One row per sample. Will correlate all columns of data2 with all columns of data1.} + +\item{method}{string defining the type of correlation to run. The currently supported values are 'spearman' and 'pearson'} + +\item{format}{string defining the desired format of the result. The currently supported values are 'data.table' and 'ComputeResult'.} + +\item{verbose}{boolean indicating if timed logging is desired} +} +\value{ +data.frame with correlation coefficients +} +\description{ +This function returns correlation coefficients for all columns in one data table with all columns in a second data table. +} diff --git a/man/correlation-data.table-missing-method.Rd b/man/correlation-data.table-missing-method.Rd new file mode 100644 index 0000000..d3adbf8 --- /dev/null +++ b/man/correlation-data.table-missing-method.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/method-correlation.R +\name{correlation,data.table,missing-method} +\alias{correlation,data.table,missing-method} +\title{Correlation} +\usage{ +\S4method{correlation}{data.table,missing}( + data1, + data2, + method = c("spearman", "pearson", "sparcc"), + format = c("ComputeResult", "data.table"), + verbose = c(TRUE, FALSE) +) +} +\arguments{ +\item{data1}{data.table with columns as variables. All columns must be numeric. One row per sample.} + +\item{method}{string defining the type of correlation to run. The currently supported values are 'spearman', 'pearson' and 'sparcc'} + +\item{format}{string defining the desired format of the result. The currently supported values are 'data.table' and 'ComputeResult'.} + +\item{verbose}{boolean indicating if timed logging is desired} +} +\value{ +data.frame with correlation coefficients +} +\description{ +This function returns correlation coefficients for all columns in one data table against themselves. +} diff --git a/man/correlation.Rd b/man/correlation.Rd new file mode 100644 index 0000000..c832b46 --- /dev/null +++ b/man/correlation.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/method-correlation.R +\name{correlation} +\alias{correlation} +\title{Correlation} +\usage{ +correlation( + data1, + data2, + method, + format = c("ComputeResult", "data.table"), + verbose = c(TRUE, FALSE), + ... +) +} +\arguments{ +\item{data1}{first dataset. An AbundanceData object or data.table} + +\item{data2}{second dataset. A SampleMetadata object (if data1 is class AbundanceData) or a data.table} + +\item{method}{string defining the type of correlation to run. The currently supported values are specific to the class of data1 and data2.} + +\item{format}{string defining the desired format of the result. The currently supported values are 'data.table' and 'ComputeResult'.} + +\item{verbose}{boolean indicating if timed logging is desired} +} +\value{ +data.frame with correlation coefficients or a ComputeResult object +} +\description{ +This function returns correlation coefficients for variables in one dataset against variables in a second dataset +} diff --git a/man/predicateFactory-character-numeric-method.Rd b/man/predicateFactory-character-numeric-method.Rd new file mode 100644 index 0000000..6819fd1 --- /dev/null +++ b/man/predicateFactory-character-numeric-method.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/method-correlation.R +\name{predicateFactory,character,numeric-method} +\alias{predicateFactory,character,numeric-method} +\title{Predicate Factory} +\usage{ +\S4method{predicateFactory}{character,numeric}( + predicateType = c("proportionNonZero", "variance", "sd"), + threshold = 0.5 +) +} +\arguments{ +\item{predicateType}{string defining the type of predicate to run. The currently supported values are 'proportionNonZero', 'variance' and 'sd'} + +\item{threshold}{numeric value associated with the predicate type} +} +\value{ +Function returning a boolean indicating if a feature should be included (TRUE) or excluded (FALSE) +} +\description{ +This function creates a predicate function based on a string defining the type of predicate to run and a numeric value. +The currently supported types are 'proportionNonZero', 'variance' and 'sd'. The numeric value associated +with each predicate type is the threshold for the predicate to be true. +} diff --git a/man/predicateFactory.Rd b/man/predicateFactory.Rd new file mode 100644 index 0000000..17648d0 --- /dev/null +++ b/man/predicateFactory.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/method-correlation.R +\name{predicateFactory} +\alias{predicateFactory} +\title{Predicate Factory} +\usage{ +predicateFactory(predicateType, threshold) +} +\arguments{ +\item{predicateType}{string defining the type of predicate to run. The currently supported values are 'proportionNonZero', 'variance' and 'sd'} + +\item{threshold}{numeric value associated with the predicate type} +} +\value{ +Function returning a boolean indicating if a feature should be included (TRUE) or excluded (FALSE) +} +\description{ +This function creates a predicate function based on a string defining the type of predicate to run and a numeric value. +The currently supported types are 'proportionNonZero', 'variance' and 'sd'. The numeric value associated +with each predicate type is the threshold for the predicate to be true. +} diff --git a/man/selfCorrelation-data.table-method.Rd b/man/selfCorrelation-data.table-method.Rd new file mode 100644 index 0000000..061c472 --- /dev/null +++ b/man/selfCorrelation-data.table-method.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/method-correlation.R +\name{selfCorrelation,data.table-method} +\alias{selfCorrelation,data.table-method} +\title{Self Correlation} +\usage{ +\S4method{selfCorrelation}{data.table}( + data, + method = c("spearman", "pearson", "sparcc"), + format = c("ComputeResult", "data.table"), + verbose = c(TRUE, FALSE) +) +} +\arguments{ +\item{data}{a data.table} + +\item{method}{string defining the type of correlation to run. The currently supported values are 'spearman', 'pearson' and 'sparcc'} + +\item{format}{string defining the desired format of the result. The currently supported values are 'data.table' and 'ComputeResult'.} + +\item{verbose}{boolean indicating if timed logging is desired} +} +\value{ +ComputeResult object +} +\description{ +This function returns correlation coefficients for variables in one data.table against itself. +This is essentially an alias to the microbiomeComputations::correlation function. +} diff --git a/man/selfCorrelation.Rd b/man/selfCorrelation.Rd new file mode 100644 index 0000000..a8168d0 --- /dev/null +++ b/man/selfCorrelation.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/method-correlation.R +\name{selfCorrelation} +\alias{selfCorrelation} +\title{Self Correlation} +\usage{ +selfCorrelation( + data, + method = c("spearman", "pearson", "sparcc"), + format = c("ComputeResult", "data.table"), + verbose = c(TRUE, FALSE), + ... +) +} +\arguments{ +\item{data}{first dataset. An AbundanceData object or data.table} + +\item{method}{string defining the type of correlation to run. The currently supported values are 'spearman','pearson' and 'sparcc'} + +\item{format}{string defining the desired format of the result. The currently supported values are 'data.table' and 'ComputeResult'.} + +\item{verbose}{boolean indicating if timed logging is desired} +} +\value{ +ComputeResult object +} +\description{ +This function returns correlation coefficients for variables in one dataset against itself +} diff --git a/man/stripEntityIdFromColumnHeader.Rd b/man/stripEntityIdFromColumnHeader.Rd new file mode 100644 index 0000000..9ecd36f --- /dev/null +++ b/man/stripEntityIdFromColumnHeader.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{stripEntityIdFromColumnHeader} +\alias{stripEntityIdFromColumnHeader} +\title{Strip Entity ID from Column Header} +\usage{ +stripEntityIdFromColumnHeader(columnNames) +} +\arguments{ +\item{columnNames}{character vector} +} +\description{ +This function strips entity ID from column headers for EDA formatted tabular data. +} diff --git a/man/toJSON.Rd b/man/toJSON.Rd index 7c9af0c..9bbfb4d 100644 --- a/man/toJSON.Rd +++ b/man/toJSON.Rd @@ -4,7 +4,7 @@ \alias{toJSON} \title{R object as JSON string} \usage{ -toJSON(object, named = c(TRUE, FALSE)) +toJSON(object, named = c(TRUE, FALSE), ...) } \arguments{ \item{object}{object of a supported S4 class to convert to a JSON string representation} diff --git a/man/writeData.Rd b/man/writeData.Rd new file mode 100644 index 0000000..a50d53f --- /dev/null +++ b/man/writeData.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-ComputeResult.R +\name{writeData} +\alias{writeData} +\title{Write Compute Result Data} +\usage{ +writeData(object, pattern = NULL, verbose = c(TRUE, FALSE)) +} +\arguments{ +\item{object}{ComputeResult} + +\item{pattern}{string to incorporate into tmp file name} + +\item{verbose}{boolean indicating if timed logging is desired} +} +\value{ +filehandle where tab delimited representation of result data can be found +} +\description{ +This function, for any given ComputeResult, +will return a filehandle where result data +has been written in tab delimited format. +} diff --git a/man/writeMeta.Rd b/man/writeMeta.Rd new file mode 100644 index 0000000..c4e1093 --- /dev/null +++ b/man/writeMeta.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-ComputeResult.R +\name{writeMeta} +\alias{writeMeta} +\title{Write Computed Variable Metadata} +\usage{ +writeMeta(object, pattern = NULL, verbose = c(TRUE, FALSE)) +} +\arguments{ +\item{object}{ComputeResult} + +\item{pattern}{string to incorporate into tmp file name} + +\item{verbose}{boolean indicating if timed logging is desired} +} +\value{ +filehandle where JSON representation of ComputedVariableMetadata can be found +} +\description{ +This function, for any given ComputeResult, +will return a filehandle where ComputedVariableMetadata +has been written in JSON format. +} diff --git a/man/writeStatistics.Rd b/man/writeStatistics.Rd new file mode 100644 index 0000000..8ae195e --- /dev/null +++ b/man/writeStatistics.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-ComputeResult.R +\name{writeStatistics} +\alias{writeStatistics} +\title{Write Computed Variable Statistics} +\usage{ +writeStatistics(object, pattern = NULL, verbose = c(TRUE, FALSE)) +} +\arguments{ +\item{object}{ComputeResult with statistics} + +\item{pattern}{string to incorporate into tmp file name} + +\item{verbose}{boolean indicating if timed logging is desired} +} +\value{ +filehandle where JSON representation of ComputeResult statistics can be found +} +\description{ +This function, for any given ComputeResult, +will return a filehandle where the statistics table +has been written in JSON format. +} From e511d47eed2aca6e53e98093ac5dc95d72188e3d Mon Sep 17 00:00:00 2001 From: Danielle Callan Date: Wed, 28 Feb 2024 09:06:18 -0500 Subject: [PATCH 6/6] update documentation for correlation method --- R/method-correlation.R | 6 +++--- man/correlation.Rd | 4 ++-- man/selfCorrelation.Rd | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/R/method-correlation.R b/R/method-correlation.R index 10ee496..3546a69 100644 --- a/R/method-correlation.R +++ b/R/method-correlation.R @@ -52,8 +52,8 @@ setMethod("predicateFactory", signature("character", "numeric"), function(predic #' #' This function returns correlation coefficients for variables in one dataset against variables in a second dataset #' -#' @param data1 first dataset. An AbundanceData object or data.table -#' @param data2 second dataset. A SampleMetadata object (if data1 is class AbundanceData) or a data.table +#' @param data1 first dataset. A data.table +#' @param data2 second dataset. A data.table #' @param method string defining the type of correlation to run. The currently supported values are specific to the class of data1 and data2. #' @param format string defining the desired format of the result. The currently supported values are 'data.table' and 'ComputeResult'. #' @param verbose boolean indicating if timed logging is desired @@ -256,7 +256,7 @@ buildCorrelationComputeResult <- function(corrResult, data1, data2 = NULL, metho #' #' This function returns correlation coefficients for variables in one dataset against itself #' -#' @param data first dataset. An AbundanceData object or data.table +#' @param data first dataset. A data.table #' @param method string defining the type of correlation to run. The currently supported values are 'spearman','pearson' and 'sparcc' #' @param format string defining the desired format of the result. The currently supported values are 'data.table' and 'ComputeResult'. #' @param verbose boolean indicating if timed logging is desired diff --git a/man/correlation.Rd b/man/correlation.Rd index c832b46..9ae128a 100644 --- a/man/correlation.Rd +++ b/man/correlation.Rd @@ -14,9 +14,9 @@ correlation( ) } \arguments{ -\item{data1}{first dataset. An AbundanceData object or data.table} +\item{data1}{first dataset. A data.table} -\item{data2}{second dataset. A SampleMetadata object (if data1 is class AbundanceData) or a data.table} +\item{data2}{second dataset. A data.table} \item{method}{string defining the type of correlation to run. The currently supported values are specific to the class of data1 and data2.} diff --git a/man/selfCorrelation.Rd b/man/selfCorrelation.Rd index a8168d0..ebe00b5 100644 --- a/man/selfCorrelation.Rd +++ b/man/selfCorrelation.Rd @@ -13,7 +13,7 @@ selfCorrelation( ) } \arguments{ -\item{data}{first dataset. An AbundanceData object or data.table} +\item{data}{first dataset. A data.table} \item{method}{string defining the type of correlation to run. The currently supported values are 'spearman','pearson' and 'sparcc'}