From 5a271597c49f771a3f8c5c52cd2ffaccae639c60 Mon Sep 17 00:00:00 2001
From: Danielle Callan <dcallan@upenn.edu>
Date: Mon, 26 Feb 2024 11:24:31 -0500
Subject: [PATCH 1/6] allow ... args to toJSON

---
 R/methods-VariableMetadata.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/methods-VariableMetadata.R b/R/methods-VariableMetadata.R
index c9c18eb..8707aed 100644
--- a/R/methods-VariableMetadata.R
+++ b/R/methods-VariableMetadata.R
@@ -31,7 +31,7 @@ setMethod("merge", signature("VariableSpecList", "VariableSpecList"), function(x
 #' @return character vector of length 1 containing JSON string
 #' @export
 setGeneric("toJSON", 
-  function(object, named = c(TRUE, FALSE)) standardGeneric("toJSON"),
+  function(object, named = c(TRUE, FALSE), ...) standardGeneric("toJSON"),
   signature = "object"
 )
 

From 461fcc50261438943337ed05aca6b47a9d75a778 Mon Sep 17 00:00:00 2001
From: Danielle Callan <dcallan@upenn.edu>
Date: Mon, 26 Feb 2024 11:25:19 -0500
Subject: [PATCH 2/6] add helper for formatting compute result columns headers

---
 R/utils.R | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/R/utils.R b/R/utils.R
index bf45752..4715196 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -104,4 +104,19 @@ setroworder <- function(x, neworder) {
 shiftToNonNeg <- function(x) {
   x <- x - min(x, na.rm = TRUE)
   return(x)
+}
+
+#' Strip Entity ID from Column Header
+#'
+#' This function strips entity ID from column headers for EDA formatted tabular data.
+#' @param columnNames character vector
+#' @export
+stripEntityIdFromColumnHeader <- function(columnNames) {
+  columnsToFix <- grepl(".", columnNames, fixed=T)
+
+  if (sum(columnsToFix) > 0) {
+    columnNames[columnsToFix] <- veupathUtils::strSplit(columnNames[columnsToFix], ".", index=2)
+  }
+
+  return(columnNames)
 }
\ No newline at end of file

From 3ad09b38d6ad63e7bba9ed187676289989e6624e Mon Sep 17 00:00:00 2001
From: Danielle Callan <dcallan@upenn.edu>
Date: Mon, 26 Feb 2024 11:26:11 -0500
Subject: [PATCH 3/6] move some general compute stuff out of mbio computes
 package

---
 R/class-ComputeResult.R             |  90 +++++++++++++++++
 R/methods-ComputeResult.R           | 135 +++++++++++++++++++++++++
 tests/testthat/test-ComputeResult.R | 146 ++++++++++++++++++++++++++++
 3 files changed, 371 insertions(+)
 create mode 100644 R/class-ComputeResult.R
 create mode 100644 R/methods-ComputeResult.R
 create mode 100644 tests/testthat/test-ComputeResult.R

diff --git a/R/class-ComputeResult.R b/R/class-ComputeResult.R
new file mode 100644
index 0000000..267e7d3
--- /dev/null
+++ b/R/class-ComputeResult.R
@@ -0,0 +1,90 @@
+# so S4 will recognize data.table class as inheriting from data.frame
+setOldClass(c("data.table", "data.frame"))
+
+check_compute_result <- function(object) {
+    errors <- character()
+
+    if (is.na(object@name)) {
+      msg <- "Compute result must have a name."
+      errors <- c(errors, msg)
+    } else if (length(object@name) != 1) {
+      msg <- "Compute result name must have a single value."
+      errors <- c(errors, msg) 
+    }
+
+    # If computedVariableMetadata is supplied, check that it contains
+    # metadata for variables actually in the result data, and that
+    # the variable classes are correct.
+    if (!!length(object@computedVariableMetadata)) {
+      variables <- object@computedVariableMetadata
+      col_names <- stripEntityIdFromColumnHeader(veupathUtils::findAllColNames(variables))
+
+      if (!all(col_names %in% names(object@data))) {
+        msg <- paste("Some specified computed variables are not present in compute result data.frame")
+        errors <- c(errors, msg)
+      }
+
+      var_classes <- unlist(lapply(as.list(variables), function(x) {x@variableClass@value}))
+      if (!all(var_classes %in% 'computed')) {
+        msg <- paste("Some specified computed variables have the wrong variable class.")
+        errors <- c(errors, msg) 
+      }
+    
+      # if we have computed variable metadata, we should have data and vice versa
+      if (!length(object@data) || nrow(object@data) == 0) {
+        msg <- "Compute result must include computed variable metadata or data."
+        errors <- c(errors, msg)
+      } else {
+        if (any(grepl(".", names(object@data), fixed = TRUE))) {
+          msg <- paste("Column headers appear to be in dot notation [entityId.variableId]. They should be the raw variableId.")
+          errors <- c(errors, msg)
+        }
+    
+        expectedOutputIdColHeaders <- stripEntityIdFromColumnHeader(c(object@recordIdColumn, object@ancestorIdColumns))
+        actualOutputIdColHeaders <- names(object@data)[1:length(expectedOutputIdColHeaders)]
+        if (all(expectedOutputIdColHeaders != actualOutputIdColHeaders)) {
+          msg <- paste("Columns must be ordered by recordIdColumn, ancestorIdColumns, and then data columns.")
+          errors <- c(errors, msg) 
+        }
+      }      
+    } else if (!length(object@statistics)) {
+      msg <- "Compute result must include computed variable metadata or statistics."
+      errors <- c(errors, msg)
+    }
+   
+    return(if (length(errors) == 0) TRUE else errors)
+}
+
+#' Compute Result
+#' 
+#' A class for consistently representing the results of computes. 
+#' This includes their representation in R, as JSON and how they are written to files.
+#' 
+#' @slot data A data.frame of values where computed variables are columns and samples rows.
+#' @slot name The name of the compute, ex: 'alphaDiv'.
+#' @slot recordIdColumn The name of the column containing IDs for the samples. All other columns will be treated as computed values.
+#' @slot ancestorIdColumns A character vector of column names representing parent entities of the recordIdColumn.
+#' @slot computedVariableMetadata veupathUtils::VariableMetadataList detailing the computed variables.
+#' @slot statistics An optional slot of any values. List or data.frame are recommended. It is not required to have rows or cols map to samples.
+#' @slot computationDetails An optional message about the computed results.
+#' @slot parameters A record of the input parameters used to generate the computed results.
+#' @slot droppedColumns A character vector of column names that have been dropped for being unsuitable for the computation.
+#' @name ComputeResult-class
+#' @rdname ComputeResult-class
+#' @export
+ComputeResult <- setClass("ComputeResult", representation(
+    name = 'character',
+    data = 'data.frame',
+    recordIdColumn = 'character',
+    ancestorIdColumns = 'character',
+    computedVariableMetadata = 'VariableMetadataList',
+    statistics = 'ANY',
+    computationDetails = 'character',
+    parameters = 'character',
+    droppedColumns = 'character'
+), prototype = prototype(
+    name = NA_character_,
+    recordIdColumn = NA_character_,
+    computationDetails = NA_character_,
+    parameters = NA_character_
+), validity = check_compute_result)
\ No newline at end of file
diff --git a/R/methods-ComputeResult.R b/R/methods-ComputeResult.R
new file mode 100644
index 0000000..620d47a
--- /dev/null
+++ b/R/methods-ComputeResult.R
@@ -0,0 +1,135 @@
+#' Write Computed Variable Metadata
+#'
+#' This function, for any given ComputeResult, 
+#' will return a filehandle where ComputedVariableMetadata
+#' has been written in JSON format.
+#' 
+#' @param object ComputeResult 
+#' @param pattern string to incorporate into tmp file name
+#' @param verbose boolean indicating if timed logging is desired
+#' @return filehandle where JSON representation of ComputedVariableMetadata can be found
+#' @export
+setGeneric("writeMeta",
+  function(object, pattern = NULL, verbose = c(TRUE, FALSE)) standardGeneric("writeMeta"),
+  signature = c("object")
+)
+
+#'@export 
+setMethod("writeMeta", signature("ComputeResult"), function(object, pattern = NULL, verbose = c(TRUE, FALSE)) {
+  verbose <- veupathUtils::matchArg(verbose)
+
+  outJson <- veupathUtils::toJSON(object@computedVariableMetadata)
+
+  if (is.null(pattern)) { 
+    pattern <- object@name
+    if (is.null(pattern)) {
+      pattern <- 'file'
+    } 
+  }
+  pattern <- paste0(pattern, '-meta-')
+
+  outFileName <- basename(tempfile(pattern = pattern, tmpdir = tempdir(), fileext = ".json"))
+  write(outJson, outFileName)
+  veupathUtils::logWithTime(paste('New output file written:', outFileName), verbose)
+
+  return(outFileName)
+})
+
+#' Write Compute Result Data
+#'
+#' This function, for any given ComputeResult, 
+#' will return a filehandle where result data
+#' has been written in tab delimited format.
+#' @param object ComputeResult 
+#' @param pattern string to incorporate into tmp file name
+#' @param verbose boolean indicating if timed logging is desired
+#' @return filehandle where tab delimited representation of result data can be found
+#' @export
+setGeneric("writeData",
+  function(object, pattern = NULL, verbose = c(TRUE, FALSE)) standardGeneric("writeData"),
+  signature = c("object")
+)
+
+#'@export 
+setMethod("writeData", signature("ComputeResult"), function(object, pattern = NULL, verbose = c(TRUE, FALSE)) {
+  verbose <- veupathUtils::matchArg(verbose)
+
+  if (is.null(pattern)) { 
+    pattern <- object@name
+    if (is.null(pattern)) {
+      pattern <- 'file'
+    } 
+  }
+  pattern <- paste0(pattern, '-data-')
+
+  outFileName <- basename(tempfile(pattern = pattern, tmpdir = tempdir(), fileext = ".tab"))
+  data.table::fwrite(object@data, outFileName, sep = '\t', quote = FALSE)
+  veupathUtils::logWithTime(paste('New output file written:', outFileName), verbose)
+
+  return(outFileName)
+})
+
+
+
+
+#' Write Computed Variable Statistics
+#'
+#' This function, for any given ComputeResult, 
+#' will return a filehandle where the statistics table
+#' has been written in JSON format.
+#' 
+#' @param object ComputeResult with statistics
+#' @param pattern string to incorporate into tmp file name
+#' @param verbose boolean indicating if timed logging is desired
+#' @return filehandle where JSON representation of ComputeResult statistics can be found
+#' @export
+setGeneric("writeStatistics",
+  function(object, pattern = NULL, verbose = c(TRUE, FALSE)) standardGeneric("writeStatistics"),
+  signature = c("object")
+)
+
+#'@export 
+setMethod("writeStatistics", signature("ComputeResult"), function(object, pattern = NULL, verbose = c(TRUE, FALSE)) {
+  verbose <- veupathUtils::matchArg(verbose)
+
+  # Convert all to character but maintain structure
+  if (inherits(object@statistics, 'data.frame')) {
+    outObject <- data.frame(lapply(object@statistics, as.character))
+  } else {
+    outObject <- object@statistics
+  }
+
+  outJson <- jsonlite::toJSON(outObject)
+
+  if (is.null(pattern)) { 
+    pattern <- object@name
+    if (is.null(pattern)) {
+      pattern <- 'file'
+    } 
+  }
+  pattern <- paste0(pattern, '-statistics-')
+
+  outFileName <- basename(tempfile(pattern = pattern, tmpdir = tempdir(), fileext = ".json"))
+  write(outJson, outFileName)
+  veupathUtils::logWithTime(paste('New output file written:', outFileName), verbose)
+
+  return(outFileName)
+})
+
+setMethod("toJSON", signature("CorrelationResult"), function(object, ...) {
+  tmp <- character()
+
+  # The outputted CorrelationResult json object should have properties data1Metadata, data2Metadata, and statistics.
+  # All values in statistics should be strings.
+  tmp <- paste0(tmp, '"data1Metadata": ', jsonlite::toJSON(jsonlite::unbox(object@data1Metadata)), ',')
+  tmp <- paste0(tmp, '"data2Metadata": ', jsonlite::toJSON(jsonlite::unbox(object@data2Metadata)), ',')
+  outObject <- data.frame(lapply(object@statistics, as.character))
+  tmp <- paste0(tmp, paste0('"statistics": ', jsonlite::toJSON(outObject)))
+
+  tmp <- paste0("{", tmp, "}")
+  return(tmp)
+})
+
+# these let jsonlite::toJSON work by using the custom toJSON method for our custom result class
+asJSONGeneric <- getGeneric("asJSON", package = "jsonlite")
+setMethod(asJSONGeneric, "CorrelationResult", function(x, ...) toJSON(x))
\ No newline at end of file
diff --git a/tests/testthat/test-ComputeResult.R b/tests/testthat/test-ComputeResult.R
new file mode 100644
index 0000000..958c26c
--- /dev/null
+++ b/tests/testthat/test-ComputeResult.R
@@ -0,0 +1,146 @@
+test_that('ComputeResult validation works', {
+  
+  nSamples <- 200
+  df <- data.table::data.table(
+    'entity.SampleID' = paste0('sample_', 1:nSamples),
+    'entity.contA' = rnorm(nSamples)
+  )
+  names(df) <- stripEntityIdFromColumnHeader(names(df))
+  df$alphaDiversity <- .1
+
+  computedVariableMetadata <- veupathUtils::VariableMetadata(
+                 variableClass = veupathUtils::VariableClass(value = "computed"),
+                 variableSpec = veupathUtils::VariableSpec(variableId = 'alphaDiversity', entityId = 'entity'),
+                 plotReference = veupathUtils::PlotReference(value = "yAxis"),
+                 displayName = "Alpha Diversity",
+                 displayRangeMin = 0,
+                 displayRangeMax = max(max(df$alphaDiversity, na.rm = TRUE),1),
+                 dataType = veupathUtils::DataType(value = "NUMBER"),
+                 dataShape = veupathUtils::DataShape(value = "CONTINUOUS")
+   )
+
+  expect_error(new("ComputeResult",
+                recordIdColumn = 'entity.SampleID',
+                computedVariableMetadata = veupathUtils::VariableMetadataList(S4Vectors::SimpleList(computedVariableMetadata)),
+                data = df))
+  expect_error(new("ComputeResult",
+                name = c('alphaDiv', 'test'),
+                recordIdColumn = 'entity.SampleID',
+                computedVariableMetadata = veupathUtils::VariableMetadataList(S4Vectors::SimpleList(computedVariableMetadata)),
+                data = df))
+  expect_error(new("ComputeResult",
+                name = NULL,
+                recordIdColumn = 'entity.SampleID',
+                computedVariableMetadata = NULL,
+                data = df))
+  expect_error(new("ComputeResult",
+                name = NA,
+                recordIdColumn = 'entity.SampleID',
+                computedVariableMetadata = NULL,
+                data = df))
+  expect_error(new("ComputeResult",
+                name = 'alphaDiv',
+                recordIdColumn = 'entity.SampleID',
+                computedVariableMetadata = NULL,
+                data = df))
+  expect_error(new("ComputeResult",
+                name = 'alphaDiv',
+                recordIdColumn = 'entity.SampleID',
+                data = df))
+  expect_error(new("ComputeResult",
+                name = 'alphaDiv',
+                recordIdColumn = 'entity.SampleID',
+                computedVariableMetadata = NA,
+                data = df))
+
+  df$alphaDiversity <- NULL
+  expect_error(new("ComputeResult",
+                name = c('alphaDiv'),
+                recordIdColumn = 'entity.SampleID',
+                computedVariableMetadata = veupathUtils::VariableMetadataList(S4Vectors::SimpleList(computedVariableMetadata)),
+                data = df))
+
+  df$alphaDiversity <- .1
+  computedVariableMetadata <- veupathUtils::VariableMetadata(
+                 variableClass = veupathUtils::VariableClass(value = "native"),
+                 variableSpec = veupathUtils::VariableSpec(variableId = 'alphaDiversity', entityId = 'entity'),
+                 plotReference = veupathUtils::PlotReference(value = "yAxis"),
+                 displayName = "Alpha Diversity",
+                 displayRangeMin = 0,
+                 displayRangeMax = max(max(df$alphaDiversity, na.rm = TRUE),1),
+                 dataType = veupathUtils::DataType(value = "NUMBER"),
+                 dataShape = veupathUtils::DataShape(value = "CONTINUOUS")
+   )
+
+   expect_error(new("ComputeResult",
+                name = c('alphaDiv'),
+                recordIdColumn = 'entity.SampleID',
+                computedVariableMetadata = veupathUtils::VariableMetadataList(S4Vectors::SimpleList(computedVariableMetadata)),
+                data = df))
+
+  # TODO test that extra cols in data will err, dot notation of output cols or not, column order
+})
+
+test_that("ComputeResult writeMeta method returns well formatted json", {
+  nSamples <- 200
+  df <- data.table::data.table(
+    'entity.SampleID' = paste0('sample_', 1:nSamples),
+    'entity.contA' = rnorm(nSamples)
+  )
+  names(df) <- stripEntityIdFromColumnHeader(names(df))
+  df$alphaDiversity <- .1
+
+  computedVariableMetadata <- veupathUtils::VariableMetadata(
+                 variableClass = veupathUtils::VariableClass(value = "computed"),
+                 variableSpec = veupathUtils::VariableSpec(variableId = 'alphaDiversity', entityId = 'entity'),
+                 plotReference = veupathUtils::PlotReference(value = "yAxis"),
+                 displayName = "Alpha Diversity",
+                 displayRangeMin = 0,
+                 displayRangeMax = max(max(df$alphaDiversity, na.rm = TRUE),1),
+                 dataType = veupathUtils::DataType(value = "NUMBER"),
+                 dataShape = veupathUtils::DataShape(value = "CONTINUOUS")
+   )
+
+  result <- new("ComputeResult",
+                name = "alphaDiv",
+                recordIdColumn = 'entity.SampleID',
+                computedVariableMetadata = veupathUtils::VariableMetadataList(S4Vectors::SimpleList(computedVariableMetadata)),
+                data = df)
+
+  jsonlist <- jsonlite::fromJSON(veupathUtils::toJSON(result@computedVariableMetadata))
+
+  expect_equal(names(jsonlist), 'variables')
+  expect_equal(names(jsonlist$variables), c("variableClass","variableSpec","plotReference","displayName","displayRangeMin","displayRangeMax","dataType","dataShape","isCollection","imputeZero","hasStudyDependentVocabulary"))
+})
+
+
+test_that("ComputeResult writeStatistics returns formatted json.", {
+
+  nStats <- 100
+  nSamples <- 200
+  df <- data.table::data.table(
+    'entity.SampleID' = paste0('sample_', 1:nSamples),
+    'entity.contA' = rnorm(nSamples)
+  )
+  names(df) <- stripEntityIdFromColumnHeader(names(df))
+  result <- new("ComputeResult",
+              name = "differentialAbundance",
+              recordIdColumn = 'entity.SampleID',
+              statistics = data.frame(list("stat_float" = rnorm(nStats, sd=5),
+                                           "stat_char" = sample(letters,nStats,replace=T),
+                                           "stat_with_missing" = sample(c(1.2, 3, NA), nStats, replace=T)
+                                           )),
+              data = df)
+
+  # transform to character, just like in the real method
+  charObject <- data.frame(lapply(result@statistics, as.character))
+  outJson <- jsonlite::toJSON(charObject)
+  jsonList <- jsonlite::fromJSON(outJson)
+  
+  expect_equal(length(jsonList$stat_float), nStats) # Check we still have enough stats
+  expect_equal(names(jsonList), c('stat_float','stat_char','stat_with_missing'))
+  expect_equal(unname(unlist(lapply(jsonList,class))), c('character','character','character'))
+  expect_equal(jsonList$stat_with_missing, as.character(result@statistics$stat_with_missing))
+  expect_equal(jsonList$stat_float, as.character(result@statistics$stat_float))
+
+})

From a50dee2efdc0e0402c6261f70be4e6ca5b8389c2 Mon Sep 17 00:00:00 2001
From: Danielle Callan <dcallan@upenn.edu>
Date: Mon, 26 Feb 2024 11:26:25 -0500
Subject: [PATCH 4/6] move some general correlation stuff out of mbio computes
 package

---
 R/class-CorrelationResult.R       |  34 ++++
 R/method-correlation.R            | 291 ++++++++++++++++++++++++++++++
 tests/testthat/test-correlation.R | 100 ++++++++++
 3 files changed, 425 insertions(+)
 create mode 100644 R/class-CorrelationResult.R
 create mode 100644 R/method-correlation.R
 create mode 100644 tests/testthat/test-correlation.R

diff --git a/R/class-CorrelationResult.R b/R/class-CorrelationResult.R
new file mode 100644
index 0000000..75465ad
--- /dev/null
+++ b/R/class-CorrelationResult.R
@@ -0,0 +1,34 @@
+check_correlation_result <- function(object) {
+    errors <- character()
+
+    if (!length(object@data1Metadata) > 0) {
+      msg <- "Compute result must include information about data1 in the data1Metadata slot"
+      errors <- c(errors, msg)
+    } else if (!length(object@data2Metadata) > 0) {
+      msg <- "Compute result must include information about data2 in the data2Metadata slot"
+      errors <- c(errors, msg) 
+    }
+
+    return(if (length(errors) == 0) TRUE else errors)
+}
+
+
+#' Corrleation Result
+#' 
+#' A class to represent the computed results from a correlation calculation.
+#' This includes their representation in R, as JSON, and how they are written to files.
+#' 
+#' @slot statistics An optional slot of any values. List or data.frame are recommended. It is not required to have rows or cols map to samples.
+#' @slot data1Metadata A character describing data1. Should at least describe the name of data1.
+#' @slot data2Metadata A character describing data2. Should at least describe the name of data2.
+#' @name CorrelationResult-class
+#' @rdname CorrelationResult-class
+#' @export
+CorrelationResult <- setClass("CorrelationResult", representation(
+    data1Metadata = "character",
+    data2Metadata = "character",
+    statistics = "data.frame"
+), prototype = prototype(
+    data1Metadata = NA_character_,
+    data2Metadata = NA_character_
+), validity = check_correlation_result)
\ No newline at end of file
diff --git a/R/method-correlation.R b/R/method-correlation.R
new file mode 100644
index 0000000..10ee496
--- /dev/null
+++ b/R/method-correlation.R
@@ -0,0 +1,291 @@
+#' Predicate Factory
+#' 
+#' This function creates a predicate function based on a string defining the type of predicate to run and a numeric value. 
+#' The currently supported types are 'proportionNonZero', 'variance' and 'sd'. The numeric value associated
+#' with each predicate type is the threshold for the predicate to be true.
+#' 
+#' @param predicateType string defining the type of predicate to run. The currently supported values are 'proportionNonZero', 'variance' and 'sd'
+#' @param threshold numeric value associated with the predicate type
+#' @return Function returning a boolean indicating if a feature should be included (TRUE) or excluded (FALSE)
+#' @export
+setGeneric("predicateFactory",
+  function(predicateType, threshold) standardGeneric("predicateFactory"),
+  signature = c("predicateType", "threshold")
+)
+
+
+#' Predicate Factory
+#'
+#' This function creates a predicate function based on a string defining the type of predicate to run and a numeric value. 
+#' The currently supported types are 'proportionNonZero', 'variance' and 'sd'. The numeric value associated
+#' with each predicate type is the threshold for the predicate to be true.
+#' 
+#' @param predicateType string defining the type of predicate to run. The currently supported values are 'proportionNonZero', 'variance' and 'sd'
+#' @param threshold numeric value associated with the predicate type
+#' @return Function returning a boolean indicating if a feature should be included (TRUE) or excluded (FALSE)
+#' @export
+setMethod("predicateFactory", signature("character", "numeric"), function(predicateType = c('proportionNonZero', 'variance', 'sd'), threshold = 0.5) {
+  predicateType <- veupathUtils::matchArg(predicateType)
+
+  if (predicateType == 'proportionNonZero') {
+    if (threshold < 0 | threshold > 1) {
+      stop('threshold must be between 0 and 1 for proportionNonZero')
+    }
+    return(function(x){sum(x > 0) >= length(x) * threshold})
+
+  } else if (predicateType == 'variance') {
+    if (threshold < 0) {
+      stop('threshold must be greater than 0 for variance')
+    }
+    return(function(x){var(x) > threshold})
+
+  } else if (predicateType == 'sd') {
+    if (threshold < 0) {
+      stop('threshold must be greater than 0 for sd')
+    }
+    return(function(x){sd(x) > threshold})
+  }
+
+})
+
+#' Correlation
+#'
+#' This function returns correlation coefficients for variables in one dataset against variables in a second dataset
+#' 
+#' @param data1 first dataset. An AbundanceData object or data.table
+#' @param data2 second dataset. A SampleMetadata object (if data1 is class AbundanceData) or a data.table
+#' @param method string defining the type of correlation to run. The currently supported values are specific to the class of data1 and data2.
+#' @param format string defining the desired format of the result. The currently supported values are 'data.table' and 'ComputeResult'.
+#' @param verbose boolean indicating if timed logging is desired
+#' @return data.frame with correlation coefficients or a ComputeResult object
+#' @import veupathUtils
+#' @import data.table
+#' @useDynLib microbiomeComputations
+#' @export
+setGeneric("correlation",
+  function(data1, data2, method, format = c('ComputeResult', 'data.table'), verbose = c(TRUE, FALSE), ...) standardGeneric("correlation"),
+  signature = c("data1","data2")
+)
+
+############ These methods should be used with caution. It is recommended to build methods for specific classes instead. ############
+############          That allows for proper validation of inputs (sparcc is for compositional data for example).        ############
+############                                 See microbiomeComputations for an example.                                  ############
+
+#' Correlation
+#'
+#' This function returns correlation coefficients for all columns in one data table with all columns in a second data table.
+#' 
+#' @param data1 data.table with columns as variables. All columns must be numeric. One row per sample.
+#' @param data2 data.table with columns as variables. All columns must be numeric. One row per sample. Will correlate all columns of data2 with all columns of data1.
+#' @param method string defining the type of correlation to run. The currently supported values are 'spearman' and 'pearson'
+#' @param format string defining the desired format of the result. The currently supported values are 'data.table' and 'ComputeResult'.
+#' @param verbose boolean indicating if timed logging is desired
+#' @importFrom Hmisc rcorr
+#' @return data.frame with correlation coefficients
+setMethod("correlation", signature("data.table", "data.table"), 
+function(data1, data2, method = c('spearman','pearson'), format = c('ComputeResult', 'data.table'), verbose = c(TRUE, FALSE)) {
+
+  format <- veupathUtils::matchArg(format)
+  method <- veupathUtils::matchArg(method)
+  verbose <- veupathUtils::matchArg(verbose)
+
+  # Check that the number of rows match.
+  if (!identical(nrow(data1), nrow(data2))) {
+    stop("data1 and data2 must have the same number of rows.")
+  }
+
+  # Check that all values are numeric
+  if (!identical(veupathUtils::findNumericCols(data1), names(data1))) { stop("All columns in data1 must be numeric.")}
+  if (!identical(veupathUtils::findNumericCols(data2), names(data2))) { stop("All columns in data2 must be numeric.")}
+
+
+  ## Compute correlation
+  #corrResult <- data.table::as.data.table(cor(data1, data2, method = method, use='na.or.complete'), keep.rownames = T)
+  lastData1ColIndex <- length(data1)
+  firstData2ColIndex <- length(data1) + 1
+  corrResult <- Hmisc::rcorr(as.matrix(data1), as.matrix(data2), type = method)
+  # this bc Hmisc::rcorr cbinds the two data.tables and runs the correlation
+  # so we need to extract only the relevant values
+  # of note, seems subsetting a matrix to have a single row/ column drops rownames by default. adding drop=F to prevent this.
+  pVals <- data.table::as.data.table(corrResult$P[1:lastData1ColIndex, firstData2ColIndex:length(colnames(corrResult$P)), drop = F], keep.rownames = T)
+  corrResult <- data.table::as.data.table(corrResult$r[1:lastData1ColIndex, firstData2ColIndex:length(colnames(corrResult$r)), drop = F], keep.rownames = T)
+
+  veupathUtils::logWithTime(paste0('Completed correlation with method=', method,'. Formatting results.'), verbose)
+
+
+  ## Format results
+  meltedCorrResult <- melt(corrResult, id.vars=c('rn'))
+  meltedPVals <- melt(pVals, id.vars=c('rn'))
+  formattedCorrResult <- data.frame(
+    data1 = meltedCorrResult[['rn']],
+    data2 = meltedCorrResult[['variable']],
+    correlationCoef = meltedCorrResult[['value']],
+    # should we do a merge just to be sure?
+    pValue = meltedPVals[['value']]
+  )
+
+  if (format == 'data.table') {
+    return(formattedCorrResult)
+  } else {
+    result <- buildCorrelationComputeResult(formattedCorrResult, data1, data2, method, verbose)
+    return(result)
+  }
+})
+
+#' Correlation
+#'
+#' This function returns correlation coefficients for all columns in one data table against themselves.
+#' 
+#' @param data1 data.table with columns as variables. All columns must be numeric. One row per sample.
+#' @param method string defining the type of correlation to run. The currently supported values are 'spearman', 'pearson' and 'sparcc'
+#' @param format string defining the desired format of the result. The currently supported values are 'data.table' and 'ComputeResult'.
+#' @param verbose boolean indicating if timed logging is desired
+#' @return data.frame with correlation coefficients
+#' @importFrom Hmisc rcorr
+#' @importFrom SpiecEasi pval.sparccboot
+#' @importFrom SpiecEasi sparccboot
+#' @importFrom SpiecEasi sparcc
+setMethod("correlation", signature("data.table", "missing"), 
+function(data1, data2, method = c('spearman','pearson','sparcc'), format = c('ComputeResult', 'data.table'), verbose = c(TRUE, FALSE)) {
+
+  format <- veupathUtils::matchArg(format)
+  method <- veupathUtils::matchArg(method)
+  verbose <- veupathUtils::matchArg(verbose)
+
+  # Check that all values are numeric
+  if (!identical(veupathUtils::findNumericCols(data1), names(data1))) { stop("All columns in data1 must be numeric.")}
+
+  ## Compute correlation
+  # rownames and colnames should be the same in this case
+  # keep matrix for now so we can use lower.tri later, expand.grid will give us the needed data.frame
+  if (method == 'sparcc') {
+
+    # this is a local alias for the sparcc function from the SpiecEasi namespace that do.call can find
+    # if we need to customize how we call sparcc in the future, we'll need to do something like the following
+    # except the empty list would have args for the sparcc function
+    #sparcc <- get("sparcc", asNamespace("SpiecEasi"))
+    #statisticperm=function(data, indices) do.call("sparcc", c(list(apply(data[indices,], 2, sample)), list()))$Cor
+    #statisticboot=function(data, indices) do.call("sparcc", c(list(data[indices,,drop=FALSE]), list()))$Cor
+
+    # sub-sampled `statistic` functions to pass to `boot::boot` that we can use to find pvalues
+    # statisticboot = function which takes data and bootstrap sample indices of the bootstapped correlation matrix
+    # statisticperm = function which takes data and permutated sample indices of the null correlation matrix
+    # the SpiecEasi defaults for these functions return the upper triangle of the correlation matrix. We do that manually later instead.
+    statisticperm = function(data, indices) SpiecEasi::sparcc(apply(data[indices,], 2, sample))$Cor
+    statisticboot = function(data, indices) SpiecEasi::sparcc(data[indices,,drop=FALSE])$Cor
+
+    # calling the bootstrap version of sparcc and finding pvalues
+    result <- SpiecEasi::pval.sparccboot(SpiecEasi::sparccboot(data1, statisticboot = statisticboot, statisticperm = statisticperm, R = 100))
+    # making sure results are formatted correctly for downstream use
+    pVals <- matrix(result$pvals, nrow = ncol(data1), ncol = ncol(data1), byrow = T)
+    corrResult <- result$cors
+    rownames(corrResult) <- colnames(corrResult) <- colnames(data1)
+  
+  } else {
+  
+    corrResult <- Hmisc::rcorr(as.matrix(data1), type = method)
+    pVals <- corrResult$P
+    corrResult <- corrResult$r
+  
+  }
+
+  veupathUtils::logWithTime(paste0('Completed correlation with method=', method,'. Formatting results.'), verbose)
+
+  ## Format results
+  rowAndColNames <- expand.grid(rownames(corrResult), colnames(corrResult))
+  deDupedRowAndColNames <- rowAndColNames[as.vector(upper.tri(corrResult)),]
+  formattedCorrResult <- cbind(deDupedRowAndColNames, corrResult[upper.tri(corrResult)])
+  formattedCorrResult <- cbind(formattedCorrResult, pVals[upper.tri(pVals)])
+  colnames(formattedCorrResult) <- c("data1","data2","correlationCoef","pValue")
+
+  if (format == 'data.table') {
+    return(formattedCorrResult)
+  } else {
+    result <- buildCorrelationComputeResult(formattedCorrResult, data1, data2, method, verbose)
+    return(result)
+  }
+})
+
+getDataMetadataType <- function(data) {
+  if (inherits(data, 'AbundanceData')) {
+    return('assay')
+  } else if (inherits(data, 'SampleMetadata')) {
+    return('sampleMetadata')
+  } else {
+    return('unknown')
+  }
+}
+
+## Helper function
+# should this be s4?
+buildCorrelationComputeResult <- function(corrResult, data1, data2 = NULL, method = c('spearman','pearson','sparcc'), verbose = c(TRUE, FALSE)) {
+  method <- veupathUtils::matchArg(method)
+  verbose <- veupathUtils::matchArg(verbose)
+
+  # both AbundanceData and SampleMetadata have these slots
+  recordIdColumn <- ifelse('recordIdColumn' %in% slotNames(data1), data1@recordIdColumn, NA_character_)
+  ancestorIdColumns <- ifelse('ancestorIdColumns' %in% slotNames(data1), data1@ancestorIdColumns, NA_character_)
+  allIdColumns <- c(recordIdColumn, ancestorIdColumns)
+
+  ## Format results
+  # Construct the ComputeResult
+  result <- new("ComputeResult")
+  result@name <- 'correlation'
+  result@recordIdColumn <- recordIdColumn
+  result@ancestorIdColumns <- ancestorIdColumns
+  statistics <- CorrelationResult(
+    statistics = corrResult,
+    data1Metadata = getDataMetadataType(data1),
+    data2Metadata = ifelse(is.null(data2), getDataMetadataType(data1), getDataMetadataType(data2))
+  )
+  result@statistics <- statistics
+  result@parameters <- paste0('method = ', method)
+
+  # The resulting data should contain only the samples actually used.
+  # this seems slightly complicated to generalize, and im not sure what we use it for anyhow
+  #result@data <- abundances[, ..allIdColumns]
+  #names(result@data) <- stripEntityIdFromColumnHeader(names(result@data))
+
+  validObject(result)
+  veupathUtils::logWithTime(paste('Correlation computation completed with parameters recordIdColumn=', recordIdColumn, ', method = ', method), verbose)
+  
+  return(result)
+}
+
+#' Self Correlation
+#'
+#' This function returns correlation coefficients for variables in one dataset against itself
+#' 
+#' @param data first dataset. An AbundanceData object or data.table
+#' @param method string defining the type of correlation to run. The currently supported values are 'spearman','pearson' and 'sparcc'
+#' @param format string defining the desired format of the result. The currently supported values are 'data.table' and 'ComputeResult'.
+#' @param verbose boolean indicating if timed logging is desired
+#' @return ComputeResult object
+#' @import veupathUtils
+#' @export
+setGeneric("selfCorrelation",
+  function(data, method = c('spearman','pearson','sparcc'), format = c('ComputeResult', 'data.table'), verbose = c(TRUE, FALSE), ...) standardGeneric("selfCorrelation"),
+  signature = c("data")
+)
+
+#' Self Correlation
+#'
+#' This function returns correlation coefficients for variables in one data.table against itself.
+#' This is essentially an alias to the microbiomeComputations::correlation function.
+#' 
+#' @param data a data.table
+#' @param method string defining the type of correlation to run. The currently supported values are 'spearman', 'pearson' and 'sparcc'
+#' @param format string defining the desired format of the result. The currently supported values are 'data.table' and 'ComputeResult'.
+#' @param verbose boolean indicating if timed logging is desired
+#' @return ComputeResult object
+#' @import veupathUtils
+#' @export
+setMethod("selfCorrelation", signature("data.table"), 
+function(data, method = c('spearman','pearson','sparcc'), format = c('ComputeResult', 'data.table'), verbose = c(TRUE, FALSE)) {
+
+  format <- veupathUtils::matchArg(format)
+  method <- veupathUtils::matchArg(method)
+  verbose <- veupathUtils::matchArg(verbose)
+  
+  correlation(data, method=method, format = format, verbose=verbose)
+})
\ No newline at end of file
diff --git a/tests/testthat/test-correlation.R b/tests/testthat/test-correlation.R
new file mode 100644
index 0000000..08f1026
--- /dev/null
+++ b/tests/testthat/test-correlation.R
@@ -0,0 +1,100 @@
+test_that('correlation works with two data tables', {
+  nSamples = 200
+
+  testData1 <- data.table(
+    "contA1" = rnorm(nSamples),
+    "contB1" = rnorm(nSamples),
+    "contC1" = rnorm(nSamples)
+  )
+
+  testData2 <- data.table(
+    "contA2" = rnorm(nSamples),
+    "contB2" = rnorm(nSamples),
+    "contC2" = rnorm(nSamples)
+  )
+
+  corrResult <- correlation(testData1, testData2, method = 'spearman', format = 'data.table', verbose=F)
+  expect_equal(names(corrResult), c("data1","data2","correlationCoef","pValue"))
+  expect_equal(nrow(corrResult), ncol(testData1) * ncol(testData2))
+  expect_true(!all(is.na(corrResult$correlationCoef)))
+
+  corrResult <- correlation(testData1, testData2, method = 'pearson', format = 'data.table', verbose=F)
+  expect_equal(names(corrResult), c("data1","data2","correlationCoef","pValue"))
+  expect_equal(nrow(corrResult), ncol(testData1) * ncol(testData2))
+  expect_true(!all(is.na(corrResult$correlationCoef)))
+
+  # Test with NAs
+  # Should compute using complete cases. 
+  # This to try to catch regression back to the default behavior of cor, which returns results of NA if any value is NA
+  testData2$contA2[1] <- NA
+  corrResult <- correlation(testData1, testData2, method = 'pearson', format = 'data.table', verbose=F)
+  expect_equal(names(corrResult), c("data1","data2","correlationCoef","pValue"))
+  expect_equal(nrow(corrResult), ncol(testData1) * ncol(testData2))
+  expect_true(!all(is.na(corrResult$correlationCoef)))
+
+})
+
+test_that("correlation works with a single data table", {
+  nSamples = 200
+  testData1 <- data.table(
+    "contA1" = rnorm(nSamples),
+    "contB1" = rnorm(nSamples),
+    "contC1" = rnorm(nSamples)
+  )
+  corrResult <- correlation(testData1, method='spearman', format = 'data.table', verbose=F)
+  expect_equal(names(corrResult), c("data1","data2","correlationCoef","pValue"))
+  expect_equal(nrow(corrResult), (ncol(testData1) * ncol(testData1) - 3)/2)
+  expect_true(!all(is.na(corrResult$correlationCoef)))
+
+  corrResult <- correlation(testData1, method='pearson', format = 'data.table', verbose=F)
+  expect_equal(names(corrResult), c("data1","data2","correlationCoef","pValue"))
+  expect_equal(nrow(corrResult), (ncol(testData1) * ncol(testData1) - 3)/2)
+  expect_true(!all(is.na(corrResult$correlationCoef)))
+
+  corrResult <- correlation(abs(testData1), method='sparcc', format = 'data.table', verbose=F) #sparcc wont like negative values
+  expect_equal(names(corrResult), c("data1","data2","correlationCoef","pValue"))
+  expect_equal(nrow(corrResult), (ncol(testData1) * ncol(testData1) - 3)/2)
+  expect_true(!all(is.na(corrResult$correlationCoef)))
+
+  #test alias
+  corrResult <- selfCorrelation(testData1, method='pearson', format = 'data.table', verbose=F) 
+  expect_equal(names(corrResult), c("data1","data2","correlationCoef","pValue"))
+  expect_equal(nrow(corrResult), (ncol(testData1) * ncol(testData1) - 3)/2)
+  expect_true(!all(is.na(corrResult$correlationCoef)))
+
+  # Test with NAs
+  # Should compute using complete cases. 
+  # This to try to catch regression back to the default behavior of cor, which returns results of NA if any value is NA
+  testData1$contA1[1] <- NA
+  corrResult <- correlation(testData1, method='pearson', format = 'data.table', verbose=F)
+  expect_equal(names(corrResult), c("data1","data2","correlationCoef","pValue"))
+  expect_equal(nrow(corrResult), (ncol(testData1) * ncol(testData1) - 3)/2)
+  expect_true(!all(is.na(corrResult$correlationCoef)))
+
+})
+
+test_that("toJSON works as expected for the CorrelationResult class", {
+  
+  nSamples <- 200
+  sampleMetadata <- data.table::data.table(
+      "entity.SampleID" = 1:nSamples,
+      "entity.contA" = rnorm(nSamples),
+      "entity.contB" = rnorm(nSamples),
+      "entity.contC" = rnorm(nSamples)
+  )
+
+  data <- data.table::data.table(
+    "entity.SampleID" = sampleMetadata$entity.SampleID,
+    "entity.cont1" = rnorm(nSamples),
+    "entity.cont2" = rnorm(nSamples),
+    "entity.cont3" = rnorm(nSamples)
+  )
+
+  result <- correlation(data, sampleMetadata, method='pearson', verbose = FALSE)
+  jsonList <- jsonlite::fromJSON(toJSON(result@statistics))
+  expect_equal(names(jsonList), c('data1Metadata', 'data2Metadata', 'statistics'))
+  expect_equal(class(jsonList$data1Metadata), "character")
+  expect_equal(class(jsonList$data2Metadata), "character")
+  expect_equal(names(jsonList$statistics), c('data1', 'data2', 'correlationCoef', 'pValue'))
+  expect_equal(unname(unlist(lapply(jsonList$statistics, class))), c('character', 'character', 'character', 'character'))
+})

From b00c2f7a39f33f1538366f51f7e861c2a179f273 Mon Sep 17 00:00:00 2001
From: Danielle Callan <dcallan@upenn.edu>
Date: Mon, 26 Feb 2024 11:32:47 -0500
Subject: [PATCH 5/6] update generated docs

---
 DESCRIPTION                                   |  6 +++-
 NAMESPACE                                     | 21 ++++++++++++
 R/class-ComputeResult.R                       |  1 +
 R/methods-ComputeResult.R                     |  1 +
 man/ComputeResult-class.Rd                    | 33 +++++++++++++++++++
 man/CorrelationResult-class.Rd                | 21 ++++++++++++
 ...orrelation-data.table-data.table-method.Rd | 31 +++++++++++++++++
 man/correlation-data.table-missing-method.Rd  | 29 ++++++++++++++++
 man/correlation.Rd                            | 32 ++++++++++++++++++
 ...edicateFactory-character-numeric-method.Rd | 24 ++++++++++++++
 man/predicateFactory.Rd                       | 21 ++++++++++++
 man/selfCorrelation-data.table-method.Rd      | 29 ++++++++++++++++
 man/selfCorrelation.Rd                        | 29 ++++++++++++++++
 man/stripEntityIdFromColumnHeader.Rd          | 14 ++++++++
 man/toJSON.Rd                                 |  2 +-
 man/writeData.Rd                              | 23 +++++++++++++
 man/writeMeta.Rd                              | 23 +++++++++++++
 man/writeStatistics.Rd                        | 23 +++++++++++++
 18 files changed, 361 insertions(+), 2 deletions(-)
 create mode 100644 man/ComputeResult-class.Rd
 create mode 100644 man/CorrelationResult-class.Rd
 create mode 100644 man/correlation-data.table-data.table-method.Rd
 create mode 100644 man/correlation-data.table-missing-method.Rd
 create mode 100644 man/correlation.Rd
 create mode 100644 man/predicateFactory-character-numeric-method.Rd
 create mode 100644 man/predicateFactory.Rd
 create mode 100644 man/selfCorrelation-data.table-method.Rd
 create mode 100644 man/selfCorrelation.Rd
 create mode 100644 man/stripEntityIdFromColumnHeader.Rd
 create mode 100644 man/writeData.Rd
 create mode 100644 man/writeMeta.Rd
 create mode 100644 man/writeStatistics.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index ef1cd00..943a5ca 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -31,14 +31,18 @@ Config/testthat/edition: 3
 Collate: 
     'class-Bin.R'
     'class-VariableMetadata.R'
+    'class-ComputeResult.R'
+    'class-CorrelationResult.R'
     'class-Megastudy.R'
     'class-Range.R'
     'class-Statistic.R'
     'data.R'
+    'method-correlation.R'
     'methods-Bin.R'
+    'methods-VariableMetadata.R'
+    'methods-ComputeResult.R'
     'methods-Megastudy.R'
     'methods-Statistic.R'
-    'methods-VariableMetadata.R'
     'utils-classes.R'
     'utils-cut.R'
     'utils-numeric.R'
diff --git a/NAMESPACE b/NAMESPACE
index 3cafd57..70ef4cb 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -9,6 +9,8 @@ S3method(validateNumericCols,default)
 S3method(validateNumericCols,list)
 export(Bin)
 export(BinList)
+export(ComputeResult)
+export(CorrelationResult)
 export(DataShape)
 export(DataType)
 export(Megastudy)
@@ -25,6 +27,7 @@ export(VariableMetadataList)
 export(VariableSpec)
 export(VariableSpecList)
 export(as.data.table)
+export(correlation)
 export(cut_interval)
 export(cut_number)
 export(cut_width)
@@ -62,12 +65,15 @@ export(merge)
 export(naToZero)
 export(new_data_frame)
 export(nonZeroRound)
+export(predicateFactory)
 export(removeAttr)
+export(selfCorrelation)
 export(setAttrFromList)
 export(setNaToZero)
 export(setroworder)
 export(shiftToNonNeg)
 export(strSplit)
+export(stripEntityIdFromColumnHeader)
 export(toJSON)
 export(toStringOrNull)
 export(toStringOrPoint)
@@ -76,8 +82,13 @@ export(updateAttrById)
 export(validateNumericCols)
 export(whichValuesInBin)
 export(whichValuesInBinList)
+export(writeData)
+export(writeMeta)
+export(writeStatistics)
 exportClasses(Bin)
 exportClasses(BinList)
+exportClasses(ComputeResult)
+exportClasses(CorrelationResult)
 exportClasses(DataShape)
 exportClasses(DataType)
 exportClasses(Megastudy)
@@ -118,13 +129,23 @@ exportMethods(getStudyIdColumnName)
 exportMethods(getVariableSpec)
 exportMethods(getVariableSpecColumnName)
 exportMethods(merge)
+exportMethods(predicateFactory)
+exportMethods(selfCorrelation)
 exportMethods(toJSON)
 exportMethods(whichValuesInBin)
 exportMethods(whichValuesInBinList)
+exportMethods(writeData)
+exportMethods(writeMeta)
+exportMethods(writeStatistics)
 import(data.table)
+importFrom(Hmisc,rcorr)
 importFrom(S4Vectors,SimpleList)
+importFrom(SpiecEasi,pval.sparccboot)
+importFrom(SpiecEasi,sparcc)
+importFrom(SpiecEasi,sparccboot)
 importFrom(digest,digest)
 importFrom(microbenchmark,microbenchmark)
 importFrom(purrr,map)
 importFrom(purrr,map_lgl)
 importFrom(stringi,stri_detect_regex)
+useDynLib(microbiomeComputations)
diff --git a/R/class-ComputeResult.R b/R/class-ComputeResult.R
index 267e7d3..38c7054 100644
--- a/R/class-ComputeResult.R
+++ b/R/class-ComputeResult.R
@@ -72,6 +72,7 @@ check_compute_result <- function(object) {
 #' @name ComputeResult-class
 #' @rdname ComputeResult-class
 #' @export
+#' @include class-VariableMetadata.R
 ComputeResult <- setClass("ComputeResult", representation(
     name = 'character',
     data = 'data.frame',
diff --git a/R/methods-ComputeResult.R b/R/methods-ComputeResult.R
index 620d47a..4f0c0f8 100644
--- a/R/methods-ComputeResult.R
+++ b/R/methods-ComputeResult.R
@@ -116,6 +116,7 @@ setMethod("writeStatistics", signature("ComputeResult"), function(object, patter
   return(outFileName)
 })
 
+#' @include methods-VariableMetadata.R
 setMethod("toJSON", signature("CorrelationResult"), function(object, ...) {
   tmp <- character()
 
diff --git a/man/ComputeResult-class.Rd b/man/ComputeResult-class.Rd
new file mode 100644
index 0000000..1930df3
--- /dev/null
+++ b/man/ComputeResult-class.Rd
@@ -0,0 +1,33 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/class-ComputeResult.R
+\docType{class}
+\name{ComputeResult-class}
+\alias{ComputeResult-class}
+\alias{ComputeResult}
+\title{Compute Result}
+\description{
+A class for consistently representing the results of computes.
+This includes their representation in R, as JSON and how they are written to files.
+}
+\section{Slots}{
+
+\describe{
+\item{\code{data}}{A data.frame of values where computed variables are columns and samples rows.}
+
+\item{\code{name}}{The name of the compute, ex: 'alphaDiv'.}
+
+\item{\code{recordIdColumn}}{The name of the column containing IDs for the samples. All other columns will be treated as computed values.}
+
+\item{\code{ancestorIdColumns}}{A character vector of column names representing parent entities of the recordIdColumn.}
+
+\item{\code{computedVariableMetadata}}{veupathUtils::VariableMetadataList detailing the computed variables.}
+
+\item{\code{statistics}}{An optional slot of any values. List or data.frame are recommended. It is not required to have rows or cols map to samples.}
+
+\item{\code{computationDetails}}{An optional message about the computed results.}
+
+\item{\code{parameters}}{A record of the input parameters used to generate the computed results.}
+
+\item{\code{droppedColumns}}{A character vector of column names that have been dropped for being unsuitable for the computation.}
+}}
+
diff --git a/man/CorrelationResult-class.Rd b/man/CorrelationResult-class.Rd
new file mode 100644
index 0000000..2fe0c98
--- /dev/null
+++ b/man/CorrelationResult-class.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/class-CorrelationResult.R
+\docType{class}
+\name{CorrelationResult-class}
+\alias{CorrelationResult-class}
+\alias{CorrelationResult}
+\title{Corrleation Result}
+\description{
+A class to represent the computed results from a correlation calculation.
+This includes their representation in R, as JSON, and how they are written to files.
+}
+\section{Slots}{
+
+\describe{
+\item{\code{statistics}}{An optional slot of any values. List or data.frame are recommended. It is not required to have rows or cols map to samples.}
+
+\item{\code{data1Metadata}}{A character describing data1. Should at least describe the name of data1.}
+
+\item{\code{data2Metadata}}{A character describing data2. Should at least describe the name of data2.}
+}}
+
diff --git a/man/correlation-data.table-data.table-method.Rd b/man/correlation-data.table-data.table-method.Rd
new file mode 100644
index 0000000..8b3d54b
--- /dev/null
+++ b/man/correlation-data.table-data.table-method.Rd
@@ -0,0 +1,31 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/method-correlation.R
+\name{correlation,data.table,data.table-method}
+\alias{correlation,data.table,data.table-method}
+\title{Correlation}
+\usage{
+\S4method{correlation}{data.table,data.table}(
+  data1,
+  data2,
+  method = c("spearman", "pearson"),
+  format = c("ComputeResult", "data.table"),
+  verbose = c(TRUE, FALSE)
+)
+}
+\arguments{
+\item{data1}{data.table with columns as variables. All columns must be numeric. One row per sample.}
+
+\item{data2}{data.table with columns as variables. All columns must be numeric. One row per sample. Will correlate all columns of data2 with all columns of data1.}
+
+\item{method}{string defining the type of correlation to run. The currently supported values are 'spearman' and 'pearson'}
+
+\item{format}{string defining the desired format of the result. The currently supported values are 'data.table' and 'ComputeResult'.}
+
+\item{verbose}{boolean indicating if timed logging is desired}
+}
+\value{
+data.frame with correlation coefficients
+}
+\description{
+This function returns correlation coefficients for all columns in one data table with all columns in a second data table.
+}
diff --git a/man/correlation-data.table-missing-method.Rd b/man/correlation-data.table-missing-method.Rd
new file mode 100644
index 0000000..d3adbf8
--- /dev/null
+++ b/man/correlation-data.table-missing-method.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/method-correlation.R
+\name{correlation,data.table,missing-method}
+\alias{correlation,data.table,missing-method}
+\title{Correlation}
+\usage{
+\S4method{correlation}{data.table,missing}(
+  data1,
+  data2,
+  method = c("spearman", "pearson", "sparcc"),
+  format = c("ComputeResult", "data.table"),
+  verbose = c(TRUE, FALSE)
+)
+}
+\arguments{
+\item{data1}{data.table with columns as variables. All columns must be numeric. One row per sample.}
+
+\item{method}{string defining the type of correlation to run. The currently supported values are 'spearman', 'pearson' and 'sparcc'}
+
+\item{format}{string defining the desired format of the result. The currently supported values are 'data.table' and 'ComputeResult'.}
+
+\item{verbose}{boolean indicating if timed logging is desired}
+}
+\value{
+data.frame with correlation coefficients
+}
+\description{
+This function returns correlation coefficients for all columns in one data table against themselves.
+}
diff --git a/man/correlation.Rd b/man/correlation.Rd
new file mode 100644
index 0000000..c832b46
--- /dev/null
+++ b/man/correlation.Rd
@@ -0,0 +1,32 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/method-correlation.R
+\name{correlation}
+\alias{correlation}
+\title{Correlation}
+\usage{
+correlation(
+  data1,
+  data2,
+  method,
+  format = c("ComputeResult", "data.table"),
+  verbose = c(TRUE, FALSE),
+  ...
+)
+}
+\arguments{
+\item{data1}{first dataset. An AbundanceData object or data.table}
+
+\item{data2}{second dataset. A SampleMetadata object (if data1 is class AbundanceData) or a data.table}
+
+\item{method}{string defining the type of correlation to run. The currently supported values are specific to the class of data1 and data2.}
+
+\item{format}{string defining the desired format of the result. The currently supported values are 'data.table' and 'ComputeResult'.}
+
+\item{verbose}{boolean indicating if timed logging is desired}
+}
+\value{
+data.frame with correlation coefficients or a ComputeResult object
+}
+\description{
+This function returns correlation coefficients for variables in one dataset against variables in a second dataset
+}
diff --git a/man/predicateFactory-character-numeric-method.Rd b/man/predicateFactory-character-numeric-method.Rd
new file mode 100644
index 0000000..6819fd1
--- /dev/null
+++ b/man/predicateFactory-character-numeric-method.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/method-correlation.R
+\name{predicateFactory,character,numeric-method}
+\alias{predicateFactory,character,numeric-method}
+\title{Predicate Factory}
+\usage{
+\S4method{predicateFactory}{character,numeric}(
+  predicateType = c("proportionNonZero", "variance", "sd"),
+  threshold = 0.5
+)
+}
+\arguments{
+\item{predicateType}{string defining the type of predicate to run. The currently supported values are 'proportionNonZero', 'variance' and 'sd'}
+
+\item{threshold}{numeric value associated with the predicate type}
+}
+\value{
+Function returning a boolean indicating if a feature should be included (TRUE) or excluded (FALSE)
+}
+\description{
+This function creates a predicate function based on a string defining the type of predicate to run and a numeric value.
+The currently supported types are 'proportionNonZero', 'variance' and 'sd'. The numeric value associated
+with each predicate type is the threshold for the predicate to be true.
+}
diff --git a/man/predicateFactory.Rd b/man/predicateFactory.Rd
new file mode 100644
index 0000000..17648d0
--- /dev/null
+++ b/man/predicateFactory.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/method-correlation.R
+\name{predicateFactory}
+\alias{predicateFactory}
+\title{Predicate Factory}
+\usage{
+predicateFactory(predicateType, threshold)
+}
+\arguments{
+\item{predicateType}{string defining the type of predicate to run. The currently supported values are 'proportionNonZero', 'variance' and 'sd'}
+
+\item{threshold}{numeric value associated with the predicate type}
+}
+\value{
+Function returning a boolean indicating if a feature should be included (TRUE) or excluded (FALSE)
+}
+\description{
+This function creates a predicate function based on a string defining the type of predicate to run and a numeric value.
+The currently supported types are 'proportionNonZero', 'variance' and 'sd'. The numeric value associated
+with each predicate type is the threshold for the predicate to be true.
+}
diff --git a/man/selfCorrelation-data.table-method.Rd b/man/selfCorrelation-data.table-method.Rd
new file mode 100644
index 0000000..061c472
--- /dev/null
+++ b/man/selfCorrelation-data.table-method.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/method-correlation.R
+\name{selfCorrelation,data.table-method}
+\alias{selfCorrelation,data.table-method}
+\title{Self Correlation}
+\usage{
+\S4method{selfCorrelation}{data.table}(
+  data,
+  method = c("spearman", "pearson", "sparcc"),
+  format = c("ComputeResult", "data.table"),
+  verbose = c(TRUE, FALSE)
+)
+}
+\arguments{
+\item{data}{a data.table}
+
+\item{method}{string defining the type of correlation to run. The currently supported values are 'spearman', 'pearson' and 'sparcc'}
+
+\item{format}{string defining the desired format of the result. The currently supported values are 'data.table' and 'ComputeResult'.}
+
+\item{verbose}{boolean indicating if timed logging is desired}
+}
+\value{
+ComputeResult object
+}
+\description{
+This function returns correlation coefficients for variables in one data.table against itself.
+This is essentially an alias to the microbiomeComputations::correlation function.
+}
diff --git a/man/selfCorrelation.Rd b/man/selfCorrelation.Rd
new file mode 100644
index 0000000..a8168d0
--- /dev/null
+++ b/man/selfCorrelation.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/method-correlation.R
+\name{selfCorrelation}
+\alias{selfCorrelation}
+\title{Self Correlation}
+\usage{
+selfCorrelation(
+  data,
+  method = c("spearman", "pearson", "sparcc"),
+  format = c("ComputeResult", "data.table"),
+  verbose = c(TRUE, FALSE),
+  ...
+)
+}
+\arguments{
+\item{data}{first dataset. An AbundanceData object or data.table}
+
+\item{method}{string defining the type of correlation to run. The currently supported values are 'spearman','pearson' and 'sparcc'}
+
+\item{format}{string defining the desired format of the result. The currently supported values are 'data.table' and 'ComputeResult'.}
+
+\item{verbose}{boolean indicating if timed logging is desired}
+}
+\value{
+ComputeResult object
+}
+\description{
+This function returns correlation coefficients for variables in one dataset against itself
+}
diff --git a/man/stripEntityIdFromColumnHeader.Rd b/man/stripEntityIdFromColumnHeader.Rd
new file mode 100644
index 0000000..9ecd36f
--- /dev/null
+++ b/man/stripEntityIdFromColumnHeader.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{stripEntityIdFromColumnHeader}
+\alias{stripEntityIdFromColumnHeader}
+\title{Strip Entity ID from Column Header}
+\usage{
+stripEntityIdFromColumnHeader(columnNames)
+}
+\arguments{
+\item{columnNames}{character vector}
+}
+\description{
+This function strips entity ID from column headers for EDA formatted tabular data.
+}
diff --git a/man/toJSON.Rd b/man/toJSON.Rd
index 7c9af0c..9bbfb4d 100644
--- a/man/toJSON.Rd
+++ b/man/toJSON.Rd
@@ -4,7 +4,7 @@
 \alias{toJSON}
 \title{R object as JSON string}
 \usage{
-toJSON(object, named = c(TRUE, FALSE))
+toJSON(object, named = c(TRUE, FALSE), ...)
 }
 \arguments{
 \item{object}{object of a supported S4 class to convert to a JSON string representation}
diff --git a/man/writeData.Rd b/man/writeData.Rd
new file mode 100644
index 0000000..a50d53f
--- /dev/null
+++ b/man/writeData.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/methods-ComputeResult.R
+\name{writeData}
+\alias{writeData}
+\title{Write Compute Result Data}
+\usage{
+writeData(object, pattern = NULL, verbose = c(TRUE, FALSE))
+}
+\arguments{
+\item{object}{ComputeResult}
+
+\item{pattern}{string to incorporate into tmp file name}
+
+\item{verbose}{boolean indicating if timed logging is desired}
+}
+\value{
+filehandle where tab delimited representation of result data can be found
+}
+\description{
+This function, for any given ComputeResult,
+will return a filehandle where result data
+has been written in tab delimited format.
+}
diff --git a/man/writeMeta.Rd b/man/writeMeta.Rd
new file mode 100644
index 0000000..c4e1093
--- /dev/null
+++ b/man/writeMeta.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/methods-ComputeResult.R
+\name{writeMeta}
+\alias{writeMeta}
+\title{Write Computed Variable Metadata}
+\usage{
+writeMeta(object, pattern = NULL, verbose = c(TRUE, FALSE))
+}
+\arguments{
+\item{object}{ComputeResult}
+
+\item{pattern}{string to incorporate into tmp file name}
+
+\item{verbose}{boolean indicating if timed logging is desired}
+}
+\value{
+filehandle where JSON representation of ComputedVariableMetadata can be found
+}
+\description{
+This function, for any given ComputeResult,
+will return a filehandle where ComputedVariableMetadata
+has been written in JSON format.
+}
diff --git a/man/writeStatistics.Rd b/man/writeStatistics.Rd
new file mode 100644
index 0000000..8ae195e
--- /dev/null
+++ b/man/writeStatistics.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/methods-ComputeResult.R
+\name{writeStatistics}
+\alias{writeStatistics}
+\title{Write Computed Variable Statistics}
+\usage{
+writeStatistics(object, pattern = NULL, verbose = c(TRUE, FALSE))
+}
+\arguments{
+\item{object}{ComputeResult with statistics}
+
+\item{pattern}{string to incorporate into tmp file name}
+
+\item{verbose}{boolean indicating if timed logging is desired}
+}
+\value{
+filehandle where JSON representation of ComputeResult statistics can be found
+}
+\description{
+This function, for any given ComputeResult,
+will return a filehandle where the statistics table
+has been written in JSON format.
+}

From e511d47eed2aca6e53e98093ac5dc95d72188e3d Mon Sep 17 00:00:00 2001
From: Danielle Callan <dcallan@upenn.edu>
Date: Wed, 28 Feb 2024 09:06:18 -0500
Subject: [PATCH 6/6] update documentation for correlation method

---
 R/method-correlation.R | 6 +++---
 man/correlation.Rd     | 4 ++--
 man/selfCorrelation.Rd | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/R/method-correlation.R b/R/method-correlation.R
index 10ee496..3546a69 100644
--- a/R/method-correlation.R
+++ b/R/method-correlation.R
@@ -52,8 +52,8 @@ setMethod("predicateFactory", signature("character", "numeric"), function(predic
 #'
 #' This function returns correlation coefficients for variables in one dataset against variables in a second dataset
 #' 
-#' @param data1 first dataset. An AbundanceData object or data.table
-#' @param data2 second dataset. A SampleMetadata object (if data1 is class AbundanceData) or a data.table
+#' @param data1 first dataset. A data.table
+#' @param data2 second dataset. A data.table
 #' @param method string defining the type of correlation to run. The currently supported values are specific to the class of data1 and data2.
 #' @param format string defining the desired format of the result. The currently supported values are 'data.table' and 'ComputeResult'.
 #' @param verbose boolean indicating if timed logging is desired
@@ -256,7 +256,7 @@ buildCorrelationComputeResult <- function(corrResult, data1, data2 = NULL, metho
 #'
 #' This function returns correlation coefficients for variables in one dataset against itself
 #' 
-#' @param data first dataset. An AbundanceData object or data.table
+#' @param data first dataset. A data.table
 #' @param method string defining the type of correlation to run. The currently supported values are 'spearman','pearson' and 'sparcc'
 #' @param format string defining the desired format of the result. The currently supported values are 'data.table' and 'ComputeResult'.
 #' @param verbose boolean indicating if timed logging is desired
diff --git a/man/correlation.Rd b/man/correlation.Rd
index c832b46..9ae128a 100644
--- a/man/correlation.Rd
+++ b/man/correlation.Rd
@@ -14,9 +14,9 @@ correlation(
 )
 }
 \arguments{
-\item{data1}{first dataset. An AbundanceData object or data.table}
+\item{data1}{first dataset. A data.table}
 
-\item{data2}{second dataset. A SampleMetadata object (if data1 is class AbundanceData) or a data.table}
+\item{data2}{second dataset. A data.table}
 
 \item{method}{string defining the type of correlation to run. The currently supported values are specific to the class of data1 and data2.}
 
diff --git a/man/selfCorrelation.Rd b/man/selfCorrelation.Rd
index a8168d0..ebe00b5 100644
--- a/man/selfCorrelation.Rd
+++ b/man/selfCorrelation.Rd
@@ -13,7 +13,7 @@ selfCorrelation(
 )
 }
 \arguments{
-\item{data}{first dataset. An AbundanceData object or data.table}
+\item{data}{first dataset. A data.table}
 
 \item{method}{string defining the type of correlation to run. The currently supported values are 'spearman','pearson' and 'sparcc'}