Merge pull request #246 from OHDSI/develop

Release v3.5.0
OHDSI · Apr 18, 2024 · 4a10c69 · msuchard · Apr 20, 2024 · msuchard
2 parents 5049d54 + b8c964b
commit 4a10c69
Show file tree

Hide file tree

Showing 71 changed files with 410 additions and 80 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: FeatureExtraction
 Type: Package
 Title: Generating Features for a Cohort
-Version: 3.4.1
-Date: 2024-03-28
+Version: 3.5.0
+Date: 2024-04-18
 Authors@R: c(
  person("Martijn", "Schuemie", , "schuemie@ohdsi.org", role = c("aut")),
  person("Marc", "Suchard", role = c("aut")),
@@ -44,6 +44,6 @@ VignetteBuilder: knitr
 URL: https://github.com/OHDSI/FeatureExtraction
 BugReports: https://github.com/OHDSI/FeatureExtraction/issues
 NeedsCompilation: no
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
 Encoding: UTF-8
 Language: en-US
diff --git a/NEWS.md b/NEWS.md
@@ -1,9 +1,20 @@
+FeatureExtraction 3.5.0
+=======================
+
+New Features:
+
+- Adds the ability to filter covariates by setting a minimum threshold for covariate mean (#174)
+
+Bug Fixes:
+
+- Table 1 - does not report correct subject count (#237) 
+
 FeatureExtraction 3.4.1
 =======================
 
 Bug Fixes:
 
-- Weely R-check fails (#239)
+- Weekly R-check fails (#239)
 - BigQuery error (#208)
 - Error when specifying 1 temporal window in temportalCovariateSettings (#200)
 - metaData aggregation issue (#195)

diff --git a/R/GetCovariates.R b/R/GetCovariates.R
@@ -59,6 +59,9 @@
 #' of the createCovariate functions, or a list of such objects.
 #' @param aggregated Should aggregate statistics be computed instead of covariates per
 #' cohort entry?
+#' @param minCharacterizationMean The minimum mean value for characterization output. Values below this will be cut off from output. This 
+#' will help reduce the file size of the characterization output, but will remove information
+#' on covariates that have very low values. The default is 0.
 #'
 #' @return
 #' Returns an object of type \code{covariateData}, containing information on the covariates.
@@ -101,7 +104,8 @@ getDbCovariateData <- function(connectionDetails = NULL,
  cohortIds = c(-1),
  rowIdField = "subject_id",
  covariateSettings,
- aggregated = FALSE) {
+ aggregated = FALSE,
+ minCharacterizationMean = 0) {
  if (is.null(connectionDetails) && is.null(connection)) {
  stop("Need to provide either connectionDetails or connection")
  }
@@ -115,6 +119,10 @@ getDbCovariateData <- function(connectionDetails = NULL,
  warning("cohortId argument has been deprecated, please use cohortIds")
  cohortIds <- cohortId
  }
+ errorMessages <- checkmate::makeAssertCollection()
+ minCharacterizationMean <- utils::type.convert(minCharacterizationMean, as.is = TRUE)
+ checkmate::assertNumeric(x = minCharacterizationMean, lower = 0, add = errorMessages)
+ checkmate::reportAssertions(collection = errorMessages)
  if (!is.null(connectionDetails)) {
  connection <- DatabaseConnector::connect(connectionDetails)
  on.exit(DatabaseConnector::disconnect(connection))
@@ -164,7 +172,8 @@ getDbCovariateData <- function(connectionDetails = NULL,
  cdmVersion = cdmVersion,
  rowIdField = rowIdField,
  covariateSettings = covariateSettings[[i]],
- aggregated = aggregated)
+ aggregated = aggregated,
+ minCharacterizationMean = minCharacterizationMean)
  tempCovariateData <- do.call(eval(parse(text = fun)), args)
  if (is.null(covariateData)) {
  covariateData <- tempCovariateData

diff --git a/R/GetCovariatesFromOtherCohorts.R b/R/GetCovariatesFromOtherCohorts.R
@@ -22,7 +22,9 @@
 #' @param covariateSettings An object of type \code{covariateSettings} as created using the
 #' \code{\link{createCohortBasedCovariateSettings}} or
 #' \code{\link{createCohortBasedTemporalCovariateSettings}} functions.
-#'
+#' @param minCharacterizationMean The minimum mean value for characterization output. Values below this will be cut off from output. This 
+#' will help reduce the file size of the characterization output, but will remove information
+#' on covariates that have very low values. The default is 0.
 #' @template GetCovarParams
 #'
 #' @export
@@ -35,7 +37,8 @@ getDbCohortBasedCovariatesData <- function(connection,
  cdmVersion = "5",
  rowIdField = "subject_id",
  covariateSettings,
- aggregated = FALSE) {
+ aggregated = FALSE,
+ minCharacterizationMean = 0) {
  errorMessages <- checkmate::makeAssertCollection()
  checkmate::assertClass(connection, "DatabaseConnectorConnection", add = errorMessages)
  checkmate::assertCharacter(oracleTempSchema, len = 1, null.ok = TRUE, add = errorMessages)
@@ -46,6 +49,8 @@ getDbCohortBasedCovariatesData <- function(connection,
  checkmate::assertCharacter(rowIdField, len = 1, add = errorMessages)
  checkmate::assertClass(covariateSettings, "covariateSettings", add = errorMessages)
  checkmate::assertLogical(aggregated, len = 1, add = errorMessages)
+ minCharacterizationMean <- utils::type.convert(minCharacterizationMean, as.is = TRUE)
+ checkmate::assertNumeric(x = minCharacterizationMean, lower = 0, add = errorMessages)
  checkmate::reportAssertions(collection = errorMessages)
  if (!missing(cohortId)) { 
  warning("cohortId argument has been deprecated, please use cohortIds")
@@ -139,7 +144,8 @@ getDbCohortBasedCovariatesData <- function(connection,
  cdmVersion = cdmVersion,
  rowIdField = rowIdField,
  covariateSettings = detailledSettings,
- aggregated = aggregated
+ aggregated = aggregated,
+ minCharacterizationMean = minCharacterizationMean
  )
 
  sql <- "TRUNCATE TABLE #covariate_cohort_ref; DROP TABLE #covariate_cohort_ref;"

diff --git a/R/GetDefaultCovariates.R b/R/GetDefaultCovariates.R
@@ -31,6 +31,9 @@
 #' it is a temp table, do not specify \code{targetDatabaseSchema}.
 #' @param targetCovariateRefTable (Optional) The name of the table where the covariate reference will be stored.
 #' @param targetAnalysisRefTable (Optional) The name of the table where the analysis reference will be stored.
+#' @param minCharacterizationMean The minimum mean value for characterization output. Values below this will be cut off from output. This 
+#' will help reduce the file size of the characterization output, but will remove information
+#' on covariates that have very low values. The default is 0.
 #'
 #' @template GetCovarParams
 #'
@@ -65,7 +68,8 @@ getDbDefaultCovariateData <- function(connection,
  targetCovariateTable,
  targetCovariateRefTable,
  targetAnalysisRefTable,
- aggregated = FALSE) {
+ aggregated = FALSE,
+ minCharacterizationMean = 0) {
  if (!is(covariateSettings, "covariateSettings")) {
  stop("Covariate settings object not of type covariateSettings")
  }
@@ -79,6 +83,11 @@ getDbDefaultCovariateData <- function(connection,
  warning("cohortId argument has been deprecated, please use cohortIds")
  cohortIds <- cohortId
  }
+ errorMessages <- checkmate::makeAssertCollection()
+ minCharacterizationMean <- utils::type.convert(minCharacterizationMean, as.is = TRUE)
+ checkmate::assertNumeric(x = minCharacterizationMean, lower = 0, add = errorMessages)
+ checkmate::reportAssertions(collection = errorMessages)
+
  settings <- .toJson(covariateSettings)
  rJava::J("org.ohdsi.featureExtraction.FeatureExtraction")$init(system.file("", package = "FeatureExtraction"))
  json <- rJava::J("org.ohdsi.featureExtraction.FeatureExtraction")$createSql(settings, aggregated, cohortTable, rowIdField, rJava::.jarray(as.character(cohortIds)), cdmDatabaseSchema)
@@ -126,6 +135,7 @@ getDbDefaultCovariateData <- function(connection,
  andromedaTableName = "covariates",
  snakeCaseToCamelCase = TRUE
  )
+ filterCovariateDataCovariates(covariateData, "covariates", minCharacterizationMean)
  }
 
  # Continuous aggregated features
@@ -142,6 +152,7 @@ getDbDefaultCovariateData <- function(connection,
  andromedaTableName = "covariatesContinuous",
  snakeCaseToCamelCase = TRUE
  )
+ filterCovariateDataCovariates(covariateData, "covariatesContinuous", minCharacterizationMean)
  }
 
  # Covariate reference
@@ -273,3 +284,17 @@ getDbDefaultCovariateData <- function(connection,
  return(covariateData)
  }
 }
+
+#' Filters the covariateData covariates based on the given characterization mean value.
+#'
+#' @param covariateData The covariate data
+#' @param covariatesName The name of the covariates object inside the covariateData
+#' @param minCharacterizationMean The minimum mean value for characterization output. Values below this will be cut off from output. This 
+#' will help reduce the file size of the characterization output, but will remove information
+#' on covariates that have very low values. The default is 0.
+filterCovariateDataCovariates <- function(covariateData, covariatesName, minCharacterizationMean = 0) {
+ if ("averageValue" %in% colnames(covariateData[[covariatesName]]) && minCharacterizationMean != 0) {
+ covariateData[[covariatesName]] <- covariateData[[covariatesName]] %>%
+ dplyr::filter(.data$averageValue > minCharacterizationMean)
+ }
+}
diff --git a/R/Table1.R b/R/Table1.R
@@ -562,18 +562,20 @@ createTable1 <- function(covariateData1,
 
  if (nrow(binaryTable) != 0) {
  if (comparison) {
+ populationSize1 <- getPopulationSize(covariateData1, cohortId1)
+ populationSize2 <- getPopulationSize(covariateData2, cohortId2)
  colnames(binaryTable) <- c(
  "Characteristic",
  "Count",
  paste0(
  "% (n = ",
- formatCount(attr(covariateData1, "metaData")$populationSize),
+ formatCount(populationSize1),
  ")"
  ),
  "Count",
  paste0(
  "% (n = ",
- formatCount(attr(covariateData2, "metaData")$populationSize),
+ formatCount(populationSize2),
  ")"
  ),
  "Std.Diff"
@@ -590,12 +592,13 @@ createTable1 <- function(covariateData1,
  binaryTable$count2 <- NULL
  binaryTable$percent2 <- NULL
  binaryTable$stdDiff <- NULL
+ populationSize1 <- getPopulationSize(covariateData1, cohortId1)
  colnames(binaryTable) <- c(
  "Characteristic",
  "Count",
  paste0(
  "% (n = ",
- formatCount(attr(covariateData1, "metaData")$populationSize),
+ formatCount(populationSize1),
  ")"
  )
  )
@@ -722,3 +725,11 @@ createTable1CovariateSettings <- function(specifications = getDefaultTable1Speci
  covariateSettings$analyses <- analyses
  return(covariateSettings)
 }
+
+getPopulationSize <- function(covariateData, cohortId) {
+ result <- attr(covariateData, "metaData")$populationSize
+ if (!is.null(cohortId)) {
+ result <- result[cohortId]
+ }
+ return(result)
+}
diff --git a/R/UnitTestHelperFunctions.R b/R/UnitTestHelperFunctions.R
@@ -59,7 +59,9 @@
 #' of the createCovariate functions, or a list of such objects.
 #' @param aggregated Should aggregate statistics be computed instead of covariates per
 #' cohort entry?
-#'
+#' @param minCharacterizationMean The minimum mean value for characterization output. Values below this will be cut off from output. This 
+#' will help reduce the file size of the characterization output, but will remove information
+#' on covariates that have very low values. The default is 0.
 #' @return
 #' Returns an object of type \code{covariateData}, containing information on the covariates.
 #'
@@ -94,7 +96,8 @@
  cdmVersion = "5",
  rowIdField = "subject_id",
  covariateSettings,
- aggregated = FALSE) {
+ aggregated = FALSE,
+ minCharacterizationMean = 0) {
  writeLines("Constructing length of observation covariates")
  if (covariateSettings$useLengthOfObs == FALSE) {
  return(NULL)

diff --git a/docs/404.html b/docs/404.html
diff --git a/docs/articles/CreatingCovariatesBasedOnOtherCohorts.html b/docs/articles/CreatingCovariatesBasedOnOtherCohorts.html
diff --git a/docs/articles/CreatingCovariatesUsingCohortAttributes.html b/docs/articles/CreatingCovariatesUsingCohortAttributes.html
diff --git a/docs/articles/CreatingCustomCovariateBuilders.html b/docs/articles/CreatingCustomCovariateBuilders.html
diff --git a/docs/articles/CreatingCustomCovariateBuildersKorean.html b/docs/articles/CreatingCustomCovariateBuildersKorean.html
diff --git a/docs/articles/UsingFeatureExtraction.html b/docs/articles/UsingFeatureExtraction.html
diff --git a/docs/articles/UsingFeatureExtractionKorean.html b/docs/articles/UsingFeatureExtractionKorean.html
diff --git a/docs/articles/index.html b/docs/articles/index.html