Release v3.5.1

OHDSI · Apr 22, 2024 · a93992b · a93992b
2 parents 4a10c69 + d2aa800
commit a93992b
Show file tree

Hide file tree

Showing 87 changed files with 1,006 additions and 811 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: FeatureExtraction
 Type: Package
 Title: Generating Features for a Cohort
-Version: 3.5.0
-Date: 2024-04-18
+Version: 3.5.1
+Date: 2024-04-22
 Authors@R: c(
  person("Martijn", "Schuemie", , "schuemie@ohdsi.org", role = c("aut")),
  person("Marc", "Suchard", role = c("aut")),

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,10 @@
+FeatureExtraction 3.5.1
+=======================
+
+Bug Fixes:
+
+- Minimum threshold for covariate mean should be inclusive of the value parameter (#249)
+
 FeatureExtraction 3.5.0
 =======================
 

diff --git a/R/CovariateData.R b/R/CovariateData.R
@@ -279,20 +279,28 @@ createEmptyCovariateData <- function(cohortIds, aggregated, temporal) {
  if (!is.null(temporal) && temporal) {
  dummy$timeId <- 1
  }
- covariateData <- Andromeda::andromeda(covariates = dummy[!1, ],
- covariateRef = tibble(covariateId = 1, 
- covariateName = "", 
- analysisId = 1,
- conceptId = 1)[!1, ],
- analysisRef = tibble(analysisId = 1, 
- analysisName = "",
- domainId = "",
- startDay = 1, 
- endDay = 1, 
- isBinary = "", 
- missingMeansZero = "")[!1, ])
- attr(covariateData, "metaData") <- list(populationSize = 0,
- cohortIds = cohortIds)
+ covariateData <- Andromeda::andromeda(
+ covariates = dummy[!1, ],
+ covariateRef = tibble(
+ covariateId = 1,
+ covariateName = "",
+ analysisId = 1,
+ conceptId = 1
+ )[!1, ],
+ analysisRef = tibble(
+ analysisId = 1,
+ analysisName = "",
+ domainId = "",
+ startDay = 1,
+ endDay = 1,
+ isBinary = "",
+ missingMeansZero = ""
+ )[!1, ]
+ )
+ attr(covariateData, "metaData") <- list(
+ populationSize = 0,
+ cohortIds = cohortIds
+ )
  class(covariateData) <- "CovariateData"
  return(covariateData)
 }
diff --git a/R/GetCovariates.R b/R/GetCovariates.R
@@ -59,7 +59,7 @@
 #' of the createCovariate functions, or a list of such objects.
 #' @param aggregated Should aggregate statistics be computed instead of covariates per
 #' cohort entry?
-#' @param minCharacterizationMean The minimum mean value for characterization output. Values below this will be cut off from output. This 
+#' @param minCharacterizationMean The minimum mean value for characterization output. Values below this will be cut off from output. This
 #' will help reduce the file size of the characterization output, but will remove information
 #' on covariates that have very low values. The default is 0.
 #'
@@ -115,13 +115,13 @@ getDbCovariateData <- function(connectionDetails = NULL,
  if (cdmVersion == "4") {
  stop("CDM version 4 is not supported any more")
  }
- if (!missing(cohortId)) { 
+ if (!missing(cohortId)) {
  warning("cohortId argument has been deprecated, please use cohortIds")
  cohortIds <- cohortId
  }
  errorMessages <- checkmate::makeAssertCollection()
  minCharacterizationMean <- utils::type.convert(minCharacterizationMean, as.is = TRUE)
- checkmate::assertNumeric(x = minCharacterizationMean, lower = 0, add = errorMessages)
+ checkmate::assertNumeric(x = minCharacterizationMean, lower = 0, upper = 1, add = errorMessages)
  checkmate::reportAssertions(collection = errorMessages)
  if (!is.null(connectionDetails)) {
  connection <- DatabaseConnector::connect(connectionDetails)
@@ -137,12 +137,16 @@ getDbCovariateData <- function(connectionDetails = NULL,
  cohortDatabaseSchemaTable <- paste(cohortDatabaseSchema, cohortTable, sep = ".")
  }
  sql <- "SELECT cohort_definition_id, COUNT_BIG(*) AS population_size FROM @cohort_database_schema_table {@cohort_ids != -1} ? {WHERE cohort_definition_id IN (@cohort_ids)} GROUP BY cohort_definition_id;"
- sql <- SqlRender::render(sql = sql,
- cohort_database_schema_table = cohortDatabaseSchemaTable,
- cohort_ids = cohortIds)
- sql <- SqlRender::translate(sql = sql,
- targetDialect = attr(connection, "dbms"),
- oracleTempSchema = oracleTempSchema)
+ sql <- SqlRender::render(
+ sql = sql,
+ cohort_database_schema_table = cohortDatabaseSchemaTable,
+ cohort_ids = cohortIds
+ )
+ sql <- SqlRender::translate(
+ sql = sql,
+ targetDialect = attr(connection, "dbms"),
+ oracleTempSchema = oracleTempSchema
+ )
  temp <- DatabaseConnector::querySql(connection, sql, snakeCaseToCamelCase = TRUE)
  if (aggregated) {
  populationSize <- temp$populationSize
@@ -164,16 +168,18 @@ getDbCovariateData <- function(connectionDetails = NULL,
  }
  for (i in 1:length(covariateSettings)) {
  fun <- attr(covariateSettings[[i]], "fun")
- args <- list(connection = connection,
- oracleTempSchema = oracleTempSchema,
- cdmDatabaseSchema = cdmDatabaseSchema,
- cohortTable = cohortDatabaseSchemaTable,
- cohortIds = cohortIds,
- cdmVersion = cdmVersion,
- rowIdField = rowIdField,
- covariateSettings = covariateSettings[[i]],
- aggregated = aggregated,
- minCharacterizationMean = minCharacterizationMean)
+ args <- list(
+ connection = connection,
+ oracleTempSchema = oracleTempSchema,
+ cdmDatabaseSchema = cdmDatabaseSchema,
+ cohortTable = cohortDatabaseSchemaTable,
+ cohortIds = cohortIds,
+ cdmVersion = cdmVersion,
+ rowIdField = rowIdField,
+ covariateSettings = covariateSettings[[i]],
+ aggregated = aggregated,
+ minCharacterizationMean = minCharacterizationMean
+ )
  tempCovariateData <- do.call(eval(parse(text = fun)), args)
  if (is.null(covariateData)) {
  covariateData <- tempCovariateData
@@ -199,8 +205,10 @@ getDbCovariateData <- function(connectionDetails = NULL,
  attr(covariateData, "metaData")[[name]] <- attr(tempCovariateData, "metaData")[[name]]
  } else {
  attr(covariateData, "metaData")[[name]] <- list(
- c(unlist(attr(covariateData, "metaData")[[name]]),
- attr(tempCovariateData, "metaData")[[name]])
+ c(
+ unlist(attr(covariateData, "metaData")[[name]]),
+ attr(tempCovariateData, "metaData")[[name]]
+ )
  )
  }
  }

diff --git a/R/GetCovariatesFromCohortAttributes.R b/R/GetCovariatesFromCohortAttributes.R
@@ -85,7 +85,7 @@ getDbCohortAttrCovariatesData <- function(connection,
  if (cdmVersion == "4") {
  stop("Common Data Model version 4 is not supported")
  }
- if (!missing(cohortId)) { 
+ if (!missing(cohortId)) {
  warning("cohortId argument has been deprecated, please use cohortIds")
  cohortIds <- cohortId
  }
@@ -160,10 +160,10 @@ getDbCohortAttrCovariatesData <- function(connection,
  writeLines(paste("Loading took", signif(delta, 3), attr(delta, "units")))
 
  result <- createEmptyCovariateData(cohortIds, aggregated, covariateSettings$temporal)
- result$covariates = covariates
- result$covariateRef = covariateRef
- result$analysisRef = analysisRef
- 
+ result$covariates <- covariates
+ result$covariateRef <- covariateRef
+ result$analysisRef <- analysisRef
+
  return(result)
 }
 

diff --git a/R/GetCovariatesFromOtherCohorts.R b/R/GetCovariatesFromOtherCohorts.R
@@ -22,7 +22,7 @@
 #' @param covariateSettings An object of type \code{covariateSettings} as created using the
 #' \code{\link{createCohortBasedCovariateSettings}} or
 #' \code{\link{createCohortBasedTemporalCovariateSettings}} functions.
-#' @param minCharacterizationMean The minimum mean value for characterization output. Values below this will be cut off from output. This 
+#' @param minCharacterizationMean The minimum mean value for binary characterization output. Values below this will be cut off from output. This
 #' will help reduce the file size of the characterization output, but will remove information
 #' on covariates that have very low values. The default is 0.
 #' @template GetCovarParams
@@ -50,9 +50,9 @@ getDbCohortBasedCovariatesData <- function(connection,
  checkmate::assertClass(covariateSettings, "covariateSettings", add = errorMessages)
  checkmate::assertLogical(aggregated, len = 1, add = errorMessages)
  minCharacterizationMean <- utils::type.convert(minCharacterizationMean, as.is = TRUE)
- checkmate::assertNumeric(x = minCharacterizationMean, lower = 0, add = errorMessages)
+ checkmate::assertNumeric(x = minCharacterizationMean, lower = 0, upper = 1, add = errorMessages)
  checkmate::reportAssertions(collection = errorMessages)
- if (!missing(cohortId)) { 
+ if (!missing(cohortId)) {
  warning("cohortId argument has been deprecated, please use cohortIds")
  cohortIds <- cohortId
  }

diff --git a/R/GetDefaultCovariates.R b/R/GetDefaultCovariates.R
@@ -31,7 +31,7 @@
 #' it is a temp table, do not specify \code{targetDatabaseSchema}.
 #' @param targetCovariateRefTable (Optional) The name of the table where the covariate reference will be stored.
 #' @param targetAnalysisRefTable (Optional) The name of the table where the analysis reference will be stored.
-#' @param minCharacterizationMean The minimum mean value for characterization output. Values below this will be cut off from output. This 
+#' @param minCharacterizationMean The minimum mean value for binary characterization output. Values below this will be cut off from output. This
 #' will help reduce the file size of the characterization output, but will remove information
 #' on covariates that have very low values. The default is 0.
 #'
@@ -79,15 +79,15 @@ getDbDefaultCovariateData <- function(connection,
  if (!missing(targetCovariateTable) && !is.null(targetCovariateTable) && aggregated) {
  stop("Writing aggregated results to database is currently not supported")
  }
- if (!missing(cohortId)) { 
+ if (!missing(cohortId)) {
  warning("cohortId argument has been deprecated, please use cohortIds")
  cohortIds <- cohortId
  }
  errorMessages <- checkmate::makeAssertCollection()
  minCharacterizationMean <- utils::type.convert(minCharacterizationMean, as.is = TRUE)
- checkmate::assertNumeric(x = minCharacterizationMean, lower = 0, add = errorMessages)
+ checkmate::assertNumeric(x = minCharacterizationMean, lower = 0, upper = 1, add = errorMessages)
  checkmate::reportAssertions(collection = errorMessages)
- 
+
  settings <- .toJson(covariateSettings)
  rJava::J("org.ohdsi.featureExtraction.FeatureExtraction")$init(system.file("", package = "FeatureExtraction"))
  json <- rJava::J("org.ohdsi.featureExtraction.FeatureExtraction")$createSql(settings, aggregated, cohortTable, rowIdField, rJava::.jarray(as.character(cohortIds)), cdmDatabaseSchema)
@@ -152,7 +152,6 @@ getDbDefaultCovariateData <- function(connection,
  andromedaTableName = "covariatesContinuous",
  snakeCaseToCamelCase = TRUE
  )
- filterCovariateDataCovariates(covariateData, "covariatesContinuous", minCharacterizationMean)
  }
 
  # Covariate reference
@@ -275,9 +274,11 @@ getDbDefaultCovariateData <- function(connection,
  attr(covariateData, "metaData") <- list()
  if (is.null(covariateData$covariates) && is.null(covariateData$covariatesContinuous)) {
  warning("No data found, probably because no covariates were specified.")
- covariateData <- createEmptyCovariateData(cohortIds = cohortIds,
- aggregated = aggregated,
- temporal = covariateSettings$temporal)
+ covariateData <- createEmptyCovariateData(
+ cohortIds = cohortIds,
+ aggregated = aggregated,
+ temporal = covariateSettings$temporal
+ )
  }
  class(covariateData) <- "CovariateData"
  attr(class(covariateData), "package") <- "FeatureExtraction"
@@ -289,12 +290,12 @@ getDbDefaultCovariateData <- function(connection,
 #'
 #' @param covariateData The covariate data
 #' @param covariatesName The name of the covariates object inside the covariateData
-#' @param minCharacterizationMean The minimum mean value for characterization output. Values below this will be cut off from output. This 
+#' @param minCharacterizationMean The minimum mean value for characterization output. Values below this will be cut off from output. This
 #' will help reduce the file size of the characterization output, but will remove information
 #' on covariates that have very low values. The default is 0.
 filterCovariateDataCovariates <- function(covariateData, covariatesName, minCharacterizationMean = 0) {
  if ("averageValue" %in% colnames(covariateData[[covariatesName]]) && minCharacterizationMean != 0) {
  covariateData[[covariatesName]] <- covariateData[[covariatesName]] %>%
- dplyr::filter(.data$averageValue > minCharacterizationMean)
+ dplyr::filter(.data$averageValue >= minCharacterizationMean)
  }
-}
+}
diff --git a/R/HelperFunctions.R b/R/HelperFunctions.R
@@ -86,7 +86,7 @@ filterByRowId <- function(covariateData, rowIds) {
 #' }
 #'
 #' @export
-filterByCohortDefinitionId <- function(covariateData, 
+filterByCohortDefinitionId <- function(covariateData,
  cohortId = 1,
  cohortIds = c(1)) {
  if (!isCovariateData(covariateData)) {
@@ -98,11 +98,11 @@ filterByCohortDefinitionId <- function(covariateData,
  if (!isAggregatedCovariateData(covariateData)) {
  stop("Can only filter aggregated data by cohortIds")
  }
- if (!missing(cohortId)) { 
+ if (!missing(cohortId)) {
  warning("cohortId argument has been deprecated, please use cohortIds")
  cohortIds <- cohortId
  }
- 
+
  if (is.null(covariateData$covariates)) {
  covariates <- NULL
  } else {

diff --git a/R/UnitTestHelperFunctions.R b/R/UnitTestHelperFunctions.R
@@ -59,7 +59,7 @@
 #' of the createCovariate functions, or a list of such objects.
 #' @param aggregated Should aggregate statistics be computed instead of covariates per
 #' cohort entry?
-#' @param minCharacterizationMean The minimum mean value for characterization output. Values below this will be cut off from output. This 
+#' @param minCharacterizationMean The minimum mean value for binary characterization output. Values below this will be cut off from output. This
 #' will help reduce the file size of the characterization output, but will remove information
 #' on covariates that have very low values. The default is 0.
 #' @return
@@ -107,20 +107,23 @@
  }
 
  # Some SQL to construct the covariate:
- sql <- paste("SELECT @row_id_field AS row_id, 1 AS covariate_id,",
- "DATEDIFF(DAY, observation_period_start_date, cohort_start_date)",
- "AS covariate_value",
- "FROM @cohort_table c",
- "INNER JOIN @cdm_database_schema.observation_period op",
- "ON op.person_id = c.subject_id",
- "WHERE cohort_start_date >= observation_period_start_date",
- "AND cohort_start_date <= observation_period_end_date",
- "{@cohort_ids != -1} ? {AND cohort_definition_id IN @cohort_ids}")
+ sql <- paste(
+ "SELECT @row_id_field AS row_id, 1 AS covariate_id,",
+ "DATEDIFF(DAY, observation_period_start_date, cohort_start_date)",
+ "AS covariate_value",
+ "FROM @cohort_table c",
+ "INNER JOIN @cdm_database_schema.observation_period op",
+ "ON op.person_id = c.subject_id",
+ "WHERE cohort_start_date >= observation_period_start_date",
+ "AND cohort_start_date <= observation_period_end_date",
+ "{@cohort_ids != -1} ? {AND cohort_definition_id IN @cohort_ids}"
+ )
  sql <- SqlRender::render(sql,
- cohort_table = cohortTable,
- cohort_ids = cohortIds,
- row_id_field = rowIdField,
- cdm_database_schema = cdmDatabaseSchema)
+ cohort_table = cohortTable,
+ cohort_ids = cohortIds,
+ row_id_field = rowIdField,
+ cdm_database_schema = cdmDatabaseSchema
+ )
  sql <- SqlRender::translate(sql, targetDialect = attr(connection, "dbms"))
  # Retrieve the covariate:
  covariates <- DatabaseConnector::querySql(connection, sql, snakeCaseToCamelCase = TRUE)

diff --git a/docs/404.html b/docs/404.html
diff --git a/docs/articles/CreatingCovariatesBasedOnOtherCohorts.html b/docs/articles/CreatingCovariatesBasedOnOtherCohorts.html