-
Notifications
You must be signed in to change notification settings - Fork 21
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1452 from dbetebenner/master
Fixing/creating createSuperCohort function
- Loading branch information
Showing
8 changed files
with
65 additions
and
69 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,62 +1,42 @@ | ||
`createSuperCohortData` <- | ||
`createSuperCohortData` <- | ||
function( | ||
dataForSuperCohort, | ||
base_data, | ||
sgp.config, | ||
simex.baseline.config, | ||
content_areas, | ||
years, | ||
grades, | ||
baseline.grade.sequences.lags, | ||
exclude.years) { | ||
|
||
VALID_CASE <- CONTENT_AREA <- YEAR <- GRADE <- YEAR_WITHIN <- COHORT_YEAR <- NULL | ||
|
||
|
||
### Utility functions | ||
|
||
test.year.sequence <- function(content_areas, years, grades, baseline.grade.sequences.lags=NULL) { | ||
grades <- type.convert(as.character(grades), as.is=TRUE) | ||
if (is.null(baseline.grade.sequences.lags)) baseline.grade.sequences.lags <- rep(1L, length(grades)-1L) | ||
tmp.years.sequence <- list() | ||
tmp.years.sequence <- lapply(years, function(x) yearIncrement(year=x, increment=c(0,cumsum(baseline.grade.sequences.lags)))) | ||
return(tmp.years.sequence[sapply(tmp.years.sequence, function(x) all(x %in% years))]) | ||
} ### END test.year.sequence | ||
|
||
variables.to.get=c("VALID_CASE", "YEAR", "CONTENT_AREA", "GRADE", "ID", "SCALE_SCORE", "ACHIEVEMENT_LEVEL", "YEAR_WITHIN", "FIRST_OBSERVATION", | ||
"LAST_OBSERVATION", simex.baseline.config$csem.data.vnames) | ||
tmp_sgp_data_for_analysis <- dataForSuperCohort[,intersect(names(dataForSuperCohort), variables.to.get), with=FALSE]["VALID_CASE"] | ||
|
||
if ("YEAR_WITHIN" %in% names(tmp_sgp_data_for_analysis)) { | ||
setkey(tmp_sgp_data_for_analysis, VALID_CASE, CONTENT_AREA, YEAR, GRADE, YEAR_WITHIN) | ||
year_within.tf <- TRUE | ||
} else { | ||
setkey(tmp_sgp_data_for_analysis, VALID_CASE, CONTENT_AREA, YEAR, GRADE) | ||
year_within.tf <- FALSE | ||
} | ||
|
||
tmp.year.sequence <- test.year.sequence(content_areas, years, grades, baseline.grade.sequences.lags) | ||
|
||
if (!is.null(exclude.years)) { | ||
tmp.year.sequence <- tmp.year.sequence[sapply(tmp.year.sequence, function(x) !tail(x, 1) %in% exclude.years)] | ||
} | ||
|
||
tmp.list <- list() | ||
for (cohort.iter in seq_along(tmp.year.sequence)) { | ||
tmp.sgp.iter <- sgp.config[[cohort.iter]] # Convert sgp.config into a valid sgp.iter for getPanelData | ||
names(tmp.sgp.iter) <- gsub('sgp.baseline.', 'sgp.', names(tmp.sgp.iter)) | ||
tmp.sgp.iter$sgp.panel.years <- tmp.year.sequence[[cohort.iter]] | ||
tmp.sgp.iter$sgp.grade.sequences <- tmp.sgp.iter$sgp.grade.sequences | ||
if (!is.null(tmp.sgp.iter$sgp.exclude.sequences)) tmp.sgp.iter$sgp.exclude.sequences <- tmp.sgp.iter$sgp.exclude.sequences[COHORT_YEAR %in% tail(tmp.sgp.iter$sgp.panel.years, 1L)] | ||
if (!is.null(simex.baseline.config)) sgp.csem <- simex.baseline.config$csem.data.vnames else sgp.csem <- NULL | ||
tmp.list[[cohort.iter]] <- getPanelData(tmp_sgp_data_for_analysis, "sgp.percentiles", sgp.iter = tmp.sgp.iter, sgp.csem=sgp.csem) | ||
setnames(tmp.list[[cohort.iter]], c("ID", | ||
paste("GRADE", rev(seq_along(tmp.year.sequence[[cohort.iter]])), sep="_"), | ||
paste("SCALE_SCORE", rev(seq_along(tmp.year.sequence[[cohort.iter]])), sep="_"), | ||
if(year_within.tf) paste("YEAR_WITHIN", rev(seq_along(tmp.year.sequence[[cohort.iter]])), sep="_"), | ||
if(!is.null(sgp.csem)) paste(sgp.csem, rev(seq_along(tmp.year.sequence[[cohort.iter]])), sep="_"))) | ||
} | ||
tmp.dt <- rbindlist(tmp.list, fill=TRUE) | ||
if (year_within.tf) tmp.dt[, grep("YEAR_WITHIN", names(tmp.dt)) := NULL] # remove YEAR_WITHIN from Data where relevant | ||
setkey(tmp.dt) | ||
return(tmp.dt) | ||
} ### END createSuperCohortData | ||
target_years, ## Provide either target_years OR num_priors. length(target_years) = num_priors + 1 | ||
num_priors, ## Provide either num_priors OR target_years. num_priors = length(target_years) - 1 | ||
indicate_cohort=FALSE | ||
) { | ||
|
||
### Parameters | ||
data.years <- sort(unique(base_data$YEAR)) | ||
tmp.cohort.list <- list() | ||
|
||
### Test parameters | ||
if (missing(target_years) & missing(num_priors)) stop("Provide either targets years for super-cohort or number of priors of desired SGP analyses.") | ||
if (missing(target_years) & !missing(num_priors)) target_years <- tail(data.years, num_priors) | ||
if (length(target_years) >= length(data.years)) stop("Super-cohort construction is possible only when number of target_years is less than number of years in supplied data.") | ||
|
||
### Loop over configurations | ||
for (sgp.config.iter in sgp.config) { | ||
tmp.list <- list() | ||
for (data.years.iter in 1:(length(data.years) - sum(sgp.config.iter[['sgp.grade.sequences.lags']]))) { | ||
grade_year_content_area_map <- data.table( | ||
CONTENT_AREA = sgp.config.iter[['sgp.content.areas']], | ||
YEAR = data.years[c(data.years.iter, data.years.iter + (cumsum(sgp.config.iter[['sgp.grade.sequences.lags']])))], | ||
GRADE = sgp.config.iter[['sgp.grade.sequences']], | ||
YEAR_NEW = tail(data.years, length(sgp.config.iter[['sgp.content.areas']]))) | ||
|
||
tmp.list[[data.years.iter]] <- base_data[grade_year_content_area_map, on=list(CONTENT_AREA, YEAR, GRADE)][,YEAR:=YEAR_NEW][,YEAR_NEW:=NULL] | ||
|
||
if (indicate_cohort) { | ||
tmp.list[[data.years.iter]][,COHORT:=paste(unlist(grade_year_content_area_map[,1:3]), collapse="_")] | ||
} | ||
|
||
tmp.dt <- rbindlist(tmp.list) | ||
setkey(tmp.dt, YEAR, GRADE, ID) | ||
tmp.dt <- tmp.dt[unique(tmp.dt[,.(YEAR, GRADE, ID)]), mult="last"] ### Remove duplicates created by collapsing data into YEAR_NEW taking LAST (most recent) case | ||
} | ||
tmp.cohort.list[[paste(sgp.config.iter[['sgp.content.areas']][1], paste(sgp.config.iter[['sgp.grade.sequences']], collapse=""), sep="_")]] <- tmp.dt | ||
} | ||
return(rbindlist(tmp.cohort.list)) | ||
} ### END createSuperCohortData |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters