Merge pull request #1452 from dbetebenner/master

Fixing/creating createSuperCohort function
CenterForAssessment · Oct 10, 2024 · 3bdc641 · 3bdc641
2 parents 35a08bf + cf1166c
commit 3bdc641
Show file tree

Hide file tree

Showing 8 changed files with 65 additions and 69 deletions.
diff --git a/CITATION.cff b/CITATION.cff
@@ -12,7 +12,7 @@ authors:
   given-names: "Shang"
 title: "SGP: Student Growth Percentiles & Percentile Growth Trajectories"
 <<<<<<< HEAD
-version: 2.2-0.0
+version: 2.2-0.1
 doi: 10.5281/zenodo.3634024
-date-released: 2024-10-6
+date-released: 2024-10-10
 url: "https://sgp.io"
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: SGP
 Type: Package
 Title: Student Growth Percentiles & Percentile Growth Trajectories
-Version: 2.2-0.0
-Date: 2024-10-6
+Version: 2.2-0.1
+Date: 2024-10-10
 Authors@R: c(person(given=c("Damian", "W."), family="Betebenner", email="dbetebenner@nciea.org", role=c("aut", "cre")),
 		person(given=c("Adam", "R."), family="Van Iwaarden", email="avaniwaarden@nciea.org", role="aut"),
 		person(given="Ben", family="Domingue", email="ben.domingue@gmail.com", role="aut"),

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,11 @@
+# SGP 2.2-0.0
+
+## User Visible Changes:
+
+* Alignment of scale score targets with time frame of projections
+* Alignment of scale score targets with cutscores
+* General bug fixes and meta-data updates to SGPstateData
+
 # SGP 2.1-0.0
 
 ## User Visible Changes:

diff --git a/R/createSuperCohortData.R b/R/createSuperCohortData.R
@@ -1,62 +1,42 @@
-`createSuperCohortData` <-
+`createSuperCohortData` <- 
 function(
-        dataForSuperCohort,
+        base_data, 
         sgp.config,
-	simex.baseline.config,
-        content_areas,
-        years,
-        grades,
-        baseline.grade.sequences.lags,
-        exclude.years) {
-
-        VALID_CASE <- CONTENT_AREA <- YEAR <- GRADE <- YEAR_WITHIN <- COHORT_YEAR <- NULL
-
-
-        ### Utility functions
-
-        test.year.sequence <- function(content_areas, years, grades, baseline.grade.sequences.lags=NULL) {
-            grades <- type.convert(as.character(grades), as.is=TRUE)
-            if (is.null(baseline.grade.sequences.lags)) baseline.grade.sequences.lags <- rep(1L, length(grades)-1L)
-            tmp.years.sequence <- list()
-            tmp.years.sequence <- lapply(years, function(x) yearIncrement(year=x, increment=c(0,cumsum(baseline.grade.sequences.lags))))
-            return(tmp.years.sequence[sapply(tmp.years.sequence, function(x) all(x %in% years))])
-        } ### END test.year.sequence
-
-        variables.to.get=c("VALID_CASE", "YEAR", "CONTENT_AREA", "GRADE", "ID", "SCALE_SCORE", "ACHIEVEMENT_LEVEL", "YEAR_WITHIN", "FIRST_OBSERVATION", 
-		"LAST_OBSERVATION", simex.baseline.config$csem.data.vnames)
-        tmp_sgp_data_for_analysis <- dataForSuperCohort[,intersect(names(dataForSuperCohort), variables.to.get), with=FALSE]["VALID_CASE"]
-
-        if ("YEAR_WITHIN" %in% names(tmp_sgp_data_for_analysis)) {
-            setkey(tmp_sgp_data_for_analysis, VALID_CASE, CONTENT_AREA, YEAR, GRADE, YEAR_WITHIN)
-            year_within.tf <- TRUE
-        } else {
-            setkey(tmp_sgp_data_for_analysis, VALID_CASE, CONTENT_AREA, YEAR, GRADE)
-            year_within.tf <- FALSE
-        }
-
-        tmp.year.sequence <- test.year.sequence(content_areas, years, grades, baseline.grade.sequences.lags)
-
-        if (!is.null(exclude.years)) {
-            tmp.year.sequence <- tmp.year.sequence[sapply(tmp.year.sequence, function(x) !tail(x, 1) %in% exclude.years)]
-        }
-
-        tmp.list <- list()
-        for (cohort.iter in seq_along(tmp.year.sequence)) {
-            tmp.sgp.iter <- sgp.config[[cohort.iter]] # Convert sgp.config into a valid sgp.iter for getPanelData
-            names(tmp.sgp.iter) <- gsub('sgp.baseline.', 'sgp.', names(tmp.sgp.iter))
-            tmp.sgp.iter$sgp.panel.years <- tmp.year.sequence[[cohort.iter]]
-            tmp.sgp.iter$sgp.grade.sequences <- tmp.sgp.iter$sgp.grade.sequences
-            if (!is.null(tmp.sgp.iter$sgp.exclude.sequences)) tmp.sgp.iter$sgp.exclude.sequences <- tmp.sgp.iter$sgp.exclude.sequences[COHORT_YEAR %in% tail(tmp.sgp.iter$sgp.panel.years, 1L)]
-            if (!is.null(simex.baseline.config))  sgp.csem <- simex.baseline.config$csem.data.vnames else sgp.csem <- NULL
-            tmp.list[[cohort.iter]] <- getPanelData(tmp_sgp_data_for_analysis, "sgp.percentiles", sgp.iter = tmp.sgp.iter, sgp.csem=sgp.csem)
-            setnames(tmp.list[[cohort.iter]], c("ID",
-                paste("GRADE", rev(seq_along(tmp.year.sequence[[cohort.iter]])), sep="_"),
-                paste("SCALE_SCORE", rev(seq_along(tmp.year.sequence[[cohort.iter]])), sep="_"),
-                if(year_within.tf) paste("YEAR_WITHIN", rev(seq_along(tmp.year.sequence[[cohort.iter]])), sep="_"),
-                if(!is.null(sgp.csem)) paste(sgp.csem, rev(seq_along(tmp.year.sequence[[cohort.iter]])), sep="_")))
-        }
-        tmp.dt <- rbindlist(tmp.list, fill=TRUE)
-        if (year_within.tf) tmp.dt[, grep("YEAR_WITHIN", names(tmp.dt)) := NULL] # remove YEAR_WITHIN from Data where relevant
-        setkey(tmp.dt)
-        return(tmp.dt)
-} ### END createSuperCohortData
+        target_years, ## Provide either target_years OR num_priors. length(target_years) = num_priors + 1
+        num_priors, ## Provide either num_priors OR target_years. num_priors = length(target_years) - 1
+        indicate_cohort=FALSE
+) {
+
+  ### Parameters
+  data.years <- sort(unique(base_data$YEAR))
+  tmp.cohort.list <- list()
+
+  ### Test parameters
+  if (missing(target_years) & missing(num_priors)) stop("Provide either targets years for super-cohort or number of priors of desired SGP analyses.")
+  if (missing(target_years) & !missing(num_priors)) target_years <- tail(data.years, num_priors)
+  if (length(target_years) >= length(data.years)) stop("Super-cohort construction is possible only when number of target_years is less than number of years in supplied data.")
+
+  ### Loop over configurations
+  for (sgp.config.iter in sgp.config) {
+    tmp.list <- list()
+    for (data.years.iter in 1:(length(data.years) - sum(sgp.config.iter[['sgp.grade.sequences.lags']]))) {
+      grade_year_content_area_map <- data.table(
+        CONTENT_AREA = sgp.config.iter[['sgp.content.areas']],
+        YEAR = data.years[c(data.years.iter, data.years.iter + (cumsum(sgp.config.iter[['sgp.grade.sequences.lags']])))],
+        GRADE = sgp.config.iter[['sgp.grade.sequences']],
+        YEAR_NEW = tail(data.years, length(sgp.config.iter[['sgp.content.areas']])))
+
+      tmp.list[[data.years.iter]] <- base_data[grade_year_content_area_map, on=list(CONTENT_AREA, YEAR, GRADE)][,YEAR:=YEAR_NEW][,YEAR_NEW:=NULL]
+
+      if (indicate_cohort) {
+        tmp.list[[data.years.iter]][,COHORT:=paste(unlist(grade_year_content_area_map[,1:3]), collapse="_")]
+      }
+
+      tmp.dt <- rbindlist(tmp.list)
+      setkey(tmp.dt, YEAR, GRADE, ID)
+      tmp.dt <- tmp.dt[unique(tmp.dt[,.(YEAR, GRADE, ID)]), mult="last"] ### Remove duplicates created by collapsing data into YEAR_NEW taking LAST (most recent) case
+    }
+    tmp.cohort.list[[paste(sgp.config.iter[['sgp.content.areas']][1], paste(sgp.config.iter[['sgp.grade.sequences']], collapse=""), sep="_")]] <- tmp.dt
+  }
+  return(rbindlist(tmp.cohort.list))
+} ### END createSuperCohortData
diff --git a/R/zzz.R b/R/zzz.R
@@ -7,6 +7,6 @@ function(libname, pkgname) {
 `.onAttach` <-
 function(libname, pkgname) {
 	if (interactive()) {
-		packageStartupMessage(magenta$bold('SGP',paste(paste0(unlist(strsplit(as.character(packageVersion("SGP")), "[.]")), c(".", "-", ".", "")), collapse=""),' (10-6-2024). For help: >help("SGP") or visit sgp.io'))
+		packageStartupMessage(magenta$bold('SGP',paste(paste0(unlist(strsplit(as.character(packageVersion("SGP")), "[.]")), c(".", "-", ".", "")), collapse=""),' (10-10-2024). For help: >help("SGP") or visit sgp.io'))
 	}
 }
diff --git a/inst/CITATION b/inst/CITATION
@@ -9,12 +9,12 @@ bibentry(
                     person(given = c("Yi"), family = "Shang")
                    ),
     year         = "2024",
-    note         = "R package version 2.2-0.0",
+    note         = "R package version 2.2-0.1",
     url          = "https://sgp.io",
     textVersion  = paste(
                     "Damian W. Betebenner, Adam R. Van Iwaarden, Benjamin Domingue and Yi Shang (2024).",
                     "SGP: Student Growth Percentiles & Percentile Growth Trajectories.",
-                    "(R package version 2.2-0.0)",
+                    "(R package version 2.2-0.1)",
                     "URL: https://sgp.io"
                   )
 )
diff --git a/inst/NEWS b/inst/NEWS
@@ -1,3 +1,11 @@
+# SGP 2.2-0.0
+
+## User Visible Changes:
+
+* Alignment of scale score targets with time frame of projections
+* Alignment of scale score targets with cutscores
+* General bug fixes and meta-data updates to SGPstateData
+
 # SGP 2.1-0.0
 
 ## User Visible Changes:

diff --git a/man/SGP-package.Rd b/man/SGP-package.Rd
@@ -19,8 +19,8 @@ growth projections to be calculated across assessment transitions by equating th
 \tabular{ll}{
 Package: \tab SGP\cr
 Type: \tab Package\cr
-Version: \tab 2.2-0.0\cr
-Date: \tab 2024-10-6\cr
+Version: \tab 2.2-0.1\cr
+Date: \tab 2024-10-10\cr
 License: \tab GPL-3\cr
 LazyLoad: \tab yes\cr
 }