Merge pull request #164 from atorus-research/gh_issue_62

Move data from vignettes to package data
atorus-research · Dec 18, 2023 · 5e1dcf2 · 5e1dcf2
2 parents bb2dc13 + 7cf5e60
commit 5e1dcf2
Show file tree

Hide file tree

Showing 45 changed files with 492 additions and 234 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -22,3 +22,4 @@
 ^Makefile$
 ^Jenkinsfile$
 ^rsconnect$
+^data-raw$
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -62,3 +62,4 @@ VignetteBuilder: knitr
 RoxygenNote: 7.2.3
 RdMacros: lifecycle
 Config/testthat/edition: 3
+LazyData: true
diff --git a/NAMESPACE b/NAMESPACE
@@ -59,6 +59,7 @@ export(collapse_row_labels)
 export(f_str)
 export(get_by)
 export(get_count_layer_formats)
+export(get_data_labels)
 export(get_desc_layer_formats)
 export(get_layer_template)
 export(get_layer_templates)

diff --git a/R/data.R b/R/data.R
@@ -0,0 +1,67 @@
+#' ADSL Data
+#'
+#' A subset of the PHUSE Test Data Factory ADSL data set.
+#'
+#' @format A data.frame with 254 rows and 49 columns.
+#'
+#' @seealso [get_data_labels()]
+#'
+#' @source https://github.com/phuse-org/TestDataFactory
+#'
+"tplyr_adsl"
+
+
+#' ADAE Data
+#'
+#' A subset of the PHUSE Test Data Factory ADAE data set.
+#'
+#' @format A data.frame with 276 rows and 55 columns.
+#'
+#' @seealso [get_data_labels()]
+#'
+#' @source https://github.com/phuse-org/TestDataFactory
+#'
+"tplyr_adae"
+
+#' ADAS Data
+#'
+#' A subset of the PHUSE Test Data Factory ADAS data set.
+#'
+#' @format A data.frame with 1,040 rows and 40 columns.
+#'
+#' @seealso [get_data_labels()]
+#'
+#' @source https://github.com/phuse-org/TestDataFactory
+#'
+"tplyr_adas"
+
+#' ADLB Data
+#'
+#' A subset of the PHUSE Test Data Factory ADLB data set.
+#'
+#' @format A data.frame with 311 rows and 46 columns.
+#'
+#' @seealso [get_data_labels()]
+#'
+#' @source https://github.com/phuse-org/TestDataFactory
+#'
+"tplyr_adlb"
+
+
+#' Get Data Labels
+#'
+#' Get labels for data sets included in Tplyr.
+#'
+#' @param data A Tplyr data set.
+#'
+#' @return A data.frame with columns `name` and `label` containing the names and labels of each column.
+#'
+#' @export
+get_data_labels <- function(data) {
+ map_dfr(
+ names(data),
+ function(name) {
+ list(name = name, label = attr(data[[name]], "label"))
+ }
+ )
+}
diff --git a/README.Rmd b/README.Rmd
@@ -17,8 +17,6 @@ library(tidyverse)
 library(magrittr)
 library(Tplyr)
 library(knitr)
-load("vignettes/adae.Rdata")
-load("vignettes/adsl.Rdata")
 ```
 
 # *Tplyr* <img src="man/figures/logo.png" align="right" alt="" width="120" />
@@ -76,11 +74,10 @@ When you look at this table, you can begin breaking this output down into smalle
 So we have one table, with 6 summaries (7 including the next page, not shown) - but only 2 different approaches to summaries being performed. 
 In the same way that [dplyr](https://dplyr.tidyverse.org/) is a grammar of data manipulation, **Tplyr** aims to be a grammar of data summary. The goal of **Tplyr** is to allow you to program a summary table like you see it on the page, by breaking a larger problem into smaller 'layers', and combining them together like you see on the page. 
 
-Enough talking - let's see some code. In these examples, we will be using data from the [PHUSE Test Data Factory]( https://advance.phuse.global/display/WEL/Test+Dataset+Factory) based on the [original pilot project submission package](https://github.com/atorus-research/CDISC_pilot_replication). Note: You can see our replication of the CDISC pilot using the PHUSE Test Data Factory data [here](https://github.com/atorus-research/CDISC_pilot_replication).
+Enough talking - let's see some code. In these examples, we will be using data from the [PHUSE Test Data Factory]( https://advance.phuse.global/display/WEL/Test+Dataset+Factory) based on the [original pilot project submission package](https://github.com/atorus-research/CDISC_pilot_replication). We've packaged some subsets of that data into **Tplyr**, which you can use to replicate our examples and run our vignette code yourself. Note: You can see our replication of the CDISC pilot using the PHUSE Test Data Factory data [here](https://github.com/atorus-research/CDISC_pilot_replication).
 
 ```{r initial_demo}
-
-tplyr_table(adsl, TRT01P, where = SAFFL == "Y") %>% 
+tplyr_table(tplyr_adsl, TRT01P, where = SAFFL == "Y") %>% 
  add_layer(
  group_desc(AGE, by = "Age (years)")
  ) %>% 
@@ -89,7 +86,6 @@ tplyr_table(adsl, TRT01P, where = SAFFL == "Y") %>%
  ) %>% 
  build() %>% 
  kable()
-
 ```
 
 ## *Tplyr* is Qualified

diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 
 <!-- README.md is generated from README.Rmd. Please edit that file -->
 
-# Tplyr <img src="man/figures/logo.png" align="right" alt="" width="120" />
+# *Tplyr* <img src="man/figures/logo.png" align="right" alt="" width="120" />
 
 <!-- badges: start -->
 
@@ -42,7 +42,7 @@ install.packages("Tplyr")
 devtools::install_github("https://github.com/atorus-research/Tplyr.git", ref="devel")
 ```
 
-# What is Tplyr?
+# What is *Tplyr*?
 
 [dplyr](https://dplyr.tidyverse.org/) from tidyverse is a grammar of
 data manipulation. So what does that allow you to do? It gives you, as a
@@ -58,10 +58,10 @@ pharmaceutical industry, a great deal of the data presented in the
 outputs we create are very similar. For the most part, most of these
 tables can be broken down into a few categories:
 
--  Counting for event based variables or categories
--  Shifting, which is just counting a change in state with a ‘from’ and
-  a ‘to’
--  Generating descriptive statistics around some continuous variable.
+- Counting for event based variables or categories
+- Shifting, which is just counting a change in state with a ‘from’ and a
+ ‘to’
+- Generating descriptive statistics around some continuous variable.
 
 For many of the tables that go into a clinical submission, the tables
 are made up of a combination of these approaches. Consider a
@@ -81,15 +81,15 @@ into smaller, redundant, components. These components can be viewed as
 layers. The boxes in the image above represent how you can begin to
 conceptualize this.
 
--  First we have Sex, which is made up of n (%) counts.
--  Next we have Age as a continuous variable, where we have a number of
-  descriptive statistics, including n, mean, standard deviation,
-  median, quartile 1, quartile 3, min, max, and missing values.
--  After that we have age, but broken into categories - so this is once
-  again n (%) values.
--  Race - more counting,
--  Ethnicity - more counting
--  Weight - and we’re back to descriptive statistics.
+- First we have Sex, which is made up of n (%) counts.
+- Next we have Age as a continuous variable, where we have a number of
+ descriptive statistics, including n, mean, standard deviation, median,
+ quartile 1, quartile 3, min, max, and missing values.
+- After that we have age, but broken into categories - so this is once
+ again n (%) values.
+- Race - more counting,
+- Ethnicity - more counting
+- Weight - and we’re back to descriptive statistics.
 
 So we have one table, with 6 summaries (7 including the next page, not
 shown) - but only 2 different approaches to summaries being performed.
@@ -104,13 +104,14 @@ using data from the [PHUSE Test Data
 Factory](https://advance.phuse.global/display/WEL/Test+Dataset+Factory)
 based on the [original pilot project submission
 package](https://github.com/atorus-research/CDISC_pilot_replication).
-Note: You can see our replication of the CDISC pilot using the PHUSE
-Test Data Factory data
+We’ve packaged some subsets of that data into **Tplyr**, which you can
+use to replicate our examples and run our vignette code yourself. Note:
+You can see our replication of the CDISC pilot using the PHUSE Test Data
+Factory data
 [here](https://github.com/atorus-research/CDISC_pilot_replication).
 
 ``` r
-
-tplyr_table(adsl, TRT01P, where = SAFFL == "Y") %>% 
+tplyr_table(tplyr_adsl, TRT01P, where = SAFFL == "Y") %>% 
  add_layer(
  group_desc(AGE, by = "Age (years)")
  ) %>% 
@@ -133,7 +134,7 @@ tplyr_table(adsl, TRT01P, where = SAFFL == "Y") %>%
 | Age Categories n (%) | \>80 | 30 ( 34.9%) | 18 ( 21.4%) | 29 ( 34.5%) | 2 | 1 | 2 |
 | Age Categories n (%) | 65-80 | 42 ( 48.8%) | 55 ( 65.5%) | 47 ( 56.0%) | 2 | 1 | 3 |
 
-## Tplyr is Qualified
+## *Tplyr* is Qualified
 
 We understand how important documentation and testing is within the
 pharmaceutical world. This is why outside of unit testing **Tplyr**
@@ -153,38 +154,38 @@ this report.
 
 Here are some of the high level benefits of using **Tplyr**:
 
--  Easy construction of table data using an intuitive syntax
--  Smart string formatting for your numbers that’s easily specified by
-  the user
--  A great deal of flexibility in what is performed and how it’s
-  presented, without specifying hundreds of parameters
+- Easy construction of table data using an intuitive syntax
+- Smart string formatting for your numbers that’s easily specified by
+ the user
+- A great deal of flexibility in what is performed and how it’s
+ presented, without specifying hundreds of parameters
 
 # Where to go from here?
 
 There’s quite a bit more to learn! And we’ve prepared a number of other
 vignettes to help you get what you need out of **Tplyr**.
 
--  The best place to start is with our Getting Started vignette at
-  `vignette("Tplyr")`
--  Learn more about table level settings in `vignette("table")`
--  Learn more about descriptive statistics layers in `vignette("desc")`
--  Learn more about count layers in `vignette("count")`
--  Learn more about shift layers in `vignette("shift")`
--  Learn more about percentages in `vignette("denom")`
--  Learn more about calculating risk differences in
-  `vignette("riskdiff")`
--  Learn more about sorting **Tplyr** tables in `vignette("sort")`
--  Learn more about using **Tplyr** options in `vignette("options")`
--  And finally, learn more about producing and outputting styled tables
-  using **Tplyr** in `vignette("styled-table")`
+- The best place to start is with our Getting Started vignette at
+ `vignette("Tplyr")`
+- Learn more about table level settings in `vignette("table")`
+- Learn more about descriptive statistics layers in `vignette("desc")`
+- Learn more about count layers in `vignette("count")`
+- Learn more about shift layers in `vignette("shift")`
+- Learn more about percentages in `vignette("denom")`
+- Learn more about calculating risk differences in
+ `vignette("riskdiff")`
+- Learn more about sorting **Tplyr** tables in `vignette("sort")`
+- Learn more about using **Tplyr** options in `vignette("options")`
+- And finally, learn more about producing and outputting styled tables
+ using **Tplyr** in `vignette("styled-table")`
 
 In the **Tplyr** version 1.0.0, we’ve packed a number of new features
 in. For deeper dives on the largest new additions:
 
--  Learn about **Tplyr’s** traceability metadata in
-  `vignette("metadata")` and about how it can be extended in
-  `vignette("custom-metadata")`
--  Learn about layer templates in `vignette("layer_templates")`
+- Learn about **Tplyr**’s traceability metadata in
+ `vignette("metadata")` and about how it can be extended in
+ `vignette("custom-metadata")`
+- Learn about layer templates in `vignette("layer_templates")`
 
 # References
 

diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -105,12 +105,13 @@ reference:
 - title: Post-pocessing
  desc: Post-pocessing functions
 - contents:
- - str_indent_wrap
- - apply_row_masks
  - apply_conditional_format
+ - apply_formats
+ - apply_row_masks
+ - collapse_row_labels
  - str_extract_fmt_group
  - str_extract_num
- - apply_formats
+ - str_indent_wrap
 - title: Helper functions
  desc: General helper functions
 - contents:
@@ -122,6 +123,14 @@ reference:
  - get_where.tplyr_layer
  - Tplyr
  - get_tplyr_regex
+- title: Data
+ desc: Tplyr Built-in Datasets
+- contents:
+ - tplyr_adae
+ - tplyr_adas
+ - tplyr_adlb
+ - tplyr_adsl
+ - get_data_labels
 
 articles:
 - title: Table Basics

diff --git a/data-raw/DATASET.R b/data-raw/DATASET.R
@@ -0,0 +1,3 @@
+## code to prepare `DATASET` dataset goes here
+
+usethis::use_data(DATASET, overwrite = TRUE)
diff --git a/data-raw/adae.R b/data-raw/adae.R
@@ -0,0 +1,6 @@
+# note: adae.Rdata was copied over from vignettes/adsl.Rdata
+# this is a copy of the PHUSE Test Data Factory data, trimmed down for size
+
+load("data-raw/adae.Rdata")
+tplyr_adae <- adae
+usethis::use_data(tplyr_adae, overwrite = TRUE)
diff --git a/vignettes/adae.Rdata → data-raw/adae.Rdata b/vignettes/adae.Rdata → data-raw/adae.Rdata
diff --git a/data-raw/adas.R b/data-raw/adas.R
@@ -0,0 +1,6 @@
+# note: adlb.Rdata was copied over from vignettes/adsl.Rdata
+# this is a copy of the PHUSE Test Data Factory data, trimmed down for size
+
+load("data-raw/adas.Rdata")
+tplyr_adas <- adas
+usethis::use_data(tplyr_adas, overwrite = TRUE)
diff --git a/vignettes/adas.Rdata → data-raw/adas.Rdata b/vignettes/adas.Rdata → data-raw/adas.Rdata
diff --git a/data-raw/adlb.R b/data-raw/adlb.R
@@ -0,0 +1,6 @@
+# note: adlb.Rdata was copied over from vignettes/adsl.Rdata
+# this is a copy of the PHUSE Test Data Factory data, trimmed down for size
+
+load("data-raw/adlb.Rdata")
+tplyr_adlb <- adlb
+usethis::use_data(tplyr_adlb, overwrite = TRUE)
diff --git a/vignettes/adlb.Rdata → data-raw/adlb.Rdata b/vignettes/adlb.Rdata → data-raw/adlb.Rdata
diff --git a/data-raw/adsl.R b/data-raw/adsl.R
@@ -0,0 +1,6 @@
+# note: adsl.Rdata was copied over from vignettes/adsl.Rdata
+# this is a copy of the PHUSE Test Data Factory data, trimmed down for size
+
+load("data-raw/adsl.Rdata")
+tplyr_adsl <- adsl
+usethis::use_data(tplyr_adsl, overwrite = TRUE)
diff --git a/vignettes/adsl.Rdata → data-raw/adsl.Rdata b/vignettes/adsl.Rdata → data-raw/adsl.Rdata
diff --git a/data/tplyr_adae.rda b/data/tplyr_adae.rda
diff --git a/data/tplyr_adas.rda b/data/tplyr_adas.rda
diff --git a/data/tplyr_adlb.rda b/data/tplyr_adlb.rda
diff --git a/data/tplyr_adsl.rda b/data/tplyr_adsl.rda
diff --git a/man/collapse_row_labels.Rd b/man/collapse_row_labels.Rd
diff --git a/man/get_data_labels.Rd b/man/get_data_labels.Rd
diff --git a/man/tplyr_adae.Rd b/man/tplyr_adae.Rd