From 1b14585858d173739ca914ad39d1bfdd2de29446 Mon Sep 17 00:00:00 2001
From: James McMahon <james.mcmahon@phs.scot>
Date: Thu, 20 Jan 2022 10:44:23 +0000
Subject: [PATCH] Version 0.8.0 (#31)

* improve test

* Increment minor version

* Update variable lists

* Increment version number

* shorten line length per `{lintr}`

* Remove unnecessary brackets.

* shorten line lengths per `{lintr}`

* Now need to get more rows

* Adding variable 'packs' (#28)

* Add a vector with demographic variables.

* Add a vector with LTC variables

* Add a vector with bedday variables

* Add a vector with cost variables.

* Add a vignette for variable packs

* Increment version number
---
 .gitignore                          |   1 +
 DESCRIPTION                         |   6 +-
 NEWS.md                             |   5 ++
 R/data.R                            |  45 +++++++++++-
 R/read_slf.R                        |  20 ++---
 data/demog_vars.rda                 | Bin 0 -> 319 bytes
 data/ep_file_bedday_vars.rda        | Bin 0 -> 172 bytes
 data/ep_file_cost_vars.rda          | Bin 0 -> 181 bytes
 data/ltc_vars.rda                   | Bin 0 -> 222 bytes
 inst/WORDLIST                       |  12 +--
 man/chi_cohort.Rd                   |  10 ++-
 man/demog_vars.Rd                   |  16 ++++
 man/ep_file_bedday_vars.Rd          |  18 +++++
 man/ep_file_cost_vars.Rd            |  18 +++++
 man/ltc_vars.Rd                     |  17 +++++
 tests/testthat/test-gen_file_path.R |  12 ++-
 vignettes/.gitignore                |   2 +
 vignettes/variable-packs.Rmd        | 109 ++++++++++++++++++++++++++++
 18 files changed, 267 insertions(+), 24 deletions(-)
 create mode 100644 data/demog_vars.rda
 create mode 100644 data/ep_file_bedday_vars.rda
 create mode 100644 data/ep_file_cost_vars.rda
 create mode 100644 data/ltc_vars.rda
 create mode 100644 man/demog_vars.Rd
 create mode 100644 man/ep_file_bedday_vars.Rd
 create mode 100644 man/ep_file_cost_vars.Rd
 create mode 100644 man/ltc_vars.Rd
 create mode 100644 vignettes/.gitignore
 create mode 100644 vignettes/variable-packs.Rmd

diff --git a/.gitignore b/.gitignore
index 234f028..0d7f03b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,4 @@
 .RData
 .Ruserdata
 docs
+inst/doc
diff --git a/DESCRIPTION b/DESCRIPTION
index b1571df..b4466ec 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: slfhelper
 Title: Useful functions for working with the Source Linkage Files
-Version: 0.7.1
+Version: 0.8.0
 Authors@R: 
     person("James", "McMahon", , "james.mcmahon@phs.scot", role = c("cre", "aut"),
            comment = c(ORCID = "0000-0002-5380-2029"))
@@ -30,9 +30,13 @@ Imports:
     tibble
 Suggests:
     covr,
+    knitr,
     phsmethods,
+    rmarkdown,
     spelling,
     testthat (>= 3.0.0)
+VignetteBuilder: 
+    knitr
 Remotes: 
     Public-Health-Scotland/phsmethods
 Config/testthat/edition: 3
diff --git a/NEWS.md b/NEWS.md
index e4e0e36..e006315 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,8 @@
+# slfhelper 0.8.0
+
+* Add a number of 'variable packs' for easily selecting sets of variables: `demog_vars`, `ltc_vars`,  `ep_file_bedday_vars` and `ep_file_cost_vars`.
+* Add a new vignette introducing variable packs.
+
 # slfhelper 0.7.1
 
 * Add check for server and hscdiip access on package load.
diff --git a/R/data.R b/R/data.R
index 215b5d6..6e836e4 100644
--- a/R/data.R
+++ b/R/data.R
@@ -18,6 +18,46 @@
 #' @keywords data
 "indiv_file_vars"
 
+#' Demographic variables
+#'
+#' A vector containing the names of the demographic variables.
+#'
+#' @name demog_vars
+#' @docType data
+#' @keywords data
+"demog_vars"
+
+#' LTC variables
+#'
+#' A vector containing the names of the
+#' Long Term Condition (LTC) variables.
+#'
+#' @name ltc_vars
+#' @docType data
+#' @keywords data
+"ltc_vars"
+
+#' Episode file bedday variables
+#'
+#' A vector containing the names of the
+#' bedday related variables in the
+#' episode file.
+#'
+#' @name ep_file_bedday_vars
+#' @docType data
+#' @keywords data
+"ep_file_bedday_vars"
+
+#' Episode file cost variables
+#'
+#' A vector containing the names of the
+#' cost related variables in the
+#' episode file.
+#'
+#' @name ep_file_cost_vars
+#' @docType data
+#' @keywords data
+"ep_file_cost_vars"
 
 #' HSCP name / code lookup
 #'
@@ -51,8 +91,9 @@
 #' @keywords data
 "recids"
 
-#' A [tibble][tibble::tibble-package] containing an example cohort of CHI numbers under different
-#' variable names. It is used for testing and to illustrate examples.
+#' A [tibble][tibble::tibble-package] containing an example
+#' cohort of CHI numbers under different variable names.
+#' It is used for testing and to illustrate examples.
 #' It is likely that many of the 'chi numbers' are not valid.
 #'
 #' @name chi_cohort
diff --git a/R/read_slf.R b/R/read_slf.R
index 4f6ac47..f9c9f76 100644
--- a/R/read_slf.R
+++ b/R/read_slf.R
@@ -27,13 +27,13 @@ read_slf <-
     # but the column wasn't selected we need to add it (and remove later)
     remove_partnership_var <- FALSE
     remove_recid_var <- FALSE
-    if (!(is.null(optional_params$columns))) {
-      if (!(is.null(partnerships)) &
+    if (!is.null(optional_params$columns)) {
+      if (!is.null(partnerships) &
         !("hscp2018" %in% optional_params$columns)) {
         optional_params$columns <- c(optional_params$columns, "hscp2018")
         remove_partnership_var <- TRUE
       }
-      if (!(is.null(recids)) & file_version == "episode" &
+      if (!is.null(recids) & file_version == "episode" &
         !("recid" %in% optional_params$columns)) {
         optional_params$columns <- c(optional_params$columns, "recid")
         remove_recid_var <- TRUE
@@ -66,7 +66,7 @@ read_slf <-
     # If a partnership is specified filter first;
     # With testing it seems to usually be faster if we do partnership
     # filtering before recid filtering
-    if (!(is.null(partnerships))) {
+    if (!is.null(partnerships)) {
       slfs_list <- purrr::map(
         slfs_list,
         ~ dplyr::filter(.x, .x$hscp2018 %in% partnerships)
@@ -74,7 +74,7 @@ read_slf <-
     }
 
     # If a recid is specified filter now
-    if (!(is.null(recids))) {
+    if (!is.null(recids)) {
       slfs_list <- purrr::map(
         slfs_list,
         ~ dplyr::filter(.x, .x$recid %in% recids)
@@ -134,9 +134,9 @@ read_slf_episode <-
       read_slf(
         year = year,
         file_version = "episode",
-        partnerships = partnerships,
-        recids = recids,
-        columns = columns,
+        partnerships = unique(partnerships),
+        recids = unique(recids),
+        columns = unique(columns),
         ...
       )
     )
@@ -174,8 +174,8 @@ read_slf_individual <-
       read_slf(
         year = year,
         file_version = "individual",
-        partnerships = partnerships,
-        columns = columns,
+        partnerships = unique(partnerships),
+        columns = unique(columns),
         ...
       )
     )
diff --git a/data/demog_vars.rda b/data/demog_vars.rda
new file mode 100644
index 0000000000000000000000000000000000000000..594a27c5b8e62686d3b3dd3a69fa63922df30255
GIT binary patch
literal 319
zcmV-F0l@x3T4*^jL0KkKSxILmLjVB5e}Mn=)l~ojd0+(q7C^r5|1banFaZ{9gGiY*
zJtl@D1T<tC4FlBCfHG4OW`Y?sWB|};8fY{C+KPsTgFqSp0imD)pu|&JRDcoX01?`<
zGaw)b+Tq`8D%D>-H0V<<II@bb8K9{bS|W%^UjSs1jh&DsX_<f&yPXF45({!5hmu~S
zY!1$wpg<O;3hxR{E2m)R)AK3j%`s0!k<cd!0LL-Os?bk~j(Uw0oGt!CcUpy%hSHjE
z8s57D0)X~SjVn1j&o&uEh=eW(mO~OH%?y%<5-}Z+ROp;KF$v<hh=~zDOp;4LH{p3i
z&?7`(QNa7AKw#z%2oM6@jUDwCfcOXvB`SC-M#1ucxox;#ZP&Yz(VJL=rNE&A30OpR
R%;k8Cxgwk>NF=k9A%L0Af$0DM

literal 0
HcmV?d00001

diff --git a/data/ep_file_bedday_vars.rda b/data/ep_file_bedday_vars.rda
new file mode 100644
index 0000000000000000000000000000000000000000..07818d8a7de73785e7e033d8340d781066bfd734
GIT binary patch
literal 172
zcmV;d08{@$T4*^jL0KkKSz~#qF8}~h|A7B_NB{r=2tWk@7C^tZ-yi@1AOMvJDWCui
z8fXJRX^=@0G?_4lfB*)D)SaPCFk~w5c7dIcjFjVZMIvd!w37f>&cGo75dZ*RV}c`T
zk8&EDPMufNeNiD#v7Qsl<LM?eAn(K}utIELgy>PtSLyCT@AitNL=(x$LCsbZ7qP0)
aQGYuvBdosY1eh0(;_gVN3K9%&H3i@h9Yq2F

literal 0
HcmV?d00001

diff --git a/data/ep_file_cost_vars.rda b/data/ep_file_cost_vars.rda
new file mode 100644
index 0000000000000000000000000000000000000000..ceae74da287d5285b716442812605de1b39156f1
GIT binary patch
literal 181
zcmV;m080NtT4*^jL0KkKS;g25;Q#<p|A7B-VE_OE2tWk@7C^tZ-yi@1AOMtxLZ+Ul
zsgM{(fHXZvg$609lzyUU(8$T405r&^csK@y@ei{Nh=Lw@M3JUwL=&Krfu%tb1CbDr
zc-o|1*I~k#-*z$iKi*QAXTpDAt)w9l%37>u&>8?9@L9Wcl9#yF9Qrv}w8a=JKQ%(3
jStii>gNoMWnxami44pu*oJzpV-ZB?*ML1B9#n=tuP6bd9

literal 0
HcmV?d00001

diff --git a/data/ltc_vars.rda b/data/ltc_vars.rda
new file mode 100644
index 0000000000000000000000000000000000000000..cda95d10a14315b13846fc805857e6d847052299
GIT binary patch
literal 222
zcmV<403rWET4*^jL0KkKS#@DlcK`ro|A7DSNB{r=2tWk@7C^u6-yi@1FaWt>Q)(WQ
z)HDqNrhsSwHBTh;frMyigCNKZ2_z&kG&BHarqml#K|?;8m>$U~=(Y&4fK=4tAAQ)-
zlyXjVU#hob2o;HM1TY*K%!IYD%z!WI<#X+iZ_<s@@rEsccYOI&v~IVRixvbB#hxT&
zkmzANee|OBMLGqoG66LrhnQ=kaERG#nv~|)N-nPd=Q$`Ly-%~#kE%=t;^;C+CGja$
Y5@9W?C7|8b1oyzZk}1N3gR2UoxS-)-cK`qY

literal 0
HcmV?d00001

diff --git a/inst/WORDLIST b/inst/WORDLIST
index 3b90050..542f4b2 100644
--- a/inst/WORDLIST
+++ b/inst/WORDLIST
@@ -1,14 +1,16 @@
-NSS
+Bedday
+HSCP
+LTC
 ORCID
-RStudio
+Recid
 Rmd
 SLFs
+bedday
+beddays
 dplyr
 fst
+hscdiip
 hscp
 recid
 recids
 tibble
-hscdiip
-PHS
-ORCID
diff --git a/man/chi_cohort.Rd b/man/chi_cohort.Rd
index 8ea36fe..0db4dec 100644
--- a/man/chi_cohort.Rd
+++ b/man/chi_cohort.Rd
@@ -3,8 +3,9 @@
 \docType{data}
 \name{chi_cohort}
 \alias{chi_cohort}
-\title{A \link[tibble:tibble-package]{tibble} containing an example cohort of CHI numbers under different
-variable names. It is used for testing and to illustrate examples.
+\title{A \link[tibble:tibble-package]{tibble} containing an example
+cohort of CHI numbers under different variable names.
+It is used for testing and to illustrate examples.
 It is likely that many of the 'chi numbers' are not valid.}
 \format{
 An object of class \code{tbl_df} (inherits from \code{tbl}, \code{data.frame}) with 100 rows and 2 columns.
@@ -13,8 +14,9 @@ An object of class \code{tbl_df} (inherits from \code{tbl}, \code{data.frame}) w
 chi_cohort
 }
 \description{
-A \link[tibble:tibble-package]{tibble} containing an example cohort of CHI numbers under different
-variable names. It is used for testing and to illustrate examples.
+A \link[tibble:tibble-package]{tibble} containing an example
+cohort of CHI numbers under different variable names.
+It is used for testing and to illustrate examples.
 It is likely that many of the 'chi numbers' are not valid.
 }
 \keyword{data}
diff --git a/man/demog_vars.Rd b/man/demog_vars.Rd
new file mode 100644
index 0000000..897906b
--- /dev/null
+++ b/man/demog_vars.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data.R
+\docType{data}
+\name{demog_vars}
+\alias{demog_vars}
+\title{Demographic variables}
+\format{
+An object of class \code{character} of length 30.
+}
+\usage{
+demog_vars
+}
+\description{
+A vector containing the names of the demographic variables.
+}
+\keyword{data}
diff --git a/man/ep_file_bedday_vars.Rd b/man/ep_file_bedday_vars.Rd
new file mode 100644
index 0000000..1d6189a
--- /dev/null
+++ b/man/ep_file_bedday_vars.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data.R
+\docType{data}
+\name{ep_file_bedday_vars}
+\alias{ep_file_bedday_vars}
+\title{Episode file bedday variables}
+\format{
+An object of class \code{character} of length 14.
+}
+\usage{
+ep_file_bedday_vars
+}
+\description{
+A vector containing the names of the
+bedday related variables in the
+episode file.
+}
+\keyword{data}
diff --git a/man/ep_file_cost_vars.Rd b/man/ep_file_cost_vars.Rd
new file mode 100644
index 0000000..3dc9687
--- /dev/null
+++ b/man/ep_file_cost_vars.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data.R
+\docType{data}
+\name{ep_file_cost_vars}
+\alias{ep_file_cost_vars}
+\title{Episode file cost variables}
+\format{
+An object of class \code{character} of length 14.
+}
+\usage{
+ep_file_cost_vars
+}
+\description{
+A vector containing the names of the
+cost related variables in the
+episode file.
+}
+\keyword{data}
diff --git a/man/ltc_vars.Rd b/man/ltc_vars.Rd
new file mode 100644
index 0000000..bab9e09
--- /dev/null
+++ b/man/ltc_vars.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data.R
+\docType{data}
+\name{ltc_vars}
+\alias{ltc_vars}
+\title{LTC variables}
+\format{
+An object of class \code{character} of length 19.
+}
+\usage{
+ltc_vars
+}
+\description{
+A vector containing the names of the
+Long Term Condition (LTC) variables.
+}
+\keyword{data}
diff --git a/tests/testthat/test-gen_file_path.R b/tests/testthat/test-gen_file_path.R
index 1272c4e..dd506ec 100644
--- a/tests/testthat/test-gen_file_path.R
+++ b/tests/testthat/test-gen_file_path.R
@@ -1,7 +1,11 @@
 test_that("Produces single episode file path", {
   path <- gen_file_path("1718", "episode")
 
-  expect_identical(path, fs::path("/conf/hscdiip/01-Source-linkage-files/source-episode-file-201718.fst"))
+  expect_identical(path, fs::path(
+    "/conf/hscdiip",
+    "01-Source-linkage-files",
+    "source-episode-file-201718.fst"
+  ))
 
   expect_identical(fs::path_ext(path), "fst")
 
@@ -12,7 +16,11 @@ test_that("Produces single episode file path", {
 test_that("Produces single individual file path", {
   path <- gen_file_path("1718", "individual")
 
-  expect_identical(path, fs::path("/conf/hscdiip/01-Source-linkage-files/source-individual-file-201718.fst"))
+  expect_identical(path, fs::path(
+    "/conf/hscdiip",
+    "01-Source-linkage-files",
+    "source-individual-file-201718.fst"
+  ))
 
   expect_identical(fs::path_ext(path), "fst")
 
diff --git a/vignettes/.gitignore b/vignettes/.gitignore
new file mode 100644
index 0000000..097b241
--- /dev/null
+++ b/vignettes/.gitignore
@@ -0,0 +1,2 @@
+*.html
+*.R
diff --git a/vignettes/variable-packs.Rmd b/vignettes/variable-packs.Rmd
new file mode 100644
index 0000000..61e0072
--- /dev/null
+++ b/vignettes/variable-packs.Rmd
@@ -0,0 +1,109 @@
+---
+title: "Using variable packs"
+output: rmarkdown::html_vignette
+vignette: >
+  %\VignetteIndexEntry{Using variable packs}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+
+```{r, include = FALSE}
+knitr::opts_chunk$set(
+  collapse = TRUE,
+  comment = "#>"
+)
+```
+
+## Selecting only specified variables
+
+It is recommended to only choose the variables you need when reading in a Source Linkage File. This can be achieved by specifying a `column` argument to the relevant `read_slf_` function.
+
+This will result in the data being read in much faster as well as being easy to work with. The full episode and individual files have 200+ and 100+ variables respectively!
+
+```{r load-package, include=FALSE}
+library(slfhelper)
+```
+
+```{r column-example, eval=FALSE}
+library(slfhelper)
+
+ep_data <- read_slf_episode(year = 1920, columns = c("year", "anon_chi", "recid"))
+
+indiv_data <- read_slf_individual(year = 1920, columns = c("year", "anon_chi", "nsu"))
+```
+
+## Looking up variable names
+
+To help with the task of picking which variables you might need for your analysis, as well as getting the spelling correct, we provide lists of the variable names in the package.
+
+```{r get-var-names}
+# Show the first few variables from the episode file
+head(ep_file_vars)
+
+# Do the same for the individual file
+head(indiv_file_vars)
+```
+
+
+## Variable packs
+
+This is great but it can still be a lot of effort and copy/pasting every time, especially if you need quite a few variables for your analysis.
+
+To assist with this, there are a number of 'variable packs', these are groups of variables which would commonly be needed together which can be accessed with a simple name. Currently there are four packs; `demog_vars`, `ltc_vars`, `ep_file_bedday_vars` and `ep_file_cost_vars`. Let's see what they contain.
+
+### Demographic variables 
+These are demographic variables which are specific to CHI and can be used with episode or individual file.
+
+```{r demog-pack}
+demog_vars
+```
+
+### Long Term Condition (LTC) variables 
+These are the Long Term Condition flag variables which are specific to CHI and can be used with episode or individual file.
+
+```{r ltc-pack}
+ltc_vars
+```
+
+### Bedday variables
+These are variables detailing beddays, they are specific to an episode and can only be used with the episode file.
+```{r bedday-pack}
+ep_file_bedday_vars
+```
+
+### Cost variables
+These are variables detailing costs, they are specific to an episode and can only be used with the episode file.
+```{r cost-pack}
+ep_file_cost_vars
+```
+
+
+## Using variable packs
+These variable packs can be used in the column selection to simplify your code substantially.
+
+For example to take some demographic data and LTC flags from the individual file.
+```{r use-ltc-indiv, eval=FALSE}
+library(slfhelper)
+
+indiv_ltc_data <- read_slf_individual(year = 1920, columns = c("year", demog_vars, ltc_vars))
+```
+
+
+Or to get bedday information about Acute records from the episode file.
+```{r use-beddays, eval=FALSE}
+library(slfhelper)
+
+acute_beddays <- read_slf_episode(
+  year = 1920,
+  columns = c("year", "anon_chi", "hbtreatcode", "recid", ep_file_bedday_vars, "cij_pattype"),
+  recid = c("01B", "GLS")
+)
+```
+
+## Conclusion
+
+You should be using the `column` argument when reading in data to increase the read speed, and reduce the amount of data you are loading into R. `slfhelper` provides a number of helpers to make picking and using the variables you need easier.
+
+If you would like any changes made to any existing packs, please [open an issue on GitHub](https://github.com/Public-Health-Scotland/slfhelper/issues).
+
+If you would like to suggest any additional variable packs, either [open an issue](https://github.com/Public-Health-Scotland/slfhelper/issues), or even [submit a pull request](https://usethis.r-lib.org/articles/pr-functions.html)!