From 1b14585858d173739ca914ad39d1bfdd2de29446 Mon Sep 17 00:00:00 2001 From: James McMahon Date: Thu, 20 Jan 2022 10:44:23 +0000 Subject: [PATCH] Version 0.8.0 (#31) * improve test * Increment minor version * Update variable lists * Increment version number * shorten line length per `{lintr}` * Remove unnecessary brackets. * shorten line lengths per `{lintr}` * Now need to get more rows * Adding variable 'packs' (#28) * Add a vector with demographic variables. * Add a vector with LTC variables * Add a vector with bedday variables * Add a vector with cost variables. * Add a vignette for variable packs * Increment version number --- .gitignore | 1 + DESCRIPTION | 6 +- NEWS.md | 5 ++ R/data.R | 45 +++++++++++- R/read_slf.R | 20 ++--- data/demog_vars.rda | Bin 0 -> 319 bytes data/ep_file_bedday_vars.rda | Bin 0 -> 172 bytes data/ep_file_cost_vars.rda | Bin 0 -> 181 bytes data/ltc_vars.rda | Bin 0 -> 222 bytes inst/WORDLIST | 12 +-- man/chi_cohort.Rd | 10 ++- man/demog_vars.Rd | 16 ++++ man/ep_file_bedday_vars.Rd | 18 +++++ man/ep_file_cost_vars.Rd | 18 +++++ man/ltc_vars.Rd | 17 +++++ tests/testthat/test-gen_file_path.R | 12 ++- vignettes/.gitignore | 2 + vignettes/variable-packs.Rmd | 109 ++++++++++++++++++++++++++++ 18 files changed, 267 insertions(+), 24 deletions(-) create mode 100644 data/demog_vars.rda create mode 100644 data/ep_file_bedday_vars.rda create mode 100644 data/ep_file_cost_vars.rda create mode 100644 data/ltc_vars.rda create mode 100644 man/demog_vars.Rd create mode 100644 man/ep_file_bedday_vars.Rd create mode 100644 man/ep_file_cost_vars.Rd create mode 100644 man/ltc_vars.Rd create mode 100644 vignettes/.gitignore create mode 100644 vignettes/variable-packs.Rmd diff --git a/.gitignore b/.gitignore index 234f028..0d7f03b 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ .RData .Ruserdata docs +inst/doc diff --git a/DESCRIPTION b/DESCRIPTION index b1571df..b4466ec 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: slfhelper Title: Useful functions for working with the Source Linkage Files -Version: 0.7.1 +Version: 0.8.0 Authors@R: person("James", "McMahon", , "james.mcmahon@phs.scot", role = c("cre", "aut"), comment = c(ORCID = "0000-0002-5380-2029")) @@ -30,9 +30,13 @@ Imports: tibble Suggests: covr, + knitr, phsmethods, + rmarkdown, spelling, testthat (>= 3.0.0) +VignetteBuilder: + knitr Remotes: Public-Health-Scotland/phsmethods Config/testthat/edition: 3 diff --git a/NEWS.md b/NEWS.md index e4e0e36..e006315 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,8 @@ +# slfhelper 0.8.0 + +* Add a number of 'variable packs' for easily selecting sets of variables: `demog_vars`, `ltc_vars`, `ep_file_bedday_vars` and `ep_file_cost_vars`. +* Add a new vignette introducing variable packs. + # slfhelper 0.7.1 * Add check for server and hscdiip access on package load. diff --git a/R/data.R b/R/data.R index 215b5d6..6e836e4 100644 --- a/R/data.R +++ b/R/data.R @@ -18,6 +18,46 @@ #' @keywords data "indiv_file_vars" +#' Demographic variables +#' +#' A vector containing the names of the demographic variables. +#' +#' @name demog_vars +#' @docType data +#' @keywords data +"demog_vars" + +#' LTC variables +#' +#' A vector containing the names of the +#' Long Term Condition (LTC) variables. +#' +#' @name ltc_vars +#' @docType data +#' @keywords data +"ltc_vars" + +#' Episode file bedday variables +#' +#' A vector containing the names of the +#' bedday related variables in the +#' episode file. +#' +#' @name ep_file_bedday_vars +#' @docType data +#' @keywords data +"ep_file_bedday_vars" + +#' Episode file cost variables +#' +#' A vector containing the names of the +#' cost related variables in the +#' episode file. +#' +#' @name ep_file_cost_vars +#' @docType data +#' @keywords data +"ep_file_cost_vars" #' HSCP name / code lookup #' @@ -51,8 +91,9 @@ #' @keywords data "recids" -#' A [tibble][tibble::tibble-package] containing an example cohort of CHI numbers under different -#' variable names. It is used for testing and to illustrate examples. +#' A [tibble][tibble::tibble-package] containing an example +#' cohort of CHI numbers under different variable names. +#' It is used for testing and to illustrate examples. #' It is likely that many of the 'chi numbers' are not valid. #' #' @name chi_cohort diff --git a/R/read_slf.R b/R/read_slf.R index 4f6ac47..f9c9f76 100644 --- a/R/read_slf.R +++ b/R/read_slf.R @@ -27,13 +27,13 @@ read_slf <- # but the column wasn't selected we need to add it (and remove later) remove_partnership_var <- FALSE remove_recid_var <- FALSE - if (!(is.null(optional_params$columns))) { - if (!(is.null(partnerships)) & + if (!is.null(optional_params$columns)) { + if (!is.null(partnerships) & !("hscp2018" %in% optional_params$columns)) { optional_params$columns <- c(optional_params$columns, "hscp2018") remove_partnership_var <- TRUE } - if (!(is.null(recids)) & file_version == "episode" & + if (!is.null(recids) & file_version == "episode" & !("recid" %in% optional_params$columns)) { optional_params$columns <- c(optional_params$columns, "recid") remove_recid_var <- TRUE @@ -66,7 +66,7 @@ read_slf <- # If a partnership is specified filter first; # With testing it seems to usually be faster if we do partnership # filtering before recid filtering - if (!(is.null(partnerships))) { + if (!is.null(partnerships)) { slfs_list <- purrr::map( slfs_list, ~ dplyr::filter(.x, .x$hscp2018 %in% partnerships) @@ -74,7 +74,7 @@ read_slf <- } # If a recid is specified filter now - if (!(is.null(recids))) { + if (!is.null(recids)) { slfs_list <- purrr::map( slfs_list, ~ dplyr::filter(.x, .x$recid %in% recids) @@ -134,9 +134,9 @@ read_slf_episode <- read_slf( year = year, file_version = "episode", - partnerships = partnerships, - recids = recids, - columns = columns, + partnerships = unique(partnerships), + recids = unique(recids), + columns = unique(columns), ... ) ) @@ -174,8 +174,8 @@ read_slf_individual <- read_slf( year = year, file_version = "individual", - partnerships = partnerships, - columns = columns, + partnerships = unique(partnerships), + columns = unique(columns), ... ) ) diff --git a/data/demog_vars.rda b/data/demog_vars.rda new file mode 100644 index 0000000000000000000000000000000000000000..594a27c5b8e62686d3b3dd3a69fa63922df30255 GIT binary patch literal 319 zcmV-F0l@x3T4*^jL0KkKSxILmLjVB5e}Mn=)l~ojd0+(q7C^r5|1banFaZ{9gGiY* zJtl@D1T-H0V<NF=k9A%L0Af$0DM literal 0 HcmV?d00001 diff --git a/data/ep_file_bedday_vars.rda b/data/ep_file_bedday_vars.rda new file mode 100644 index 0000000000000000000000000000000000000000..07818d8a7de73785e7e033d8340d781066bfd734 GIT binary patch literal 172 zcmV;d08{@$T4*^jL0KkKSz~#qF8}~h|A7B_NB{r=2tWk@7C^tZ-yi@1AOMvJDWCui z8fXJRX^=@0G?_4lfB*)D)SaPCFk~w5c7dIcjFjVZMIvd!w37f>&cGo75dZ*RV}c`T zk8&EDPMufNeNiD#v7QslPtSLyCT@AitNL=(x$LCsbZ7qP0) aQGYuvBdosY1eh0(;_gVN3K9%&H3i@h9Yq2F literal 0 HcmV?d00001 diff --git a/data/ep_file_cost_vars.rda b/data/ep_file_cost_vars.rda new file mode 100644 index 0000000000000000000000000000000000000000..ceae74da287d5285b716442812605de1b39156f1 GIT binary patch literal 181 zcmV;m080NtT4*^jL0KkKS;g25;Q#u&>8?9@L9Wcl9#yF9Qrv}w8a=JKQ%(3 jStii>gNoMWnxami44pu*oJzpV-ZB?*ML1B9#n=tuP6bd9 literal 0 HcmV?d00001 diff --git a/data/ltc_vars.rda b/data/ltc_vars.rda new file mode 100644 index 0000000000000000000000000000000000000000..cda95d10a14315b13846fc805857e6d847052299 GIT binary patch literal 222 zcmV<403rWET4*^jL0KkKS#@DlcK`ro|A7DSNB{r=2tWk@7C^u6-yi@1FaWt>Q)(WQ z)HDqNrhsSwHBTh;frMyigCNKZ2_z&kG&BHarqml#K|?;8m>$U~=(Y&4fK=4tAAQ)- zlyXjVU#hob2o;HM1TY*K%!IYD%z!WI<#X+iZ_ + %\VignetteIndexEntry{Using variable packs} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +## Selecting only specified variables + +It is recommended to only choose the variables you need when reading in a Source Linkage File. This can be achieved by specifying a `column` argument to the relevant `read_slf_` function. + +This will result in the data being read in much faster as well as being easy to work with. The full episode and individual files have 200+ and 100+ variables respectively! + +```{r load-package, include=FALSE} +library(slfhelper) +``` + +```{r column-example, eval=FALSE} +library(slfhelper) + +ep_data <- read_slf_episode(year = 1920, columns = c("year", "anon_chi", "recid")) + +indiv_data <- read_slf_individual(year = 1920, columns = c("year", "anon_chi", "nsu")) +``` + +## Looking up variable names + +To help with the task of picking which variables you might need for your analysis, as well as getting the spelling correct, we provide lists of the variable names in the package. + +```{r get-var-names} +# Show the first few variables from the episode file +head(ep_file_vars) + +# Do the same for the individual file +head(indiv_file_vars) +``` + + +## Variable packs + +This is great but it can still be a lot of effort and copy/pasting every time, especially if you need quite a few variables for your analysis. + +To assist with this, there are a number of 'variable packs', these are groups of variables which would commonly be needed together which can be accessed with a simple name. Currently there are four packs; `demog_vars`, `ltc_vars`, `ep_file_bedday_vars` and `ep_file_cost_vars`. Let's see what they contain. + +### Demographic variables +These are demographic variables which are specific to CHI and can be used with episode or individual file. + +```{r demog-pack} +demog_vars +``` + +### Long Term Condition (LTC) variables +These are the Long Term Condition flag variables which are specific to CHI and can be used with episode or individual file. + +```{r ltc-pack} +ltc_vars +``` + +### Bedday variables +These are variables detailing beddays, they are specific to an episode and can only be used with the episode file. +```{r bedday-pack} +ep_file_bedday_vars +``` + +### Cost variables +These are variables detailing costs, they are specific to an episode and can only be used with the episode file. +```{r cost-pack} +ep_file_cost_vars +``` + + +## Using variable packs +These variable packs can be used in the column selection to simplify your code substantially. + +For example to take some demographic data and LTC flags from the individual file. +```{r use-ltc-indiv, eval=FALSE} +library(slfhelper) + +indiv_ltc_data <- read_slf_individual(year = 1920, columns = c("year", demog_vars, ltc_vars)) +``` + + +Or to get bedday information about Acute records from the episode file. +```{r use-beddays, eval=FALSE} +library(slfhelper) + +acute_beddays <- read_slf_episode( + year = 1920, + columns = c("year", "anon_chi", "hbtreatcode", "recid", ep_file_bedday_vars, "cij_pattype"), + recid = c("01B", "GLS") +) +``` + +## Conclusion + +You should be using the `column` argument when reading in data to increase the read speed, and reduce the amount of data you are loading into R. `slfhelper` provides a number of helpers to make picking and using the variables you need easier. + +If you would like any changes made to any existing packs, please [open an issue on GitHub](https://github.com/Public-Health-Scotland/slfhelper/issues). + +If you would like to suggest any additional variable packs, either [open an issue](https://github.com/Public-Health-Scotland/slfhelper/issues), or even [submit a pull request](https://usethis.r-lib.org/articles/pr-functions.html)!