add initial package files

ki-tools · Apr 19, 2022 · a3a6839 · a3a6839
1 parent e01f8bc
commit a3a6839
Show file tree

Hide file tree

Showing 12 changed files with 436 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+.DS_Store
+_ignore
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,17 @@
+Package: ipumsvis
+Title: What the Package Does (One Line, Title Case)
+Version: 0.0.0.9000
+Authors@R: 
+    person("First", "Last", , "first.last@example.com", role = c("aut", "cre"),
+           comment = c(ORCID = "YOUR-ORCID-ID"))
+Description: What the package does (one paragraph).
+License: `use_mit_license()`, `use_gpl3_license()` or friends to pick a
+    license
+Encoding: UTF-8
+Roxygen: list(markdown = TRUE)
+RoxygenNote: 7.1.2
+Imports: 
+    dplyr,
+    DT,
+    ipumsr,
+    mapboxer
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,18 @@
+# Generated by roxygen2: do not edit by hand
+
+export(calc_rates)
+export(preprocess_geo)
+export(preprocess_ipums)
+export(view_var_descs)
+importFrom(DT,datatable)
+importFrom(dplyr,"%>%")
+importFrom(dplyr,.data)
+importFrom(dplyr,filter)
+importFrom(dplyr,group_by)
+importFrom(dplyr,rename_all)
+importFrom(dplyr,tibble)
+importFrom(ipumsr,as_factor)
+importFrom(ipumsr,lbl_clean)
+importFrom(ipumsr,read_ipums_ddi)
+importFrom(ipumsr,read_ipums_micro)
+importFrom(utils,download.file)
diff --git a/R/calc.R b/R/calc.R
@@ -0,0 +1,155 @@
+#' Calculate rates from ipums data and associated geography
+#' @examples
+#' \dontrun{
+#' calc_rates(dd, geo_dir = "ext-data/idhs/geo")
+#' }
+#' @importFrom %>% dplyr group_by ungroup rename_all filter summarise n
+#' select all_of mutate
+#' @importFrom ipumsr read_ipums_sf as_factor lbl_clean zap_labels
+#' @export
+calc_rates <- function(x, geo_dir, num_var, denom_var, num_cond, denom_cond) {
+  check_ipums_data(x, "x")
+  x$country2 <- ipumsr::as_factor(ipumsr::lbl_clean(x$country))
+
+  if (!dir.exists(geo_dir))
+    stop("directory '", geo_dir, "' doesn't exist")
+
+  ff <- list.files(geo_dir, pattern = "geo_", full.names = TRUE)
+  if (length(ff) == 0) {
+    stop("no files starting with 'geo_' were found in the directory '",
+      geo_dir, "'")
+  }
+
+  ftbl <- dplyr::tibble(
+    f = ff,
+    country = substr(basename(ff), 5, 6),
+    start_yr = as.integer(substr(basename(ff), 7, 10)),
+    end_yr = as.integer(substr(basename(ff), 12, 15))
+  )
+
+  # there are duplicate shapefiles for many countries
+  # for now, choose those that have the latest data
+  # and also go back the farthest
+  # (could add an option later to do latest start and end date)
+  ftbl <- ftbl %>%
+    dplyr::group_by(.data$country) %>%
+    dplyr::filter(end_yr == max(end_yr)) %>%
+    dplyr::filter(start_yr == max(start_yr)) %>%
+    dplyr::ungroup()
+  ff <- ftbl$f
+
+  xnms <- names(x)
+
+  res <- list()
+  for (f in ff) {
+    geodf <- ipumsr::read_ipums_sf(f, verbose = FALSE) %>%
+      dplyr::rename_all(tolower)
+    names(geodf)[3] <- "dhscode"
+    geodf$dhscode <- as.integer(geodf$dhscode)
+    cur_var <- gsub(".*(geo_.*).zip", "\\1", f)
+    cur_cntry <- geodf$cntry_name[1]
+    message(cur_cntry, ": ", cur_var)
+
+    if (!cur_var %in% xnms) {
+      message("  not found...")
+      next
+    }
+
+    vars <- c("year", "perweight", num_var, denom_var)
+    # first get just the country
+    tmp <- x %>%
+      dplyr::filter(.data$country2 == cur_cntry)
+    # then get rid of years where n == nna for numerator variable
+    tmp2 <- tmp %>%
+      dplyr::group_by(.data$year) %>%
+      dplyr::summarise(
+        n = dplyr::n(),
+        nna = length(which(is.na(.data[[num_var]])))
+      ) %>%
+      dplyr::filter(.data$n == nna)
+    tmp <- tmp %>%
+      dplyr::filter(!.data$year %in% tmp2$year, ) %>%
+      dplyr::select(all_of(c(cur_var, vars))) %>%
+      dplyr::rename_all(function(x)
+        ifelse(x %in% vars, x, "dhscode")) %>%
+      dplyr::mutate(
+        region = ipumsr::as_factor(ipumsr::lbl_clean(.data$dhscode)),
+        dhscode = ipumsr::zap_labels(.data$dhscode)
+      )
+
+    if (nrow(tmp) == 0) {
+      message("  no data for '", num_var, "'...")
+      next
+    }
+    if (all(is.na(tmp[[denom_var]]))) {
+      message("  denominator variable '", denom_var, "' is all NA...")
+      next
+    }
+
+    if (
+      length(which(is.na(tmp$year))) != 0 ||
+      # length(which(is.na(tmp$sexactiv4wk))) != 0 ||
+      length(which(is.na(tmp[[num_var]]))) != 0
+    ) {
+      # browser()
+      stop("something isn't right...")
+    }
+
+    yrregstats <- tmp %>%
+      dplyr::filter(.data[[denom_var]] %in% denom_cond) %>%
+      dplyr::group_by(.data$year, .data$dhscode, .data$region) %>%
+      dplyr::summarise(
+        n = sum(.data$perweight),
+        n2 = dplyr::n(), # length(which(.data[[num_var]] %in% num_cond)),
+        nsti = sum((.data[[num_var]] == 1) * .data$perweight),
+        nsti2 = length(which(.data[[num_var]] == 1)),
+        pct = 100 * .data$nsti / .data$n,
+        .groups = "drop") %>%
+      dplyr::mutate(country = cur_cntry, geo_var = cur_var)
+
+    regstats <- tmp %>%
+      dplyr::filter(.data[[denom_var]] %in% denom_cond) %>%
+      dplyr::group_by(.data$dhscode, .data$region) %>%
+      dplyr::summarise(
+        n = sum(.data$perweight),
+        n2 = dplyr::n(), # length(which(.data[[num_var]] %in% num_cond)),
+        nsti = sum((.data[[num_var]] == 1) * .data$perweight),
+        nsti2 = length(which(.data[[num_var]] == 1)),
+        pct = 100 * .data$nsti / .data$n,
+        .groups = "drop") %>%
+      dplyr::mutate(country = cur_cntry, geo_var = cur_var)
+
+    yrstats <- tmp %>%
+      dplyr::filter(.data[[denom_var]] %in% denom_cond) %>%
+      dplyr::group_by(.data$year) %>%
+      dplyr::summarise(
+        n = sum(.data$perweight),
+        n2 = dplyr::n(), # length(which(.data[[num_var]] %in% num_cond)),
+        nsti = sum((.data[[num_var]] == 1) * .data$perweight),
+        nsti2 = length(which(.data[[num_var]] == 1)),
+        pct = 100 * .data$nsti / .data$n) %>%
+      dplyr::mutate(country = cur_cntry, geo_var = cur_var)
+
+    allstats <- tmp %>%
+      dplyr::filter(.data[[denom_var]] %in% denom_cond) %>%
+      dplyr::summarise(
+        n = sum(.data$perweight),
+        n2 = length(which(.data[[num_var]] %in% num_cond)),
+        nsti = sum((.data[[num_var]] == 1) * .data$perweight),
+        nsti2 = length(which(.data[[num_var]] == 1)),
+        pct = 100 * .data$nsti / .data$n) %>%
+      dplyr::mutate(country = cur_cntry, geo_var = cur_var)
+
+    res[[cur_cntry]] <- list(
+      country = cur_cntry,
+      geo_var = cur_var,
+      allstats = allstats,
+      yrstats = yrstats,
+      regstats = regstats,
+      yrregstats = yrregstats,
+      geodf = geodf
+    )
+  }
+  class(res) <- c("list", "ipums-rates")
+  res
+}
diff --git a/R/geo_vis.R b/R/geo_vis.R
@@ -0,0 +1,73 @@
+
+#' @importFrom dplyr bind_rows recode
+#' @importFrom mapboxer as_mapbox_source mapboxer add_fill_layer
+#' add_line_layer as_mapbox_source add_tooltips basemaps
+geo_vis_all <- function(
+  rates, geo_dir, bins = NULL, width = NULL, height = NULL
+) {
+  world <- ipumsr::read_ipums_sf(
+    file.path(geo_dir, "IPUMSI_world_release2020.zip"),
+    verbose = FALSE
+  ) %>%
+    dplyr::rename_all(tolower) %>%
+    # DRC is the only one coded differently here vs. in IPUMS data
+    # setdiff(sort(unique(x$country2)), world$cntry_name)
+    dplyr::mutate(country = dplyr::recode(.data$cntry_name,
+      "Congo, DRC" = "Congo Democratic Republic"
+    ))
+
+  world2 <- dplyr::filter(world, .data$country %in% names(rates))
+
+  alldat <- dplyr::bind_rows(lapply(rates, function(x) x$yrregstats)) %>%
+    dplyr::group_by(.data$country, .data$region, .data$dhscode) %>%
+    dplyr::filter(year == max(.data$year)
+  )
+
+  allgeo <- lapply(rates, function(x) x$geodf) %>%
+    dplyr::bind_rows() %>%
+    dplyr::rename(country = "cntry_name")
+
+  if (is.null(bins)) {
+    bins <- pretty(alldat$pct, n = 8)
+    # classInt::classIntervals(alldat$pct, n = 10)
+  }
+  cols <- viridisLite::viridis(length(bins) - 1) %>%
+    substr(1, 7)
+
+  leglbl <- paste0(bins[-length(bins)], "-", bins[-1], "%")
+
+  geodat <- ipumsr::ipums_shape_inner_join(alldat, allgeo,
+    by = c("country", "dhscode")) %>%
+    dplyr::mutate(
+      fill_color = cols[cut(.data$pct, bins, labels = FALSE)],
+      pct = round(.data$pct, 1)
+    )
+
+  # setdiff(names(sti_rates), geodat$country)
+
+  # legend: https://docs.mapbox.com/help/tutorials/choropleth-studio-gl-pt-2/
+  mapboxer::as_mapbox_source(geodat) %>%
+    mapboxer::mapboxer(
+      style = mapboxer::basemaps$Mapbox$light_v10,
+      bounds = sf::st_bbox(geodat),
+      fitBoundsOptions = list(padding = 20),
+      width = width,
+      height = height
+    ) %>%
+    mapboxer::add_fill_layer(
+      id = "map",
+      fill_color = c("get", "fill_color"),
+      fill_opacity = 0.6
+    ) %>%
+    mapboxer::add_line_layer(
+      source =  mapboxer::as_mapbox_source(world2),
+      line_color = "black",
+      line_width = 1
+    ) %>%
+    mapboxer::add_tooltips(
+      "map", paste0(
+        "Country: {{country}}<br>",
+        "Region: {{region}}<br>",
+        "Year: {{year}}<br>",
+        "Pct STI: {{pct}}%"))
+}
diff --git a/R/misc.R b/R/misc.R
@@ -0,0 +1,18 @@
+#' View html table of IPUMS extract variable descriptions
+#' @importFrom DT datatable
+#' @importFrom dplyr filter tibble .data
+#' @export
+view_var_descs <- function(dd, include_geo = FALSE) {
+  check_ipums_data(x, "x")
+
+  descs <- sapply(dd, function(x) attr(x, "var_desc"))
+  descs <- dplyr::tibble(
+    name = names(descs),
+    desc = unname(descs)
+  )
+  if (!include_geo) {
+    descs <- dplyr::filter(descs, !grepl("^geo_", .data$name))
+  }
+
+  DT::datatable(descs, options = list(paging = FALSE))
+}
diff --git a/R/preprocess.R b/R/preprocess.R
@@ -0,0 +1,81 @@
+#' Preprocess IPUMS extract
+#' @importFrom ipumsr read_ipums_ddi read_ipums_micro as_factor lbl_clean
+#' @importFrom dplyr rename_all
+#' @examples
+#' \dontrun{
+#' preprocess_ipums(
+#'   input_xml = "ext-data/idhs/extract2/idhs_00002.xml",
+#'   output_file = "ext-data/idhs/extract2/dd.rds"
+#' )
+#' }
+#' @export
+preprocess_ipums <- function(input_xml, output_file) {
+  message("reading xml")
+  ddi <- ipumsr::read_ipums_ddi(input_xml)
+  message("reading microdata")
+  dd <- ipumsr::read_ipums_micro(ddi)
+  dd <- dplyr::rename_all(dd, tolower)
+  attr(dd, "ipums_ddi") <- ddi
+
+  message("saving output")
+  saveRDS(dd, file = output_file)
+
+  invisible(dd)
+}
+
+# https://www.idhsdata.org/idhs/gis.shtml
+
+#' Preprocess (download) shapfiles associated with an IPUMS extract
+#' @examples
+#' \dontrun{
+#' dd <- preprocess_ipums(
+#'   input_xml = "ext-data/idhs/extract2/idhs_00002.xml",
+#'   output_file = "ext-data/idhs/extract2/dd.rds"
+#' )
+#' preprocess_geo(dd, output_dir = "ext-data/idhs/geo")
+#' }
+#' @export
+#' @importFrom utils download.file
+preprocess_geo <- function(x, output_dir) {
+  check_ipums_data(x, "x")
+
+  if (!dir.exists(output_dir))
+    dir.create(output_dir, recursive = TRUE)
+
+  prefix <- "https://www.idhsdata.org/idhs/resources/gis/"
+
+  nms <- names(x)
+  geonms <- sort(nms[grepl("geo_.*_.*", nms)])
+
+  if (length(geonms) == 0) {
+    message("no shapefiles found associated with this data")
+    return(invisible(FALSE))
+  }
+
+  for (nm in geonms) {
+    message("downloading shapefiles for ", nm)
+    tryres <- try(utils::download.file(
+      paste0(prefix, nm, ".zip"),
+      file.path(output_dir, paste0(nm, ".zip")),
+      quiet = TRUE
+    ), silent = TRUE)
+    if (inherits(tryres, "try-error")) {
+      message("  could not find shapefiles associated with this country")
+    }
+  }
+
+  message("downloading shapefiles for world countries")
+  tryres <- try(utils::download.file(
+    "https://international.ipums.org/international/resources/gis/IPUMSI_world_release2020.zip",
+    file.path(output_dir, "IPUMSI_world_release2020.zip"),
+    quiet = TRUE
+  ))
+  if (inherits(tryres, "try-error")) {
+    message("  could not find world shapefiles")
+  }
+
+  invisible(TRUE)
+}
+
+# https://spatialdata.dhsprogram.com/population-estimates/
+# https://dhs-sites.s3.amazonaws.com/SDR/boundaries/production/data/Population_Estimates_ALL.zip
diff --git a/R/utils.R b/R/utils.R
@@ -0,0 +1,6 @@
+
+check_ipums_data <- function(x, nm = "x") {
+  if (!"ipums_ddi" %in% names(attributes(x))) {
+    stop("Input '", nm, "' must come from preprocess_ipums()")
+  }
+}
diff --git a/man/calc_rates.Rd b/man/calc_rates.Rd