Skip to content

Commit

Permalink
add initial package files
Browse files Browse the repository at this point in the history
  • Loading branch information
hafen committed Apr 19, 2022
1 parent e01f8bc commit a3a6839
Show file tree
Hide file tree
Showing 12 changed files with 436 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.DS_Store
_ignore
17 changes: 17 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
Package: ipumsvis
Title: What the Package Does (One Line, Title Case)
Version: 0.0.0.9000
Authors@R:
person("First", "Last", , "first.last@example.com", role = c("aut", "cre"),
comment = c(ORCID = "YOUR-ORCID-ID"))
Description: What the package does (one paragraph).
License: `use_mit_license()`, `use_gpl3_license()` or friends to pick a
license
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.1.2
Imports:
dplyr,
DT,
ipumsr,
mapboxer
18 changes: 18 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by roxygen2: do not edit by hand

export(calc_rates)
export(preprocess_geo)
export(preprocess_ipums)
export(view_var_descs)
importFrom(DT,datatable)
importFrom(dplyr,"%>%")
importFrom(dplyr,.data)
importFrom(dplyr,filter)
importFrom(dplyr,group_by)
importFrom(dplyr,rename_all)
importFrom(dplyr,tibble)
importFrom(ipumsr,as_factor)
importFrom(ipumsr,lbl_clean)
importFrom(ipumsr,read_ipums_ddi)
importFrom(ipumsr,read_ipums_micro)
importFrom(utils,download.file)
155 changes: 155 additions & 0 deletions R/calc.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
#' Calculate rates from ipums data and associated geography
#' @examples
#' \dontrun{
#' calc_rates(dd, geo_dir = "ext-data/idhs/geo")
#' }
#' @importFrom %>% dplyr group_by ungroup rename_all filter summarise n
#' select all_of mutate
#' @importFrom ipumsr read_ipums_sf as_factor lbl_clean zap_labels
#' @export
calc_rates <- function(x, geo_dir, num_var, denom_var, num_cond, denom_cond) {
check_ipums_data(x, "x")
x$country2 <- ipumsr::as_factor(ipumsr::lbl_clean(x$country))

if (!dir.exists(geo_dir))
stop("directory '", geo_dir, "' doesn't exist")

ff <- list.files(geo_dir, pattern = "geo_", full.names = TRUE)
if (length(ff) == 0) {
stop("no files starting with 'geo_' were found in the directory '",
geo_dir, "'")
}

ftbl <- dplyr::tibble(
f = ff,
country = substr(basename(ff), 5, 6),
start_yr = as.integer(substr(basename(ff), 7, 10)),
end_yr = as.integer(substr(basename(ff), 12, 15))
)

# there are duplicate shapefiles for many countries
# for now, choose those that have the latest data
# and also go back the farthest
# (could add an option later to do latest start and end date)
ftbl <- ftbl %>%
dplyr::group_by(.data$country) %>%
dplyr::filter(end_yr == max(end_yr)) %>%
dplyr::filter(start_yr == max(start_yr)) %>%
dplyr::ungroup()
ff <- ftbl$f

xnms <- names(x)

res <- list()
for (f in ff) {
geodf <- ipumsr::read_ipums_sf(f, verbose = FALSE) %>%
dplyr::rename_all(tolower)
names(geodf)[3] <- "dhscode"
geodf$dhscode <- as.integer(geodf$dhscode)
cur_var <- gsub(".*(geo_.*).zip", "\\1", f)
cur_cntry <- geodf$cntry_name[1]
message(cur_cntry, ": ", cur_var)

if (!cur_var %in% xnms) {
message(" not found...")
next
}

vars <- c("year", "perweight", num_var, denom_var)
# first get just the country
tmp <- x %>%
dplyr::filter(.data$country2 == cur_cntry)
# then get rid of years where n == nna for numerator variable
tmp2 <- tmp %>%
dplyr::group_by(.data$year) %>%
dplyr::summarise(
n = dplyr::n(),
nna = length(which(is.na(.data[[num_var]])))
) %>%
dplyr::filter(.data$n == nna)
tmp <- tmp %>%
dplyr::filter(!.data$year %in% tmp2$year, ) %>%
dplyr::select(all_of(c(cur_var, vars))) %>%
dplyr::rename_all(function(x)
ifelse(x %in% vars, x, "dhscode")) %>%
dplyr::mutate(
region = ipumsr::as_factor(ipumsr::lbl_clean(.data$dhscode)),
dhscode = ipumsr::zap_labels(.data$dhscode)
)

if (nrow(tmp) == 0) {
message(" no data for '", num_var, "'...")
next
}
if (all(is.na(tmp[[denom_var]]))) {
message(" denominator variable '", denom_var, "' is all NA...")
next
}

if (
length(which(is.na(tmp$year))) != 0 ||
# length(which(is.na(tmp$sexactiv4wk))) != 0 ||
length(which(is.na(tmp[[num_var]]))) != 0
) {
# browser()
stop("something isn't right...")
}

yrregstats <- tmp %>%
dplyr::filter(.data[[denom_var]] %in% denom_cond) %>%
dplyr::group_by(.data$year, .data$dhscode, .data$region) %>%
dplyr::summarise(
n = sum(.data$perweight),
n2 = dplyr::n(), # length(which(.data[[num_var]] %in% num_cond)),
nsti = sum((.data[[num_var]] == 1) * .data$perweight),
nsti2 = length(which(.data[[num_var]] == 1)),
pct = 100 * .data$nsti / .data$n,
.groups = "drop") %>%
dplyr::mutate(country = cur_cntry, geo_var = cur_var)

regstats <- tmp %>%
dplyr::filter(.data[[denom_var]] %in% denom_cond) %>%
dplyr::group_by(.data$dhscode, .data$region) %>%
dplyr::summarise(
n = sum(.data$perweight),
n2 = dplyr::n(), # length(which(.data[[num_var]] %in% num_cond)),
nsti = sum((.data[[num_var]] == 1) * .data$perweight),
nsti2 = length(which(.data[[num_var]] == 1)),
pct = 100 * .data$nsti / .data$n,
.groups = "drop") %>%
dplyr::mutate(country = cur_cntry, geo_var = cur_var)

yrstats <- tmp %>%
dplyr::filter(.data[[denom_var]] %in% denom_cond) %>%
dplyr::group_by(.data$year) %>%
dplyr::summarise(
n = sum(.data$perweight),
n2 = dplyr::n(), # length(which(.data[[num_var]] %in% num_cond)),
nsti = sum((.data[[num_var]] == 1) * .data$perweight),
nsti2 = length(which(.data[[num_var]] == 1)),
pct = 100 * .data$nsti / .data$n) %>%
dplyr::mutate(country = cur_cntry, geo_var = cur_var)

allstats <- tmp %>%
dplyr::filter(.data[[denom_var]] %in% denom_cond) %>%
dplyr::summarise(
n = sum(.data$perweight),
n2 = length(which(.data[[num_var]] %in% num_cond)),
nsti = sum((.data[[num_var]] == 1) * .data$perweight),
nsti2 = length(which(.data[[num_var]] == 1)),
pct = 100 * .data$nsti / .data$n) %>%
dplyr::mutate(country = cur_cntry, geo_var = cur_var)

res[[cur_cntry]] <- list(
country = cur_cntry,
geo_var = cur_var,
allstats = allstats,
yrstats = yrstats,
regstats = regstats,
yrregstats = yrregstats,
geodf = geodf
)
}
class(res) <- c("list", "ipums-rates")
res
}
73 changes: 73 additions & 0 deletions R/geo_vis.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@

#' @importFrom dplyr bind_rows recode
#' @importFrom mapboxer as_mapbox_source mapboxer add_fill_layer
#' add_line_layer as_mapbox_source add_tooltips basemaps
geo_vis_all <- function(
rates, geo_dir, bins = NULL, width = NULL, height = NULL
) {
world <- ipumsr::read_ipums_sf(
file.path(geo_dir, "IPUMSI_world_release2020.zip"),
verbose = FALSE
) %>%
dplyr::rename_all(tolower) %>%
# DRC is the only one coded differently here vs. in IPUMS data
# setdiff(sort(unique(x$country2)), world$cntry_name)
dplyr::mutate(country = dplyr::recode(.data$cntry_name,
"Congo, DRC" = "Congo Democratic Republic"
))

world2 <- dplyr::filter(world, .data$country %in% names(rates))

alldat <- dplyr::bind_rows(lapply(rates, function(x) x$yrregstats)) %>%
dplyr::group_by(.data$country, .data$region, .data$dhscode) %>%
dplyr::filter(year == max(.data$year)
)

allgeo <- lapply(rates, function(x) x$geodf) %>%
dplyr::bind_rows() %>%
dplyr::rename(country = "cntry_name")

if (is.null(bins)) {
bins <- pretty(alldat$pct, n = 8)
# classInt::classIntervals(alldat$pct, n = 10)
}
cols <- viridisLite::viridis(length(bins) - 1) %>%
substr(1, 7)

leglbl <- paste0(bins[-length(bins)], "-", bins[-1], "%")

geodat <- ipumsr::ipums_shape_inner_join(alldat, allgeo,
by = c("country", "dhscode")) %>%
dplyr::mutate(
fill_color = cols[cut(.data$pct, bins, labels = FALSE)],
pct = round(.data$pct, 1)
)

# setdiff(names(sti_rates), geodat$country)

# legend: https://docs.mapbox.com/help/tutorials/choropleth-studio-gl-pt-2/
mapboxer::as_mapbox_source(geodat) %>%
mapboxer::mapboxer(
style = mapboxer::basemaps$Mapbox$light_v10,
bounds = sf::st_bbox(geodat),
fitBoundsOptions = list(padding = 20),
width = width,
height = height
) %>%
mapboxer::add_fill_layer(
id = "map",
fill_color = c("get", "fill_color"),
fill_opacity = 0.6
) %>%
mapboxer::add_line_layer(
source = mapboxer::as_mapbox_source(world2),
line_color = "black",
line_width = 1
) %>%
mapboxer::add_tooltips(
"map", paste0(
"Country: {{country}}<br>",
"Region: {{region}}<br>",
"Year: {{year}}<br>",
"Pct STI: {{pct}}%"))
}
18 changes: 18 additions & 0 deletions R/misc.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#' View html table of IPUMS extract variable descriptions
#' @importFrom DT datatable
#' @importFrom dplyr filter tibble .data
#' @export
view_var_descs <- function(dd, include_geo = FALSE) {
check_ipums_data(x, "x")

descs <- sapply(dd, function(x) attr(x, "var_desc"))
descs <- dplyr::tibble(
name = names(descs),
desc = unname(descs)
)
if (!include_geo) {
descs <- dplyr::filter(descs, !grepl("^geo_", .data$name))
}

DT::datatable(descs, options = list(paging = FALSE))
}
81 changes: 81 additions & 0 deletions R/preprocess.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#' Preprocess IPUMS extract
#' @importFrom ipumsr read_ipums_ddi read_ipums_micro as_factor lbl_clean
#' @importFrom dplyr rename_all
#' @examples
#' \dontrun{
#' preprocess_ipums(
#' input_xml = "ext-data/idhs/extract2/idhs_00002.xml",
#' output_file = "ext-data/idhs/extract2/dd.rds"
#' )
#' }
#' @export
preprocess_ipums <- function(input_xml, output_file) {
message("reading xml")
ddi <- ipumsr::read_ipums_ddi(input_xml)
message("reading microdata")
dd <- ipumsr::read_ipums_micro(ddi)
dd <- dplyr::rename_all(dd, tolower)
attr(dd, "ipums_ddi") <- ddi

message("saving output")
saveRDS(dd, file = output_file)

invisible(dd)
}

# https://www.idhsdata.org/idhs/gis.shtml

#' Preprocess (download) shapfiles associated with an IPUMS extract
#' @examples
#' \dontrun{
#' dd <- preprocess_ipums(
#' input_xml = "ext-data/idhs/extract2/idhs_00002.xml",
#' output_file = "ext-data/idhs/extract2/dd.rds"
#' )
#' preprocess_geo(dd, output_dir = "ext-data/idhs/geo")
#' }
#' @export
#' @importFrom utils download.file
preprocess_geo <- function(x, output_dir) {
check_ipums_data(x, "x")

if (!dir.exists(output_dir))
dir.create(output_dir, recursive = TRUE)

prefix <- "https://www.idhsdata.org/idhs/resources/gis/"

nms <- names(x)
geonms <- sort(nms[grepl("geo_.*_.*", nms)])

if (length(geonms) == 0) {
message("no shapefiles found associated with this data")
return(invisible(FALSE))
}

for (nm in geonms) {
message("downloading shapefiles for ", nm)
tryres <- try(utils::download.file(
paste0(prefix, nm, ".zip"),
file.path(output_dir, paste0(nm, ".zip")),
quiet = TRUE
), silent = TRUE)
if (inherits(tryres, "try-error")) {
message(" could not find shapefiles associated with this country")
}
}

message("downloading shapefiles for world countries")
tryres <- try(utils::download.file(
"https://international.ipums.org/international/resources/gis/IPUMSI_world_release2020.zip",
file.path(output_dir, "IPUMSI_world_release2020.zip"),
quiet = TRUE
))
if (inherits(tryres, "try-error")) {
message(" could not find world shapefiles")
}

invisible(TRUE)
}

# https://spatialdata.dhsprogram.com/population-estimates/
# https://dhs-sites.s3.amazonaws.com/SDR/boundaries/production/data/Population_Estimates_ALL.zip
6 changes: 6 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@

check_ipums_data <- function(x, nm = "x") {
if (!"ipums_ddi" %in% names(attributes(x))) {
stop("Input '", nm, "' must come from preprocess_ipums()")
}
}
16 changes: 16 additions & 0 deletions man/calc_rates.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit a3a6839

Please sign in to comment.