cmu-delphi · brookslogan · Dec 18, 2024 · Jan 11, 2025 · Jan 11, 2025 · nmdefries
diff --git a/vignettes/epi_df.Rmd b/vignettes/epi_df.Rmd
@@ -209,35 +209,142 @@ Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/flusurv.html)
 
 ```{r}
 library(epidatr)
-flu_data <- pub_flusurv(
+flu_data_api <- pub_flusurv(
   locations = "ca",
-  epiweeks = epirange(201801, 202001),
-) %>%
-  select(location, epiweek, issue, rate_age_0, rate_age_1, rate_age_2, rate_age_3, rate_age_4) %>%
-  tidyr::pivot_longer(cols = starts_with("rate_age_"), names_to = "age_group", values_to = "rate")
+  epiweeks = epirange(201801, 202001)
+)
+```
+
+We're interested in the age-specific rates:
+```{r}
+flu_data <- flu_data_api %>%
+  select(location, epiweek, rate_age_0, rate_age_1, rate_age_2, rate_age_3, rate_age_4) %>%
+  # Turn `rate_age_0`..`rate_age_4` columns into an `age_group` and `rate`
+  # column (with 5x as many rows):
+  tidyr::pivot_longer(
+    cols = starts_with("rate_age_"), names_to = "age_group", values_to = "rate",
+    # When converting column names to entries in `age_group`, remove this prefix:
+    names_prefix = "rate_age_",
+    # And add a better prefix:
+    names_transform = function(age_group) paste0("age_group_", age_group)
+  ) %>%
+  # Improve `age_group` labels a bit more:
+  mutate(
+    age_group = case_match(
+      age_group,
+      "age_group_0" ~ "0--4 yr",
+      "age_group_1" ~ "5--17 yr",
+      "age_group_2" ~ "18--49 yr",
+      "age_group_3" ~ "50--64 yr",
+      "age_group_4" ~ ">= 65 yr",
+      # Make this a factor with appropriate level ordering:
+      .ptype = factor(levels = c("0--4 yr", "5--17 yr", "18--49 yr", "50--64 yr", ">= 65 yr"))
+    )
+  ) %>%
+  # The API currently outputs `epiweek` in Date format (the constituent Sunday);
+  # rename it to remind us that it's not in YYYYww format:
+  rename(time_value = epiweek)
 flu_data
 ```
 
 We can now convert this data to an `epi_df` object and set the `age_group`
 column as an additional group key:
 
 ```{r}
-flu_data <- flu_data %>% as_epi_df(other_keys = "age_group", as_of = as.Date("2024-03-20"))
+flu_data <- flu_data %>% as_epi_df(other_keys = "age_group")
 flu_data
 ```
 
 Note that the `epi_df` object now has an additional key column `age_group`. This
 means that there should only be one row for each combination of `geo_value`,
-`time_value`, and `age_group` in the dataset (this is enforced at construction
+`age_group`, and `time_value` in the dataset (this is enforced at construction
 time).
 
-Now we can aggregate the data by `age_group`, if we want to compute the total:
+Now we can aggregate the data by `age_group`, if we want to compute the total.
+Since we are working with rates, we need to attach some population data in order
+to do this aggregation. It's somewhat ambiguous whether FluSurv-NET reporting
+uses either [NCHS](https://www.cdc.gov/nchs/nvss/bridged_race.htm) or
+[Census](https://www.census.gov/programs-surveys/popest/technical-documentation/research/evaluation-estimates/2020-evaluation-estimates/2010s-county-detail.html)
+populations for `time_value`s before 2020 included in reports from 2020 onward,
+but in this case, the two sources agreed exactly. FluSurv-NET also directly
+reports an overall rate, so we can check our work.
+```{r}
+# Population estimates for FluSurv-NET-covered part of CA on 2017-07-01 and
+# 2018-07-01, extracted and aggregated from "vintage 2020" estimates (actually
+# released by Census in June 2021 and by NCHS in September 2021), which is the
+# last available reporting found with population estimates for 2017 and 2018:
+pop <- tribble(
+  ~geo_value, ~age_group,  ~time_value,           ~pop,
+  "CA",       "0--4 yr",   as.Date("2017-07-01"), 203813,
+  "CA",       "5--17 yr",  as.Date("2017-07-01"), 521827,
+  "CA",       "18--49 yr", as.Date("2017-07-01"), 1722399,
+  "CA",       "50--64 yr", as.Date("2017-07-01"), 700090,
+  "CA",       ">= 65 yr",  as.Date("2017-07-01"), 534789,
+  "CA",       "0--4 yr",   as.Date("2018-07-01"), 201265,
+  "CA",       "5--17 yr",  as.Date("2018-07-01"), 520077,
+  "CA",       "18--49 yr", as.Date("2018-07-01"), 1725382,
+  "CA",       "50--64 yr", as.Date("2018-07-01"), 699145,
+  "CA",       ">= 65 yr",  as.Date("2018-07-01"), 551243,
+)
+# Calculate fraction of total population in each age group.
+pop <- pop %>%
+  group_by(geo_value, time_value) %>%
+  mutate(frac_pop = pop / sum(pop))
+```
+
+After joining population onto the rate data, we can calculate the
+population-weighted rate for each age group, that is, the portion of each
+`age_group`'s rate that it contributes to the overall rate.
+
+```{r}
+group_cols <- key_colnames(flu_data, exclude = "age_group")
+fractional_rate_by_age_group <-
+  flu_data %>%
+  inner_join(
+    pop,
+    # Simple population interpolation/extrapolation scheme: last observation
+    # carried forward. Use the estimated population on 2017-07-01 for
+    # time_values 2017-07-01 through 2018-06-30, and the estimated population on
+    # 2018-07-01 for all subsequent time_values:
+    join_by(geo_value, closest(y$time_value <= x$time_value), age_group),
+    suffix = c("", "_for_pop"),
+    relationship = "many-to-one", unmatched = "error"
+  ) %>%
+  mutate(frac_rate = rate * frac_pop)
+```
+
+We can then use `sum_groups_epi_df` to sum population-weighted rate across all
+age groups to get the overall rate.
+
+```{r}
+rate_overall_recalc_edf <-
+  fractional_rate_by_age_group %>%
+  sum_groups_epi_df("frac_rate", group_cols = group_cols) %>%
+  rename(rate_overall_recalc = frac_rate) %>%
+  # match rounding of original data:
+  mutate(rate_overall_recalc = round(rate_overall_recalc, 1))
+rate_overall_recalc_edf
+```
+
+Let's compare our calculated rate to the overall rate reported by FluSurv-NET.
 
 ```{r}
-group_cols <- key_colnames(exclude = "age_group")
-flu_data %>%
-  sum_groups_epi_df("rate", group_cols = group_cols)
+rate_overall_recalc_edf <-
+  rate_overall_recalc_edf %>%
+  # compare to published overall rates:
+  inner_join(
+    flu_data_api %>%
+      select(geo_value = location, time_value = epiweek, rate_overall),
+    by = c("geo_value", "time_value"),
+    relationship = "one-to-one", unmatched = "error"
+  )
+# What's our maximum error vs. the official overall estimates?
+max(abs(rate_overall_recalc_edf$rate_overall - rate_overall_recalc_edf$rate_overall_recalc))
 ```
+This small amount of difference is expected, since all the age-specific rates
+were rounded to the first decimal place, and population data might have been
+interpolated and extrapolated a bit differently in the official source, limiting
+our ability to precisely recreate its estimates from an age group breakdown.
 
 ## Detecting and filling time gaps with `complete.epi_df`
 
@@ -515,4 +622,3 @@ Engineering. Copyright Johns Hopkins University 2020.
 API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html):
 These signals are taken directly from the JHU CSSE [COVID-19 GitHub
 repository](https://github.com/CSSEGISandData/COVID-19) without changes.
-