Add post_estimation.R

Add post-estimation analysis script and encapsulate more of the data-prep script so that they can be used together.
p00ya · Aug 14, 2019 · 912ddfd · 912ddfd
1 parent 573379e
commit 912ddfd
Show file tree

Hide file tree

Showing 2 changed files with 185 additions and 66 deletions.
diff --git a/data_prep.R b/data_prep.R
@@ -1,13 +1,13 @@
 # Normalize a table of ascents into ascent, page and route tables.
 
 # Copyright 2019 Dean Scarff
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,6 +16,8 @@
 
 library(dplyr)
 
+period_length <- 604800  # seconds in 1 week
+
 # Classifies tick types like "dog" into either 1 (clean), 0 (not clean) or NA.
 # Note that we have pessimistic interpretations of some tick types that are
 # ambiguous in practice, e.g. a plain "tick" is not counted as clean.
@@ -31,71 +33,90 @@ IsTickClean <- function(ticktype) {
   )
 }
 
+# Given a data frame containing "raw" ascents, normalize these to ascent,
+# page and route tables.
+#
+# df_raw is expected to have the columns "ascentId", "route", "climber",
+# "tick", "grade", "timestamp".
+NormalizeTables <- function(df_raw) {
+  df <- df_raw %>%
+    mutate(clean = IsTickClean(tick)) %>%
+    filter(!is.na(clean)) %>%
+    filter(!is.na(grade)) %>%
+    mutate(route = droplevels(route), climber = droplevels(climber))
+
+  # Summarize the tick counts.
+  print(paste(nrow(df), "ascents by",
+      nlevels(df$climber), "climbers over",
+      nlevels(df$route), "routes."))
+
+  # Find the most popular routes:
+  top_routes <- df %>% count(route, sort=TRUE)
+
+  # Make the route with the most ascents the "first" route.  This means it
+  # will be used as the reference route (rating of 1).  Having lots of ascents
+  # means it is (hopefully) a good reference point for comparing climbers.
+  df$route <- relevel(df$route, ref=as.character(top_routes[[1,1]]))
+
+  df_routes <- df %>%
+    group_by(route) %>%
+    summarise(ewbank=floor(median(grade)))
+
+  df_ascents <- df %>%
+    mutate(t = (timestamp - min(df$timestamp)) %/% period_length) %>%
+    arrange(climber, t)
+
+  df_pages <- df_ascents %>% group_by(climber, t) %>% summarise()
+  # Set first_page[c] to be the index in df_pages of the first page for climber c.
+  first_page <- (df_pages %>% group_by(climber) %>% summarise(n=n()) %>%
+      mutate(idx=head(cumsum(c(1, n)), -1)))$idx
+
+  # time relative to the first ascent from the same climber
+  df_pages$rel_t <- df_pages$t - df_pages$t[first_page[df_pages$climber]]
+  # time relative to the next page for the same climber (meaningless for last
+  # page of each climber).
+  df_pages$gap <- c(diff(df_pages$rel_t), 0)
+
+  df_pages <- df_pages %>%
+    select(climber, t, gap) %>%
+    ungroup() %>%
+    mutate(page=row_number())
+
+  df_ascents <- df_ascents %>%
+    inner_join(df_pages, by=c("climber", "t")) %>%
+    select("route", "climber", "clean", "page")
+
+  df_ascents <- as_tibble(df_ascents)
+
+  return(list(ascents=df_ascents, pages=df_pages, routes=df_routes))
+}
+
+# Write normalized tables to CSV files.
+#
+# The files are written with standard names to the directory specified by
+# "dir".
+#
+# "dfs" should be a list of data frames, with the tags "ascents", "routes" and
+# "pages"; like what NormalizeTables returns.
+WriteNormalizedTables <- function(dfs, dir) {
+  write.csv(dfs$ascents %>%
+      mutate(route=as.integer(route)-1, page=page-1) %>%
+      select(-climber),
+      file.path(dir, "ascents.csv"), row.names=FALSE)
+  write.csv(dfs$routes %>%
+      mutate(grade=pmax(1, 1 + ewbank - ewbank[[1]])) %>%
+      select(-ewbank),
+      file.path("data", "routes.csv"), row.names=FALSE)
+  write.csv(dfs$pages %>%
+      select(climber, gap) %>%
+      mutate(climber=as.integer(climber) - 1),
+      file.path("data", "pages.csv"), row.names=FALSE)
+}
+
 # Read in the ascents table.
 df_raw <- read.csv("data/raw_ascents.csv",
   comment.char = "#",
   colClasses=c("integer", "factor", "factor", "factor", "integer", "integer"))
 
-df <- df_raw %>%
-  mutate(clean = IsTickClean(tick)) %>%
-  filter(!is.na(clean)) %>%
-  filter(!is.na(grade)) %>%
-  mutate(route = droplevels(route), climber = droplevels(climber))
-
-# Summarize the tick counts.
-print(paste(nrow(df), "ascents by",
-    nlevels(df$climber), "climbers over",
-    nlevels(df$route), "routes."))
-
-# Find the most popular routes:
-top_routes <- df %>% count(route, sort=TRUE)
-
-# Make the route with the most ascents the "first" route.  This means it
-# will be used as the reference route (rating of 1).  Having lots of ascents
-# means it is (hopefully) a good reference point for comparing climbers.
-df$route <- relevel(df$route, ref=as.character(top_routes[[1,1]]))
-
-df_routes <- df %>%
-  group_by(route) %>%
-  summarise(ewbank=floor(median(grade)))
-
-period_length <- 604800  # seconds in 1 week
-
-df_ascents <- df %>%
-  mutate(t = (timestamp - min(df$timestamp)) %/% period_length) %>%
-  arrange(climber, t)
-
-df_pages <- df_ascents %>% group_by(climber, t) %>% summarise()
-# Set first_page[c] to be the index in df_pages of the first page for climber c.
-first_page <- (df_pages %>% group_by(climber) %>% summarise(n=n()) %>%
-    mutate(idx=head(cumsum(c(1, n)), -1)))$idx
-
-# time relative to the first ascent from the same climber
-df_pages$rel_t <- df_pages$t - df_pages$t[first_page[df_pages$climber]]
-# time relative to the next page for the same climber (meaningless for last
-# page of each climber).
-df_pages$gap <- c(diff(df_pages$rel_t), 0)
-
-df_pages <- df_pages %>%
-  select(climber, t, gap) %>%
-  ungroup() %>%
-  mutate(page=row_number())
-
-df_ascents <- df_ascents %>% 
-  inner_join(df_pages, by=c("climber", "t")) %>%
-  select("route", "climber", "clean", "page")
-
-df_ascents <- as_tibble(df_ascents)
-
-write.csv(df_ascents %>%
-    mutate(route=as.integer(route)-1, page=page-1) %>%
-    select(-climber),
-    "data/ascents.csv", row.names=FALSE)
-write.csv(df_routes %>%
-    mutate(grade=pmax(1, 1 + ewbank - ewbank[[1]])) %>%
-    select(-ewbank),
-    "data/routes.csv", row.names=FALSE)
-write.csv(df_pages %>%
-    select(climber, gap) %>%
-    mutate(climber=as.integer(climber) - 1),
-    "data/pages.csv", row.names=FALSE)
+dfs <- NormalizeTables(df_raw)
+WriteNormalizedTables(dfs, "data")
diff --git a/post_estimation.R b/post_estimation.R
@@ -0,0 +1,98 @@
+# Exploration of estimated ratings.
+
+# Copyright 2019 Dean Scarff
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+library(dplyr)
+library(ggplot2)
+
+# Returns the probability an ascent is clean according to the Bradley-Terry
+# model.
+PredictBradleyTerry <- function(dfs) {
+    page_gamma <- dfs$pages$gamma[dfs$ascents$page]
+    route_gamma <- dfs$routes$gamma[dfs$ascents$route]
+    return(page_gamma / (page_gamma + route_gamma))
+}
+
+MergeWithRatings <- function(dfs, dir) {
+  df_page_ratings <- read.csv(file.path(dir, "page_ratings.csv"),
+      colClasses=c("integer", "numeric", "numeric"))
+
+  df_route_ratings <- read.csv(file.path(dir, "route_ratings.csv"),
+      colClasses=c("factor", "numeric"))
+
+  dfs$pages$gamma <- df_page_ratings$gamma
+  dfs$pages$var <- df_page_ratings$var
+  dfs$routes$gamma <- df_route_ratings$gamma
+
+  dfs$ascents$predicted <- PredictBradleyTerry(dfs)
+  dfs$routes$r <- log(dfs$routes$gamma)
+  dfs$pages$r <- log(dfs$pages$gamma)
+  return(dfs)
+}
+
+GetTimestampForPeriod <- function(t) {
+  as.POSIXct(t * period_length + min(df_raw$timestamp), origin="1970-01-01");
+}
+
+# Plots the timeseries of rating estimates for a set of climbers.  The "friends"
+# parameter should be a named vector where the names are the climber levels and
+# the values are corresponding labels to apply in the plot.
+PlotProgression <- function(df_pages, friends) {
+    df_friends <- df_pages %>%
+        filter(climber %in% names(friends)) %>%
+        mutate(date=GetTimestampForPeriod(t),
+               climber=recode(climber, !!!friends),
+               r=log(gamma),
+               r_upper=r+qnorm(0.25)*sqrt(var),
+               r_lower=r-qnorm(0.25)*sqrt(var))
+    ggplot(df_friends, aes(date, r, color=climber)) +
+        geom_smooth(aes(ymin=r_lower, ymax=r_upper, fill=climber), stat="identity")
+}
+
+# Generates outlier labels for the route plot.
+GetOutliers <- function(df_routes) {
+  m <- loess(r ~ ewbank, df_routes)
+  e <- residuals(m)
+  e <- e / sd(e)
+  return(ifelse(abs(e) > 1.65, as.character(df_routes$route), NA))
+}
+
+# Assume data_prep.R has already been sourced.
+dfs <- MergeWithRatings(dfs, "data")
+
+png(width=1024, res=120)
+
+# Plots the "predicted" probability of clean ascents vs the actual proportion
+# of clean ascents.  Ideally the fit follows the y=x line.
+ggplot(dfs$ascents, aes(predicted, clean)) + geom_smooth() +
+    geom_abline(slope=1)
+
+# Plots the residuals vs the conventional grade of routes.  Ideally the fit
+# follows the y=0 line.
+ggplot(dfs$ascents %>% inner_join(dfs$routes, by="route"),
+    aes(ewbank, predicted - clean)) + geom_smooth()
+
+# Plots the residuals vs the estimated natural route rating.  Ideally the fit
+# follows the y=0 line.
+ggplot(dfs$ascents %>% inner_join(dfs$routes, by="route"),
+    aes(r, predicted - clean)) + geom_smooth()
+
+# Plots conventional grades vs the estimated "natural rating" of routes.
+# Outliers are labeled.
+ggplot(dfs$routes %>% mutate(outlier=GetOutliers(dfs$routes)), aes(ewbank, r)) +
+    geom_point(alpha=0.1, size=0.5, color="red") +
+    geom_smooth() +
+    geom_text(aes(label=outlier), na.rm=TRUE, size=2, check_overlap=TRUE,
+        hjust=0, nudge_x=0.1, vjust="outward")