Skip to content

Commit

Permalink
Add post_estimation.R
Browse files Browse the repository at this point in the history
Add post-estimation analysis script and encapsulate more of the
data-prep script so that they can be used together.
  • Loading branch information
p00ya committed Aug 14, 2019
1 parent 573379e commit 912ddfd
Show file tree
Hide file tree
Showing 2 changed files with 185 additions and 66 deletions.
153 changes: 87 additions & 66 deletions data_prep.R
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
# Normalize a table of ascents into ascent, page and route tables.

# Copyright 2019 Dean Scarff
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
Expand All @@ -16,6 +16,8 @@

library(dplyr)

period_length <- 604800 # seconds in 1 week

# Classifies tick types like "dog" into either 1 (clean), 0 (not clean) or NA.
# Note that we have pessimistic interpretations of some tick types that are
# ambiguous in practice, e.g. a plain "tick" is not counted as clean.
Expand All @@ -31,71 +33,90 @@ IsTickClean <- function(ticktype) {
)
}

# Given a data frame containing "raw" ascents, normalize these to ascent,
# page and route tables.
#
# df_raw is expected to have the columns "ascentId", "route", "climber",
# "tick", "grade", "timestamp".
NormalizeTables <- function(df_raw) {
df <- df_raw %>%
mutate(clean = IsTickClean(tick)) %>%
filter(!is.na(clean)) %>%
filter(!is.na(grade)) %>%
mutate(route = droplevels(route), climber = droplevels(climber))

# Summarize the tick counts.
print(paste(nrow(df), "ascents by",
nlevels(df$climber), "climbers over",
nlevels(df$route), "routes."))

# Find the most popular routes:
top_routes <- df %>% count(route, sort=TRUE)

# Make the route with the most ascents the "first" route. This means it
# will be used as the reference route (rating of 1). Having lots of ascents
# means it is (hopefully) a good reference point for comparing climbers.
df$route <- relevel(df$route, ref=as.character(top_routes[[1,1]]))

df_routes <- df %>%
group_by(route) %>%
summarise(ewbank=floor(median(grade)))

df_ascents <- df %>%
mutate(t = (timestamp - min(df$timestamp)) %/% period_length) %>%
arrange(climber, t)

df_pages <- df_ascents %>% group_by(climber, t) %>% summarise()
# Set first_page[c] to be the index in df_pages of the first page for climber c.
first_page <- (df_pages %>% group_by(climber) %>% summarise(n=n()) %>%
mutate(idx=head(cumsum(c(1, n)), -1)))$idx

# time relative to the first ascent from the same climber
df_pages$rel_t <- df_pages$t - df_pages$t[first_page[df_pages$climber]]
# time relative to the next page for the same climber (meaningless for last
# page of each climber).
df_pages$gap <- c(diff(df_pages$rel_t), 0)

df_pages <- df_pages %>%
select(climber, t, gap) %>%
ungroup() %>%
mutate(page=row_number())

df_ascents <- df_ascents %>%
inner_join(df_pages, by=c("climber", "t")) %>%
select("route", "climber", "clean", "page")

df_ascents <- as_tibble(df_ascents)

return(list(ascents=df_ascents, pages=df_pages, routes=df_routes))
}

# Write normalized tables to CSV files.
#
# The files are written with standard names to the directory specified by
# "dir".
#
# "dfs" should be a list of data frames, with the tags "ascents", "routes" and
# "pages"; like what NormalizeTables returns.
WriteNormalizedTables <- function(dfs, dir) {
write.csv(dfs$ascents %>%
mutate(route=as.integer(route)-1, page=page-1) %>%
select(-climber),
file.path(dir, "ascents.csv"), row.names=FALSE)
write.csv(dfs$routes %>%
mutate(grade=pmax(1, 1 + ewbank - ewbank[[1]])) %>%
select(-ewbank),
file.path("data", "routes.csv"), row.names=FALSE)
write.csv(dfs$pages %>%
select(climber, gap) %>%
mutate(climber=as.integer(climber) - 1),
file.path("data", "pages.csv"), row.names=FALSE)
}

# Read in the ascents table.
df_raw <- read.csv("data/raw_ascents.csv",
comment.char = "#",
colClasses=c("integer", "factor", "factor", "factor", "integer", "integer"))

df <- df_raw %>%
mutate(clean = IsTickClean(tick)) %>%
filter(!is.na(clean)) %>%
filter(!is.na(grade)) %>%
mutate(route = droplevels(route), climber = droplevels(climber))

# Summarize the tick counts.
print(paste(nrow(df), "ascents by",
nlevels(df$climber), "climbers over",
nlevels(df$route), "routes."))

# Find the most popular routes:
top_routes <- df %>% count(route, sort=TRUE)

# Make the route with the most ascents the "first" route. This means it
# will be used as the reference route (rating of 1). Having lots of ascents
# means it is (hopefully) a good reference point for comparing climbers.
df$route <- relevel(df$route, ref=as.character(top_routes[[1,1]]))

df_routes <- df %>%
group_by(route) %>%
summarise(ewbank=floor(median(grade)))

period_length <- 604800 # seconds in 1 week

df_ascents <- df %>%
mutate(t = (timestamp - min(df$timestamp)) %/% period_length) %>%
arrange(climber, t)

df_pages <- df_ascents %>% group_by(climber, t) %>% summarise()
# Set first_page[c] to be the index in df_pages of the first page for climber c.
first_page <- (df_pages %>% group_by(climber) %>% summarise(n=n()) %>%
mutate(idx=head(cumsum(c(1, n)), -1)))$idx

# time relative to the first ascent from the same climber
df_pages$rel_t <- df_pages$t - df_pages$t[first_page[df_pages$climber]]
# time relative to the next page for the same climber (meaningless for last
# page of each climber).
df_pages$gap <- c(diff(df_pages$rel_t), 0)

df_pages <- df_pages %>%
select(climber, t, gap) %>%
ungroup() %>%
mutate(page=row_number())

df_ascents <- df_ascents %>%
inner_join(df_pages, by=c("climber", "t")) %>%
select("route", "climber", "clean", "page")

df_ascents <- as_tibble(df_ascents)

write.csv(df_ascents %>%
mutate(route=as.integer(route)-1, page=page-1) %>%
select(-climber),
"data/ascents.csv", row.names=FALSE)
write.csv(df_routes %>%
mutate(grade=pmax(1, 1 + ewbank - ewbank[[1]])) %>%
select(-ewbank),
"data/routes.csv", row.names=FALSE)
write.csv(df_pages %>%
select(climber, gap) %>%
mutate(climber=as.integer(climber) - 1),
"data/pages.csv", row.names=FALSE)
dfs <- NormalizeTables(df_raw)
WriteNormalizedTables(dfs, "data")
98 changes: 98 additions & 0 deletions post_estimation.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# Exploration of estimated ratings.

# Copyright 2019 Dean Scarff
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

library(dplyr)
library(ggplot2)

# Returns the probability an ascent is clean according to the Bradley-Terry
# model.
PredictBradleyTerry <- function(dfs) {
page_gamma <- dfs$pages$gamma[dfs$ascents$page]
route_gamma <- dfs$routes$gamma[dfs$ascents$route]
return(page_gamma / (page_gamma + route_gamma))
}

MergeWithRatings <- function(dfs, dir) {
df_page_ratings <- read.csv(file.path(dir, "page_ratings.csv"),
colClasses=c("integer", "numeric", "numeric"))

df_route_ratings <- read.csv(file.path(dir, "route_ratings.csv"),
colClasses=c("factor", "numeric"))

dfs$pages$gamma <- df_page_ratings$gamma
dfs$pages$var <- df_page_ratings$var
dfs$routes$gamma <- df_route_ratings$gamma

dfs$ascents$predicted <- PredictBradleyTerry(dfs)
dfs$routes$r <- log(dfs$routes$gamma)
dfs$pages$r <- log(dfs$pages$gamma)
return(dfs)
}

GetTimestampForPeriod <- function(t) {
as.POSIXct(t * period_length + min(df_raw$timestamp), origin="1970-01-01");
}

# Plots the timeseries of rating estimates for a set of climbers. The "friends"
# parameter should be a named vector where the names are the climber levels and
# the values are corresponding labels to apply in the plot.
PlotProgression <- function(df_pages, friends) {
df_friends <- df_pages %>%
filter(climber %in% names(friends)) %>%
mutate(date=GetTimestampForPeriod(t),
climber=recode(climber, !!!friends),
r=log(gamma),
r_upper=r+qnorm(0.25)*sqrt(var),
r_lower=r-qnorm(0.25)*sqrt(var))
ggplot(df_friends, aes(date, r, color=climber)) +
geom_smooth(aes(ymin=r_lower, ymax=r_upper, fill=climber), stat="identity")
}

# Generates outlier labels for the route plot.
GetOutliers <- function(df_routes) {
m <- loess(r ~ ewbank, df_routes)
e <- residuals(m)
e <- e / sd(e)
return(ifelse(abs(e) > 1.65, as.character(df_routes$route), NA))
}

# Assume data_prep.R has already been sourced.
dfs <- MergeWithRatings(dfs, "data")

png(width=1024, res=120)

# Plots the "predicted" probability of clean ascents vs the actual proportion
# of clean ascents. Ideally the fit follows the y=x line.
ggplot(dfs$ascents, aes(predicted, clean)) + geom_smooth() +
geom_abline(slope=1)

# Plots the residuals vs the conventional grade of routes. Ideally the fit
# follows the y=0 line.
ggplot(dfs$ascents %>% inner_join(dfs$routes, by="route"),
aes(ewbank, predicted - clean)) + geom_smooth()

# Plots the residuals vs the estimated natural route rating. Ideally the fit
# follows the y=0 line.
ggplot(dfs$ascents %>% inner_join(dfs$routes, by="route"),
aes(r, predicted - clean)) + geom_smooth()

# Plots conventional grades vs the estimated "natural rating" of routes.
# Outliers are labeled.
ggplot(dfs$routes %>% mutate(outlier=GetOutliers(dfs$routes)), aes(ewbank, r)) +
geom_point(alpha=0.1, size=0.5, color="red") +
geom_smooth() +
geom_text(aes(label=outlier), na.rm=TRUE, size=2, check_overlap=TRUE,
hjust=0, nudge_x=0.1, vjust="outward")

0 comments on commit 912ddfd

Please sign in to comment.