Merge pull request #5 from willtownes/main

make preview_datasets more flexible and add sprinters dataset
cmustatistics · Sep 10, 2024 · 0f1e55f · 0f1e55f
2 parents e7c0047 + 3523db7
commit 0f1e55f
Show file tree

Hide file tree

Showing 8 changed files with 2,167 additions and 11 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,6 @@ _site/
 
 # Ignore metadata, since it is auto-generated
 metadata.json
+
+# RStudio project data
+.Rproj.user
diff --git a/_freeze/sports/sprinters/execute-results/html.json b/_freeze/sports/sprinters/execute-results/html.json
diff --git a/cmu-statds-datasets.csv b/cmu-statds-datasets.csv
@@ -8,6 +8,7 @@ date,datayear,title,description,subject,categories,url
 2019-09-01,2019,Serving sizes and plate sizes,Do people eat more food if their meals are served on larger plates? A randomized experiment with demographic and psychological control variables.,psychology,"ANOVA, linear regression",https://cmustatistics.github.io/data-repository/psychology/plate-size.html
 2017-02-25,2017,Religion and analytic thinking,A controversial study found that encouraging analytic thinking also reduced religious belief. A replication attempt collected much more data to try to confirm the hypothesis; is it supported by the data? A simple randomized experiment with continuous outcome and demographic controls.,psychology,"ANOVA, linear regression",https://cmustatistics.github.io/data-repository/psychology/religion-analytic-thinking.html
 2017-02-24,2015,Resenting moral rebels,We usually applaud people who rebel against the status quo for moral reasons – but those involved in the status quo do not. An experiment attempts to understand why this is.,psychology,ANOVA,https://cmustatistics.github.io/data-repository/psychology/moral-rebels.html
+2024-08-30,2021,Fastest 100m sprint times,"Explore the 1,000 fastest times ever recorded for the 100m sprint, in men’s and women’s track, cleaning the data and conducting EDA.",sports,"data cleaning, EDA, split/apply/combine",https://cmustatistics.github.io/data-repository/sports/sprinters.html
 2023-06-20,2022,World Development Indicators,"The World Bank’s World Development Indicators (WDI) compile development information about countries around the world. Using ten years of data, study development, political stability, pollution, and other factors at the national and regional levels.",politics,"linear regression, ANOVA",https://cmustatistics.github.io/data-repository/politics/world-bank.html
 2023-09-07,2020,Fiscally standardized cities,Extensive financial data on over 200 of the largest cities in the United States for over 40 years. Which cities spend the most or the least on government services?,politics,"EDA, clustering",https://cmustatistics.github.io/data-repository/politics/standard-cities.html
 2017-02-22,2014,Science Forums,"A random sample of discussions at a large science discussion forum, with metadata about each.",social,"GLMs, classification, linear regression",https://cmustatistics.github.io/data-repository/social/science-forums.html

diff --git a/data-repository.Rproj b/data-repository.Rproj
@@ -0,0 +1,13 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
diff --git a/data/sprint_m.txt b/data/sprint_m.txt
diff --git a/data/sprint_w.txt b/data/sprint_w.txt
diff --git a/preview_dataset.R b/preview_dataset.R
@@ -1,4 +1,8 @@
-library(readr)
+
+default_csv_loader <- function(dataset_name) {
+  readr::read_csv(file.path("../data", dataset_name), guess_max = Inf,
+           show_col_types = FALSE)
+}
 
 #' Produce a nice preview table of the chosen dataset
 #'
@@ -8,23 +12,39 @@ library(readr)
 #' @param n Number of rows of data to include in the table. This should not be
 #'   so large that the web page contains megabytes of data, but should contain
 #'   enough rows that readers can understand the format of the data.
-#' @param dataset Name of file (in `data/` directory) to load and preview.
-#'   Defaults to the `datafile` attribute of the page metadata.
+#' @param dataset Either a dataframe or the name of a CSV file (in `data/`
+#'   directory) to load and preview. Defaults to the `datafile` attribute of the
+#'   page metadata.
 #' @return A `paged_df` object that is automatically formatted by R Markdown as
 #'   a nice table.
 preview_dataset <- function(n = 20, dataset = rmarkdown::metadata$datafile) {
-  read_csv(paste0("../data/", dataset),
-           guess_max = Inf,
-           show_col_types = FALSE) |>
+  if (is.character(dataset)) {
+    if (tolower(tools::file_ext(dataset)) == "csv") {
+      df <- default_csv_loader(dataset)
+    } else {
+      stop("preview by filename only implemented for csv files")
+    }
+  } else if (is.data.frame(dataset)) {
+    df <- dataset
+  } else {
+    stop("invalid dataset type, must be a string or a data frame")
+  }
+  df |>
     head(n = n) |>
     rmarkdown::paged_table(options = list(rownames.print = FALSE))
 }
 
 preview_datasets <- function(n = 20, datasets = rmarkdown::metadata$data$files) {
-  for (dataset in datasets) {
-    url <- paste0("https://cmustatistics.github.io/data-repository/data/",
-                  dataset)
-    cat("<h4><a href=\"", url, "\">", dataset, "</a></h4>\n")
-    cat(rmarkdown:::print.paged_df(preview_dataset(n, dataset)))
+  # datasets can be either a named list of dataframes, or a character vector
+  if (!is.list(datasets)) {  #case where only a vector of names is provided
+    datasets <- sapply(datasets, function(d) { preview_dataset(n, d) },
+                       simplify = FALSE)
+  }
+
+  # datasets is now a named list of dataframes
+  for (d in names(datasets)) {
+    url <- paste0("https://cmustatistics.github.io/data-repository/data/", d)
+    cat("<h4><a href=\"", url, "\">", d, "</a></h4>\n")
+    cat(rmarkdown:::print.paged_df(datasets[[d]]))
   }
 }
diff --git a/sports/sprinters.qmd b/sports/sprinters.qmd
@@ -0,0 +1,98 @@
+---
+title: Fastest 100m sprint times
+author: Will Townes
+date: August 30, 2024
+description: Explore the 1,000 fastest times ever recorded for the 100m sprint, in men's and women's track, cleaning the data and conducting EDA.
+categories:
+  - data cleaning
+  - EDA
+  - split/apply/combine
+data:
+  year: 2021
+  files: 
+  - sprint_m.txt
+  - sprint_w.txt
+---
+
+## Motivation
+
+This data comes from a compilation of official times for runners in 100m sprint
+events around the world.
+
+## Data
+
+There are two files, one for men and one for women. Each contains the top 1,000
+times (as of 2021) for men or women in the 100m sprint. Each row represents one
+time.
+
+Both data files are in tab-separated format.
+
+### Data preview
+
+```{r, echo=FALSE, results="asis"}
+source("../preview_dataset.R")
+rt <- function(x) {
+  read.table(file.path("../data", x), sep = "\t", quote = "", header = TRUE)
+}
+datasets <- sapply(c("sprint_m.txt", "sprint_w.txt"), rt, simplify = FALSE)
+
+preview_datasets(datasets = datasets)
+```
+
+### Variable descriptions
+
+| Variable | Description |
+|----|-------------|
+| Rank | Ranking of the time |
+| Time | Number of seconds to complete the 100m sprint. Times have an "A" suffix if the event was at an altitude greater than 1,000 meters above sea level (e.g. "10.36A"). |
+| Wind | Windspeed in meters per second. Positive number is a tailwind. |
+| Name | Name of the sprinter. |
+| Country | Country the sprinter represented. |
+| Birthdate | When the sprinter was born (format DD.MM.YY) |
+| City | Where the race took place. |
+| Date | When the race took place (format DD.MM.YYYY) |
+
+The format is the same in both data files.
+
+## Questions
+
+This dataset provides many opportunities for data cleaning, wrangling, and EDA.
+For instance, in data cleaning:
+
+1. The `Time` column contains a suffix to indicate if the event was at altitude.
+   Create a new `Altitude` column that is true if the event was at altitude, and
+   adjust `Time` to contain only the time as a number.
+2. Some of the text columns contain special characters encoded as [HTML
+   entities](https://developer.mozilla.org/en-US/docs/Glossary/Character_reference).
+   For instance, ü is written `&uuml;`. Use a package that can decode these to
+   convert the text back to a readable format.
+3. Convert the `Birthdate` and `Date` columns to your programming language's
+   date objects (such as `datetime.date` in Python or `Date` objects in R). How
+   do you handle two-digit years and determine which century they occurred in?
+
+
+For data wrangling and split-apply-combine operations:
+
+1. Find the fastest time achieved by runners of each country.
+2. Find the fastest time recorded each year. Is this increasing or decreasing
+   over time?
+3. Which sprinters have the most times in the top 1,000?
+
+
+For EDA:
+
+1. Is the race time correlated with the wind speed? Do tailwinds make runners
+   faster?
+2. Do certain cities have faster average times than others?
+
+
+## References
+
+Data scraped from Peter Larsson's [Track and Field all-time Performances
+Homepage](http://www.alltime-athletics.com):
+
+- [Men's 100 meters](http://www.alltime-athletics.com/m_100ok.htm)
+- [Women's 100 meters](http://www.alltime-athletics.com/w_100ok.htm)
+
+Copyright held by Peter Larsson, email: `kl78vc` at `alltime-athletics.com`. 
+Approval to redistribute was granted in September 2024.