index.qmd

---
title: "Glittr stats"
format: 
  html
---

In this report you can find some general statistics about [Glittr.org](https://glittr.org). The plots and statistics created are amongst others used in the manuscript. Since [Glittr.org](https://glittr.org) is an ongoing project these statistics are updated weekly. 

## Set up the environment

This is required if you run this notebook locally. Loading required packages.

```{r}
#| output: false
library(httr2)
library(ggplot2)
library(dplyr)
library(ggbreak)
library(cowplot)
library(downlit)
library(xml2)
```

To run locally, create a file named `.env` and add your GitHub PAT (variable named `PAT` ) and google api key (named `GOOGLE_API_KEY`) in there, e.g.:

```{bash, eval=FALSE}
# this is an example, store it as .env:
export PAT="ghp_aRSRESCTZII20Lklser3H"
export GOOGLE_API_KEY="AjKSLE5SklxuRsxwPP8s0"
```

Now use your UNIX terminal to source this file to get the keys as objects:

```{bash, eval=FALSE}
source .env
```

In R, get environment variables as objects:

```{r}
pat <- Sys.getenv("PAT")
google_api_key <- Sys.getenv("GOOGLE_API_KEY")
matomo_api_key <- Sys.getenv("MATOMO_API_KEY")
```

Setting colors. These correspond to the category colours on glittr.org. 

```{r}
glittr_cols <- c(
  "Scripting and languages" =             "#3a86ff",
  "Computational methods and pipelines" = "#fb5607",
  "Omics analysis" =                      "#ff006e",
  "Reproducibility and data management" = "#ffbe0b",
  "Statistics and machine learning" =     "#8338ec",
  "Others" =                              "#000000")
```

## Parse repository data

Using the glittr.org REST API to get repository metadata, among which the stargazers, recency, category, license and tags. 

```{r}
#| code-fold: true

# while loop to loop over all pages
page_list <- list()
page <- 1
has_more_pages <- TRUE

while (has_more_pages) {
  # Create and send the request with pagination
  response <- request("https://glittr.org/api/repositories") |>
    req_url_query(`page[size]` = 100, `page[number]` = page) |>
    req_perform() |>
    resp_body_json()
  
  # Append the data to the list
  page_list <- append(page_list, response$data)
  
  # Check if there are more pages (this logic depends on the API's response structure)
  has_more_pages <- length(response$data) > 0  # Adjust this condition based on your API's pagination logic
  page <- page + 1
}

# extract relevant items as dataframe
repo_info_list <- lapply(page_list, function(x) data.frame(
  repo = x$name,
  author_name = x$author$name,
  stargazers = x$stargazers,
  recency = x$days_since_last_push,
  url = x$url,
  license = ifelse(is.null(x$license), "none", x$license),
  main_tag = x$tags[[1]]$name,
  main_category = x$tags[[1]]$category,
  website = x$website,
  author_profile = x$author$profile,
  author_website = x$author$website
))

repo_info <- do.call(rbind, repo_info_list)

# create a column with provider (either github or gitlab)
repo_info$provider <- ifelse(grepl("github", repo_info$url), "github", "gitlab")

# create a factor for categories for sorting
repo_info$main_category <- factor(repo_info$main_category,
                                  levels = names(glittr_cols))

# category table to keep order the same in the plots
cat_table <- table(category = repo_info$main_category)
cat_table <- sort(cat_table)
```

Number of repositories: `r nrow(repo_info)`

## Get contributors info

Using the GitHub REST API to get the number of contributors for each repository on glittr.org. This takes a few minutes, so if the contributors haven't changed, it will use a cached version. 

```{r}
#| code-fold: true

# take long time to run, so try to use cache results if no repos have been 
# added in the meantime

# check if data/n_contributors.rds exists
if(file.exists("data/n_contributors.rds")) {
  n_contributors <- readRDS("data/n_contributors.rds")
} else {
  n_contributors <- NULL
}

# get contributors info only from github repos
repo_info_gh <- repo_info[repo_info$provider == "github", ]

# get contributor info from github api if update is needed
if(!identical(sort(repo_info_gh$repo), sort(names(n_contributors)))) {
  dir.create("data", showWarnings = FALSE)
  n_contributors <- sapply(repo_info_gh$repo, function(x) {
    
    # get repo contributors
    resp <- request("https://api.github.com/repos/") |>
      req_url_path_append(x) |>
      req_url_path_append("contributors") |>
      req_url_query(per_page = 1) |>
      req_headers(
        Accept = "application/vnd.github+json",
        Authorization = paste("Bearer", pat),
        `X-GitHub-Api-Version` = "2022-11-28",
      ) |>
      req_perform() 
    
    link_url <- resp_link_url(resp, "last")
    if(is.null(link_url)) {
      return(1)
    } else {
      npages <- strsplit(link_url, "&page=")[[1]][2] |> as.numeric()
      return(npages)
    }
  })
  
  # overwrite rds file
  saveRDS(n_contributors, "data/n_contributors.rds")
}

repo_info_gh$contributors <- n_contributors[repo_info_gh$repo]

```

## Get country information

Here we get country information for all authors and organizations. It uses the free text specified at 'location'. Since this can be anything, we use the google REST API to translate that into country. 

```{r}
#| warning: false
#| output: false
#| code-fold: true

# check whether author info exists for caching
if(file.exists("data/author_info.rds")) {
  author_info <- readRDS("data/author_info.rds")
  author_info_authors <- unique(author_info$author) |> sort()
} else {
  author_info_authors <- NULL
}

gh_authors <- repo_info$author_name[repo_info$provider == "github"] |>
  unique() |>
  sort()

# if the author info is out of date, update it
if(!identical(gh_authors, author_info_authors)) {
  author_info_list <- list()
  for(author in gh_authors) {
    
    parsed <- request("https://api.github.com/users/") |>
      req_url_path_append(author) |>
      req_headers(
        Accept = "application/vnd.github+json",
        Authorization = paste("Bearer", pat),
        `X-GitHub-Api-Version` = "2022-11-28",
      ) |>
      req_perform() |>
      resp_body_json()
    
    author_info_list[[author]] <- data.frame(
      author = parsed$login,
      type = parsed$type,
      name = ifelse(is.null(parsed$name), NA, parsed$name),
      location = ifelse(is.null(parsed$location), NA, parsed$location)
    )
  }
  
  author_info <- do.call(rbind, author_info_list)
  
  author_info_loc <- author_info[!is.na(author_info$location), ]
  
  author_loc <- author_info_loc$location
  names(author_loc) <- author_info_loc$author
  
  ggmap::register_google(key = google_api_key)
  loc_info <- ggmap::geocode(author_loc,
                             output = 'all')
  
  get_country <- function(loc_results) {
    if("results" %in% names(loc_results)) {
      for(results in loc_results$results) {
        address_info <- results$address_components |> 
          lapply(unlist) |> 
          do.call(rbind, args = _) |>
          as.data.frame()
        country <- address_info$long_name[address_info$types1 == "country"]
        if (length(country) == 0) next
      }
      if (length(country) == 0) return(NA)
      return(country)
    } else {
      return(NA)
    }
  }
  
  countries <- sapply(loc_info, get_country)
  names(countries) <- names(author_loc)
  
  author_info$country <- countries[author_info$author]
  
  saveRDS(author_info, "data/author_info.rds")
}

repo_info <- merge(repo_info, author_info, by.x = "author_name",
                   by.y = "author")
repo_info$country[is.na(repo_info$country)] <- "undefined"
```

- Number of authors: `r nrow(author_info)`
- Number of countries: `r unique(repo_info$country) |> length()`

## Parse tag data

Here, we create `tag_df` that contains information for each tag by using the glittr.org API. 

```{r}
parsed <- request("https://glittr.org/api/tags") |>
  req_perform() |>
  resp_body_json()

tag_dfs <- list()
for(i in seq_along(parsed)) {
  category <- parsed[[i]]$category
  name <- sapply(parsed[[i]]$tags, function(x) x$name)
  repositories <- sapply(parsed[[i]]$tags, function(x) x$repositories)
  tag_dfs[[category]] <- data.frame(name, category, repositories)
}

tag_df <- do.call(rbind, tag_dfs) |> arrange(repositories)
```

Number of tags/topics: `r nrow(tag_df)`

## Parse outlink data

Here, we use the matomo API to retrieve outlinks to repositories. In this way, we can have an idea which repositories are popular on glittr.org and how often people click on a link. It is summarized over a year (from today). 

```{r}
#| code-fold: true

url <- "https://matomo.sib.swiss/?module=API"

# Create and send the request
response <- request(url) |>
  req_body_form(
    method = "Actions.getOutlinks",
    idSite = 217,
    format = "json",
    date = "today",
    period = "year",
    expanded = 1,
    filter_limit = -1,
    token_auth = matomo_api_key
  ) |>
  req_perform() |>
  resp_body_json()

## Get outlinks metadata in a dataframe
outlinks_list <- list()
for(domain in response) {
  label <- domain$label
  url_info <- lapply(domain$subtable, function(x) {
    data.frame(
      url = ifelse(is.null(x$url),NA , x$url),
      nb_visits = ifelse(is.null(x$nb_visits),NA , x$nb_visits),
      domain = label
    )
  })
  outlinks_list[[domain$label]] <- do.call(rbind, url_info)
}

outlinks_df <- do.call(rbind, outlinks_list)
row.names(outlinks_df) <- NULL

# function to clean urls for matching
clean_url <- function(url) {
  trimws(url) |> gsub("/$", "", x = _) |> tolower()
}

# function to match outlink data with repo and website url per entry
match_url <- function(outlinks_df, repo_info, column = "repo_url") {
  url_clean <- clean_url(outlinks_df$url)
  column_clean <- clean_url(repo_info[[column]])
  outlinks_df[[paste0("is_", column)]] <- url_clean %in% column_clean
  outlinks_df[[paste0("ass_repo_", column)]] <- repo_info$repo[match(url_clean,
                                                                    column_clean)]
  return(outlinks_df)
}

# apply the match url function on urls associated with repo entry
for(column in c("url", "website", "author_profile", "author_website")) {
  outlinks_df <- match_url(outlinks_df, repo_info, column = column)
}

# filter for only repo url and website (ignore author info)
# check whether associations match
outlinks_df$associated_entry <- outlinks_df |>
  select(ass_repo_url, ass_repo_website) |>
  apply(1, function(x) {
    x <- x[!is.na(x)] |> unique()
    
    if(length(x) == 1) return(x[1]) 
    if(length(x == 0) == 0) return(NA)
    if(length(x == 2)) return("do not correspond")
  })

# create a list of outlink visits by entry
visits_by_entry <- outlinks_df |>
  select(url, nb_visits, associated_entry) |>
  filter(!is.na(associated_entry)) |>
  group_by(associated_entry) |>
  summarise(total_visits = sum(nb_visits)) 

visits_by_entry <- merge(visits_by_entry, repo_info,
                         by.x = "associated_entry",
                         by.y = "repo") |>
  arrange(desc(total_visits))

visits_by_namespace <- visits_by_entry |>
  select(total_visits, author_name) |>
  group_by(author_name) |>
  summarise(total_visits = sum(total_visits)) |>
  arrange(desc(total_visits))

# visits by tag
visits_by_main_tag <- visits_by_entry |>
  select(total_visits, main_tag) |>
  group_by(main_tag) |>
  summarise(total_visits = sum(total_visits))

# add main category information for plotting
visits_by_main_tag <- merge(visits_by_main_tag, tag_df,
                            by.x = "main_tag",
                            by.y = "name") |>
  mutate(visits_per_repo = total_visits/repositories) |>
  arrange(desc(visits_per_repo))
```

- The number of outlink visits in the last year: `r sum(visits_by_entry$total_visits)`
- Number of repositories found using Glittr.org: `r nrow(visits_by_entry)` 
- Most popular repo was `r visits_by_entry$associated_entry[1]` with `r visits_by_entry$total_visits[1]` outlinks. 
- Most popular namespace was `r visits_by_namespace$author_name[1]` with `r visits_by_namespace$total_visits[1]` outlinks.

## Number of repositories by category

This is figure 2A in the manuscript. 

```{r}
#| label: fig-categories
#| fig-cap: Number of repositories per category

cat_count_plot <- table(category = repo_info$main_category) |>
  as.data.frame() |>
  ggplot(aes(x = reorder(category, Freq), y = Freq, fill = category)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = glittr_cols) +
  coord_flip() +
  theme_classic() +
  ggtitle("Categories") +
  theme(legend.position = "none",
        axis.title.y = element_blank()) +
  ylab("Number of repositories")

print(cat_count_plot)
```

And a table with the actual numbers 

```{r}
#| label: tbl-categories
#| tbl-cap: Number of repositories per category

category_count <- table(category = repo_info$main_category) |> as.data.frame()
knitr::kable(category_count)
```

## Number of contributors per repository separated by category

This is figure 2B in the manuscript. 

```{r}
#| label: fig-contributors
#| fig-cap: Number of contributors per repository separated by category

repo_info_gh$main_category <- factor(repo_info_gh$main_category,
                                     levels = names(cat_table))

contributors_plot <- repo_info_gh |>
  ggplot(aes(x = main_category, y = contributors, fill = main_category)) +
  geom_violin(scale = "width") +
  geom_boxplot(width = 0.1, col = "darkgrey") +
  coord_flip() +
  ggtitle("Contributors") +
  ylab("Number of contributors") +
  scale_y_sqrt() +
  scale_fill_manual(values = glittr_cols) +
  theme_bw() +
  theme(legend.position = "none",
        axis.title.y = element_blank(),
        plot.margin = margin(t = 5, r = 10, b = 5, l = 10))

print(contributors_plot)
```

And some statistics of contributors. 

```{r}
nna_contr <- repo_info_gh$contributors
param1 <- sum(nna_contr > 10)/length(nna_contr)
param2 <- sum(nna_contr > 1)/length(nna_contr)
param3 <- sum(nna_contr <= 5)/length(nna_contr)
```

- More than 10 contributors: `r signif(param1*100, digits = 3) `%
- More than 1 contributor: `r signif(param2*100, digits = 3) `%
- Between 1 and 5 contributors: `r signif(param3*100, digits = 3) `%

## Number of repositories per tag

This is figure 2C in the manuscript.

```{r}
#| label: fig-tags
#| fig-cap: Number of repostories per tag, colored by category.

tag_freq_plot <- tag_df |>
  filter(repositories > 10) |>
  ggplot(aes(x = reorder(name, repositories),
             y = repositories, fill = category)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_fill_manual(values = glittr_cols) +
  ggtitle("Tags with > 10 repositories") +
  ylab("Number of repositories") +
  annotate(geom = "text", x = 2, y = 150,
           label = paste("Total number of tags: ",
                         nrow(tag_df)),
           color="black") +
  theme_classic() +
  theme(legend.position = "none",
        axis.title.y = element_blank())

print(tag_freq_plot)
```

And a table with the actual numbers.

```{r}
#| label: tbl-tags
#| tbl-cap: Number of repositories per tag

tag_df |>
  filter(repositories > 10) |>
  arrange(desc(repositories)) |>
  knitr::kable(row.names = FALSE)
```

## Number of repositories by author

This is figure 2D in the manuscript.

```{r}
#| label: fig-author
#| fig-cap: Number of repositories per author colored by category

author_freq <- table(author_name = repo_info$author_name, 
                     main_category = repo_info$main_category) |>
  as.data.frame()

author_freq$main_category <- factor(author_freq$main_category,
                                     levels = names(cat_table))

repos_per_author <- table(repo_info$author_name)

lf_authors <- names(repos_per_author)[repos_per_author < 5]

author_freq_plot <- author_freq |>
  filter(!author_name %in% lf_authors) |>
  arrange(Freq) |>
  ggplot(aes(x = reorder(author_name, Freq), y = Freq, fill = main_category)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  ggtitle("Author or organization") +
  ylab("Number of repositories") +
  scale_fill_manual(values = glittr_cols) +
  annotate(geom = "text", x = 2, y = 30,
           label = paste("Authors with < 5 repos: ",
                         length(lf_authors)),
           color="black") +
  theme_classic() +
  theme(legend.position = "none",
        axis.title.y = element_blank())

print(author_freq_plot)
```

And a table with the actual numbers.

```{r}
#| label: tbl-author
#| tbl-cap: Number of repositories per author

table(repo_info$author_name) |>
  as.data.frame() |>
  filter(Freq >= 5) |>
  arrange(desc(Freq)) |>
  knitr::kable()

```

## Number of repositories per license

This is figure 2E in the manuscript.

```{r}
#| label: fig-license
#| fig-cap: Number of repositories per license

lic_freq_data <- table(license = repo_info$license,
                       main_category = repo_info$main_category) |>
  as.data.frame()

lic_freq_data$main_category <- factor(lic_freq_data$main_category,
                                     levels = names(cat_table))

lic_freq_plot <- lic_freq_data |>
  ggplot(aes(x = reorder(license, Freq), y = Freq, fill = main_category)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_fill_manual(values = glittr_cols) +
  theme_classic() +
  ggtitle("License type") +
  ylab("Number of repositories") +
  theme(legend.position = "none",
        axis.title.y = element_blank())

print(lic_freq_plot)
```

And a table with the actual numbers.

```{r}
#| label: tbl-license
#| tbl-cap: Number of repositories per license

repo_info$license |>
  table() |>
  as.data.frame() |>
  mutate(perc = round(Freq/nrow(repo_info)*100, 1)) |>
  arrange(desc(Freq)) |>
  knitr::kable()
```


## Number of repositories per country

This is figure 2F in the mansucript. 

```{r}
#| label: fig-country
#| fig-cap: Number of repositories per country colored by category
#| 
country_freq <- table(country = repo_info$country, 
                      main_category = repo_info$main_category) |>
  as.data.frame()

country_freq$main_category <- factor(country_freq$main_category,
                                     levels = names(cat_table))

country_freq_plot <- country_freq |>
  filter(country != "undefined") |>
  ggplot(aes(x = reorder(country, Freq), y = Freq, fill = main_category)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  ggtitle("Country") +
  ylab("Number of repositories") +
  scale_fill_manual(values = glittr_cols) +
  annotate(geom = "text", x = 2, y = 70,
           label = paste("Repos with undefined country: ",
                         sum(repo_info$country == "undefined")),
           color="black") +
  theme_classic() +
  theme(legend.position = "none",
        axis.title.y = element_blank())

print(country_freq_plot)
```

And a table with the actual numbers.

```{r}
#| label: tbl-country
#| tbl-cap: Number of repositories per country

repo_info$country |> 
  table() |> 
  as.data.frame() |> 
  arrange(desc(Freq)) |> 
  knitr::kable()
```

## Number of outlinks by repo

Here, we use the site data from matomo in order to investigate what visitors of Glittr.org have clicked on. This gives us an idea about which repositories and topics are popular among users. In @fig-outlinks we see that `r visits_by_entry$associated_entry[1]` was the most popular repository with `r visits_by_entry$total_visits[1]` clicks. 

```{r}
#| label: fig-outlinks
#| fig-cap: Top 30 of most visited repos from Glittr.org in the past year

visits_by_entry |>
  slice(1:30) |>
  ggplot(aes(x = reorder(associated_entry, total_visits),
             y = total_visits, fill = main_category)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_fill_manual(values = glittr_cols) +
  ggtitle("Top 30 of most outlinked repos") +
  ylab("Number of outlinks") +
  theme_classic() +
  theme(legend.position = "none",
        axis.title.y = element_blank())
```

```{r}
#| label: tbl-visits-entry
#| tbl-cap: Number visits per repository

visits_by_entry |>
  select(associated_entry, total_visits) |>
  rename(repository = associated_entry) |>
  slice(1:30) |>
  knitr::kable()
```

It becomes more interesting when we aggregate the data per main tag (@fig-outlinks-tag). This gives us an idea of topic which people are looking for when visiting glittr.org. The most popular topic is `r visits_by_main_tag$main_tag[which.max(visits_by_main_tag$total_visits)]` with `r max(visits_by_main_tag$total_visits)` outlinks. 

```{r}
#| label: fig-outlinks-tag
#| fig-cap: Top 30 of most popular main tags from Glittr.org in the past year

visits_by_main_tag |>
  slice(1:30) |>
  ggplot(aes(x = reorder(main_tag, total_visits),
             y = total_visits, fill = category)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_fill_manual(values = glittr_cols) +
  ggtitle("Top 30 of most outlinked main tags") +
  ylab("Number of outlinks") +
  theme_classic() +
  theme(legend.position = "none",
        axis.title.y = element_blank())
```

When we normalize the visits by number of repositories per tag (@fig-outlinks-tag-repo), we can get an idea where training materials might be lacking. A high number here show repositories that are quite unique in what they teach, but are still popular. Here, we see that there are `r visits_by_main_tag$repositories[1]` repositires with the main tag `r visits_by_main_tag$main_tag[1]` with an average of `r round(visits_by_main_tag$visits_per_repo[1], 1)` outlinks.   

```{r}
#| label: fig-outlinks-tag-repo
#| fig-cap: Top 30 of most popular main tags normalized per repo from Glittr.org in the past year

visits_by_main_tag |>
  arrange(desc(visits_per_repo)) |>
  slice(1:30) |>
  ggplot(aes(x = reorder(main_tag, visits_per_repo),
             y = visits_per_repo, fill = category)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_fill_manual(values = glittr_cols) +
  ggtitle("Top 30 of most outlinked main tags normalized by repo number") +
  ylab("Number of outlinks per tag/number of repositories having tag") +
  theme_classic() +
  theme(legend.position = "none",
        axis.title.y = element_blank())
```

```{r}
#| label: tbl-outlinks-tag
#| tbl-cap: Number visits per tag

visits_by_main_tag |>
  slice(1:30) |>
  knitr::kable()
```

We can also check popularity by namespace, i.e. author (@tbl-visits-namespace). 

```{r}
#| label: tbl-visits-namespace
#| tbl-cap: Number visits per namespace

visits_by_namespace |>
  slice(1:30) |>
  knitr::kable()
```

## Summary plot

Full figure 2 of the manuscript. 

```{r}
p <- plot_grid(cat_count_plot, contributors_plot, 
          tag_freq_plot, author_freq_plot,  
          lic_freq_plot, country_freq_plot,
          ncol = 2, labels = LETTERS[1:6],
          rel_heights = c(2,3,3))

ggsave("grid_plot_fig2.pdf", width = 10, height = 10)
ggsave("grid_plot_fig2.eps", width = 10, height = 10)
```