postprocessML.R

# load suite of packages to manage/visualise data
library(tidyverse)
library(sf)

# read in pix names w/ hand-made tags
# this is the truth
truth <- 
  read_csv('lynx/test.csv') %>%
  filter(!is.na(X2)) %>%
  rename(name = `test/10.1_D_(1592).jpg`,
         x = X2,
         y = X3,
         z = X4,
         t = X5,
         species = X6)
truth

# we have several pictures that appear several times
truth %>% count(name, sort = TRUE) %>% print(n = 25)

# how many unique pix do we have
truth %>% pull(name) %>% unique() %>% length()

# count how many instances of each species
truth %>% count(species, sort = TRUE)

# # A tibble: 9 x 2
# species            n
# <chr>          <int>
# 1 sangliers         80
# 2 blaireaux         61
# 3 chamois           60
# 4 chevreuil         47
# 5 renard            43
# 6 chat_forestier    35
# 7 lynx              31
# 8 lièvre            24
# 9 cerf              10

# replace 'chat forestier' by 'chat_forestier'
truth <- truth %>% mutate(species = str_replace(species, 'chat forestier', 'chat_forestier'))

# read in tags generated by algo
pred <- 
  read_tsv('lynx/lynx_test.txt', col_names = FALSE) %>%
  mutate(X1 = str_replace(X1, 'chat forestier', 'chat_forestier')) %>%
  separate(X1, 
           into = c("name","species","confidence","x","y","z","t"), 
           sep = '\\s') %>%
  mutate(x = str_remove(x,'\\['),
         x = str_remove(x,','),
         y = str_remove(y,','),
         z = str_remove(z,','),
         t = str_remove(t,'\\]'),
         t = str_remove(t,','),
         x = as.numeric(x),
         y = as.numeric(y),
         z = as.numeric(z),
         t = as.numeric(t))
pred

# we have several pictures that appear several times
pred %>% count(name, sort = TRUE) %>% print(n = 50)

# how many unique pix do we have
pred %>% pull(name) %>% unique() %>% length()

# count how many instances of each species
pred %>% count(species, sort = TRUE)

# # A tibble: 9 x 2
# species            n
# <chr>          <int>
# 1 sangliers         85
# 2 blaireaux         71
# 3 chamois           65
# 4 chevreuil         51
# 5 renard            50
# 6 chat_forestier    45
# 7 lynx              31
# 8 lièvre            25
# 9 cerf               6


compute_overlap <- function(true_box, pred_box){
  box1 <- st_as_sfc(st_bbox(c(ymin = true_box$y, 
                              ymax = true_box$t, 
                              xmax = true_box$z, 
                              xmin = true_box$x)))
  
  box2 <- st_as_sfc(st_bbox(c(ymin = pred_box$y, 
                              ymax = pred_box$t, 
                              xmax = pred_box$z, 
                              xmin = pred_box$x)))
  
  #  plot(true_box, border = 'green')
  #  plot(pred_box, border = 'blue', add = T)
  #  
  int <- st_intersection(box1,box2)
  num <- st_area(int)
  uni <- st_union(box1,box2)
  den <- st_area(uni)
  if (length(num) == 0) {
    iou <- 0
  }
  else {
    iou <- num / den
  }
  return(iou)
}

perf <- data.frame(species = unique(truth$species),
                   TP = rep(0, length(unique(truth$species))),
                   FP = rep(0, length(unique(truth$species))),
                   FN = rep(0, length(unique(truth$species))))

# following https://github.com/rafaelpadilla/Object-Detection-Metrics
# see also https://towardsdatascience.com/evaluating-performance-of-an-object-detection-model-137a349c517b

for (i in unique(truth$name)){
  
  #  current <- unique(truth$name)[87]
  #  current <- unique(truth$name)[8]
  #  current <- unique(truth$name)[1]
  #    current <- 'test/2.1_G_(791).jpg'
  current <- i
  mask_truth <- truth$name == current
  mask_pred <- pred$name == current
  (current_truth <- truth[mask_truth,])
  (current_pred <- pred[mask_pred,])
  
  # if prediction is empty, then +1 to FN
  if (length(current_pred$species) == 0){
    perf[perf$species == current_truth$species, 4] <- perf[perf$species == current_truth$species, 4] + 1
    next
  } 
  # # if we get a single detection, we assume that the overlap is > threshold, and just check out whether the prediction is correct
  # if ((length(current_pred$species) == 1) & (current_truth$species == current_pred$species)){
  #   perf[perf$species == current_truth$species, 2] <- perf[perf$species == current_truth$species, 2] + 1
  #   next
  # }
  # if ((length(current_pred$species) == 1) & (current_truth$species != current_pred$species)){
  #   perf[perf$species == current_truth$species, 3] <- perf[perf$species == current_truth$species, 3] + 1
  #   next 
  # }
  
  # if we have multiple detections
  npred <- nrow(current_pred)
  ntruth <- nrow(current_truth)
  ov <- NULL
  for (k in 1:npred){
    for (l in 1:ntruth){
      ov <- rbind(ov,c(compute_overlap(current_pred[k,],current_truth[l,]), paste0('detection', k), paste0('groundt', l)))
    }
  }
  
  ov <- as_tibble(ov) %>%
    rename(score = V1,
           detection = V2,
           ground = V3)
  
  top_ov <- ov %>%
    filter(score != 0) %>%
    group_by(detection) %>%
    top_n(n = 1) %>%
    group_by(ground) %>%
    filter(score == max(score)) %>%
    slice(1)
  
  top_ov <- top_ov %>%
    mutate(ii = parse_number(detection),
           jj = parse_number(ground)) 
  
  if (nrow(top_ov) == 0) {
    for (m in 1:nrow(current_truth)){
      perf[perf$species == current_truth$species[n], 4] <- perf[perf$species == current_truth$species[n], 4] + 1
    }
    next
  }
  
  overlap <- NULL
  for (m in 1:nrow(top_ov)){
    overlap <- c(overlap, compute_overlap(current_truth[top_ov$jj[m],],current_pred[top_ov$ii[m],]))
  }
  
  res <- top_ov %>% 
    add_column(overlap) %>%
    filter(overlap > 0.3)
  
  if (nrow(res) == 0) {
    for (m in 1:nrow(current_truth)){
      perf[perf$species == current_truth$species[n], 4] <- perf[perf$species == current_truth$species[n], 4] + 1
    }
    next
  }

    for (n in 1:nrow(res)){
    if (current_pred[res$ii,]$species[n] == current_truth[res$jj,]$species[n]){
      perf[perf$species == current_truth[res$jj,]$species[n], 2] <- perf[perf$species == current_truth[res$jj,]$species[n], 2] + 1
    } else {
      perf[perf$species == current_truth[res$jj,]$species[n], 3] <- perf[perf$species == current_truth[res$jj,]$species[n], 3] + 1
    }
  }
  
  if (nrow(res) != nrow(current_truth)) {
    mask <- !(1:nrow(current_truth) %in% res$jj)
    for (mm in 1:sum(mask)){
      perf[perf$species == current_truth$species[mask][mm], 4] <- perf[perf$species == current_truth$species[mask][mm], 4] + 1
    }
  }
  
}

perf %>% arrange(desc(TP))

# species TP FP FN
# 1         renard 41  2  0
# 2        chamois 56  2  2
# 3           lynx 29  2  0
# 4      blaireaux 61  0  0
# 5 chat_forestier 33  1  1
# 6      chevreuil 43  4  0
# 7           cerf  6  4  0
# 8         lièvre 20  1  3
# 9      sangliers 74  2  4