From fec818b68a88ff8f71b29e3438572ad300e6161c Mon Sep 17 00:00:00 2001
From: Francesco Pisu <frapisu9@gmail.com>
Date: Sat, 23 Sep 2023 19:00:42 +0200
Subject: [PATCH] docs: improve docstring docs #13

---
 src/modelsight/_typing.py           |  36 +++-
 src/modelsight/calibration/calib.py |  42 +++--
 src/modelsight/curves/_delong.py    | 259 ++++++++++++++++------------
 src/modelsight/curves/compare.py    | 245 ++++++++++++++++++++++----
 4 files changed, 429 insertions(+), 153 deletions(-)

diff --git a/src/modelsight/_typing.py b/src/modelsight/_typing.py
index e9ef2e4..a87c1ba 100644
--- a/src/modelsight/_typing.py
+++ b/src/modelsight/_typing.py
@@ -1,3 +1,7 @@
+"""
+This file deals with the implementation of custom types.
+"""
+
 import sys
 import random
 import numpy as np
@@ -24,7 +28,37 @@
 
 
 @dataclass
-class CVModellingOutput:    
+class CVModellingOutput:
+    """This class stores the data generated by a cross-validation
+    process for a single estimator.
+    
+    Arguments
+    ---------
+    gts_train: ArrayLike
+        A (n_repetitions * n_outer_splits) list of arrays representing training ground-truth.
+    gts_val: ArrayLike
+        A (n_repetitions * n_outer_splits) list of arrays representing validation ground-truth.
+    gts_train_conc: ArrayLike
+        A list of (n_repetitions * n_outer_splits) data points representing pooled training ground-truth.
+    gts_val_conc: ArrayLike
+        A list of (n_repetitions * n_outer_splits) data points representing pooled validation ground-truth.
+    probas_train: ArrayLike
+        A (n_repetitions * n_outer_splits) list of arrays representing training predicted probabilities.
+    probas_val: ArrayLike
+        A (n_repetitions * n_outer_splits) list of arrays representing validation predicted probabilities.
+    probas_train_conc: ArrayLike
+        A list of (n_repetitions * n_outer_splits) data points representing pooled training predicted probabilities.
+    probas_val_conc: ArrayLike
+        A list of (n_repetitions * n_outer_splits) data points representing pooled validation predicted probabilities.
+    models: List[Estimator]
+        A list of (n_repetitions * n_outer_splits) fitted estimators.
+    errors: Optional[ArrayLike]
+        A (n_repetitions * n_outer_splits) list of validation prediction errors.
+    correct: Optional[ArrayLike]
+        A (n_repetitions * n_outer_splits) list of validation correct predictions.
+    features: Optional[ArrayLike]
+        A (n_repetitions * n_outer_splits) list of subsets of selected features.
+    """
     gts_train: ArrayLike
     gts_val: ArrayLike
     gts_train_conc: ArrayLike
diff --git a/src/modelsight/calibration/calib.py b/src/modelsight/calibration/calib.py
index ba9cbd4..37f9a60 100644
--- a/src/modelsight/calibration/calib.py
+++ b/src/modelsight/calibration/calib.py
@@ -1,3 +1,8 @@
+"""
+This file deals with the implementation of the Hosmer-Lemeshow plot for the
+assessment of calibration of predicted probabilities.
+"""
+
 import numpy as np
 from typing import Tuple
 import matplotlib.pyplot as plt
@@ -6,11 +11,14 @@
 
 
 def ntile_name(n: int) -> str:
-    """Returns the ntile name corresponding to an ntile integer.
+    """
+    Returns the ntile name corresponding to an ntile integer.
+    
     Parameters
     ----------
     n : int
         An ntile integer.
+    
     Returns
     -------
     ntile_name : str
@@ -30,13 +38,16 @@ def ntile_name(n: int) -> str:
 
 def make_recarray(y_true: ArrayLike,
                   y_pred: ArrayLike) -> np.recarray:
-    """Combines arrays into a recarray.
+    """
+    Combines arrays into a recarray.
+    
     Parameters
     ----------
     y_true : array
         Observed labels, either 0 or 1.
     y_pred : array
         Predicted probabilities, floats on [0, 1].
+    
     Returns
     -------
     table : recarray
@@ -53,7 +64,9 @@ def make_recarray(y_true: ArrayLike,
 def hosmer_lemeshow_table(y_true: ArrayLike,
                           y_pred: ArrayLike,
                           n_bins: int = 10) -> np.recarray:
-    """Constructs a Hosmer–Lemeshow table.
+    """
+    Constructs a Hosmer–Lemeshow table.
+    
     Parameters
     ----------
     y_true : array
@@ -63,6 +76,7 @@ def hosmer_lemeshow_table(y_true: ArrayLike,
     n_bins : int, optional
         The number of groups to create. The default value is 10, which
         corresponds to deciles of predicted probabilities.
+    
     Returns
     -------
     table : recarray
@@ -100,26 +114,28 @@ def hosmer_lemeshow_plot(y_true: ArrayLike,
 
     Parameters
     ----------
-    y_true: ArrayLike
+    y_true : ArrayLike
         (n_obs,) shaped array of ground-truth values
-    y_pred: ArrayLike
+    y_pred : ArrayLike
         (n_obs,) shaped array of predicted probabilities
-    n_bins: int
+    n_bins : int
         Number of bins to group observed and predicted probabilities into
-    colors: Tuple[str, str]
+    colors : Tuple[str, str]
         Pair of colors for observed (line) and predicted (vertical bars) probabilities.
-    annotate_bars: bool
+    annotate_bars : bool
         Whether bars should be annotated with the number of observed probabilities in each bin.
-    title: str
+    title : str
         Title to display on top of the calibration plot.
-    brier_score_annot: str
+    brier_score_annot : str
         Optional brier score (95% CI) annotation on the top-left corner.
-    ax: plt.Axes
+    ax : plt.Axes
         A matplotlib Axes object to draw the calibration plot into. If None, an Axes object is created by default.
+    
     Returns
     -------
-    Tuple[plt.Figure, plt.Axes]:
-        Corresponding figure and Axes
+    f, ax : Tuple[plt.Figure, plt.Axes]
+        f: pyplot figure
+        ax: pyplot Axes
     """
     table = hosmer_lemeshow_table(y_true, y_pred, n_bins)
     # transform observed and predicted frequencies in percentage relative to the bin dimension
diff --git a/src/modelsight/curves/_delong.py b/src/modelsight/curves/_delong.py
index 77dd1f3..bdffa34 100644
--- a/src/modelsight/curves/_delong.py
+++ b/src/modelsight/curves/_delong.py
@@ -1,124 +1,169 @@
+"""
+This file deals with the implementation of the DeLong test for the comparison of
+pairs of correlated areas under the receiver-operating characteristics curves.
+"""
+
 import pandas as pd
 import numpy as np
 import scipy.stats
+from typing import Tuple
 
 # AUC comparison adapted from
 # https://github.com/Netflix/vmaf/
-def compute_midrank(x):
-    """Computes midranks.
-    Args:
-       x - a 1D numpy array
-    Returns:
-       array of midranks
-    """
-    J = np.argsort(x)
-    Z = x[J]
-    N = len(x)
-    T = np.zeros(N, dtype=np.float64)
-    i = 0
-    while i < N:
-        j = i
-        while j < N and Z[j] == Z[i]:
-            j += 1
-        T[i:j] = 0.5*(i + j - 1)
-        i = j
-    T2 = np.empty(N, dtype=np.float64)
-    # Note(kazeevn) +1 is due to Python using 0-based indexing
-    # instead of 1-based in the AUC formula in the paper
-    T2[J] = T + 1
-    return T2
+def compute_midrank(x: np.ndarray) -> np.ndarray:
+   """
+   Computes midranks.
+    
+   Parameters
+   ----------
+   x : np.ndarray
+     a 1-d array of predicted probabilities.
+    
+   Returns
+   -------
+   T2 : np.ndarray
+      array of midranks
+   """
+   J = np.argsort(x)
+   Z = x[J]
+   N = len(x)
+   T = np.zeros(N, dtype=np.float64)
+   i = 0
+   while i < N:
+      j = i
+      while j < N and Z[j] == Z[i]:
+         j += 1
+      T[i:j] = 0.5*(i + j - 1)
+      i = j
+   T2 = np.empty(N, dtype=np.float64)
+   # Note(kazeevn) +1 is due to Python using 0-based indexing
+   # instead of 1-based in the AUC formula in the paper
+   T2[J] = T + 1
+   return T2
 
 
-def fastDeLong(predictions_sorted_transposed, label_1_count):
-    """
-    The fast version of DeLong's method for computing the covariance of
-    unadjusted AUC.
-    Args:
-       predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples]
-          sorted such as the examples with label "1" are first
-    Returns:
-       (AUC value, DeLong covariance)
-    Reference:
-     @article{sun2014fast,
-       title={Fast Implementation of DeLong's Algorithm for
-              Comparing the Areas Under Correlated Receiver Operating Characteristic Curves},
-       author={Xu Sun and Weichao Xu},
-       journal={IEEE Signal Processing Letters},
-       volume={21},
-       number={11},
-       pages={1389--1393},
-       year={2014},
-       publisher={IEEE}
-     }
-    """
-    # Short variables are named as they are in the paper
-    m = label_1_count
-    n = predictions_sorted_transposed.shape[1] - m
-    positive_examples = predictions_sorted_transposed[:, :m]
-    negative_examples = predictions_sorted_transposed[:, m:]
-    k = predictions_sorted_transposed.shape[0]
+def fastDeLong(predictions_sorted_transposed: np.ndarray, 
+               label_1_count: int) -> Tuple[np.ndarray, np.ndarray]:
+   """
+   The fast version of DeLong's method for computing the covariance of
+   unadjusted AUC.
+   
+   Parameters
+   ----------
+   predictions_sorted_transposed : a (n_classifiers, n_obs) numpy array containing
+      the predicted probabilities by the two classifiers in the comparison. 
+      These probabilities are sorted such that the examples with label "1" come first.
+   
+   Returns
+   -------
+   aucs, delongcov : Tuple[np.ndarray, np.ndarray]
+      aucs: array of AUC values 
+      delongcov: array of DeLong covariance
+      
+   Reference
+   ---------
+   @article{sun2014fast,
+      title={Fast Implementation of DeLong's Algorithm for
+            Comparing the Areas Under Correlated Receiver Operating Characteristic Curves},
+      author={Xu Sun and Weichao Xu},
+      journal={IEEE Signal Processing Letters},
+      volume={21},
+      number={11},
+      pages={1389--1393},
+      year={2014},
+      publisher={IEEE}
+   }
+   """
+   # Short variables are named as they are in the paper
+   m = label_1_count
+   n = predictions_sorted_transposed.shape[1] - m
+   positive_examples = predictions_sorted_transposed[:, :m]
+   negative_examples = predictions_sorted_transposed[:, m:]
+   k = predictions_sorted_transposed.shape[0]
 
-    tx = np.empty([k, m], dtype=np.float64)
-    ty = np.empty([k, n], dtype=np.float64)
-    tz = np.empty([k, m + n], dtype=np.float64)
-    for r in range(k):
-        tx[r, :] = compute_midrank(positive_examples[r, :])
-        ty[r, :] = compute_midrank(negative_examples[r, :])
-        tz[r, :] = compute_midrank(predictions_sorted_transposed[r, :])
-    aucs = tz[:, :m].sum(axis=1) / m / n - float(m + 1.0) / 2.0 / n
-    v01 = (tz[:, :m] - tx[:, :]) / n
-    v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m
-    sx = np.cov(v01)
-    sy = np.cov(v10)
-    delongcov = sx / m + sy / n
-    return aucs, delongcov
+   tx = np.empty([k, m], dtype=np.float64)
+   ty = np.empty([k, n], dtype=np.float64)
+   tz = np.empty([k, m + n], dtype=np.float64)
+   for r in range(k):
+      tx[r, :] = compute_midrank(positive_examples[r, :])
+      ty[r, :] = compute_midrank(negative_examples[r, :])
+      tz[r, :] = compute_midrank(predictions_sorted_transposed[r, :])
+   aucs = tz[:, :m].sum(axis=1) / m / n - float(m + 1.0) / 2.0 / n
+   v01 = (tz[:, :m] - tx[:, :]) / n
+   v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m
+   sx = np.cov(v01)
+   sy = np.cov(v10)
+   delongcov = sx / m + sy / n
+   return aucs, delongcov
 
 
-def calc_pvalue(aucs, sigma):
-    """Computes log(10) of p-values.
-    Args:
-       aucs: 1D array of AUCs
-       sigma: AUC DeLong covariances
-    Returns:
-       log10(pvalue)
-    """
-    l = np.array([[1, -1]])
-    z = np.abs(np.diff(aucs)) / np.sqrt(np.dot(np.dot(l, sigma), l.T))
-    return np.log10(2) + scipy.stats.norm.logsf(z, loc=0, scale=1) / np.log(10)
+def calc_pvalue(aucs: np.ndarray, sigma: np.ndarray) -> float:
+   """
+   Computes log(10) of p-values.
+   
+   Parameters
+   ----------
+   aucs : np.array
+      a 1-d array of AUCs
+   sigma : np.array
+      an array AUC DeLong covariances
+   
+   Returns
+   -------
+   p : float
+      log10(pvalue)
+   """
+   l = np.array([[1, -1]])
+   z = np.abs(np.diff(aucs)) / np.sqrt(np.dot(np.dot(l, sigma), l.T))
+   p = np.log10(2) + scipy.stats.norm.logsf(z, loc=0, scale=1) / np.log(10)
+   return p
 
 
-def compute_ground_truth_statistics(ground_truth):
-    assert np.array_equal(np.unique(ground_truth), [0, 1])
-    order = (-ground_truth).argsort()
-    label_1_count = int(ground_truth.sum())
-    return order, label_1_count
+def compute_ground_truth_statistics(ground_truth: np.ndarray) -> Tuple[np.ndarray, int]:
+   """
+   Compute statistics of ground-truth array.
 
+   Parameters
+   ----------
+   ground_truth : np.ndarray
+      a (n_obs,) array of 0 and 1 values representing the ground-truth.
 
-def delong_roc_variance(ground_truth, predictions):
-    """
-    Computes ROC AUC variance for a single set of predictions
-    Args:
-       ground_truth: np.array of 0 and 1
-       predictions: np.array of floats of the probability of being class 1
-    """
-    order, label_1_count = compute_ground_truth_statistics(ground_truth)
-    predictions_sorted_transposed = predictions[np.newaxis, order]
-    aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count)
-    assert len(aucs) == 1, "There is a bug in the code, please forward this to the developers"
-    return aucs[0], delongcov
+   Returns
+   -------
+   order, label_1_count : Tuple[np.ndarray, int]
+       order is a numpy array of sorted indexes
+       label_1_count is the count of data points of the positive class.
+   """
+   assert np.array_equal(np.unique(ground_truth), [0, 1])
+   order = (-ground_truth).argsort()
+   label_1_count = int(ground_truth.sum())
+   return order, label_1_count
 
 
-def delong_roc_test(ground_truth, predictions_one, predictions_two):
-    """
-    Computes log(p-value) for hypothesis that two ROC AUCs are different
-    Args:
-       ground_truth: np.array of 0 and 1
-       predictions_one: predictions of the first model,
-          np.array of floats of the probability of being class 1
-       predictions_two: predictions of the second model,
-          np.array of floats of the probability of being class 1
-    """
-    order, label_1_count = compute_ground_truth_statistics(ground_truth)
-    predictions_sorted_transposed = np.vstack((predictions_one, predictions_two))[:, order]
-    aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count)
-    return 10**calc_pvalue(aucs, delongcov).item()
\ No newline at end of file
+def delong_roc_test(ground_truth: np.ndarray, 
+                    predictions_one: np.ndarray, 
+                    predictions_two: np.ndarray) -> float:
+   """
+   Compare areas-under-curve of two estimators using the DeLong test.
+   Concretely, it computes the pvalue for hypothesis that two ROC AUCs are different.
+   
+   Parameters
+   ----------
+   ground_truth : np.ndarray
+      a (n_obs,) array of 0 and 1 representing ground-truths.
+   predictions_one : np.ndarray
+      a (n_obs,) array of probabilities of class 1 predicted by the first model.
+   predictions_two : np.ndarray
+      a (n_obs,) array of probabilities of class 1 predicted by the second model.
+      
+   Returns
+   -------
+   p : float
+      the p-value for hypothesis that two ROC AUCs are different.
+   """
+   order, label_1_count = compute_ground_truth_statistics(ground_truth)
+   predictions_sorted_transposed = np.vstack((predictions_one, predictions_two))[:, order]
+   aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count)
+   
+   p = 10**calc_pvalue(aucs, delongcov).item()
+   return p
\ No newline at end of file
diff --git a/src/modelsight/curves/compare.py b/src/modelsight/curves/compare.py
index 4d748ec..aa298b8 100644
--- a/src/modelsight/curves/compare.py
+++ b/src/modelsight/curves/compare.py
@@ -1,37 +1,84 @@
+"""
+This file deals with the implementation of functions that allow annotating plots 
+with statistical tests results between pairs of estimators.
+"""
+
 from typing import Callable, Dict, Tuple, List
 import matplotlib
 from matplotlib import patches
 import matplotlib.pyplot as plt
-from scipy.stats import ttest_ind
-from sklearn.metrics import average_precision_score
 
 from src.modelsight.curves._delong import delong_roc_test
+from src.modelsight._typing import CVModellingOutput
 
-def annot_stat_vertical(text, x, y1, y2, ww, 
-                        col='k', 
-                        fontsize=13, 
-                        voffset = 0, 
-                        n_elems = None,
+def annot_stat_vertical(text:str, 
+                        x: float, 
+                        y1: float, y2: float, 
+                        ww: float = 0.02, 
+                        col: str = 'black', 
+                        fontsize: int = 13, 
+                        voffset: float = 0, 
+                        n_elems: int = None,
                         ax=None,
                         **kwargs):
     """
-    ww: float
-        whisker width
+    Draw a vertical whisker at position `x` that spans through `y1` to `y2` with annotation specified
+    by `text`.
+
+    Parameters
+    ----------
+    text : str
+        Annotation for whisker.
+    x : float
+        x-position the whisker is positioned at.
+    y1 :float
+        starting y position.
+    y2 : float
+        ending y position.
+    ww : float, optional
+        whisker width, by default 0.02
+    col : str, optional
+        whisker color, by default 'black'
+    fontsize : int, optional
+        fontsize for the annotation, by default 13
+    voffset : float, optional
+        vertical offset for the annotation, by default 0.
+        Some font families and characters occupy different vertical spaces; 
+        this parameter allows compensating for such variations.
+    n_elems : int, optional
+        number of discrete elements in the y-axis, by default None.
+        This value is precomputed by the caller (add_annotations) and passed
+        to this function as input.
+    ax : plt.Axes, optional
+        a pyplot Axes to draw annotations on, by default None
+    **kwargs
+        rect_h_base: float, optional
+            base height of rectangle patch for single-character annotations, by default 0.1
+        fontsize_nonsignif, optional
+            fontsize for multi-character annotations (here called non significant annotations
+            to reflect the fact that single-character annotations most often use some kind
+            of symbol to denote statistical significance, e.g. *), by default `fontsize` (i.e., 13)
     """
     ax = plt.gca() if ax is None else ax
     
     # we want the text to be centered on the whisker 
     text_x_pos = x + ww 
-    #+ 0.01
+
     text_y_pos = (y1+y2)/2
     
     # draw whisker from y1 to y2 with width `ww`
     ax.plot([x, x + ww, x + ww, x], [y1, y1, y2, y2], lw=1, c=col)
     
-    if len(text) == 1:                
-        #text_y_pos = (y1+y2)/2
-
-        # draw text at (text_x_pos, text_y_pos) # + 0.15
+    # this is the case of a whisker being annotated with a single character.
+    # by default, symbols do not enforce a white background, hence when
+    # superimposed on whiskers the readibility is limited.
+    # here we enforce a white rectangle patch beneath the symbol to enhance
+    # readibility of annotations.
+    # the built-in bbox parameter of pyplot's .text() doesn't produce
+    # acceptable results, hence we came up with a custom implementation for
+    # single-character annotations.
+    if len(text) == 1:
+        # draw text at (text_x_pos, (text_y_pos - voffset) + 0.17)
         ax.text(
             text_x_pos, (text_y_pos - voffset) + 0.17, text, 
             ha='center', va='center', color=col,
@@ -61,6 +108,9 @@ def annot_stat_vertical(text, x, y1, y2, ww,
         
         ax.add_patch(rect)
     else:
+        # this is the case of multi-character annotations.
+        # here, we leverage the built-in bbox of pyplot's text method
+        # that allows drawing a bounding box beneath the annotation.
         fontsize_nonsignif = kwargs.pop("fontsize_nonsignif", fontsize)
         ax.text(
             text_x_pos, text_y_pos, text, 
@@ -73,15 +123,52 @@ def annot_stat_vertical(text, x, y1, y2, ww,
             )
         )        
         
-from matplotlib import patches
-def annot_stat_horizontal(text, x1, x2, y, wh, col='k', fontsize=13, 
-                        voffset = 0, 
-                        n_elems = None,
-                        ax=None,
-                        **kwargs):
+
+def annot_stat_horizontal(text: str, 
+                          x1: float, x2: float, 
+                          y: float, 
+                          wh: float = 0.02, 
+                          col: str = "black", 
+                          fontsize: int = 13, 
+                          voffset: float = 0, 
+                          n_elems:int  = None,
+                          ax: plt.Axes = None,
+                          **kwargs):
     """
-    ww: float
-        whisker width
+    Draw an horizontal whisker at position `y` that spans through `x1` to `x2` with annotation specified
+    by `text`.
+
+    Parameters
+    ----------
+    text : str
+        Annotation for whisker.
+    x1 : float
+        starting x position.
+    x2 :float
+        ending x position.
+    y : float
+        y-position the whisker is positioned at.
+    wh : float, optional
+        whisker height, by default 0.02
+    col : str, optional
+        whisker color, by default 'black'
+    fontsize : int, optional
+        fontsize for the annotation, by default 13
+    voffset : float, optional
+        vertical offset for the annotation, by default 0.
+        Some font families and characters occupy different vertical spaces; 
+        this parameter allows compensating for such variations.
+    n_elems : int, optional
+        number of discrete elements in the y-axis, by default None.
+        This value is precomputed by the caller (add_annotations) and passed
+        to this function as input.
+    ax : plt.Axes, optional
+        a pyplot Axes to draw annotations on, by default None
+    **kwargs
+        fontsize_nonsignif, optional
+            fontsize for multi-character annotations (here called non significant annotations
+            to reflect the fact that single-character annotations most often use some kind
+            of symbol to denote statistical significance, e.g. *), by default `fontsize` (i.e., 13)
     """
     ax = plt.gca() if ax is None else ax
     
@@ -93,10 +180,16 @@ def annot_stat_horizontal(text, x1, x2, y, wh, col='k', fontsize=13,
     # draw whisker from y1 to y2 with width `ww`
     ax.plot([x1, x1, x2, x2], [y, y + wh, y + wh, y], lw=1, c=col,
            clip_on=False)
-    
-    if len(text) == 1:     
-        #text_y_pos = (y1+y2)/2
 
+    # this is the case of a whisker being annotated with a single character.
+    # by default, symbols do not enforce a white background, hence when
+    # superimposed on whiskers the readibility is limited.
+    # here we enforce a white rectangle patch beneath the symbol to enhance
+    # readibility of annotations.
+    # the built-in bbox parameter of pyplot's .text() doesn't produce
+    # acceptable results, hence we came up with a custom implementation for
+    # single-character annotations.    
+    if len(text) == 1:
         # draw text at (text_x_pos, text_y_pos) # + 0.15
         ax.text(
             text_x_pos, text_y_pos + voffset, text, 
@@ -140,14 +233,12 @@ def annot_stat_horizontal(text, x1, x2, y, wh, col='k', fontsize=13,
         )        
         
         
-from typing import Tuple, List, Dict
-
 def add_annotations(comparisons: Dict[str, Tuple[str, str, float]], 
                     alpha: float, 
                     bars: matplotlib.container.BarContainer,
                     direction: str,
                     order: List[Tuple[str, str]],
-                    symbol: str,
+                    symbol: str = "*",
                     symbol_fontsize: int = 22,
                     voffset: float = 0,
                     ext_voffset: float = 0,
@@ -155,6 +246,54 @@ def add_annotations(comparisons: Dict[str, Tuple[str, str, float]],
                     P_val_rounding: int = 2,
                     ax: plt.Axes = None,
                     **kwargs):
+    """
+    Annotates the specified plot (`ax`) with the provided comparisons results either vertically or horizontally
+    depending on the value of `direction`.
+
+    Parameters
+    ----------
+    comparisons : Dict[str, Tuple[str, str, float]]
+        The results of models comparisons.
+    alpha : float
+        The significance level used for formatting the P value of comparisons.
+    bars : matplotlib.container.BarContainer
+        A list of matplotlib's bars that is used to access the bar's width or height
+        when annotating horizontally and vertically, respectively.
+    direction : str
+        The direction for annotation. Possible values are "horizontal" and "vertical".
+    order : List[Tuple[str, str]]
+        The order in which the comparisons should be displayed.
+        Each entry of this list is a tuple where elements are algorithm's names.
+    symbol : str, optional
+        The symbol used in place of the P value when statistical significance is achieved
+        accoring to the specified alpha, by default "*".
+    symbol_fontsize : int, optional
+        Fontsize for the symbol used when statistical significance is achieved, by default 22
+    voffset : float, optional
+        vertical offset for the annotation, by default 0., by default 0
+    ext_voffset : float, optional
+        Additional vertical offset for vertical annotations.
+        Ignored when direction = "horizontal", by default 0
+    ext_hoffset : float, optional
+        Additional horizontal offset for horizontal annotations.
+        Ignored when direction = "vertical", by default 0
+    P_val_rounding : int, optional
+        Number of decimal places to round P values at, by default 2
+    ax : plt.Axes, optional
+        The plot to be annotated, by default None
+
+    Returns
+    -------
+    ax : plt.Axes
+        The annotated plot.
+
+    Raises
+    ------
+    ValueError
+        When ax is None
+    ValueError
+        Whenever a comparison key doesn't exist.
+    """
     if not ax:
         raise ValueError("I need an Axes to draw comparisons on.")
     
@@ -188,7 +327,7 @@ def add_annotations(comparisons: Dict[str, Tuple[str, str, float]],
                                 wh=0.02,
                                 col="black", 
                                 fontsize=symbol_fontsize,
-                                voffset = -0.02,
+                                voffset = voffset, #-0.02
                                 ext_offset = ext_hoffset,
                                 n_elems = len(entity_labels),
                                 ax=ax,
@@ -219,16 +358,58 @@ def add_annotations(comparisons: Dict[str, Tuple[str, str, float]],
     
     return ax
         
-def roc_single_comparison(cv_preds, fst_algo, snd_algo):
+def roc_single_comparison(cv_preds: CVModellingOutput, 
+                          fst_algo: str, 
+                          snd_algo: str) -> Dict[str, Tuple[str, str, float]]:
+    """Perform a single comparison of two areas under Receiver Operating Characteristic curves
+    computed on the same set of data points by the DeLong test.
+
+    Parameters
+    ----------
+    cv_preds : CVModellingOutput
+        The output of a cross-validation process encompassing mulitple (n>=2) models.
+    fst_algo : str
+        The name of the first algorithm for the comparison.
+        Must be an existing key of `cv_preds`.
+    snd_algo : str
+        The name of the second algorithm for the comparison.
+        Must be an existing key of `cv_preds`.
+
+    Returns
+    -------
+    comparison_result : Dict[str, Tuple[str, str, float]]
+        The output of the comparison. This is a dictionary where the key is
+        of the form "<fst_algo>_<snd_algo>" and the value is a tuple of three
+        elements, the first two are the names of the algorithms being compared
+        and the third element is the P value for the null hypothesis that
+        the two AUC values are equal.
+    """
     ground_truths = cv_preds[fst_algo].gts_val_conc
     fst_algo_probas = cv_preds[fst_algo].probas_val_conc
     snd_algo_probas = cv_preds[snd_algo].probas_val_conc
     
     P = delong_roc_test(ground_truths, fst_algo_probas, snd_algo_probas)
     cmp_key = f"{fst_algo}_{snd_algo}"
-    return {cmp_key: (fst_algo, snd_algo, P)}
+    comparison_result = {cmp_key: (fst_algo, snd_algo, P)}
+    return comparison_result
         
-def roc_comparisons(cv_preds, target_algo):
+def roc_comparisons(cv_preds: CVModellingOutput, 
+                    target_algo: str):
+    """
+    Compares the AUC of the specified algorithm with the AUCs of all other algorithms.
+
+    Parameters
+    ----------
+    cv_preds : CVModellingOutput
+        The output of a cross-validation process encompassing mulitple (n>=2) models.
+    target_algo : str
+        The name of the target algorithm's whose AUC will be compared with all other AUCs.
+
+    Returns
+    -------
+    comparisons : Dict[str, Tuple[str, str, float]]
+        A dictionary containing the results of all comparisons. See output of `roc_single_comparison`.
+    """
     comparisons = dict()
 
     for algo_name in cv_preds.keys():