From 4f1690128519b64e6f41ab9c0bdcb4d48bd2572e Mon Sep 17 00:00:00 2001
From: Olivier <olivier.ar.laurent@gmail.com>
Date: Wed, 17 Jul 2024 17:32:31 +0200
Subject: [PATCH 01/27] :bug: Fix resnet20

---
 tests/models/test_resnets.py                      |  6 ++++++
 .../baselines/classification/resnet.py            |  2 +-
 torch_uncertainty/metrics/classification/fpr95.py | 15 +++++++++------
 .../metrics/classification/grouping_loss.py       |  2 +-
 .../metrics/classification/risk_coverage.py       | 11 ++++-------
 torch_uncertainty/models/resnet/batched.py        |  7 +++++--
 torch_uncertainty/models/resnet/lpbnn.py          |  7 +++++--
 torch_uncertainty/models/resnet/masked.py         |  7 +++++--
 torch_uncertainty/models/resnet/mimo.py           |  7 +++++--
 torch_uncertainty/models/resnet/packed.py         |  7 +++++--
 torch_uncertainty/models/resnet/std.py            |  9 ++++++---
 torch_uncertainty/models/resnet/utils.py          |  8 ++++++++
 12 files changed, 60 insertions(+), 28 deletions(-)

diff --git a/tests/models/test_resnets.py b/tests/models/test_resnets.py
index 44c2cd3c..31677f51 100644
--- a/tests/models/test_resnets.py
+++ b/tests/models/test_resnets.py
@@ -9,6 +9,7 @@
     packed_resnet,
     resnet,
 )
+from torch_uncertainty.models.resnet.utils import get_resnet_num_blocks
 
 
 class TestResnet:
@@ -21,6 +22,11 @@ def test_main(self):
             model(torch.randn(1, 1, 32, 32))
             model.feats_forward(torch.randn(1, 1, 32, 32))
 
+        get_resnet_num_blocks(44)
+        get_resnet_num_blocks(56)
+        get_resnet_num_blocks(110)
+        get_resnet_num_blocks(1202)
+
     def test_mc_dropout(self):
         resnet(1, 10, arch=20, conv_bias=False, style="cifar")
         model = resnet(1, 10, arch=50).eval()
diff --git a/torch_uncertainty/baselines/classification/resnet.py b/torch_uncertainty/baselines/classification/resnet.py
index 47391ff6..36777e10 100644
--- a/torch_uncertainty/baselines/classification/resnet.py
+++ b/torch_uncertainty/baselines/classification/resnet.py
@@ -34,7 +34,7 @@ class ResNetBaseline(ClassificationRoutine):
         "mimo": mimo_resnet,
         "mc-dropout": resnet,
     }
-    archs = [18, 20, 34, 50, 101, 152]
+    archs = [18, 20, 34, 44, 50, 56, 101, 110, 152, 1202]
 
     def __init__(
         self,
diff --git a/torch_uncertainty/metrics/classification/fpr95.py b/torch_uncertainty/metrics/classification/fpr95.py
index f7e4a660..4fb51f88 100644
--- a/torch_uncertainty/metrics/classification/fpr95.py
+++ b/torch_uncertainty/metrics/classification/fpr95.py
@@ -47,13 +47,13 @@ def update(self, conf: Tensor, target: Tensor) -> None:
 
         Args:
             conf (Tensor): The confidence scores.
-            target (Tensor): The target labels.
+            target (Tensor): The target labels, 0 if ID, 1 if OOD.
         """
         self.conf.append(conf)
         self.targets.append(target)
 
     def compute(self) -> Tensor:
-        """Compute the actual False Positive Rate at x% Recall.
+        """Compute the False Positive Rate at x% Recall.
 
         Returns:
             Tensor: The value of the FPRx.
@@ -61,7 +61,7 @@ def compute(self) -> Tensor:
         conf = dim_zero_cat(self.conf).cpu().numpy()
         targets = dim_zero_cat(self.targets).cpu().numpy()
 
-        # out_labels is an array of 0s and 1s - 0 if IOD 1 if OOD
+        # out_labels is an array of 0s and 1s - 0 if ID, 1 if OOD
         out_labels = targets == self.pos_label
 
         in_scores = conf[np.logical_not(out_labels)]
@@ -77,7 +77,7 @@ def compute(self) -> Tensor:
         labels = labels == self.pos_label
 
         # sort scores and corresponding truth values
-        desc_score_indices = np.argsort(examples, kind="mergesort")[::-1]
+        desc_score_indices = np.argsort(examples)[::-1]
         examples = examples[desc_score_indices]
         labels = labels[desc_score_indices]
 
@@ -93,6 +93,8 @@ def compute(self) -> Tensor:
 
         thresholds = examples[threshold_idxs]
 
+        if tps[-1] == 0:
+            return torch.tensor([torch.nan], device=self.device)
         recall = tps / tps[-1]
 
         last_ind = tps.searchsorted(tps[-1])
@@ -107,7 +109,9 @@ def compute(self) -> Tensor:
         cutoff = np.argmin(np.abs(recall - self.recall_level))
 
         return torch.tensor(
-            fps[cutoff] / (np.sum(np.logical_not(labels))), dtype=torch.float32
+            fps[cutoff] / (np.sum(np.logical_not(labels))),
+            dtype=self.dtype,
+            device=self.device,
         )
 
 
@@ -116,7 +120,6 @@ def __init__(self, pos_label: int, **kwargs) -> None:
         """The False Positive Rate at 95% Recall metric.
 
         Args:
-            recall_level (float): The recall level at which to compute the FPR.
             pos_label (int): The positive label.
             kwargs: Additional arguments to pass to the metric class.
         """
diff --git a/torch_uncertainty/metrics/classification/grouping_loss.py b/torch_uncertainty/metrics/classification/grouping_loss.py
index da9eab41..155e111f 100644
--- a/torch_uncertainty/metrics/classification/grouping_loss.py
+++ b/torch_uncertainty/metrics/classification/grouping_loss.py
@@ -117,7 +117,7 @@ def update(self, probs: Tensor, target: Tensor, features: Tensor) -> None:
                 f"{features.shape}."
             )
 
-    def compute(self) -> torch.Tensor:
+    def compute(self) -> Tensor:
         """Compute the final Brier score based on inputs passed to ``update``.
 
         Returns:
diff --git a/torch_uncertainty/metrics/classification/risk_coverage.py b/torch_uncertainty/metrics/classification/risk_coverage.py
index 264d363f..5eb525f5 100644
--- a/torch_uncertainty/metrics/classification/risk_coverage.py
+++ b/torch_uncertainty/metrics/classification/risk_coverage.py
@@ -86,11 +86,8 @@ def compute(self) -> Tensor:
         error_rates = self.partial_compute()
         num_samples = error_rates.size(0)
         if num_samples < 2:
-            return torch.tensor([float("nan")], device=error_rates.device)
-        x = (
-            torch.arange(1, num_samples + 1, device=error_rates.device)
-            / num_samples
-        )
+            return torch.tensor([float("nan")], device=self.device)
+        x = torch.arange(1, num_samples + 1, device=self.device) / num_samples
         return _auc_compute(x, error_rates) / (1 - 1 / num_samples)
 
     def plot(
@@ -223,7 +220,7 @@ def compute(self) -> Tensor:
         errors = dim_zero_cat(self.errors)
         num_samples = scores.size(0)
         if num_samples < 1:
-            return torch.tensor([float("nan")], device=scores.device)
+            return torch.tensor([float("nan")], device=self.device)
         error_rates = _aurc_rejection_rate_compute(scores, errors)
         admissible_risks = (error_rates > self.risk_threshold) * 1
         max_cov_at_risk = admissible_risks.flip(0).argmin()
@@ -231,7 +228,7 @@ def compute(self) -> Tensor:
         # check if max_cov_at_risk is really admissible, if not return nan
         risk = admissible_risks[max_cov_at_risk]
         if risk > self.risk_threshold:
-            return torch.tensor([float("nan")], device=scores.device)
+            return torch.tensor([float("nan")], device=self.device)
         return 1 - max_cov_at_risk / num_samples
 
 
diff --git a/torch_uncertainty/models/resnet/batched.py b/torch_uncertainty/models/resnet/batched.py
index 795cd13e..cd89e63c 100644
--- a/torch_uncertainty/models/resnet/batched.py
+++ b/torch_uncertainty/models/resnet/batched.py
@@ -332,7 +332,10 @@ def batched_resnet(
     Returns:
         _BatchedResNet: A BatchEnsemble-style ResNet.
     """
-    block = _BasicBlock if arch in [18, 20, 34] else _Bottleneck
+    block = (
+        _BasicBlock if arch in [18, 20, 34, 44, 56, 110, 1202] else _Bottleneck
+    )
+    in_planes = 16 if arch in [20, 44, 56, 110, 1202] else 64
     return _BatchedResNet(
         block=block,
         num_blocks=get_resnet_num_blocks(arch),
@@ -343,6 +346,6 @@ def batched_resnet(
         dropout_rate=dropout_rate,
         groups=groups,
         style=style,
-        in_planes=int(64 * width_multiplier),
+        in_planes=int(in_planes * width_multiplier),
         normalization_layer=normalization_layer,
     )
diff --git a/torch_uncertainty/models/resnet/lpbnn.py b/torch_uncertainty/models/resnet/lpbnn.py
index 36c4d103..b79de57c 100644
--- a/torch_uncertainty/models/resnet/lpbnn.py
+++ b/torch_uncertainty/models/resnet/lpbnn.py
@@ -326,7 +326,10 @@ def lpbnn_resnet(
     groups: int = 1,
     style: Literal["imagenet", "cifar"] = "imagenet",
 ) -> _LPBNNResNet:
-    block = _BasicBlock if arch in [18, 20, 34] else _Bottleneck
+    block = (
+        _BasicBlock if arch in [18, 20, 34, 44, 56, 110, 1202] else _Bottleneck
+    )
+    in_planes = 16 if arch in [20, 44, 56, 110, 1202] else 64
     return _LPBNNResNet(
         block=block,
         num_blocks=get_resnet_num_blocks(arch),
@@ -337,5 +340,5 @@ def lpbnn_resnet(
         conv_bias=conv_bias,
         groups=groups,
         style=style,
-        in_planes=int(64 * width_multiplier),
+        in_planes=int(in_planes * width_multiplier),
     )
diff --git a/torch_uncertainty/models/resnet/masked.py b/torch_uncertainty/models/resnet/masked.py
index 45398891..04117e67 100644
--- a/torch_uncertainty/models/resnet/masked.py
+++ b/torch_uncertainty/models/resnet/masked.py
@@ -351,7 +351,10 @@ def masked_resnet(
     Returns:
         _MaskedResNet: A Masksembles-style ResNet.
     """
-    block = _BasicBlock if arch in [18, 20, 34] else _Bottleneck
+    block = (
+        _BasicBlock if arch in [18, 20, 34, 44, 56, 110, 1202] else _Bottleneck
+    )
+    in_planes = 16 if arch in [20, 44, 56, 110, 1202] else 64
     return _MaskedResNet(
         block=block,
         num_blocks=get_resnet_num_blocks(arch),
@@ -363,6 +366,6 @@ def masked_resnet(
         conv_bias=conv_bias,
         dropout_rate=dropout_rate,
         style=style,
-        in_planes=int(64 * width_multiplier),
+        in_planes=int(in_planes * width_multiplier),
         normalization_layer=normalization_layer,
     )
diff --git a/torch_uncertainty/models/resnet/mimo.py b/torch_uncertainty/models/resnet/mimo.py
index bf16a933..11ee2228 100644
--- a/torch_uncertainty/models/resnet/mimo.py
+++ b/torch_uncertainty/models/resnet/mimo.py
@@ -62,7 +62,10 @@ def mimo_resnet(
     style: Literal["imagenet", "cifar"] = "imagenet",
     normalization_layer: type[nn.Module] = nn.BatchNorm2d,
 ) -> _MIMOResNet:
-    block = _BasicBlock if arch in [18, 20, 34] else _Bottleneck
+    block = (
+        _BasicBlock if arch in [18, 20, 34, 44, 56, 110, 1202] else _Bottleneck
+    )
+    in_planes = 16 if arch in [20, 44, 56, 110, 1202] else 64
     return _MIMOResNet(
         block=block,
         num_blocks=get_resnet_num_blocks(arch),
@@ -73,6 +76,6 @@ def mimo_resnet(
         dropout_rate=dropout_rate,
         groups=groups,
         style=style,
-        in_planes=int(64 * width_multiplier),
+        in_planes=int(in_planes * width_multiplier),
         normalization_layer=normalization_layer,
     )
diff --git a/torch_uncertainty/models/resnet/packed.py b/torch_uncertainty/models/resnet/packed.py
index 1cdd2f98..b353f1d7 100644
--- a/torch_uncertainty/models/resnet/packed.py
+++ b/torch_uncertainty/models/resnet/packed.py
@@ -425,7 +425,10 @@ def packed_resnet(
     Returns:
         _PackedResNet: A Packed-Ensembles ResNet.
     """
-    block = _BasicBlock if arch in [18, 20, 34] else _Bottleneck
+    block = (
+        _BasicBlock if arch in [18, 20, 34, 44, 56, 110, 1202] else _Bottleneck
+    )
+    in_planes = 16 if arch in [20, 44, 56, 110, 1202] else 64
     net = _PackedResNet(
         block=block,
         num_blocks=get_resnet_num_blocks(arch),
@@ -438,7 +441,7 @@ def packed_resnet(
         groups=groups,
         num_classes=num_classes,
         style=style,
-        in_planes=int(64 * width_multiplier),
+        in_planes=int(in_planes * width_multiplier),
         normalization_layer=normalization_layer,
     )
     if pretrained:  # coverage: ignore
diff --git a/torch_uncertainty/models/resnet/std.py b/torch_uncertainty/models/resnet/std.py
index cdf1303d..8f5a3350 100644
--- a/torch_uncertainty/models/resnet/std.py
+++ b/torch_uncertainty/models/resnet/std.py
@@ -358,7 +358,7 @@ def resnet(
     activation_fn: Callable = relu,
     normalization_layer: type[nn.Module] = nn.BatchNorm2d,
 ) -> _ResNet:
-    """ResNet-18 model.
+    """ResNet model.
 
     Args:
         in_channels (int): Number of input channels.
@@ -379,7 +379,10 @@ def resnet(
     Returns:
         _ResNet: The ResNet model.
     """
-    block = _BasicBlock if arch in [18, 20, 34] else _Bottleneck
+    block = (
+        _BasicBlock if arch in [18, 20, 34, 44, 56, 110, 1202] else _Bottleneck
+    )
+    in_planes = 16 if arch in [20, 44, 56, 110, 1202] else 64
     return _ResNet(
         block=block,
         num_blocks=get_resnet_num_blocks(arch),
@@ -389,7 +392,7 @@ def resnet(
         dropout_rate=dropout_rate,
         groups=groups,
         style=style,
-        in_planes=int(64 * width_multiplier),
+        in_planes=int(in_planes * width_multiplier),
         activation_fn=activation_fn,
         normalization_layer=normalization_layer,
     )
diff --git a/torch_uncertainty/models/resnet/utils.py b/torch_uncertainty/models/resnet/utils.py
index 0e082509..caf2cde9 100644
--- a/torch_uncertainty/models/resnet/utils.py
+++ b/torch_uncertainty/models/resnet/utils.py
@@ -5,10 +5,18 @@ def get_resnet_num_blocks(arch: int) -> list[int]:
         num_blocks = [3, 3, 3]
     elif arch == 34 or arch == 50:
         num_blocks = [3, 4, 6, 3]
+    elif arch == 44:
+        num_blocks = [7, 7, 7]
+    elif arch == 56:
+        num_blocks = [9, 9, 9]
     elif arch == 101:
         num_blocks = [3, 4, 23, 3]
+    elif arch == 110:
+        num_blocks = [18, 18, 18]
     elif arch == 152:
         num_blocks = [3, 8, 36, 3]
+    elif arch == 1202:
+        num_blocks = [200, 200, 200]
     else:
         raise ValueError(f"Unknown ResNet architecture. Got {arch}.")
     return num_blocks

From 3267d05f1bd9775fe3ea661d78759d3b45dc6110 Mon Sep 17 00:00:00 2001
From: Olivier <olivier.ar.laurent@gmail.com>
Date: Thu, 18 Jul 2024 11:36:00 +0200
Subject: [PATCH 02/27] :hammer: Rework the FPR metric

---
 .../metrics/classification/fpr95.py           | 90 +++++++++----------
 1 file changed, 41 insertions(+), 49 deletions(-)

diff --git a/torch_uncertainty/metrics/classification/fpr95.py b/torch_uncertainty/metrics/classification/fpr95.py
index 4fb51f88..a854cc42 100644
--- a/torch_uncertainty/metrics/classification/fpr95.py
+++ b/torch_uncertainty/metrics/classification/fpr95.py
@@ -23,7 +23,8 @@ def __init__(self, recall_level: float, pos_label: int, **kwargs) -> None:
             kwargs: Additional arguments to pass to the metric class.
 
         Reference:
-            Inpired by https://github.com/hendrycks/anomaly-seg.
+            Improved from https://github.com/hendrycks/anomaly-seg and
+            translated to torch.
         """
         super().__init__(**kwargs)
 
@@ -58,61 +59,52 @@ def compute(self) -> Tensor:
         Returns:
             Tensor: The value of the FPRx.
         """
-        conf = dim_zero_cat(self.conf).cpu().numpy()
-        targets = dim_zero_cat(self.targets).cpu().numpy()
-
-        # out_labels is an array of 0s and 1s - 0 if ID, 1 if OOD
-        out_labels = targets == self.pos_label
-
-        in_scores = conf[np.logical_not(out_labels)]
-        out_scores = conf[out_labels]
-
-        neg = np.array(in_scores[:]).reshape((-1, 1))
-        pos = np.array(out_scores[:]).reshape((-1, 1))
-        examples = np.squeeze(np.vstack((pos, neg)))
-        labels = np.zeros(len(examples), dtype=np.int32)
-        labels[: len(pos)] += 1
-
-        # make labels a boolean vector, True if OOD
-        labels = labels == self.pos_label
-
-        # sort scores and corresponding truth values
-        desc_score_indices = np.argsort(examples)[::-1]
-        examples = examples[desc_score_indices]
-        labels = labels[desc_score_indices]
-
-        # examples typically has many tied values. Here we extract
-        # the indices associated with the distinct values. We also
-        # concatenate a value for the end of the curve.
-        distinct_value_indices = np.where(np.diff(examples))[0]
-        threshold_idxs = np.r_[distinct_value_indices, labels.shape[0] - 1]
+        conf = dim_zero_cat(self.conf)
+        targets = dim_zero_cat(self.targets)
+
+        # map examples and labels to OOD first
+        indx = torch.argsort(targets, descending=True)
+        examples = conf[indx]
+        labels = torch.zeros_like(targets, dtype=torch.bool, device=self.device)
+        labels[: torch.count_nonzero(targets)] = True
+
+        # sort examples and labels by decreasing confidence
+        desc_scores_indx = torch.argsort(examples, descending=True)
+        examples = examples[desc_scores_indx]
+        labels = labels[desc_scores_indx]
+
+        # Get the indices of the distinct values
+        distinct_value_indices = torch.where(torch.diff(examples))[0]
+        threshold_idxs = torch.cat(
+            [
+                distinct_value_indices,
+                torch.LongTensor([labels.shape[0] - 1], device=self.device),
+            ]
+        )
 
         # accumulate the true positives with decreasing threshold
-        tps = np.cumsum(labels)[threshold_idxs]
-        fps = 1 + threshold_idxs - tps  # add one because of zero-based indexing
-
-        thresholds = examples[threshold_idxs]
+        true_pos = torch.cumsum(labels, dim=0)[threshold_idxs]
+        false_pos = (
+            1 + threshold_idxs - true_pos
+        )  # add one because of zero-based indexing
 
-        if tps[-1] == 0:
+        if true_pos[-1] == 0:
             return torch.tensor([torch.nan], device=self.device)
-        recall = tps / tps[-1]
-
-        last_ind = tps.searchsorted(tps[-1])
-        sl = slice(last_ind, None, -1)  # [last_ind::-1]
-        recall, fps, tps, thresholds = (
-            np.r_[recall[sl], 1],
-            np.r_[fps[sl], 0],
-            np.r_[tps[sl], 0],
-            thresholds[sl],
-        )
 
-        cutoff = np.argmin(np.abs(recall - self.recall_level))
+        recall = true_pos / true_pos[-1]
 
-        return torch.tensor(
-            fps[cutoff] / (np.sum(np.logical_not(labels))),
-            dtype=self.dtype,
-            device=self.device,
+        last_ind = torch.searchsorted(true_pos, true_pos[-1])
+        recall = torch.cat(
+            [recall[: last_ind + 1].flip(0), torch.tensor([1.0])]
+        )
+        false_pos = torch.cat(
+            [
+                false_pos[: last_ind + 1].flip(0),
+                torch.tensor([0.0], dtype=self.dtype, device=self.device),
+            ]
         )
+        cutoff = np.argmin(torch.abs(recall - 0.6))
+        return false_pos[cutoff] / (~labels).sum()
 
 
 class FPR95(FPRx):

From b8c7111f38399b53cbbc6a364b17e36d44152abb Mon Sep 17 00:00:00 2001
From: Olivier <olivier.ar.laurent@gmail.com>
Date: Thu, 18 Jul 2024 12:47:14 +0200
Subject: [PATCH 03/27] :hammer: Various improvements

---
 torch_uncertainty/models/lenet.py             |  6 --
 torch_uncertainty/models/mlp.py               |  1 -
 torch_uncertainty/models/resnet/std.py        | 11 ++-
 torch_uncertainty/models/utils.py             | 15 +++-
 .../models/wideresnet/batched.py              | 65 +++++++++-------
 torch_uncertainty/models/wideresnet/masked.py | 52 ++++++++-----
 torch_uncertainty/models/wideresnet/mimo.py   | 12 ++-
 torch_uncertainty/models/wideresnet/packed.py | 28 ++++---
 torch_uncertainty/models/wideresnet/std.py    | 78 +++++++++++--------
 torch_uncertainty/post_processing/laplace.py  | 12 +--
 10 files changed, 166 insertions(+), 114 deletions(-)

diff --git a/torch_uncertainty/models/lenet.py b/torch_uncertainty/models/lenet.py
index 36b175c0..55a4c772 100644
--- a/torch_uncertainty/models/lenet.py
+++ b/torch_uncertainty/models/lenet.py
@@ -25,7 +25,6 @@ def __init__(
         norm: type[nn.Module],
         groups: int,
         dropout_rate: float,
-        last_layer_dropout: bool,
     ) -> None:
         super().__init__()
         self.activation = activation
@@ -44,7 +43,6 @@ def __init__(
             )
 
         self.dropout_rate = dropout_rate
-        self.last_layer_dropout = last_layer_dropout
 
         self.conv1 = conv2d_layer(
             in_channels, 6, (5, 5), groups=groups, **layer_args
@@ -88,7 +86,6 @@ def _lenet(
     norm: type[nn.Module] = nn.Identity,
     groups: int = 1,
     dropout_rate: float = 0.0,
-    last_layer_dropout: bool = False,
 ) -> _LeNet | StochasticModel:
     model = _LeNet(
         in_channels=in_channels,
@@ -100,7 +97,6 @@ def _lenet(
         groups=groups,
         layer_args=layer_args,
         dropout_rate=dropout_rate,
-        last_layer_dropout=last_layer_dropout,
     )
     if stochastic:
         return StochasticModel(model, num_samples)
@@ -114,7 +110,6 @@ def lenet(
     norm: type[nn.Module] = nn.Identity,
     groups: int = 1,
     dropout_rate: float = 0.0,
-    last_layer_dropout: bool = False,
 ) -> _LeNet:
     return _lenet(
         stochastic=False,
@@ -127,7 +122,6 @@ def lenet(
         norm=norm,
         groups=groups,
         dropout_rate=dropout_rate,
-        last_layer_dropout=last_layer_dropout,
     )
 
 
diff --git a/torch_uncertainty/models/mlp.py b/torch_uncertainty/models/mlp.py
index d0fdee07..720ce7f0 100644
--- a/torch_uncertainty/models/mlp.py
+++ b/torch_uncertainty/models/mlp.py
@@ -39,7 +39,6 @@ def __init__(
         super().__init__()
         self.activation = activation
         self.dropout_rate = dropout_rate
-
         layers = nn.ModuleList()
 
         if len(hidden_dims) == 0:
diff --git a/torch_uncertainty/models/resnet/std.py b/torch_uncertainty/models/resnet/std.py
index 8f5a3350..b07e7fc6 100644
--- a/torch_uncertainty/models/resnet/std.py
+++ b/torch_uncertainty/models/resnet/std.py
@@ -19,9 +19,9 @@ def __init__(
         stride: int,
         dropout_rate: float,
         groups: int,
+        conv_bias: bool,
         activation_fn: Callable,
         normalization_layer: type[nn.Module],
-        conv_bias: bool,
     ) -> None:
         super().__init__()
         self.activation_fn = activation_fn
@@ -366,14 +366,13 @@ def resnet(
         arch (int): The architecture of the ResNet.
         conv_bias (bool): Whether to use bias in convolutions. Defaults to
             ``True``.
-        conv_bias (bool): Whether to use bias in convolutions. Defaults to
-            ``True``.
-        dropout_rate (float): Dropout rate. Defaults to 0.
-        width_multiplier (float): Width multiplier. Defaults to 1.
+        dropout_rate (float): Dropout rate. Defaults to 0.0.
+        width_multiplier (float): Width multiplier. Defaults to 1.0.
         groups (int): Number of groups in convolutions. Defaults to 1.
         style (bool, optional): Whether to use the ImageNet
             structure. Defaults to ``True``.
-        activation_fn (Callable, optional): Activation function.
+        activation_fn (Callable, optional): Activation function. Defaults to
+            ``torch.nn.functional.relu``.
         normalization_layer (nn.Module, optional): Normalization layer.
 
     Returns:
diff --git a/torch_uncertainty/models/utils.py b/torch_uncertainty/models/utils.py
index 87fd65ee..ecf06466 100644
--- a/torch_uncertainty/models/utils.py
+++ b/torch_uncertainty/models/utils.py
@@ -1,4 +1,5 @@
 from torch import Tensor, nn
+from torch.nn.modules.batchnorm import _BatchNorm
 
 
 class Backbone(nn.Module):
@@ -27,14 +28,20 @@ def forward(self, x: Tensor) -> list[Tensor]:
         """
         feature = x
         features = []
-        for k, v in self.model._modules.items():
-            feature = v(feature)
-            if k in self.feat_names:
+        for key, layer in self.model._modules.items():
+            feature = layer(feature)
+            if key in self.feat_names:
                 features.append(feature)
         return features
 
 
 def set_bn_momentum(model: nn.Module, momentum: float) -> None:
+    """Set the momentum of all batch normalization layers in the model.
+
+    Args:
+        model (nn.Module): Model.
+        momentum (float): Momentum of the batch normalization layers.
+    """
     for m in model.modules():
-        if isinstance(m, nn.BatchNorm2d):
+        if isinstance(m, _BatchNorm):
             m.momentum = momentum
diff --git a/torch_uncertainty/models/wideresnet/batched.py b/torch_uncertainty/models/wideresnet/batched.py
index b27b8f99..792c0e46 100644
--- a/torch_uncertainty/models/wideresnet/batched.py
+++ b/torch_uncertainty/models/wideresnet/batched.py
@@ -1,7 +1,8 @@
+from collections.abc import Callable
 from typing import Literal
 
-import torch.nn.functional as F
 from torch import Tensor, nn
+from torch.nn.functional import relu
 
 from torch_uncertainty.layers import BatchConv2d, BatchLinear
 
@@ -15,20 +16,22 @@ def __init__(
         self,
         in_planes: int,
         planes: int,
-        conv_bias: bool,
         dropout_rate: float,
         stride: int,
         num_estimators: int,
         groups: int,
+        conv_bias: bool,
+        activation_fn: Callable,
     ) -> None:
         super().__init__()
+        self.activation_fn = activation_fn
         self.conv1 = BatchConv2d(
             in_planes,
             planes,
             kernel_size=3,
             num_estimators=num_estimators,
-            groups=groups,
             padding=1,
+            groups=groups,
             bias=conv_bias,
         )
         self.dropout = nn.Dropout2d(p=dropout_rate)
@@ -38,11 +41,13 @@ def __init__(
             planes,
             kernel_size=3,
             num_estimators=num_estimators,
-            groups=groups,
             stride=stride,
             padding=1,
+            groups=groups,
             bias=conv_bias,
         )
+        self.bn2 = nn.BatchNorm2d(planes)
+
         self.shortcut = nn.Sequential()
         if stride != 1 or in_planes != planes:
             self.shortcut = nn.Sequential(
@@ -51,19 +56,17 @@ def __init__(
                     planes,
                     kernel_size=1,
                     num_estimators=num_estimators,
-                    groups=groups,
                     stride=stride,
+                    groups=groups,
                     bias=conv_bias,
                 ),
             )
 
-        self.bn2 = nn.BatchNorm2d(planes)
-
     def forward(self, x: Tensor) -> Tensor:
-        out = F.relu(self.bn1(self.dropout(self.conv1(x))))
+        out = self.activation_fn(self.bn1(self.dropout(self.conv1(x))))
         out = self.conv2(out)
         out += self.shortcut(x)
-        return F.relu(self.bn2(out))
+        return self.activation_fn(self.bn2(out))
 
 
 class _BatchWideResNet(nn.Module):
@@ -78,17 +81,22 @@ def __init__(
         dropout_rate: float,
         groups: int = 1,
         style: Literal["imagenet", "cifar"] = "imagenet",
+        activation_fn: Callable = relu,
     ) -> None:
         super().__init__()
         self.num_estimators = num_estimators
+        self.activation_fn = activation_fn
         self.in_planes = 16
 
         if (depth - 4) % 6 != 0:
-            raise ValueError("Wide-resnet depth should be 6n+4.")
+            raise ValueError(f"Wide-resnet depth should be 6n+4. Got {depth}.")
         num_blocks = (depth - 4) // 6
-        k = widen_factor
-
-        num_stages = [16, 16 * k, 32 * k, 64 * k]
+        num_stages = [
+            16,
+            16 * widen_factor,
+            32 * widen_factor,
+            64 * widen_factor,
+        ]
 
         if style == "imagenet":
             self.conv1 = BatchConv2d(
@@ -99,7 +107,7 @@ def __init__(
                 kernel_size=7,
                 stride=2,
                 padding=3,
-                bias=True,
+                bias=conv_bias,
             )
         elif style == "cifar":
             self.conv1 = BatchConv2d(
@@ -110,7 +118,7 @@ def __init__(
                 kernel_size=3,
                 stride=1,
                 padding=1,
-                bias=True,
+                bias=conv_bias,
             )
         else:
             raise ValueError(f"Unknown WideResNet style: {style}. ")
@@ -128,37 +136,39 @@ def __init__(
             _WideBasicBlock,
             num_stages[1],
             num_blocks=num_blocks,
-            conv_bias=conv_bias,
             dropout_rate=dropout_rate,
             stride=1,
             num_estimators=self.num_estimators,
             groups=groups,
+            conv_bias=conv_bias,
+            activation_fn=activation_fn,
         )
         self.layer2 = self._wide_layer(
             _WideBasicBlock,
             num_stages[2],
             num_blocks=num_blocks,
-            conv_bias=conv_bias,
             dropout_rate=dropout_rate,
             stride=2,
             num_estimators=self.num_estimators,
             groups=groups,
+            conv_bias=conv_bias,
+            activation_fn=activation_fn,
         )
         self.layer3 = self._wide_layer(
             _WideBasicBlock,
             num_stages[3],
             num_blocks=num_blocks,
-            conv_bias=conv_bias,
             dropout_rate=dropout_rate,
             stride=2,
             num_estimators=self.num_estimators,
             groups=groups,
+            conv_bias=conv_bias,
+            activation_fn=activation_fn,
         )
 
         self.dropout = nn.Dropout(p=dropout_rate)
         self.pool = nn.AdaptiveAvgPool2d(output_size=1)
         self.flatten = nn.Flatten(1)
-
         self.linear = BatchLinear(
             num_stages[3],
             num_classes,
@@ -170,11 +180,12 @@ def _wide_layer(
         block: type[nn.Module],
         planes: int,
         num_blocks: int,
-        conv_bias: bool,
         dropout_rate: float,
         stride: int,
         num_estimators: int,
         groups: int,
+        conv_bias: bool,
+        activation_fn: Callable,
     ) -> nn.Module:
         strides = [stride] + [1] * (int(num_blocks) - 1)
         layers = []
@@ -189,22 +200,24 @@ def _wide_layer(
                     stride=stride,
                     num_estimators=num_estimators,
                     groups=groups,
+                    activation_fn=activation_fn,
                 )
             )
             self.in_planes = planes
-
         return nn.Sequential(*layers)
 
-    def forward(self, x: Tensor) -> Tensor:
+    def feats_forward(self, x: Tensor) -> Tensor:
         out = x.repeat(self.num_estimators, 1, 1, 1)
-        out = F.relu(self.bn1(self.conv1(out)))
+        out = self.activation_fn(self.bn1(self.conv1(out)))
         out = self.optional_pool(out)
         out = self.layer1(out)
         out = self.layer2(out)
         out = self.layer3(out)
         out = self.pool(out)
-        out = self.dropout(self.flatten(out))
-        return self.linear(out)
+        return self.dropout(self.flatten(out))
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.linear(self.feats_forward(x))
 
 
 def batched_wideresnet28x10(
@@ -221,11 +234,11 @@ def batched_wideresnet28x10(
     Args:
         in_channels (int): Number of input channels.
         num_estimators (int): Number of estimators in the ensemble.
-        groups (int): Number of groups in the convolutions.
         conv_bias (bool): Whether to use bias in convolutions. Defaults to
             ``True``.
         dropout_rate (float, optional): Dropout rate. Defaults to ``0.3``.
         num_classes (int): Number of classes to predict.
+        groups (int): Number of groups in the convolutions. Defaults to ``1``.
         style (bool, optional): Whether to use the ImageNet
             structure. Defaults to ``True``.
 
diff --git a/torch_uncertainty/models/wideresnet/masked.py b/torch_uncertainty/models/wideresnet/masked.py
index b0d00581..3a90be81 100644
--- a/torch_uncertainty/models/wideresnet/masked.py
+++ b/torch_uncertainty/models/wideresnet/masked.py
@@ -1,7 +1,8 @@
+from collections.abc import Callable
 from typing import Literal
 
-import torch.nn.functional as F
 from torch import Tensor, nn
+from torch.nn.functional import relu
 
 from torch_uncertainty.layers import MaskedConv2d, MaskedLinear
 
@@ -21,17 +22,19 @@ def __init__(
         num_estimators: int,
         scale: float,
         groups: int,
+        activation_fn: Callable,
     ) -> None:
         super().__init__()
+        self.activation_fn = activation_fn
         self.conv1 = MaskedConv2d(
             in_planes,
             planes,
             kernel_size=3,
             num_estimators=num_estimators,
             padding=1,
-            bias=conv_bias,
             scale=scale,
             groups=groups,
+            bias=conv_bias,
         )
         self.dropout = nn.Dropout2d(p=dropout_rate)
         self.bn1 = nn.BatchNorm2d(planes)
@@ -42,10 +45,12 @@ def __init__(
             num_estimators=num_estimators,
             stride=stride,
             padding=1,
-            bias=conv_bias,
             scale=scale,
             groups=groups,
+            bias=conv_bias,
         )
+        self.bn2 = nn.BatchNorm2d(planes)
+
         self.shortcut = nn.Sequential()
         if stride != 1 or in_planes != planes:
             self.shortcut = nn.Sequential(
@@ -55,18 +60,17 @@ def __init__(
                     kernel_size=1,
                     num_estimators=num_estimators,
                     stride=stride,
-                    bias=conv_bias,
                     scale=scale,
                     groups=groups,
+                    bias=conv_bias,
                 ),
             )
-        self.bn2 = nn.BatchNorm2d(planes)
 
     def forward(self, x: Tensor) -> Tensor:
-        out = F.relu(self.bn1(self.dropout(self.conv1(x))))
+        out = self.activation_fn(self.bn1(self.dropout(self.conv1(x))))
         out = self.conv2(out)
         out += self.shortcut(x)
-        return F.relu(self.bn2(out))
+        return self.activation_fn(self.bn2(out))
 
 
 class _MaskedWideResNet(nn.Module):
@@ -82,17 +86,22 @@ def __init__(
         scale: float = 2.0,
         groups: int = 1,
         style: Literal["imagenet", "cifar"] = "imagenet",
+        activation_fn: Callable = relu,
     ) -> None:
         super().__init__()
         self.num_estimators = num_estimators
+        self.activation_fn = activation_fn
         self.in_planes = 16
 
         if (depth - 4) % 6 != 0:
-            raise ValueError("Wide-resnet depth should be 6n+4.")
+            raise ValueError(f"Wide-resnet depth should be 6n+4. Got {depth}.")
         num_blocks = (depth - 4) // 6
-        k = widen_factor
-
-        num_stages = [16, 16 * k, 32 * k, 64 * k]
+        num_stages = [
+            16,
+            16 * widen_factor,
+            32 * widen_factor,
+            64 * widen_factor,
+        ]
 
         if style == "imagenet":
             self.conv1 = nn.Conv2d(
@@ -136,6 +145,7 @@ def __init__(
             num_estimators=self.num_estimators,
             scale=scale,
             groups=groups,
+            activation_fn=activation_fn,
         )
         self.layer2 = self._wide_layer(
             _WideBasicBlock,
@@ -147,6 +157,7 @@ def __init__(
             num_estimators=self.num_estimators,
             scale=scale,
             groups=groups,
+            activation_fn=activation_fn,
         )
         self.layer3 = self._wide_layer(
             _WideBasicBlock,
@@ -158,6 +169,7 @@ def __init__(
             num_estimators=self.num_estimators,
             scale=scale,
             groups=groups,
+            activation_fn=activation_fn,
         )
 
         self.dropout = nn.Dropout(p=dropout_rate)
@@ -179,6 +191,7 @@ def _wide_layer(
         num_estimators: int,
         scale: float = 2.0,
         groups: int = 1,
+        activation_fn: Callable = relu,
     ) -> nn.Module:
         strides = [stride] + [1] * (int(num_blocks) - 1)
         layers = []
@@ -194,22 +207,24 @@ def _wide_layer(
                     dropout_rate=dropout_rate,
                     scale=scale,
                     groups=groups,
+                    activation_fn=activation_fn,
                 )
             )
             self.in_planes = planes
-
         return nn.Sequential(*layers)
 
-    def forward(self, x: Tensor) -> Tensor:
+    def feats_forward(self, x: Tensor) -> Tensor:
         out = x.repeat(self.num_estimators, 1, 1, 1)
-        out = F.relu(self.bn1(self.conv1(out)))
+        out = self.activation_fn(self.bn1(self.conv1(out)))
         out = self.optional_pool(out)
         out = self.layer1(out)
         out = self.layer2(out)
         out = self.layer3(out)
         out = self.pool(out)
-        out = self.dropout(self.flatten(out))
-        return self.linear(out)
+        return self.dropout(self.flatten(out))
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.linear(self.feats_forward(x))
 
 
 def masked_wideresnet28x10(
@@ -217,9 +232,9 @@ def masked_wideresnet28x10(
     num_classes: int,
     num_estimators: int,
     scale: float,
-    groups: int,
     conv_bias: bool = True,
     dropout_rate: float = 0.3,
+    groups: int = 1,
     style: Literal["imagenet", "cifar"] = "imagenet",
 ) -> _MaskedWideResNet:
     """Masksembles of Wide-ResNet-28x10.
@@ -229,10 +244,11 @@ def masked_wideresnet28x10(
         num_classes (int): Number of classes to predict.
         num_estimators (int): Number of estimators in the ensemble.
         scale (float): Expansion factor affecting the width of the estimators.
-        groups (int): Number of groups within each estimator.
         conv_bias (bool): Whether to use bias in convolutions. Defaults to
             ``True``.
         dropout_rate (float, optional): Dropout rate. Defaults to ``0.3``.
+        groups (int): Number of groups within each estimator. Defaults to
+            ``1``.
         style (bool, optional): Whether to use the ImageNet
             structure. Defaults to ``True``.
 
diff --git a/torch_uncertainty/models/wideresnet/mimo.py b/torch_uncertainty/models/wideresnet/mimo.py
index c3a25e0a..edb9a588 100644
--- a/torch_uncertainty/models/wideresnet/mimo.py
+++ b/torch_uncertainty/models/wideresnet/mimo.py
@@ -1,7 +1,9 @@
+from collections.abc import Callable
 from typing import Literal
 
 import torch
 from einops import rearrange
+from torch.nn.functional import relu
 
 from .std import _WideResNet
 
@@ -22,6 +24,7 @@ def __init__(
         dropout_rate: float,
         groups: int = 1,
         style: Literal["imagenet", "cifar"] = "imagenet",
+        activation_fn: Callable = relu,
     ) -> None:
         super().__init__(
             depth,
@@ -32,25 +35,26 @@ def __init__(
             dropout_rate=dropout_rate,
             groups=groups,
             style=style,
+            activation_fn=activation_fn,
         )
-
         self.num_estimators = num_estimators
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         if not self.training:
             x = x.repeat(self.num_estimators, 1, 1, 1)
         out = rearrange(x, "(m b) c h w -> b (m c) h w", m=self.num_estimators)
-        out = super().forward(out)
-        return rearrange(out, "b (m d) -> (m b) d", m=self.num_estimators)
+        return rearrange(
+            super().forward(out), "b (m d) -> (m b) d", m=self.num_estimators
+        )
 
 
 def mimo_wideresnet28x10(
     in_channels: int,
     num_classes: int,
     num_estimators: int,
-    groups: int = 1,
     conv_bias: bool = True,
     dropout_rate: float = 0.3,
+    groups: int = 1,
     style: Literal["imagenet", "cifar"] = "imagenet",
 ) -> _MIMOWideResNet:
     return _MIMOWideResNet(
diff --git a/torch_uncertainty/models/wideresnet/packed.py b/torch_uncertainty/models/wideresnet/packed.py
index 8d16cecc..60fcc7cf 100644
--- a/torch_uncertainty/models/wideresnet/packed.py
+++ b/torch_uncertainty/models/wideresnet/packed.py
@@ -1,8 +1,10 @@
+from collections.abc import Callable
 from typing import Literal
 
 import torch.nn.functional as F
 from einops import rearrange
 from torch import Tensor, nn
+from torch.nn.functional import relu
 
 from torch_uncertainty.layers import PackedConv2d, PackedLinear
 
@@ -88,17 +90,22 @@ def __init__(
         gamma: int = 1,
         groups: int = 1,
         style: Literal["imagenet", "cifar"] = "imagenet",
+        activation_fn: Callable = relu,
     ) -> None:
         super().__init__()
         self.num_estimators = num_estimators
+        self.activation_fn = activation_fn
         self.in_planes = 16
 
         if (depth - 4) % 6 != 0:
-            raise ValueError("Wide-resnet depth should be 6n+4.")
+            raise ValueError(f"Wide-resnet depth should be 6n+4. Got {depth}.")
         num_blocks = int((depth - 4) / 6)
-        k = widen_factor
-
-        num_stages = [16, 16 * k, 32 * k, 64 * k]
+        num_stages = [
+            16,
+            16 * widen_factor,
+            32 * widen_factor,
+            64 * widen_factor,
+        ]
 
         if style == "imagenet":
             self.conv1 = PackedConv2d(
@@ -220,11 +227,10 @@ def _wide_layer(
                 )
             )
             self.in_planes = planes
-
         return nn.Sequential(*layers)
 
-    def forward(self, x: Tensor) -> Tensor:
-        out = F.relu(self.bn1(self.conv1(x)))
+    def feats_forward(self, x: Tensor) -> Tensor:
+        out = self.activation_fn(self.bn1(self.conv1(x)))
         out = self.optional_pool(out)
         out = self.layer1(out)
         out = self.layer2(out)
@@ -233,8 +239,10 @@ def forward(self, x: Tensor) -> Tensor:
             out, "e (m c) h w -> (m e) c h w", m=self.num_estimators
         )
         out = self.pool(out)
-        out = self.dropout(self.flatten(out))
-        return self.linear(out)
+        return self.dropout(self.flatten(out))
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.linear(self.feats_forward(x))
 
 
 def packed_wideresnet28x10(
@@ -243,9 +251,9 @@ def packed_wideresnet28x10(
     num_estimators: int,
     alpha: int,
     gamma: int,
-    groups: int = 1,
     conv_bias: bool = True,
     dropout_rate: float = 0.3,
+    groups: int = 1,
     style: Literal["imagenet", "cifar"] = "imagenet",
 ) -> _PackedWideResNet:
     """Packed-Ensembles of Wide-ResNet-28x10.
diff --git a/torch_uncertainty/models/wideresnet/std.py b/torch_uncertainty/models/wideresnet/std.py
index bd3d6a76..963b4d60 100644
--- a/torch_uncertainty/models/wideresnet/std.py
+++ b/torch_uncertainty/models/wideresnet/std.py
@@ -1,7 +1,8 @@
+from collections.abc import Callable
 from typing import Literal
 
-import torch.nn.functional as F
 from torch import Tensor, nn
+from torch.nn.functional import relu
 
 __all__ = [
     "wideresnet28x10",
@@ -14,17 +15,20 @@ def __init__(
         in_planes: int,
         planes: int,
         dropout_rate: float,
-        stride: int = 1,
-        groups: int = 1,
+        stride: int,
+        groups: int,
+        conv_bias: bool,
+        activation_fn: Callable,
     ) -> None:
         super().__init__()
+        self.activation_fn = activation_fn
         self.conv1 = nn.Conv2d(
             in_planes,
             planes,
             kernel_size=3,
             padding=1,
             groups=groups,
-            bias=False,
+            bias=conv_bias,
         )
         self.dropout = nn.Dropout2d(p=dropout_rate)
         self.bn1 = nn.BatchNorm2d(planes)
@@ -35,8 +39,10 @@ def __init__(
             stride=stride,
             padding=1,
             groups=groups,
-            bias=False,
+            bias=conv_bias,
         )
+        self.bn2 = nn.BatchNorm2d(planes)
+
         self.shortcut = nn.Sequential()
         if stride != 1 or in_planes != planes:
             self.shortcut = nn.Sequential(
@@ -46,28 +52,18 @@ def __init__(
                     kernel_size=1,
                     stride=stride,
                     groups=groups,
-                    bias=True,
+                    bias=conv_bias,
                 ),
             )
-        self.bn2 = nn.BatchNorm2d(planes)
 
     def forward(self, x: Tensor) -> Tensor:
-        out = F.relu(self.bn1(self.dropout(self.conv1(x))))
+        out = self.activation_fn(self.bn1(self.dropout(self.conv1(x))))
         out = self.conv2(out)
         out += self.shortcut(x)
-        return F.relu(self.bn2(out))
+        return self.activation_fn(self.bn2(out))
 
 
 class _WideResNet(nn.Module):
-    """WideResNet from `Wide Residual Networks`.
-
-    Note:
-        if `dropout_rate` and `num_estimators` are set, the model will sample
-        from the dropout distribution during inference. If `last_layer_dropout`
-        is set, only the last layer will be sampled from the dropout
-        distribution during inference.
-    """
-
     def __init__(
         self,
         depth: int,
@@ -78,17 +74,22 @@ def __init__(
         dropout_rate: float,
         groups: int = 1,
         style: Literal["imagenet", "cifar"] = "imagenet",
+        activation_fn: Callable = relu,
     ) -> None:
         super().__init__()
-        self.in_planes = 16
         self.dropout_rate = dropout_rate
+        self.activation_fn = activation_fn
+        self.in_planes = 16
 
         if (depth - 4) % 6 != 0:
             raise ValueError(f"Wide-resnet depth should be 6n+4. Got {depth}.")
         num_blocks = int((depth - 4) / 6)
-        k = widen_factor
-
-        num_stages = [16, 16 * k, 32 * k, 64 * k]
+        num_stages = [
+            16,
+            16 * widen_factor,
+            32 * widen_factor,
+            64 * widen_factor,
+        ]
 
         if style == "imagenet":
             self.conv1 = nn.Conv2d(
@@ -129,6 +130,8 @@ def __init__(
             dropout_rate=dropout_rate,
             stride=1,
             groups=groups,
+            activation_fn=activation_fn,
+            conv_bias=conv_bias,
         )
         self.layer2 = self._wide_layer(
             WideBasicBlock,
@@ -137,6 +140,8 @@ def __init__(
             dropout_rate=dropout_rate,
             stride=2,
             groups=groups,
+            activation_fn=activation_fn,
+            conv_bias=conv_bias,
         )
         self.layer3 = self._wide_layer(
             WideBasicBlock,
@@ -145,12 +150,12 @@ def __init__(
             dropout_rate=dropout_rate,
             stride=2,
             groups=groups,
+            activation_fn=activation_fn,
+            conv_bias=conv_bias,
         )
-
         self.dropout = nn.Dropout(p=dropout_rate)
         self.pool = nn.AdaptiveAvgPool2d(output_size=1)
         self.flatten = nn.Flatten(1)
-
         self.linear = nn.Linear(
             num_stages[3],
             num_classes,
@@ -164,6 +169,8 @@ def _wide_layer(
         dropout_rate: float,
         stride: int,
         groups: int,
+        conv_bias: bool,
+        activation_fn: Callable,
     ) -> nn.Module:
         strides = [stride] + [1] * (int(num_blocks) - 1)
         layers = []
@@ -171,25 +178,26 @@ def _wide_layer(
         for stride in strides:
             layers.append(
                 block(
-                    self.in_planes,
-                    planes,
-                    dropout_rate,
-                    stride,
-                    groups,
+                    in_planes=self.in_planes,
+                    planes=planes,
+                    stride=stride,
+                    dropout_rate=dropout_rate,
+                    groups=groups,
+                    conv_bias=conv_bias,
+                    activation_fn=activation_fn,
                 )
             )
             self.in_planes = planes
-
         return nn.Sequential(*layers)
 
     def feats_forward(self, x: Tensor) -> Tensor:
-        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.activation_fn(self.bn1(self.conv1(x)))
         out = self.optional_pool(out)
         out = self.layer1(out)
         out = self.layer2(out)
         out = self.layer3(out)
         out = self.pool(out)
-        return self.flatten(out)
+        return self.dropout(self.flatten(out))
 
     def forward(self, x: Tensor) -> Tensor:
         return self.linear(self.feats_forward(x))
@@ -198,10 +206,11 @@ def forward(self, x: Tensor) -> Tensor:
 def wideresnet28x10(
     in_channels: int,
     num_classes: int,
-    groups: int = 1,
     conv_bias: bool = True,
     dropout_rate: float = 0.3,
+    groups: int = 1,
     style: Literal["imagenet", "cifar"] = "imagenet",
+    activation_fn: Callable = relu,
 ) -> _WideResNet:
     """Wide-ResNet-28x10 from `Wide Residual Networks
     <https://arxiv.org/pdf/1605.07146.pdf>`_.
@@ -216,6 +225,8 @@ def wideresnet28x10(
         dropout_rate (float, optional): Dropout rate. Defaults to ``0.3``.
         style (bool, optional): Whether to use the ImageNet
             structure. Defaults to ``True``.
+        activation_fn (Callable, optional): Activation function. Defaults to
+            ``torch.nn.functional.relu``.
 
     Returns:
         _Wide: A Wide-ResNet-28x10.
@@ -229,4 +240,5 @@ def wideresnet28x10(
         num_classes=num_classes,
         groups=groups,
         style=style,
+        activation_fn=activation_fn,
     )
diff --git a/torch_uncertainty/post_processing/laplace.py b/torch_uncertainty/post_processing/laplace.py
index fc1d2894..dac1b0ba 100644
--- a/torch_uncertainty/post_processing/laplace.py
+++ b/torch_uncertainty/post_processing/laplace.py
@@ -4,6 +4,8 @@
 from torch import Tensor, nn
 from torch.utils.data import DataLoader, Dataset
 
+from .abstract import PostProcessing
+
 if util.find_spec("laplace"):
     from laplace import Laplace
 
@@ -12,7 +14,7 @@
     laplace_installed = False
 
 
-class LaplaceApprox(nn.Module):
+class LaplaceApprox(PostProcessing):
     def __init__(
         self,
         task: Literal["classification", "regression"],
@@ -61,9 +63,10 @@ def __init__(
         self.batch_size = batch_size
 
         if model is not None:
-            self._setup_model(model)
+            self.set_model(model)
 
-    def _setup_model(self, model) -> None:
+    def set_model(self, model: nn.Module) -> None:
+        super().set_model(model)
         self.la = Laplace(
             model=model,
             likelihood=self.task,
@@ -71,9 +74,6 @@ def _setup_model(self, model) -> None:
             hessian_structure=self.hessian_struct,
         )
 
-    def set_model(self, model: nn.Module) -> None:
-        self._setup_model(model)
-
     def fit(self, dataset: Dataset) -> None:
         dl = DataLoader(dataset, batch_size=self.batch_size)
         self.la.fit(train_loader=dl)

From 8e86fdf5aea164aa7c177fcc092b4b1d1f8dfaa9 Mon Sep 17 00:00:00 2001
From: Olivier <olivier.ar.laurent@gmail.com>
Date: Fri, 19 Jul 2024 09:42:34 +0200
Subject: [PATCH 04/27] :shirt: Minor cosmetic changes

---
 auto_tutorials_source/tutorial_bayesian.py               | 9 ++++-----
 auto_tutorials_source/tutorial_mc_batch_norm.py          | 2 +-
 docs/source/cli_guide.rst                                | 2 +-
 torch_uncertainty/datamodules/depth/base.py              | 2 +-
 torch_uncertainty/datamodules/depth/kitti.py             | 2 +-
 torch_uncertainty/datamodules/depth/muad.py              | 2 +-
 torch_uncertainty/datamodules/depth/nyu.py               | 2 +-
 torch_uncertainty/datamodules/segmentation/cityscapes.py | 2 +-
 torch_uncertainty/layers/bayesian/bayes_conv.py          | 2 +-
 torch_uncertainty/layers/bayesian/bayes_linear.py        | 2 +-
 torch_uncertainty/losses.py                              | 2 +-
 11 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/auto_tutorials_source/tutorial_bayesian.py b/auto_tutorials_source/tutorial_bayesian.py
index d50c7bf7..cc4e830a 100644
--- a/auto_tutorials_source/tutorial_bayesian.py
+++ b/auto_tutorials_source/tutorial_bayesian.py
@@ -20,7 +20,7 @@
 Training a Bayesian LeNet using TorchUncertainty models and Lightning
 ---------------------------------------------------------------------
 
-In this part, we train a bayesian LeNet, based on the model and routines already implemented in TU.
+In this part, we train a Bayesian LeNet, based on the model and routines already implemented in TU.
 
 1. Loading the utilities
 ~~~~~~~~~~~~~~~~~~~~~~~~
@@ -30,12 +30,11 @@
 - the Trainer from Lightning
 - the model: bayesian_lenet, which lies in the torch_uncertainty.model
 - the classification training routine from torch_uncertainty.routines
-- the bayesian objective: the ELBOLoss, which lies in the torch_uncertainty.losses file
+- the Bayesian objective: the ELBOLoss, which lies in the torch_uncertainty.losses file
 - the datamodule that handles dataloaders: MNISTDataModule from torch_uncertainty.datamodules
 
-We will also need to define an optimizer using torch.optim, the
-neural network utils from torch.nn, as well as the partial util to provide
-the modified default arguments for the ELBO loss.
+We will also need to define an optimizer using torch.optim and Pytorch's
+neural network utils from torch.nn.
 """
 from pathlib import Path
 
diff --git a/auto_tutorials_source/tutorial_mc_batch_norm.py b/auto_tutorials_source/tutorial_mc_batch_norm.py
index a8bed883..bbf495c8 100644
--- a/auto_tutorials_source/tutorial_mc_batch_norm.py
+++ b/auto_tutorials_source/tutorial_mc_batch_norm.py
@@ -98,7 +98,7 @@
 # 6. Testing the Model
 # ~~~~~~~~~~~~~~~~~~~~
 # Now that the model is trained, let's test it on MNIST. Don't forget to call
-# .eval() to enable Monte Carlo batch normalization at inference.
+# .eval() to enable Monte Carlo batch normalization at evaluation (sometimes called inference).
 # In this tutorial, we plot the most uncertain images, i.e. the images for which
 # the variance of the predictions is the highest.
 # Please note that we apply a reshape to the logits to determine the dimension corresponding to the ensemble
diff --git a/docs/source/cli_guide.rst b/docs/source/cli_guide.rst
index 24129fe6..0b888ea9 100644
--- a/docs/source/cli_guide.rst
+++ b/docs/source/cli_guide.rst
@@ -89,7 +89,7 @@ This command will display the available subcommands of the CLI tool.
         fit                 Runs the full optimization routine.
         validate            Perform one evaluation epoch over the validation set.
         test                Perform one evaluation epoch over the test set.
-        predict             Run inference on your data.
+        predict             Run evaluation on your data.
 
 You can execute whichever subcommand you like and set up all your hyperparameters directly using the command line
 
diff --git a/torch_uncertainty/datamodules/depth/base.py b/torch_uncertainty/datamodules/depth/base.py
index 34c69c89..df067d97 100644
--- a/torch_uncertainty/datamodules/depth/base.py
+++ b/torch_uncertainty/datamodules/depth/base.py
@@ -43,7 +43,7 @@ def __init__(
                 of length :math:`1`, it will be interpreted as
                 :math:`(\text{size[0]},\text{size[1]})`.
             eval_size (sequence or int, optional): Desired input image and
-                depth mask sizes during inference. If size is an int,
+                depth mask sizes during evaluation. If size is an int,
                 smaller edge of the images will be matched to this number, i.e.,
                 :math:`\text{height}>\text{width}`, then image will be rescaled to
                 :math:`(\text{size}\times\text{height}/\text{width},\text{size})`.
diff --git a/torch_uncertainty/datamodules/depth/kitti.py b/torch_uncertainty/datamodules/depth/kitti.py
index c5035893..69227769 100644
--- a/torch_uncertainty/datamodules/depth/kitti.py
+++ b/torch_uncertainty/datamodules/depth/kitti.py
@@ -37,7 +37,7 @@ def __init__(
                 of length :math:`1`, it will be interpreted as
                 :math:`(\text{size[0]},\text{size[1]})`. Defaults to ``(375, 1242)``.
             eval_size (sequence or int, optional): Desired input image and
-                depth mask sizes during inference. If size is an int,
+                depth mask sizes during evaluation. If size is an int,
                 smaller edge of the images will be matched to this number, i.e.,
                 :math:`\text{height}>\text{width}`, then image will be rescaled to
                 :math:`(\text{size}\times\text{height}/\text{width},\text{size})`.
diff --git a/torch_uncertainty/datamodules/depth/muad.py b/torch_uncertainty/datamodules/depth/muad.py
index cf4f6cde..032a4292 100644
--- a/torch_uncertainty/datamodules/depth/muad.py
+++ b/torch_uncertainty/datamodules/depth/muad.py
@@ -37,7 +37,7 @@ def __init__(
                 of length :math:`1`, it will be interpreted as
                 :math:`(\text{size[0]},\text{size[1]})`. Defaults to ``1024``.
             eval_size (sequence or int, optional): Desired input image and
-                depth mask sizes during inference. If size is an int,
+                depth mask sizes during evaluation. If size is an int,
                 smaller edge of the images will be matched to this number, i.e.,
                 :math:`\text{height}>\text{width}`, then image will be rescaled to
                 :math:`(\text{size}\times\text{height}/\text{width},\text{size})`.
diff --git a/torch_uncertainty/datamodules/depth/nyu.py b/torch_uncertainty/datamodules/depth/nyu.py
index ec925ffa..077badff 100644
--- a/torch_uncertainty/datamodules/depth/nyu.py
+++ b/torch_uncertainty/datamodules/depth/nyu.py
@@ -37,7 +37,7 @@ def __init__(
                 of length :math:`1`, it will be interpreted as
                 :math:`(\text{size[0]},\text{size[1]})`. Defaults to ``(416, 544)``.
             eval_size (sequence or int, optional): Desired input image and
-                depth mask sizes during inference. If size is an int,
+                depth mask sizes during evaluation. If size is an int,
                 smaller edge of the images will be matched to this number, i.e.,
                 :math:`\text{height}>\text{width}`, then image will be rescaled to
                 :math:`(\text{size}\times\text{height}/\text{width},\text{size})`.
diff --git a/torch_uncertainty/datamodules/segmentation/cityscapes.py b/torch_uncertainty/datamodules/segmentation/cityscapes.py
index baee3d4b..ea4bea8e 100644
--- a/torch_uncertainty/datamodules/segmentation/cityscapes.py
+++ b/torch_uncertainty/datamodules/segmentation/cityscapes.py
@@ -36,7 +36,7 @@ def __init__(
                 of length :math:`1`, it will be interpreted as
                 :math:`(\text{size[0]},\text{size[1]})`. Defaults to ``1024``.
             eval_size (sequence or int, optional): Desired input image and
-                segmentation mask sizes during inference. If size is an int,
+                segmentation mask sizes during evaluation. If size is an int,
                 smaller edge of the images will be matched to this number, i.e.,
                 :math:`\text{height}>\text{width}`, then image will be rescaled to
                 :math:`(\text{size}\times\text{height}/\text{width},\text{size})`.
diff --git a/torch_uncertainty/layers/bayesian/bayes_conv.py b/torch_uncertainty/layers/bayesian/bayes_conv.py
index 3584ba77..d6122bb5 100644
--- a/torch_uncertainty/layers/bayesian/bayes_conv.py
+++ b/torch_uncertainty/layers/bayesian/bayes_conv.py
@@ -173,7 +173,7 @@ def unfreeze(self) -> None:
         self.frozen = False
 
     def sample(self) -> tuple[Tensor, Tensor | None]:
-        """Sample the bayesian layer's posterior."""
+        """Sample the Bayesian layer's posterior."""
         weight = self.weight_sampler.sample()
         bias = self.bias_sampler.sample() if self.bias_mu is not None else None
         return weight, bias
diff --git a/torch_uncertainty/layers/bayesian/bayes_linear.py b/torch_uncertainty/layers/bayesian/bayes_linear.py
index 2c9f15c4..ff2247d2 100644
--- a/torch_uncertainty/layers/bayesian/bayes_linear.py
+++ b/torch_uncertainty/layers/bayesian/bayes_linear.py
@@ -140,7 +140,7 @@ def unfreeze(self) -> None:
         self.frozen = False
 
     def sample(self) -> tuple[Tensor, Tensor | None]:
-        """Sample the bayesian layer's posterior."""
+        """Sample the Bayesian layer's posterior."""
         weight = self.weight_sampler.sample()
         bias = self.bias_sampler.sample() if self.bias_mu is not None else None
         return weight, bias
diff --git a/torch_uncertainty/losses.py b/torch_uncertainty/losses.py
index c82ab210..65291cac 100644
--- a/torch_uncertainty/losses.py
+++ b/torch_uncertainty/losses.py
@@ -107,7 +107,7 @@ def __init__(
         self.num_samples = num_samples
 
     def forward(self, inputs: Tensor, targets: Tensor) -> Tensor:
-        """Gather the KL divergence from the bayesian modules and aggregate
+        """Gather the KL divergence from the Bayesian modules and aggregate
         the ELBO loss for a given network.
 
         Args:

From 18b29225283a271df53220271bbf8c6e2a7ea852 Mon Sep 17 00:00:00 2001
From: Olivier <olivier.ar.laurent@gmail.com>
Date: Sun, 21 Jul 2024 20:08:45 +0200
Subject: [PATCH 05/27] :hammer: Refactor and add losses

---
 torch_uncertainty/losses.py                | 419 ---------------------
 torch_uncertainty/losses/__init__.py       |   4 +
 torch_uncertainty/losses/bayesian.py       | 114 ++++++
 torch_uncertainty/losses/classification.py | 272 +++++++++++++
 torch_uncertainty/losses/regression.py     | 142 +++++++
 5 files changed, 532 insertions(+), 419 deletions(-)
 delete mode 100644 torch_uncertainty/losses.py
 create mode 100644 torch_uncertainty/losses/__init__.py
 create mode 100644 torch_uncertainty/losses/bayesian.py
 create mode 100644 torch_uncertainty/losses/classification.py
 create mode 100644 torch_uncertainty/losses/regression.py

diff --git a/torch_uncertainty/losses.py b/torch_uncertainty/losses.py
deleted file mode 100644
index 65291cac..00000000
--- a/torch_uncertainty/losses.py
+++ /dev/null
@@ -1,419 +0,0 @@
-from typing import Literal
-
-import torch
-from torch import Tensor, nn
-from torch.distributions import Distribution
-from torch.nn import functional as F
-
-from torch_uncertainty.layers.bayesian import bayesian_modules
-from torch_uncertainty.utils.distributions import NormalInverseGamma
-
-
-class DistributionNLLLoss(nn.Module):
-    def __init__(
-        self, reduction: Literal["mean", "sum"] | None = "mean"
-    ) -> None:
-        """Negative Log-Likelihood loss using given distributions as inputs.
-
-        Args:
-            reduction (str, optional): specifies the reduction to apply to the
-            output:``'none'`` | ``'mean'`` | ``'sum'``. Defaults to "mean".
-        """
-        super().__init__()
-        self.reduction = reduction
-
-    def forward(
-        self,
-        dist: Distribution,
-        targets: Tensor,
-        padding_mask: Tensor | None = None,
-    ) -> Tensor:
-        """Compute the NLL of the targets given predicted distributions.
-
-        Args:
-            dist (Distribution): The predicted distributions
-            targets (Tensor): The target values
-            padding_mask (Tensor, optional): The padding mask. Defaults to None.
-                Sets the loss to 0 for padded values.
-        """
-        loss = -dist.log_prob(targets)
-        if padding_mask is not None:
-            loss = loss.masked_fill(padding_mask, 0.0)
-
-        if self.reduction == "mean":
-            loss = loss.mean()
-        elif self.reduction == "sum":
-            loss = loss.sum()
-        return loss
-
-
-class KLDiv(nn.Module):
-    def __init__(self, model: nn.Module) -> None:
-        """KL divergence loss for Bayesian Neural Networks. Gathers the KL from the
-        modules computed in the forward passes.
-
-        Args:
-            model (nn.Module): Bayesian Neural Network
-        """
-        super().__init__()
-        self.model = model
-
-    def forward(self) -> Tensor:
-        return self._kl_div()
-
-    def _kl_div(self) -> Tensor:
-        """Gathers pre-computed KL-Divergences from :attr:`model`."""
-        kl_divergence = torch.zeros(1)
-        count = 0
-        for module in self.model.modules():
-            if isinstance(module, bayesian_modules):
-                kl_divergence = kl_divergence.to(
-                    device=module.lvposterior.device
-                )
-                kl_divergence += module.lvposterior - module.lprior
-                count += 1
-        return kl_divergence / count
-
-
-class ELBOLoss(nn.Module):
-    def __init__(
-        self,
-        model: nn.Module | None,
-        inner_loss: nn.Module,
-        kl_weight: float,
-        num_samples: int,
-    ) -> None:
-        """The Evidence Lower Bound (ELBO) loss for Bayesian Neural Networks.
-
-        ELBO loss for Bayesian Neural Networks. Use this loss function with the
-        objective that you seek to minimize as :attr:`inner_loss`.
-
-        Args:
-            model (nn.Module): The Bayesian Neural Network to compute the loss for
-            inner_loss (nn.Module): The loss function to use during training
-            kl_weight (float): The weight of the KL divergence term
-            num_samples (int): The number of samples to use for the ELBO loss
-
-        Note:
-            Set the model to None if you use the ELBOLoss within
-            the ClassificationRoutine. It will get filled automatically.
-        """
-        super().__init__()
-        _elbo_loss_checks(inner_loss, kl_weight, num_samples)
-        self.set_model(model)
-
-        self.inner_loss = inner_loss
-        self.kl_weight = kl_weight
-        self.num_samples = num_samples
-
-    def forward(self, inputs: Tensor, targets: Tensor) -> Tensor:
-        """Gather the KL divergence from the Bayesian modules and aggregate
-        the ELBO loss for a given network.
-
-        Args:
-            inputs (Tensor): The inputs of the Bayesian Neural Network
-            targets (Tensor): The target values
-
-        Returns:
-            Tensor: The aggregated ELBO loss
-        """
-        aggregated_elbo = torch.zeros(1, device=inputs.device)
-        for _ in range(self.num_samples):
-            logits = self.model(inputs)
-            aggregated_elbo += self.inner_loss(logits, targets)
-            # TODO: This shouldn't be necessary
-            aggregated_elbo += self.kl_weight * self._kl_div().to(inputs.device)
-        return aggregated_elbo / self.num_samples
-
-    def set_model(self, model: nn.Module | None) -> None:
-        self.model = model
-        if model is not None:
-            self._kl_div = KLDiv(model)
-
-
-def _elbo_loss_checks(
-    inner_loss: nn.Module, kl_weight: float, num_samples: int
-) -> None:
-    if isinstance(inner_loss, type):
-        raise TypeError(
-            "The inner_loss should be an instance of a class."
-            f"Got {inner_loss}."
-        )
-
-    if kl_weight < 0:
-        raise ValueError(
-            f"The KL weight should be non-negative. Got {kl_weight}."
-        )
-
-    if num_samples < 1:
-        raise ValueError(
-            "The number of samples should not be lower than 1."
-            f"Got {num_samples}."
-        )
-    if not isinstance(num_samples, int):
-        raise TypeError(
-            "The number of samples should be an integer. "
-            f"Got {type(num_samples)}."
-        )
-
-
-class DERLoss(DistributionNLLLoss):
-    def __init__(
-        self, reg_weight: float, reduction: str | None = "mean"
-    ) -> None:
-        """The Deep Evidential loss.
-
-        This loss combines the negative log-likelihood loss of the normal
-        inverse gamma distribution and a weighted regularization term.
-
-        Args:
-            reg_weight (float): The weight of the regularization term.
-            reduction (str, optional): specifies the reduction to apply to the
-            output:``'none'`` | ``'mean'`` | ``'sum'``.
-
-        Reference:
-            Amini, A., Schwarting, W., Soleimany, A., & Rus, D. (2019). Deep
-            evidential regression. https://arxiv.org/abs/1910.02600.
-        """
-        super().__init__(reduction=None)
-
-        if reduction not in (None, "none", "mean", "sum"):
-            raise ValueError(f"{reduction} is not a valid value for reduction.")
-        self.final_reduction = reduction
-
-        if reg_weight < 0:
-            raise ValueError(
-                "The regularization weight should be non-negative, but got "
-                f"{reg_weight}."
-            )
-        self.reg_weight = reg_weight
-
-    def _reg(self, dist: NormalInverseGamma, targets: Tensor) -> Tensor:
-        return torch.norm(targets - dist.loc, 1, dim=1, keepdim=True) * (
-            2 * dist.lmbda + dist.alpha
-        )
-
-    def forward(
-        self,
-        dist: NormalInverseGamma,
-        targets: Tensor,
-    ) -> Tensor:
-        loss_nll = super().forward(dist, targets)
-        loss_reg = self._reg(dist, targets)
-        loss = loss_nll + self.reg_weight * loss_reg
-
-        if self.final_reduction == "mean":
-            return loss.mean()
-        if self.final_reduction == "sum":
-            return loss.sum()
-        return loss
-
-
-class BetaNLL(nn.Module):
-    def __init__(
-        self, beta: float = 0.5, reduction: str | None = "mean"
-    ) -> None:
-        """The Beta Negative Log-likelihood loss.
-
-        Args:
-            beta (float): TParameter from range [0, 1] controlling relative
-            weighting between data points, where `0` corresponds to
-            high weight on low error points and `1` to an equal weighting.
-            reduction (str, optional): specifies the reduction to apply to the
-            output:``'none'`` | ``'mean'`` | ``'sum'``.
-
-        Reference:
-            Seitzer, M., Tavakoli, A., Antic, D., & Martius, G. (2022). On the
-            pitfalls of heteroscedastic uncertainty estimation with probabilistic
-            neural networks. https://arxiv.org/abs/2203.09168.
-        """
-        super().__init__()
-
-        if beta < 0 or beta > 1:
-            raise ValueError(
-                "The beta parameter should be in range [0, 1], but got "
-                f"{beta}."
-            )
-        self.beta = beta
-        self.nll_loss = nn.GaussianNLLLoss(reduction="none")
-        if reduction not in ("none", "mean", "sum"):
-            raise ValueError(f"{reduction} is not a valid value for reduction.")
-        self.reduction = reduction
-
-    def forward(
-        self, mean: Tensor, targets: Tensor, variance: Tensor
-    ) -> Tensor:
-        loss = self.nll_loss(mean, targets, variance) * (
-            variance.detach() ** self.beta
-        )
-
-        if self.reduction == "mean":
-            return loss.mean()
-        if self.reduction == "sum":
-            return loss.sum()
-        return loss
-
-
-class DECLoss(nn.Module):
-    def __init__(
-        self,
-        annealing_step: int | None = None,
-        reg_weight: float | None = None,
-        loss_type: str = "log",
-        reduction: str | None = "mean",
-    ) -> None:
-        """The deep evidential classification loss.
-
-        Args:
-            annealing_step (int): Annealing step for the weight of the
-            regularization term.
-            reg_weight (float): Fixed weight of the regularization term.
-            loss_type (str, optional): Specifies the loss type to apply to the
-            Dirichlet parameters: ``'mse'`` | ``'log'`` | ``'digamma'``.
-            reduction (str, optional): Specifies the reduction to apply to the
-            output:``'none'`` | ``'mean'`` | ``'sum'``.
-
-        Reference:
-            Sensoy, M., Kaplan, L., & Kandemir, M. (2018). Evidential deep
-            learning to quantify classification uncertainty. NeurIPS 2018.
-            https://arxiv.org/abs/1806.01768.
-        """
-        super().__init__()
-
-        if reg_weight is not None and (reg_weight < 0):
-            raise ValueError(
-                "The regularization weight should be non-negative, but got "
-                f"{reg_weight}."
-            )
-        self.reg_weight = reg_weight
-
-        if annealing_step is not None and (annealing_step <= 0):
-            raise ValueError(
-                "The annealing step should be positive, but got "
-                f"{annealing_step}."
-            )
-        self.annealing_step = annealing_step
-
-        if reduction not in ("none", "mean", "sum"):
-            raise ValueError(f"{reduction} is not a valid value for reduction.")
-        self.reduction = reduction
-
-        if loss_type not in ["mse", "log", "digamma"]:
-            raise ValueError(
-                f"{loss_type} is not a valid value for mse/log/digamma loss."
-            )
-        self.loss_type = loss_type
-
-    def _mse_loss(self, evidence: Tensor, targets: Tensor) -> Tensor:
-        evidence = torch.relu(evidence)
-        alpha = evidence + 1.0
-        strength = torch.sum(alpha, dim=1, keepdim=True)
-        loglikelihood_err = torch.sum(
-            (targets - (alpha / strength)) ** 2, dim=1, keepdim=True
-        )
-        loglikelihood_var = torch.sum(
-            alpha * (strength - alpha) / (strength * strength * (strength + 1)),
-            dim=1,
-            keepdim=True,
-        )
-        return loglikelihood_err + loglikelihood_var
-
-    def _log_loss(self, evidence: Tensor, targets: Tensor) -> Tensor:
-        evidence = torch.relu(evidence)
-        alpha = evidence + 1.0
-        strength = alpha.sum(dim=-1, keepdim=True)
-        return torch.sum(
-            targets * (torch.log(strength) - torch.log(alpha)),
-            dim=1,
-            keepdim=True,
-        )
-
-    def _digamma_loss(self, evidence: Tensor, targets: Tensor) -> Tensor:
-        evidence = torch.relu(evidence)
-        alpha = evidence + 1.0
-        strength = alpha.sum(dim=-1, keepdim=True)
-        return torch.sum(
-            targets * (torch.digamma(strength) - torch.digamma(alpha)),
-            dim=1,
-            keepdim=True,
-        )
-
-    def _kldiv_reg(
-        self,
-        evidence: Tensor,
-        targets: Tensor,
-    ) -> Tensor:
-        num_classes = evidence.size()[-1]
-        evidence = torch.relu(evidence)
-        alpha = evidence + 1.0
-
-        kl_alpha = (alpha - 1) * (1 - targets) + 1
-
-        ones = torch.ones(
-            [1, num_classes], dtype=evidence.dtype, device=evidence.device
-        )
-        sum_kl_alpha = torch.sum(kl_alpha, dim=1, keepdim=True)
-        first_term = (
-            torch.lgamma(sum_kl_alpha)
-            - torch.lgamma(kl_alpha).sum(dim=1, keepdim=True)
-            + torch.lgamma(ones).sum(dim=1, keepdim=True)
-            - torch.lgamma(ones.sum(dim=1, keepdim=True))
-        )
-        second_term = torch.sum(
-            (kl_alpha - ones)
-            * (torch.digamma(kl_alpha) - torch.digamma(sum_kl_alpha)),
-            dim=1,
-            keepdim=True,
-        )
-        return first_term + second_term
-
-    def forward(
-        self,
-        evidence: Tensor,
-        targets: Tensor,
-        current_epoch: int | None = None,
-    ) -> Tensor:
-        if (
-            self.annealing_step is not None
-            and self.annealing_step > 0
-            and current_epoch is None
-        ):
-            raise ValueError(
-                "The epoch num should be positive when \
-                annealing_step is settled, but got "
-                f"{current_epoch}."
-            )
-
-        if targets.ndim != 1:  # if no mixup or cutmix
-            raise NotImplementedError(
-                "DECLoss does not yet support mixup/cutmix."
-            )
-        # TODO: handle binary
-        targets = F.one_hot(targets, num_classes=evidence.size()[-1])
-
-        if self.loss_type == "mse":
-            loss_dirichlet = self._mse_loss(evidence, targets)
-        elif self.loss_type == "log":
-            loss_dirichlet = self._log_loss(evidence, targets)
-        else:  # self.loss_type == "digamma"
-            loss_dirichlet = self._digamma_loss(evidence, targets)
-
-        if self.reg_weight is None and self.annealing_step is None:
-            annealing_coef = 0
-        elif self.annealing_step is None and self.reg_weight > 0:
-            annealing_coef = self.reg_weight
-        else:
-            annealing_coef = torch.min(
-                torch.tensor(1.0, dtype=evidence.dtype),
-                torch.tensor(
-                    current_epoch / self.annealing_step, dtype=evidence.dtype
-                ),
-            )
-
-        loss_reg = self._kldiv_reg(evidence, targets)
-        loss = loss_dirichlet + annealing_coef * loss_reg
-        if self.reduction == "mean":
-            loss = loss.mean()
-        elif self.reduction == "sum":
-            loss = loss.sum()
-        return loss
diff --git a/torch_uncertainty/losses/__init__.py b/torch_uncertainty/losses/__init__.py
new file mode 100644
index 00000000..318295e1
--- /dev/null
+++ b/torch_uncertainty/losses/__init__.py
@@ -0,0 +1,4 @@
+# ruff: noqa: F401
+from .bayesian import ELBOLoss, KLDiv
+from .classification import ConfidencePenaltyLoss, ConflictualLoss, DECLoss
+from .regression import BetaNLL, DERLoss, DistributionNLLLoss
diff --git a/torch_uncertainty/losses/bayesian.py b/torch_uncertainty/losses/bayesian.py
new file mode 100644
index 00000000..3621a8f2
--- /dev/null
+++ b/torch_uncertainty/losses/bayesian.py
@@ -0,0 +1,114 @@
+import torch
+from torch import Tensor, nn
+
+from torch_uncertainty.layers.bayesian import bayesian_modules
+
+
+class KLDiv(nn.Module):
+    def __init__(self, model: nn.Module) -> None:
+        """KL divergence loss for Bayesian Neural Networks. Gathers the KL from the
+        modules computed in the forward passes.
+
+        Args:
+            model (nn.Module): Bayesian Neural Network
+        """
+        super().__init__()
+        self.model = model
+
+    def forward(self) -> Tensor:
+        return self._kl_div()
+
+    def _kl_div(self) -> Tensor:
+        """Gathers pre-computed KL-Divergences from :attr:`model`."""
+        kl_divergence = torch.zeros(1)
+        count = 0
+        for module in self.model.modules():
+            if isinstance(module, bayesian_modules):
+                kl_divergence = kl_divergence.to(
+                    device=module.lvposterior.device
+                )
+                kl_divergence += module.lvposterior - module.lprior
+                count += 1
+        return kl_divergence / count
+
+
+class ELBOLoss(nn.Module):
+    def __init__(
+        self,
+        model: nn.Module | None,
+        inner_loss: nn.Module,
+        kl_weight: float,
+        num_samples: int,
+    ) -> None:
+        """The Evidence Lower Bound (ELBO) loss for Bayesian Neural Networks.
+
+        ELBO loss for Bayesian Neural Networks. Use this loss function with the
+        objective that you seek to minimize as :attr:`inner_loss`.
+
+        Args:
+            model (nn.Module): The Bayesian Neural Network to compute the loss for
+            inner_loss (nn.Module): The loss function to use during training
+            kl_weight (float): The weight of the KL divergence term
+            num_samples (int): The number of samples to use for the ELBO loss
+
+        Note:
+            Set the model to None if you use the ELBOLoss within
+            the ClassificationRoutine. It will get filled automatically.
+        """
+        super().__init__()
+        _elbo_loss_checks(inner_loss, kl_weight, num_samples)
+        self.set_model(model)
+
+        self.inner_loss = inner_loss
+        self.kl_weight = kl_weight
+        self.num_samples = num_samples
+
+    def forward(self, inputs: Tensor, targets: Tensor) -> Tensor:
+        """Gather the KL divergence from the Bayesian modules and aggregate
+        the ELBO loss for a given network.
+
+        Args:
+            inputs (Tensor): The inputs of the Bayesian Neural Network
+            targets (Tensor): The target values
+
+        Returns:
+            Tensor: The aggregated ELBO loss
+        """
+        aggregated_elbo = torch.zeros(1, device=inputs.device)
+        for _ in range(self.num_samples):
+            logits = self.model(inputs)
+            aggregated_elbo += self.inner_loss(logits, targets)
+            # TODO: This shouldn't be necessary
+            aggregated_elbo += self.kl_weight * self._kl_div().to(inputs.device)
+        return aggregated_elbo / self.num_samples
+
+    def set_model(self, model: nn.Module | None) -> None:
+        self.model = model
+        if model is not None:
+            self._kl_div = KLDiv(model)
+
+
+def _elbo_loss_checks(
+    inner_loss: nn.Module, kl_weight: float, num_samples: int
+) -> None:
+    if isinstance(inner_loss, type):
+        raise TypeError(
+            "The inner_loss should be an instance of a class."
+            f"Got {inner_loss}."
+        )
+
+    if kl_weight < 0:
+        raise ValueError(
+            f"The KL weight should be non-negative. Got {kl_weight}."
+        )
+
+    if num_samples < 1:
+        raise ValueError(
+            "The number of samples should not be lower than 1."
+            f"Got {num_samples}."
+        )
+    if not isinstance(num_samples, int):
+        raise TypeError(
+            "The number of samples should be an integer. "
+            f"Got {type(num_samples)}."
+        )
diff --git a/torch_uncertainty/losses/classification.py b/torch_uncertainty/losses/classification.py
new file mode 100644
index 00000000..1233623f
--- /dev/null
+++ b/torch_uncertainty/losses/classification.py
@@ -0,0 +1,272 @@
+import torch
+from torch import Tensor, nn
+from torch.nn import functional as F
+
+
+class DECLoss(nn.Module):
+    def __init__(
+        self,
+        annealing_step: int | None = None,
+        reg_weight: float | None = None,
+        loss_type: str = "log",
+        reduction: str | None = "mean",
+    ) -> None:
+        """The deep evidential classification loss.
+
+        Args:
+            annealing_step (int): Annealing step for the weight of the
+            regularization term.
+            reg_weight (float): Fixed weight of the regularization term.
+            loss_type (str, optional): Specifies the loss type to apply to the
+            Dirichlet parameters: ``'mse'`` | ``'log'`` | ``'digamma'``.
+            reduction (str, optional): Specifies the reduction to apply to the
+            output:``'none'`` | ``'mean'`` | ``'sum'``.
+
+        Reference:
+            Sensoy, M., Kaplan, L., & Kandemir, M. (2018). Evidential deep
+            learning to quantify classification uncertainty. NeurIPS 2018.
+            https://arxiv.org/abs/1806.01768.
+        """
+        super().__init__()
+
+        if reg_weight is not None and (reg_weight < 0):
+            raise ValueError(
+                "The regularization weight should be non-negative, but got "
+                f"{reg_weight}."
+            )
+        self.reg_weight = reg_weight
+
+        if annealing_step is not None and (annealing_step <= 0):
+            raise ValueError(
+                "The annealing step should be positive, but got "
+                f"{annealing_step}."
+            )
+        self.annealing_step = annealing_step
+
+        if reduction not in ("none", "mean", "sum") and reduction is not None:
+            raise ValueError(f"{reduction} is not a valid value for reduction.")
+        self.reduction = reduction
+
+        if loss_type not in ["mse", "log", "digamma"]:
+            raise ValueError(
+                f"{loss_type} is not a valid value for mse/log/digamma loss."
+            )
+        self.loss_type = loss_type
+
+    def _mse_loss(self, evidence: Tensor, targets: Tensor) -> Tensor:
+        evidence = torch.relu(evidence)
+        alpha = evidence + 1.0
+        strength = torch.sum(alpha, dim=1, keepdim=True)
+        loglikelihood_err = torch.sum(
+            (targets - (alpha / strength)) ** 2, dim=1, keepdim=True
+        )
+        loglikelihood_var = torch.sum(
+            alpha * (strength - alpha) / (strength * strength * (strength + 1)),
+            dim=1,
+            keepdim=True,
+        )
+        return loglikelihood_err + loglikelihood_var
+
+    def _log_loss(self, evidence: Tensor, targets: Tensor) -> Tensor:
+        evidence = torch.relu(evidence)
+        alpha = evidence + 1.0
+        strength = alpha.sum(dim=-1, keepdim=True)
+        return torch.sum(
+            targets * (torch.log(strength) - torch.log(alpha)),
+            dim=1,
+            keepdim=True,
+        )
+
+    def _digamma_loss(self, evidence: Tensor, targets: Tensor) -> Tensor:
+        evidence = torch.relu(evidence)
+        alpha = evidence + 1.0
+        strength = alpha.sum(dim=-1, keepdim=True)
+        return torch.sum(
+            targets * (torch.digamma(strength) - torch.digamma(alpha)),
+            dim=1,
+            keepdim=True,
+        )
+
+    def _kldiv_reg(
+        self,
+        evidence: Tensor,
+        targets: Tensor,
+    ) -> Tensor:
+        num_classes = evidence.size()[-1]
+        evidence = torch.relu(evidence)
+        alpha = evidence + 1.0
+
+        kl_alpha = (alpha - 1) * (1 - targets) + 1
+
+        ones = torch.ones(
+            [1, num_classes], dtype=evidence.dtype, device=evidence.device
+        )
+        sum_kl_alpha = torch.sum(kl_alpha, dim=1, keepdim=True)
+        first_term = (
+            torch.lgamma(sum_kl_alpha)
+            - torch.lgamma(kl_alpha).sum(dim=1, keepdim=True)
+            + torch.lgamma(ones).sum(dim=1, keepdim=True)
+            - torch.lgamma(ones.sum(dim=1, keepdim=True))
+        )
+        second_term = torch.sum(
+            (kl_alpha - ones)
+            * (torch.digamma(kl_alpha) - torch.digamma(sum_kl_alpha)),
+            dim=1,
+            keepdim=True,
+        )
+        return first_term + second_term
+
+    def forward(
+        self,
+        evidence: Tensor,
+        targets: Tensor,
+        current_epoch: int | None = None,
+    ) -> Tensor:
+        if (
+            self.annealing_step is not None
+            and self.annealing_step > 0
+            and current_epoch is None
+        ):
+            raise ValueError(
+                "The epoch num should be positive when \
+                annealing_step is settled, but got "
+                f"{current_epoch}."
+            )
+
+        if targets.ndim != 1:  # if no mixup or cutmix
+            raise NotImplementedError(
+                "DECLoss does not yet support mixup/cutmix."
+            )
+        # TODO: handle binary
+        targets = F.one_hot(targets, num_classes=evidence.size()[-1])
+
+        if self.loss_type == "mse":
+            loss_dirichlet = self._mse_loss(evidence, targets)
+        elif self.loss_type == "log":
+            loss_dirichlet = self._log_loss(evidence, targets)
+        else:  # self.loss_type == "digamma"
+            loss_dirichlet = self._digamma_loss(evidence, targets)
+
+        if self.reg_weight is None and self.annealing_step is None:
+            annealing_coef = 0
+        elif self.annealing_step is None and self.reg_weight > 0:
+            annealing_coef = self.reg_weight
+        else:
+            annealing_coef = torch.min(
+                torch.tensor(1.0, dtype=evidence.dtype),
+                torch.tensor(
+                    current_epoch / self.annealing_step, dtype=evidence.dtype
+                ),
+            )
+
+        loss_reg = self._kldiv_reg(evidence, targets)
+        loss = loss_dirichlet + annealing_coef * loss_reg
+        if self.reduction == "mean":
+            loss = loss.mean()
+        elif self.reduction == "sum":
+            loss = loss.sum()
+        return loss
+
+
+class ConfidencePenaltyLoss(nn.Module):
+    def __init__(
+        self,
+        reg_weight: float = 1,
+        reduction: str | None = "mean",
+        eps: float = 1e-6,
+    ) -> None:
+        """The Confidence Penalty Loss.
+
+        Args:
+            reg_weight (float, optional): The weight of the regularization term.
+            reduction (str, optional): specifies the reduction to apply to the
+            output:``'none'`` | ``'mean'`` | ``'sum'``. Defaults to "mean".
+            eps (float, optional): A small value to avoid numerical instability.
+                Defaults to 1e-6.
+
+        Reference:
+            Amini, A., Schwarting, W., Soleimany, A., & Rus, D. (2019). Deep
+            evidential regression. https://arxiv.org/abs/1910.02600.
+
+        Reference:
+            Gabriel Pereyra: Regularizing neural networks by penalizing
+            confident output distributions. https://arxiv.org/pdf/1701.06548.
+
+        """
+        super().__init__(reduction=None)
+        if reduction is None:
+            reduction = "none"
+        if reduction not in ("none", "mean", "sum"):
+            raise ValueError(f"{reduction} is not a valid value for reduction.")
+        self.reduction = reduction
+        self.eps = eps
+        self.reg_weight = reg_weight
+
+    def forward(self, logits: Tensor, targets: Tensor) -> Tensor:
+        """Compute the Confidence Penalty loss.
+
+        Args:
+            logits (Tensor): The inputs of the Bayesian Neural Network
+            targets (Tensor): The target values
+
+        Returns:
+            Tensor: The Confidence Penalty loss
+        """
+        probs = F.softmax(logits, dim=1)
+        ce_loss = F.cross_entropy(logits, targets, reduction=self.reduction)
+        reg_loss = torch.log(logits.shape[-1]) + (
+            probs * torch.log(probs + self.eps)
+        ).sum(dim=-1)
+        if self.reduction == "sum":
+            return ce_loss + self.reg_weight * reg_loss.sum()
+        if self.reduction == "mean":
+            return ce_loss + self.reg_weight * reg_loss.mean()
+        return ce_loss + self.reg_weight * reg_loss
+
+
+class ConflictualLoss(nn.Module):
+    def __init__(
+        self,
+        reg_weight: float = 1,
+        reduction: str | None = "mean",
+    ) -> None:
+        """The Conflictual Loss.
+
+        Args:
+            reg_weight (float, optional): The weight of the regularization term.
+            reduction (str, optional): specifies the reduction to apply to the
+            output:``'none'`` | ``'mean'`` | ``'sum'``.
+
+        Reference:
+            Mohammed Fellaji et al. On the Calibration of Epistemic Uncertainty:
+            Principles, Paradoxes and Conflictual Loss. https://arxiv.org/pdf/2407.12211
+        """
+        super().__init__(reduction=None)
+
+        if reduction is None:
+            reduction = "none"
+        if reduction not in ("none", "mean", "sum"):
+            raise ValueError(f"{reduction} is not a valid value for reduction.")
+        self.reduction = reduction
+        self.reg_weight = reg_weight
+
+    def forward(self, logits: Tensor, targets: Tensor) -> Tensor:
+        """Compute the conflictual loss.
+
+        Args:
+            logits (Tensor): The outputs of the model.
+            targets (Tensor): The target values.
+
+        Returns:
+            Tensor: The conflictual loss.
+        """
+        class_index = torch.randint(
+            0, logits.shape[-1], (1,), dtype=torch.long, device=logits.device
+        )
+        ce_loss = F.cross_entropy(logits, targets, reduction=self.reduction)
+        reg_loss = -F.log_softmax(logits, dim=1)[:, class_index]
+        if self.reduction == "sum":
+            return ce_loss + self.reg_weight * reg_loss.sum()
+        if self.reduction == "mean":
+            return ce_loss + self.reg_weight * reg_loss.mean()
+        return ce_loss + self.reg_weight * reg_loss
diff --git a/torch_uncertainty/losses/regression.py b/torch_uncertainty/losses/regression.py
new file mode 100644
index 00000000..99b9b9fd
--- /dev/null
+++ b/torch_uncertainty/losses/regression.py
@@ -0,0 +1,142 @@
+from typing import Literal
+
+import torch
+from torch import Tensor, nn
+from torch.distributions import Distribution
+
+from torch_uncertainty.utils.distributions import NormalInverseGamma
+
+
+class DistributionNLLLoss(nn.Module):
+    def __init__(
+        self, reduction: Literal["mean", "sum"] | None = "mean"
+    ) -> None:
+        """Negative Log-Likelihood loss using given distributions as inputs.
+
+        Args:
+            reduction (str, optional): specifies the reduction to apply to the
+            output:``'none'`` | ``'mean'`` | ``'sum'``. Defaults to "mean".
+        """
+        super().__init__()
+        self.reduction = reduction
+
+    def forward(
+        self,
+        dist: Distribution,
+        targets: Tensor,
+        padding_mask: Tensor | None = None,
+    ) -> Tensor:
+        """Compute the NLL of the targets given predicted distributions.
+
+        Args:
+            dist (Distribution): The predicted distributions
+            targets (Tensor): The target values
+            padding_mask (Tensor, optional): The padding mask. Defaults to None.
+                Sets the loss to 0 for padded values.
+        """
+        loss = -dist.log_prob(targets)
+        if padding_mask is not None:
+            loss = loss.masked_fill(padding_mask, 0.0)
+
+        if self.reduction == "mean":
+            loss = loss.mean()
+        elif self.reduction == "sum":
+            loss = loss.sum()
+        return loss
+
+
+class DERLoss(DistributionNLLLoss):
+    def __init__(
+        self, reg_weight: float, reduction: str | None = "mean"
+    ) -> None:
+        """The Deep Evidential Regression loss.
+
+        This loss combines the negative log-likelihood loss of the normal
+        inverse gamma distribution and a weighted regularization term.
+
+        Args:
+            reg_weight (float): The weight of the regularization term.
+            reduction (str, optional): specifies the reduction to apply to the
+            output:``'none'`` | ``'mean'`` | ``'sum'``.
+
+        Reference:
+            Amini, A., Schwarting, W., Soleimany, A., & Rus, D. (2019). Deep
+            evidential regression. https://arxiv.org/abs/1910.02600.
+        """
+        super().__init__(reduction=None)
+
+        if reduction not in ("none", "mean", "sum") and reduction is not None:
+            raise ValueError(f"{reduction} is not a valid value for reduction.")
+        self.der_reduction = reduction
+
+        if reg_weight < 0:
+            raise ValueError(
+                "The regularization weight should be non-negative, but got "
+                f"{reg_weight}."
+            )
+        self.reg_weight = reg_weight
+
+    def _reg(self, dist: NormalInverseGamma, targets: Tensor) -> Tensor:
+        return torch.norm(targets - dist.loc, 1, dim=1, keepdim=True) * (
+            2 * dist.lmbda + dist.alpha
+        )
+
+    def forward(
+        self,
+        dist: NormalInverseGamma,
+        targets: Tensor,
+    ) -> Tensor:
+        loss_nll = super().forward(dist, targets)
+        loss_reg = self._reg(dist, targets)
+        loss = loss_nll + self.reg_weight * loss_reg
+
+        if self.der_reduction == "mean":
+            return loss.mean()
+        if self.der_reduction == "sum":
+            return loss.sum()
+        return loss
+
+
+class BetaNLL(nn.Module):
+    def __init__(
+        self, beta: float = 0.5, reduction: str | None = "mean"
+    ) -> None:
+        """The Beta Negative Log-likelihood loss.
+
+        Args:
+            beta (float): TParameter from range [0, 1] controlling relative
+            weighting between data points, where `0` corresponds to
+            high weight on low error points and `1` to an equal weighting.
+            reduction (str, optional): specifies the reduction to apply to the
+            output:``'none'`` | ``'mean'`` | ``'sum'``.
+
+        Reference:
+            Seitzer, M., Tavakoli, A., Antic, D., & Martius, G. (2022). On the
+            pitfalls of heteroscedastic uncertainty estimation with probabilistic
+            neural networks. https://arxiv.org/abs/2203.09168.
+        """
+        super().__init__()
+
+        if beta < 0 or beta > 1:
+            raise ValueError(
+                "The beta parameter should be in range [0, 1], but got "
+                f"{beta}."
+            )
+        self.beta = beta
+        self.nll_loss = nn.GaussianNLLLoss(reduction="none")
+        if reduction not in ("none", "mean", "sum"):
+            raise ValueError(f"{reduction} is not a valid value for reduction.")
+        self.reduction = reduction
+
+    def forward(
+        self, mean: Tensor, targets: Tensor, variance: Tensor
+    ) -> Tensor:
+        loss = self.nll_loss(mean, targets, variance) * (
+            variance.detach() ** self.beta
+        )
+
+        if self.reduction == "mean":
+            return loss.mean()
+        if self.reduction == "sum":
+            return loss.sum()
+        return loss

From d4b88f2e859e89936714447e155b473f937dd6ee Mon Sep 17 00:00:00 2001
From: Olivier <olivier.ar.laurent@gmail.com>
Date: Mon, 22 Jul 2024 17:00:26 +0200
Subject: [PATCH 06/27] :Fix new losses & add tests

---
 tests/losses/__init__.py                   |   0
 tests/losses/test_bayesian.py              |  50 ++++++
 tests/losses/test_classification.py        | 108 ++++++++++++
 tests/losses/test_regression.py            | 119 +++++++++++++
 tests/test_losses.py                       | 184 ---------------------
 torch_uncertainty/losses/classification.py |  29 ++--
 6 files changed, 296 insertions(+), 194 deletions(-)
 create mode 100644 tests/losses/__init__.py
 create mode 100644 tests/losses/test_bayesian.py
 create mode 100644 tests/losses/test_classification.py
 create mode 100644 tests/losses/test_regression.py
 delete mode 100644 tests/test_losses.py

diff --git a/tests/losses/__init__.py b/tests/losses/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/losses/test_bayesian.py b/tests/losses/test_bayesian.py
new file mode 100644
index 00000000..4135e3c8
--- /dev/null
+++ b/tests/losses/test_bayesian.py
@@ -0,0 +1,50 @@
+import pytest
+import torch
+from torch import nn
+
+from torch_uncertainty.layers.bayesian import BayesLinear
+from torch_uncertainty.losses import (
+    ELBOLoss,
+)
+
+
+class TestELBOLoss:
+    """Testing the ELBOLoss class."""
+
+    def test_main(self):
+        model = BayesLinear(1, 1)
+        criterion = nn.BCEWithLogitsLoss()
+        loss = ELBOLoss(model, criterion, kl_weight=1e-5, num_samples=1)
+        loss(model(torch.randn(1, 1)), torch.randn(1, 1))
+
+        model = nn.Linear(1, 1)
+        criterion = nn.BCEWithLogitsLoss()
+
+        ELBOLoss(None, criterion, kl_weight=1e-5, num_samples=1)
+        loss = ELBOLoss(model, criterion, kl_weight=1e-5, num_samples=1)
+        loss(model(torch.randn(1, 1)), torch.randn(1, 1))
+
+    def test_failures(self):
+        model = BayesLinear(1, 1)
+        criterion = nn.BCEWithLogitsLoss()
+
+        with pytest.raises(
+            TypeError, match="The inner_loss should be an instance of a class."
+        ):
+            ELBOLoss(model, nn.BCEWithLogitsLoss, kl_weight=1, num_samples=1)
+
+        with pytest.raises(
+            ValueError, match="The KL weight should be non-negative. Got "
+        ):
+            ELBOLoss(model, criterion, kl_weight=-1, num_samples=1)
+
+        with pytest.raises(
+            ValueError,
+            match="The number of samples should not be lower than 1.",
+        ):
+            ELBOLoss(model, criterion, kl_weight=1, num_samples=-1)
+
+        with pytest.raises(
+            TypeError, match="The number of samples should be an integer. "
+        ):
+            ELBOLoss(model, criterion, kl_weight=1e-5, num_samples=1.5)
diff --git a/tests/losses/test_classification.py b/tests/losses/test_classification.py
new file mode 100644
index 00000000..469e732a
--- /dev/null
+++ b/tests/losses/test_classification.py
@@ -0,0 +1,108 @@
+import pytest
+import torch
+
+from torch_uncertainty.losses import (
+    ConfidencePenaltyLoss,
+    ConflictualLoss,
+    DECLoss,
+)
+
+
+class TestDECLoss:
+    """Testing the DECLoss class."""
+
+    def test_main(self):
+        loss = DECLoss(
+            loss_type="mse", reg_weight=1e-2, annealing_step=1, reduction="sum"
+        )
+        loss(torch.tensor([[0.0, 0.0]]), torch.tensor([0]), current_epoch=1)
+        loss = DECLoss(loss_type="mse", reg_weight=1e-2, annealing_step=1)
+        loss(torch.tensor([[0.0, 0.0]]), torch.tensor([0]), current_epoch=0)
+        loss = DECLoss(loss_type="log", reg_weight=1e-2, reduction="none")
+        loss(torch.tensor([[0.0, 0.0]]), torch.tensor([0]))
+        loss = DECLoss(loss_type="digamma")
+        loss(torch.tensor([[0.0, 0.0]]), torch.tensor([0]))
+
+    def test_failures(self):
+        with pytest.raises(
+            ValueError,
+            match="The regularization weight should be non-negative, but got",
+        ):
+            DECLoss(reg_weight=-1)
+
+        with pytest.raises(
+            ValueError, match="The annealing step should be positive, but got "
+        ):
+            DECLoss(annealing_step=0)
+
+        loss = DECLoss(annealing_step=10)
+        with pytest.raises(ValueError):
+            loss(
+                torch.tensor([[0.0, 0.0]]),
+                torch.tensor([0]),
+                current_epoch=None,
+            )
+
+        with pytest.raises(
+            ValueError, match=" is not a valid value for reduction."
+        ):
+            DECLoss(reduction="median")
+
+        with pytest.raises(
+            ValueError, match="is not a valid value for mse/log/digamma loss."
+        ):
+            DECLoss(loss_type="regression")
+
+
+class TestConfidencePenaltyLoss:
+    """Testing the ConfidencePenaltyLoss class."""
+
+    def test_main(self):
+        loss = ConfidencePenaltyLoss(reg_weight=1e-2, reduction="sum")
+        loss(torch.tensor([[0.0, 0.0]]), torch.tensor([0]))
+        loss = ConfidencePenaltyLoss(reg_weight=1e-2)
+        loss(torch.tensor([[0.0, 0.0]]), torch.tensor([0]))
+        loss = ConfidencePenaltyLoss(reg_weight=1e-2, reduction="none")
+        loss(torch.tensor([[0.0, 0.0]]), torch.tensor([0]))
+
+    def test_failures(self):
+        with pytest.raises(
+            ValueError,
+            match="The regularization weight should be non-negative, but got",
+        ):
+            ConfidencePenaltyLoss(reg_weight=-1)
+
+        with pytest.raises(
+            ValueError, match="is not a valid value for reduction."
+        ):
+            ConfidencePenaltyLoss(reduction="median")
+
+        with pytest.raises(
+            ValueError,
+            match="The epsilon value should be non-negative, but got",
+        ):
+            ConfidencePenaltyLoss(eps=-1)
+
+
+class TestConflictualLoss:
+    """Testing the ConflictualLoss class."""
+
+    def test_main(self):
+        loss = ConflictualLoss(reg_weight=1e-2, reduction="sum")
+        loss(torch.tensor([[0.0, 0.0]]), torch.tensor([0]))
+        loss = ConflictualLoss(reg_weight=1e-2)
+        loss(torch.tensor([[0.0, 0.0]]), torch.tensor([0]))
+        loss = ConflictualLoss(reg_weight=1e-2, reduction="none")
+        loss(torch.tensor([[0.0, 0.0]]), torch.tensor([0]))
+
+    def test_failures(self):
+        with pytest.raises(
+            ValueError,
+            match="The regularization weight should be non-negative, but got",
+        ):
+            ConflictualLoss(reg_weight=-1)
+
+        with pytest.raises(
+            ValueError, match="is not a valid value for reduction."
+        ):
+            ConflictualLoss(reduction="median")
diff --git a/tests/losses/test_regression.py b/tests/losses/test_regression.py
new file mode 100644
index 00000000..41f413a1
--- /dev/null
+++ b/tests/losses/test_regression.py
@@ -0,0 +1,119 @@
+import math
+
+import pytest
+import torch
+from torch.distributions import Normal
+
+from torch_uncertainty.layers.distributions import NormalInverseGamma
+from torch_uncertainty.losses import (
+    BetaNLL,
+    DERLoss,
+    DistributionNLLLoss,
+)
+
+
+class TestDistributionNLL:
+    """Testing the DistributionNLLLoss class."""
+
+    def test_sum(self):
+        loss = DistributionNLLLoss(reduction="sum")
+        dist = Normal(0, 1)
+        loss(dist, torch.tensor([0.0]))
+
+
+class TestDERLoss:
+    """Testing the DERLoss class."""
+
+    def test_main(self):
+        loss = DERLoss(reg_weight=1e-2)
+        layer = NormalInverseGamma
+        inputs = layer(
+            torch.ones(1), torch.ones(1), torch.ones(1), torch.ones(1)
+        )
+        targets = torch.tensor([[1.0]], dtype=torch.float32)
+
+        assert loss(inputs, targets) == pytest.approx(2 * math.log(2))
+
+        loss = DERLoss(
+            reg_weight=1e-2,
+            reduction="sum",
+        )
+        inputs = layer(
+            torch.ones((2, 1)),
+            torch.ones((2, 1)),
+            torch.ones((2, 1)),
+            torch.ones((2, 1)),
+        )
+
+        assert loss(
+            inputs,
+            targets,
+        ) == pytest.approx(4 * math.log(2))
+
+        loss = DERLoss(
+            reg_weight=1e-2,
+            reduction="none",
+        )
+
+        assert loss(
+            inputs,
+            targets,
+        ) == pytest.approx([2 * math.log(2), 2 * math.log(2)])
+
+    def test_failures(self):
+        with pytest.raises(
+            ValueError,
+            match="The regularization weight should be non-negative, but got ",
+        ):
+            DERLoss(reg_weight=-1)
+
+        with pytest.raises(
+            ValueError, match="is not a valid value for reduction."
+        ):
+            DERLoss(reg_weight=1.0, reduction="median")
+
+
+class TestBetaNLL:
+    """Testing the BetaNLL class."""
+
+    def test_main(self):
+        loss = BetaNLL(beta=0.5)
+
+        inputs = torch.tensor([[1.0, 1.0]], dtype=torch.float32)
+        targets = torch.tensor([[1.0]], dtype=torch.float32)
+
+        assert loss(*inputs.split(1, dim=-1), targets) == 0
+
+        loss = BetaNLL(
+            beta=0.5,
+            reduction="sum",
+        )
+
+        assert (
+            loss(
+                *inputs.repeat(2, 1).split(1, dim=-1),
+                targets.repeat(2, 1),
+            )
+            == 0
+        )
+
+        loss = BetaNLL(
+            beta=0.5,
+            reduction="none",
+        )
+
+        assert loss(
+            *inputs.repeat(2, 1).split(1, dim=-1),
+            targets.repeat(2, 1),
+        ) == pytest.approx([0.0, 0.0])
+
+    def test_failures(self):
+        with pytest.raises(
+            ValueError, match="The beta parameter should be in range "
+        ):
+            BetaNLL(beta=-1)
+
+        with pytest.raises(
+            ValueError, match="is not a valid value for reduction."
+        ):
+            BetaNLL(beta=1.0, reduction="median")
diff --git a/tests/test_losses.py b/tests/test_losses.py
deleted file mode 100644
index f368e6cc..00000000
--- a/tests/test_losses.py
+++ /dev/null
@@ -1,184 +0,0 @@
-import math
-
-import pytest
-import torch
-from torch import nn
-from torch.distributions import Normal
-
-from torch_uncertainty.layers.bayesian import BayesLinear
-from torch_uncertainty.layers.distributions import NormalInverseGamma
-from torch_uncertainty.losses import (
-    BetaNLL,
-    DECLoss,
-    DERLoss,
-    DistributionNLLLoss,
-    ELBOLoss,
-)
-
-
-class TestDistributionNLL:
-    """Testing the DistributionNLLLoss class."""
-
-    def test_sum(self):
-        loss = DistributionNLLLoss(reduction="sum")
-        dist = Normal(0, 1)
-        loss(dist, torch.tensor([0.0]))
-
-
-class TestELBOLoss:
-    """Testing the ELBOLoss class."""
-
-    def test_main(self):
-        model = BayesLinear(1, 1)
-        criterion = nn.BCEWithLogitsLoss()
-        loss = ELBOLoss(model, criterion, kl_weight=1e-5, num_samples=1)
-        loss(model(torch.randn(1, 1)), torch.randn(1, 1))
-
-        model = nn.Linear(1, 1)
-        criterion = nn.BCEWithLogitsLoss()
-
-        ELBOLoss(None, criterion, kl_weight=1e-5, num_samples=1)
-        loss = ELBOLoss(model, criterion, kl_weight=1e-5, num_samples=1)
-        loss(model(torch.randn(1, 1)), torch.randn(1, 1))
-
-    def test_failures(self):
-        model = BayesLinear(1, 1)
-        criterion = nn.BCEWithLogitsLoss()
-
-        with pytest.raises(TypeError):
-            ELBOLoss(model, nn.BCEWithLogitsLoss, kl_weight=1, num_samples=1)
-
-        with pytest.raises(ValueError):
-            ELBOLoss(model, criterion, kl_weight=-1, num_samples=1)
-
-        with pytest.raises(ValueError):
-            ELBOLoss(model, criterion, kl_weight=1, num_samples=-1)
-
-        with pytest.raises(TypeError):
-            ELBOLoss(model, criterion, kl_weight=1e-5, num_samples=1.5)
-
-
-class TestDERLoss:
-    """Testing the DERLoss class."""
-
-    def test_main(self):
-        loss = DERLoss(reg_weight=1e-2)
-        layer = NormalInverseGamma
-        inputs = layer(
-            torch.ones(1), torch.ones(1), torch.ones(1), torch.ones(1)
-        )
-        targets = torch.tensor([[1.0]], dtype=torch.float32)
-
-        assert loss(inputs, targets) == pytest.approx(2 * math.log(2))
-
-        loss = DERLoss(
-            reg_weight=1e-2,
-            reduction="sum",
-        )
-        inputs = layer(
-            torch.ones((2, 1)),
-            torch.ones((2, 1)),
-            torch.ones((2, 1)),
-            torch.ones((2, 1)),
-        )
-
-        assert loss(
-            inputs,
-            targets,
-        ) == pytest.approx(4 * math.log(2))
-
-        loss = DERLoss(
-            reg_weight=1e-2,
-            reduction="none",
-        )
-
-        assert loss(
-            inputs,
-            targets,
-        ) == pytest.approx([2 * math.log(2), 2 * math.log(2)])
-
-    def test_failures(self):
-        with pytest.raises(ValueError):
-            DERLoss(reg_weight=-1)
-
-        with pytest.raises(ValueError):
-            DERLoss(reg_weight=1.0, reduction="median")
-
-
-class TestDECLoss:
-    """Testing the DECLoss class."""
-
-    def test_main(self):
-        loss = DECLoss(
-            loss_type="mse", reg_weight=1e-2, annealing_step=1, reduction="sum"
-        )
-        loss(torch.tensor([[0.0, 0.0]]), torch.tensor([0]), current_epoch=1)
-        loss = DECLoss(loss_type="mse", reg_weight=1e-2, annealing_step=1)
-        loss(torch.tensor([[0.0, 0.0]]), torch.tensor([0]), current_epoch=0)
-        loss = DECLoss(loss_type="log", reg_weight=1e-2, reduction="none")
-        loss(torch.tensor([[0.0, 0.0]]), torch.tensor([0]))
-        loss = DECLoss(loss_type="digamma")
-        loss(torch.tensor([[0.0, 0.0]]), torch.tensor([0]))
-
-    def test_failures(self):
-        with pytest.raises(ValueError):
-            DECLoss(reg_weight=-1)
-
-        with pytest.raises(ValueError):
-            DECLoss(annealing_step=0)
-
-        loss = DECLoss(annealing_step=10)
-        with pytest.raises(ValueError):
-            loss(
-                torch.tensor([[0.0, 0.0]]),
-                torch.tensor([0]),
-                current_epoch=None,
-            )
-
-        with pytest.raises(ValueError):
-            DECLoss(reduction="median")
-
-        with pytest.raises(ValueError):
-            DECLoss(loss_type="regression")
-
-
-class TestBetaNLL:
-    """Testing the BetaNLL class."""
-
-    def test_main(self):
-        loss = BetaNLL(beta=0.5)
-
-        inputs = torch.tensor([[1.0, 1.0]], dtype=torch.float32)
-        targets = torch.tensor([[1.0]], dtype=torch.float32)
-
-        assert loss(*inputs.split(1, dim=-1), targets) == 0
-
-        loss = BetaNLL(
-            beta=0.5,
-            reduction="sum",
-        )
-
-        assert (
-            loss(
-                *inputs.repeat(2, 1).split(1, dim=-1),
-                targets.repeat(2, 1),
-            )
-            == 0
-        )
-
-        loss = BetaNLL(
-            beta=0.5,
-            reduction="none",
-        )
-
-        assert loss(
-            *inputs.repeat(2, 1).split(1, dim=-1),
-            targets.repeat(2, 1),
-        ) == pytest.approx([0.0, 0.0])
-
-    def test_failures(self):
-        with pytest.raises(ValueError):
-            BetaNLL(beta=-1)
-
-        with pytest.raises(ValueError):
-            BetaNLL(beta=1.0, reduction="median")
diff --git a/torch_uncertainty/losses/classification.py b/torch_uncertainty/losses/classification.py
index 1233623f..1b358fc5 100644
--- a/torch_uncertainty/losses/classification.py
+++ b/torch_uncertainty/losses/classification.py
@@ -184,22 +184,27 @@ def __init__(
             eps (float, optional): A small value to avoid numerical instability.
                 Defaults to 1e-6.
 
-        Reference:
-            Amini, A., Schwarting, W., Soleimany, A., & Rus, D. (2019). Deep
-            evidential regression. https://arxiv.org/abs/1910.02600.
-
         Reference:
             Gabriel Pereyra: Regularizing neural networks by penalizing
             confident output distributions. https://arxiv.org/pdf/1701.06548.
 
         """
-        super().__init__(reduction=None)
+        super().__init__()
         if reduction is None:
             reduction = "none"
         if reduction not in ("none", "mean", "sum"):
             raise ValueError(f"{reduction} is not a valid value for reduction.")
         self.reduction = reduction
+        if eps < 0:
+            raise ValueError(
+                "The epsilon value should be non-negative, but got " f"{eps}."
+            )
         self.eps = eps
+        if reg_weight < 0:
+            raise ValueError(
+                "The regularization weight should be non-negative, but got "
+                f"{reg_weight}."
+            )
         self.reg_weight = reg_weight
 
     def forward(self, logits: Tensor, targets: Tensor) -> Tensor:
@@ -214,9 +219,9 @@ def forward(self, logits: Tensor, targets: Tensor) -> Tensor:
         """
         probs = F.softmax(logits, dim=1)
         ce_loss = F.cross_entropy(logits, targets, reduction=self.reduction)
-        reg_loss = torch.log(logits.shape[-1]) + (
-            probs * torch.log(probs + self.eps)
-        ).sum(dim=-1)
+        reg_loss = torch.log(
+            torch.tensor(logits.shape[-1], device=probs.device)
+        ) + (probs * torch.log(probs + self.eps)).sum(dim=-1)
         if self.reduction == "sum":
             return ce_loss + self.reg_weight * reg_loss.sum()
         if self.reduction == "mean":
@@ -241,13 +246,17 @@ def __init__(
             Mohammed Fellaji et al. On the Calibration of Epistemic Uncertainty:
             Principles, Paradoxes and Conflictual Loss. https://arxiv.org/pdf/2407.12211
         """
-        super().__init__(reduction=None)
-
+        super().__init__()
         if reduction is None:
             reduction = "none"
         if reduction not in ("none", "mean", "sum"):
             raise ValueError(f"{reduction} is not a valid value for reduction.")
         self.reduction = reduction
+        if reg_weight < 0:
+            raise ValueError(
+                "The regularization weight should be non-negative, but got "
+                f"{reg_weight}."
+            )
         self.reg_weight = reg_weight
 
     def forward(self, logits: Tensor, targets: Tensor) -> Tensor:

From 94b7a370de976de0e878219343afa613646051a0 Mon Sep 17 00:00:00 2001
From: Olivier <olivier.ar.laurent@gmail.com>
Date: Mon, 22 Jul 2024 17:02:02 +0200
Subject: [PATCH 07/27] :sparkles: Add CosineAnnealingWarmup

---
 torch_uncertainty/optim_recipes.py | 37 ++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/torch_uncertainty/optim_recipes.py b/torch_uncertainty/optim_recipes.py
index 8d648400..4d7ce5be 100644
--- a/torch_uncertainty/optim_recipes.py
+++ b/torch_uncertainty/optim_recipes.py
@@ -433,6 +433,43 @@ def get_procedure(
     return procedure
 
 
+class CosineAnnealingWarmup(torch.optim.lr_scheduler.SequentialLR):
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        warmup_start_factor: float,
+        warmup_epochs: int,
+        annealing_epochs: int,
+        eta_min: float = 0,
+    ) -> None:
+        """Cosine annealing scheduler with linear warmup.
+
+        Args:
+            optimizer (Optimizer): The optimizer to be used.
+            warmup_start_factor (float): The multiplicative factor to apply to
+                the learning rate at the start of the warmup.
+            warmup_epochs (int): The number of epochs to warmup the learning
+                rate.
+            annealing_epochs (int): The number of epochs to anneal the
+                learning rate.
+            eta_min (float): The minimum learning rate.
+        """
+        warmup_scheduler = optim.lr_scheduler.LinearLR(
+            optimizer,
+            start_factor=warmup_start_factor,
+            end_factor=1,
+            total_iters=warmup_epochs,
+        )
+        cosine_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
+            optimizer, T_max=annealing_epochs - warmup_epochs, eta_min=eta_min
+        )
+        super().__init__(
+            optimizer=optimizer,
+            schedulers=[warmup_scheduler, cosine_scheduler],
+            milestones=[warmup_epochs],
+        )
+
+
 class FullSWALR(torch.optim.lr_scheduler.SequentialLR):
     def __init__(
         self,

From 04b86c0044bae85d9dec3a59e79413e95ec249f2 Mon Sep 17 00:00:00 2001
From: Olivier <olivier.ar.laurent@gmail.com>
Date: Mon, 22 Jul 2024 20:35:11 +0200
Subject: [PATCH 08/27] :bug: Fix FPR95

---
 torch_uncertainty/metrics/classification/fpr95.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/torch_uncertainty/metrics/classification/fpr95.py b/torch_uncertainty/metrics/classification/fpr95.py
index a854cc42..7cc7d650 100644
--- a/torch_uncertainty/metrics/classification/fpr95.py
+++ b/torch_uncertainty/metrics/classification/fpr95.py
@@ -78,7 +78,9 @@ def compute(self) -> Tensor:
         threshold_idxs = torch.cat(
             [
                 distinct_value_indices,
-                torch.LongTensor([labels.shape[0] - 1], device=self.device),
+                torch.tensor(
+                    [labels.shape[0] - 1], dtype=torch.long, device=self.device
+                ),
             ]
         )
 

From 917e3da6a1e887b889aa4030998e08e5ce3072ee Mon Sep 17 00:00:00 2001
From: Olivier <olivier.ar.laurent@gmail.com>
Date: Mon, 22 Jul 2024 20:37:07 +0200
Subject: [PATCH 09/27] :bug: Continue fixing FPR

---
 torch_uncertainty/metrics/classification/fpr95.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/torch_uncertainty/metrics/classification/fpr95.py b/torch_uncertainty/metrics/classification/fpr95.py
index 7cc7d650..23dabd47 100644
--- a/torch_uncertainty/metrics/classification/fpr95.py
+++ b/torch_uncertainty/metrics/classification/fpr95.py
@@ -1,4 +1,3 @@
-import numpy as np
 import torch
 from torch import Tensor
 from torchmetrics import Metric
@@ -97,7 +96,10 @@ def compute(self) -> Tensor:
 
         last_ind = torch.searchsorted(true_pos, true_pos[-1])
         recall = torch.cat(
-            [recall[: last_ind + 1].flip(0), torch.tensor([1.0])]
+            [
+                recall[: last_ind + 1].flip(0),
+                torch.tensor([1.0], device=self.device),
+            ]
         )
         false_pos = torch.cat(
             [
@@ -105,7 +107,7 @@ def compute(self) -> Tensor:
                 torch.tensor([0.0], dtype=self.dtype, device=self.device),
             ]
         )
-        cutoff = np.argmin(torch.abs(recall - 0.6))
+        cutoff = torch.argmin(torch.abs(recall - 0.6))
         return false_pos[cutoff] / (~labels).sum()
 
 

From 3259ff954f36970f94c2e204ba6f07c032dc42ae Mon Sep 17 00:00:00 2001
From: Olivier <olivier.ar.laurent@gmail.com>
Date: Mon, 22 Jul 2024 20:47:31 +0200
Subject: [PATCH 10/27] :shirt: Use logging in datasets

---
 torch_uncertainty/datasets/classification/cifar/cifar_c.py | 3 ++-
 torch_uncertainty/datasets/classification/imagenet/base.py | 3 ++-
 .../datasets/classification/imagenet/tiny_imagenet_c.py    | 3 ++-
 torch_uncertainty/datasets/classification/mnist_c.py       | 3 ++-
 torch_uncertainty/datasets/classification/not_mnist.py     | 3 ++-
 torch_uncertainty/datasets/classification/openimage_o.py   | 3 ++-
 torch_uncertainty/datasets/fractals.py                     | 3 ++-
 torch_uncertainty/datasets/frost.py                        | 3 ++-
 torch_uncertainty/datasets/regression/uci_regression.py    | 3 ++-
 torch_uncertainty/datasets/segmentation/camvid.py          | 3 ++-
 torch_uncertainty/optim_recipes.py                         | 7 +++----
 11 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/torch_uncertainty/datasets/classification/cifar/cifar_c.py b/torch_uncertainty/datasets/classification/cifar/cifar_c.py
index 10f9f230..b10fa0b9 100644
--- a/torch_uncertainty/datasets/classification/cifar/cifar_c.py
+++ b/torch_uncertainty/datasets/classification/cifar/cifar_c.py
@@ -1,3 +1,4 @@
+import logging
 from collections.abc import Callable
 from pathlib import Path
 
@@ -197,7 +198,7 @@ def _check_integrity(self) -> bool:
     def download(self) -> None:
         """Download the dataset."""
         if self._check_integrity():
-            print("Files already downloaded and verified.")
+            logging.info("Files already downloaded and verified")
             return
         download_and_extract_archive(
             self.url, self.root, filename=self.filename, md5=self.tgz_md5
diff --git a/torch_uncertainty/datasets/classification/imagenet/base.py b/torch_uncertainty/datasets/classification/imagenet/base.py
index 7d69d0f9..c5229df7 100644
--- a/torch_uncertainty/datasets/classification/imagenet/base.py
+++ b/torch_uncertainty/datasets/classification/imagenet/base.py
@@ -1,4 +1,5 @@
 import json
+import logging
 from collections.abc import Callable
 from pathlib import Path
 
@@ -85,7 +86,7 @@ def _check_integrity(self) -> bool:
     def download(self) -> None:
         """Download and extract dataset."""
         if self._check_integrity():
-            print("Files already downloaded and verified")
+            logging.info("Files already downloaded and verified")
             return
         if isinstance(self.filename, str):
             download_and_extract_archive(
diff --git a/torch_uncertainty/datasets/classification/imagenet/tiny_imagenet_c.py b/torch_uncertainty/datasets/classification/imagenet/tiny_imagenet_c.py
index 1f0bcc38..762ff346 100644
--- a/torch_uncertainty/datasets/classification/imagenet/tiny_imagenet_c.py
+++ b/torch_uncertainty/datasets/classification/imagenet/tiny_imagenet_c.py
@@ -1,3 +1,4 @@
+import logging
 from collections.abc import Callable
 from pathlib import Path
 
@@ -155,7 +156,7 @@ def _check_integrity(self) -> bool:
     def download(self) -> None:
         """Download the dataset."""
         if self._check_integrity():
-            print("Files already downloaded and verified.")
+            logging.info("Files already downloaded and verified")
             return
         for filename, md5 in list(
             zip(self.filename, self.tgz_md5, strict=True)
diff --git a/torch_uncertainty/datasets/classification/mnist_c.py b/torch_uncertainty/datasets/classification/mnist_c.py
index 65febcf9..ae1bf563 100644
--- a/torch_uncertainty/datasets/classification/mnist_c.py
+++ b/torch_uncertainty/datasets/classification/mnist_c.py
@@ -1,3 +1,4 @@
+import logging
 from collections.abc import Callable
 from pathlib import Path
 from typing import Any, Literal
@@ -168,7 +169,7 @@ def _check_integrity(self) -> bool:
     def download(self) -> None:
         """Download the dataset."""
         if self._check_integrity():
-            print("Files already downloaded and verified.")
+            logging.info("Files already downloaded and verified")
             return
         download_and_extract_archive(
             self.url, self.root, filename=self.filename, md5=self.zip_md5
diff --git a/torch_uncertainty/datasets/classification/not_mnist.py b/torch_uncertainty/datasets/classification/not_mnist.py
index 9bd27f8c..1f97d33b 100644
--- a/torch_uncertainty/datasets/classification/not_mnist.py
+++ b/torch_uncertainty/datasets/classification/not_mnist.py
@@ -1,3 +1,4 @@
+import logging
 from collections.abc import Callable
 from pathlib import Path
 from typing import Any, Literal
@@ -80,7 +81,7 @@ def _check_integrity(self) -> bool:
 
     def download(self) -> None:
         if self._check_integrity():
-            print("Files already downloaded and verified")
+            logging.info("Files already downloaded and verified")
             return
 
         download_and_extract_archive(
diff --git a/torch_uncertainty/datasets/classification/openimage_o.py b/torch_uncertainty/datasets/classification/openimage_o.py
index 2cc9104a..14c839de 100644
--- a/torch_uncertainty/datasets/classification/openimage_o.py
+++ b/torch_uncertainty/datasets/classification/openimage_o.py
@@ -1,3 +1,4 @@
+import logging
 from collections.abc import Callable
 from pathlib import Path
 
@@ -69,7 +70,7 @@ def _check_integrity(self) -> bool:
 
     def download(self) -> None:
         if self._check_integrity():
-            print("Files already downloaded and verified")
+            logging.info("Files already downloaded and verified")
             return
 
         download_and_extract_archive(
diff --git a/torch_uncertainty/datasets/fractals.py b/torch_uncertainty/datasets/fractals.py
index c609dd9d..d46358b5 100644
--- a/torch_uncertainty/datasets/fractals.py
+++ b/torch_uncertainty/datasets/fractals.py
@@ -1,3 +1,4 @@
+import logging
 from collections.abc import Callable
 from pathlib import Path
 from typing import Any
@@ -56,7 +57,7 @@ def _check_integrity(self) -> bool:
 
     def download(self) -> None:
         if self._check_integrity():
-            print("Files already downloaded and verified")
+            logging.info("Files already downloaded and verified")
             return
 
         download_file_from_google_drive(
diff --git a/torch_uncertainty/datasets/frost.py b/torch_uncertainty/datasets/frost.py
index 9cdc533e..c2de25ab 100644
--- a/torch_uncertainty/datasets/frost.py
+++ b/torch_uncertainty/datasets/frost.py
@@ -1,3 +1,4 @@
+import logging
 from collections.abc import Callable
 from pathlib import Path
 from typing import Any
@@ -63,7 +64,7 @@ def _check_integrity(self) -> bool:
 
     def download(self) -> None:
         if self._check_integrity():
-            print("Files already downloaded and verified")
+            logging.info("Files already downloaded and verified")
             return
 
         download_and_extract_archive(
diff --git a/torch_uncertainty/datasets/regression/uci_regression.py b/torch_uncertainty/datasets/regression/uci_regression.py
index 0f4be30c..3a23ae8d 100644
--- a/torch_uncertainty/datasets/regression/uci_regression.py
+++ b/torch_uncertainty/datasets/regression/uci_regression.py
@@ -1,3 +1,4 @@
+import logging
 from collections.abc import Callable
 from importlib import util
 from pathlib import Path
@@ -193,7 +194,7 @@ def _compute_statistics(self) -> None:
     def download(self) -> None:
         """Download and extract dataset."""
         if self._check_integrity():
-            print("Files already downloaded and verified")
+            logging.info("Files already downloaded and verified")
             return
         if self.url is None:
             raise ValueError(
diff --git a/torch_uncertainty/datasets/segmentation/camvid.py b/torch_uncertainty/datasets/segmentation/camvid.py
index 5a25c821..5bf3c7fa 100644
--- a/torch_uncertainty/datasets/segmentation/camvid.py
+++ b/torch_uncertainty/datasets/segmentation/camvid.py
@@ -1,4 +1,5 @@
 import json
+import logging
 import shutil
 from collections.abc import Callable
 from pathlib import Path
@@ -219,7 +220,7 @@ def _check_integrity(self) -> bool:
     def download(self) -> None:
         """Download the CamVid data if it doesn't exist already."""
         if self._check_integrity():
-            print("Files already downloaded and verified")
+            logging.info("Files already downloaded and verified")
             return
 
         if (Path(self.root) / self.base_folder).exists():
diff --git a/torch_uncertainty/optim_recipes.py b/torch_uncertainty/optim_recipes.py
index 4d7ce5be..0b477448 100644
--- a/torch_uncertainty/optim_recipes.py
+++ b/torch_uncertainty/optim_recipes.py
@@ -439,7 +439,7 @@ def __init__(
         optimizer: Optimizer,
         warmup_start_factor: float,
         warmup_epochs: int,
-        annealing_epochs: int,
+        max_epochs: int,
         eta_min: float = 0,
     ) -> None:
         """Cosine annealing scheduler with linear warmup.
@@ -450,8 +450,7 @@ def __init__(
                 the learning rate at the start of the warmup.
             warmup_epochs (int): The number of epochs to warmup the learning
                 rate.
-            annealing_epochs (int): The number of epochs to anneal the
-                learning rate.
+            max_epochs (int): The total number of epochs.
             eta_min (float): The minimum learning rate.
         """
         warmup_scheduler = optim.lr_scheduler.LinearLR(
@@ -461,7 +460,7 @@ def __init__(
             total_iters=warmup_epochs,
         )
         cosine_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
-            optimizer, T_max=annealing_epochs - warmup_epochs, eta_min=eta_min
+            optimizer, T_max=max_epochs - warmup_epochs, eta_min=eta_min
         )
         super().__init__(
             optimizer=optimizer,

From 5133e1d6d83acbeed69f674b7215ebb302d65c77 Mon Sep 17 00:00:00 2001
From: Olivier <olivier.ar.laurent@gmail.com>
Date: Mon, 22 Jul 2024 21:01:24 +0200
Subject: [PATCH 11/27] :shirt: Remove all prints to switch to logging

---
 .../datasets/classification/not_mnist.py        |  2 +-
 torch_uncertainty/datasets/frost.py             |  2 +-
 torch_uncertainty/datasets/kitti.py             | 17 +++++++++--------
 torch_uncertainty/datasets/muad.py              |  3 ++-
 .../models/segmentation/segformer.py            | 12 ++++++++----
 torch_uncertainty/optim_recipes.py              |  5 ++++-
 .../post_processing/calibration/scaler.py       |  3 ++-
 torch_uncertainty/routines/regression.py        |  1 -
 8 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/torch_uncertainty/datasets/classification/not_mnist.py b/torch_uncertainty/datasets/classification/not_mnist.py
index 1f97d33b..8fa77b4c 100644
--- a/torch_uncertainty/datasets/classification/not_mnist.py
+++ b/torch_uncertainty/datasets/classification/not_mnist.py
@@ -90,7 +90,7 @@ def download(self) -> None:
             filename=self.filename,
             md5=self.tgz_md5,
         )
-        print(f"Downloaded {self.filename} to {self.root}")
+        logging.info("Downloaded %s to %s.", self.filename, self.root)
 
     def __getitem__(self, index: int) -> tuple[Any, Any]:
         """Get the samples and targets of the dataset.
diff --git a/torch_uncertainty/datasets/frost.py b/torch_uncertainty/datasets/frost.py
index c2de25ab..6e391b93 100644
--- a/torch_uncertainty/datasets/frost.py
+++ b/torch_uncertainty/datasets/frost.py
@@ -73,7 +73,7 @@ def download(self) -> None:
             filename=self.filename,
             md5=self.zip_md5,
         )
-        print(f"Downloaded {self.filename} to {self.root}")
+        logging.info("Downloaded %s to %s.", self.filename, self.root)
 
     def __getitem__(self, index: int) -> Any:
         """Get the samples of the dataset.
diff --git a/torch_uncertainty/datasets/kitti.py b/torch_uncertainty/datasets/kitti.py
index f2b2a35f..5256f385 100644
--- a/torch_uncertainty/datasets/kitti.py
+++ b/torch_uncertainty/datasets/kitti.py
@@ -1,4 +1,5 @@
 import json
+import logging
 import shutil
 from collections.abc import Callable
 from pathlib import Path
@@ -38,7 +39,7 @@ def __init__(
         download: bool = False,
         remove_unused: bool = False,
     ) -> None:
-        print(
+        logging.info(
             "KITTIDepth is copyrighted by the Karlsruhe Institute of Technology "
             "(KIT) and the Toyota Technological Institute at Chicago (TTIC). "
             "By using KITTIDepth, you agree to the terms and conditions of the "
@@ -135,7 +136,7 @@ def _download_depth(self) -> None:
                 md5=self.depth_md5,
             )
 
-        print("Re-structuring the depth annotations...")
+        logging.info("Re-structuring the depth annotations...")
 
         if (self.root / "train" / "leftDepth").exists():
             shutil.rmtree(self.root / "train" / "leftDepth")
@@ -143,7 +144,7 @@ def _download_depth(self) -> None:
         (self.root / "train" / "leftDepth").mkdir(parents=True, exist_ok=False)
 
         depth_files = list((self.root).glob("**/tmp/train/**/image_02/*.png"))
-        print("Train files:")
+        logging.info("Train files...")
         for file in tqdm(depth_files):
             exp_code = file.parents[3].name.split("_")
             filecode = "_".join(
@@ -157,7 +158,7 @@ def _download_depth(self) -> None:
         (self.root / "val" / "leftDepth").mkdir(parents=True, exist_ok=False)
 
         depth_files = list((self.root).glob("**/tmp/val/**/image_02/*.png"))
-        print("Validation files:")
+        logging.info("Validation files...")
         for file in tqdm(depth_files):
             exp_code = file.parents[3].name.split("_")
             filecode = "_".join(
@@ -179,7 +180,7 @@ def _download_raw(self, remove_unused: bool) -> None:
             raw_filenames = json.load(file)
 
         for filename in tqdm(raw_filenames):
-            print(self.raw_url + filename)
+            logging.info("%s", self.raw_url + filename)
             download_and_extract_archive(
                 self.raw_url + filename,
                 download_root=self.root,
@@ -187,7 +188,7 @@ def _download_raw(self, remove_unused: bool) -> None:
                 md5=None,
             )
 
-        print("Re-structuring the raw data...")
+        logging.info("Re-structuring the raw data...")
 
         samples_to_keep = list(
             (self.root / "train" / "leftDepth").glob("*.png")
@@ -200,7 +201,7 @@ def _download_raw(self, remove_unused: bool) -> None:
             parents=True, exist_ok=False
         )
 
-        print("Train files:")
+        logging.info("Train files...")
         for sample in tqdm(samples_to_keep):
             filecode = sample.name.split("_")
             first_level = "_".join([filecode[0], filecode[1], filecode[2]])
@@ -234,7 +235,7 @@ def _download_raw(self, remove_unused: bool) -> None:
 
         (self.root / "val" / "leftImg8bit").mkdir(parents=True, exist_ok=False)
 
-        print("Validation files:")
+        logging.info("Validation files...")
         for sample in tqdm(samples_to_keep):
             filecode = sample.name.split("_")
             first_level = "_".join([filecode[0], filecode[1], filecode[2]])
diff --git a/torch_uncertainty/datasets/muad.py b/torch_uncertainty/datasets/muad.py
index 9cde371a..f27ffe57 100644
--- a/torch_uncertainty/datasets/muad.py
+++ b/torch_uncertainty/datasets/muad.py
@@ -1,4 +1,5 @@
 import json
+import logging
 import os
 import shutil
 from collections.abc import Callable
@@ -73,7 +74,7 @@ def __init__(
             MUAD cannot be used for commercial purposes. Read MUAD's license
             carefully before using it and verify that you can comply.
         """
-        print(
+        logging.info(
             "MUAD is restricted to non-commercial use. By using MUAD, you "
             "agree to the terms and conditions."
         )
diff --git a/torch_uncertainty/models/segmentation/segformer.py b/torch_uncertainty/models/segmentation/segformer.py
index 6c34dfcb..e13258e8 100644
--- a/torch_uncertainty/models/segmentation/segformer.py
+++ b/torch_uncertainty/models/segmentation/segformer.py
@@ -1,3 +1,4 @@
+import logging
 import math
 from functools import partial
 
@@ -522,11 +523,14 @@ def resize(
             and (output_h - 1) % (input_h - 1)
             and (output_w - 1) % (input_w - 1)
         ):
-            print(
-                f"When align_corners={align_corners}, "
+            logging.info(
+                "When align_corners=%s, "
                 "the output would more aligned if "
-                f"input size {(input_h, input_w)} is `x+1` and "
-                f"out size {(output_h, output_w)} is `nx+1`",
+                "input size %s is `x+1` and "
+                "out size %s is `nx+1`",
+                align_corners,
+                (input_h, input_w),
+                (output_h, output_w),
             )
     if isinstance(size, torch.Size):
         size = tuple(int(x) for x in size)
diff --git a/torch_uncertainty/optim_recipes.py b/torch_uncertainty/optim_recipes.py
index 0b477448..5b089430 100644
--- a/torch_uncertainty/optim_recipes.py
+++ b/torch_uncertainty/optim_recipes.py
@@ -1,3 +1,4 @@
+import logging
 from collections.abc import Callable
 from functools import partial
 from typing import Literal
@@ -207,7 +208,9 @@ def optim_imagenet_resnet50_a3(
         dict: The optimizer and the scheduler for the training.
     """
     if effective_batch_size is None:
-        print("Setting effective batch size to 2048 for steps computations !")
+        logging.warning(
+            "Setting effective batch size to 2048 for steps computations !"
+        )
         effective_batch_size = 2048
 
     optimizer = Lamb(model.parameters(), lr=0.008, weight_decay=0.02)
diff --git a/torch_uncertainty/post_processing/calibration/scaler.py b/torch_uncertainty/post_processing/calibration/scaler.py
index d3400dfe..3dcf08e6 100644
--- a/torch_uncertainty/post_processing/calibration/scaler.py
+++ b/torch_uncertainty/post_processing/calibration/scaler.py
@@ -1,3 +1,4 @@
+import logging
 from typing import Literal
 
 import torch
@@ -91,7 +92,7 @@ def calib_eval() -> float:
     @torch.no_grad()
     def forward(self, inputs: Tensor) -> Tensor:
         if not self.trained:
-            print(
+            logging.error(
                 "TemperatureScaler has not been trained yet. Returning "
                 "manually tempered inputs."
             )
diff --git a/torch_uncertainty/routines/regression.py b/torch_uncertainty/routines/regression.py
index 2beeb435..b61d3ee0 100644
--- a/torch_uncertainty/routines/regression.py
+++ b/torch_uncertainty/routines/regression.py
@@ -182,7 +182,6 @@ def validation_step(
                     dist_size(preds)[0] // batch_size, device=self.device
                 )
             )
-            print(ens_dist, type(ens_dist))
             mixture = MixtureSameFamily(mix, ens_dist)
             preds = mixture.mean
         else:

From bc5d2ea25f1d4259dfa51d77570f061eb48752fe Mon Sep 17 00:00:00 2001
From: Olivier <olivier.ar.laurent@gmail.com>
Date: Fri, 26 Jul 2024 12:17:21 +0200
Subject: [PATCH 12/27] :sparkles: Add ChannelLayerNorm

---
 ...t_filter_response_norm.py => test_norm.py} | 12 ++++
 .../baselines/classification/resnet.py        |  4 ++
 torch_uncertainty/layers/__init__.py          |  1 +
 .../layers/channel_layer_norm.py              | 59 +++++++++++++++++++
 torch_uncertainty/layers/utils.py             | 12 ++++
 torch_uncertainty/models/resnet/packed.py     |  6 +-
 6 files changed, 91 insertions(+), 3 deletions(-)
 rename tests/layers/{test_filter_response_norm.py => test_norm.py} (83%)
 create mode 100644 torch_uncertainty/layers/channel_layer_norm.py
 create mode 100644 torch_uncertainty/layers/utils.py

diff --git a/tests/layers/test_filter_response_norm.py b/tests/layers/test_norm.py
similarity index 83%
rename from tests/layers/test_filter_response_norm.py
rename to tests/layers/test_norm.py
index e1f58eb1..89fe23eb 100644
--- a/tests/layers/test_filter_response_norm.py
+++ b/tests/layers/test_norm.py
@@ -1,6 +1,7 @@
 import pytest
 import torch
 
+from torch_uncertainty.layers.channel_layer_norm import ChannelLayerNorm
 from torch_uncertainty.layers.filter_response_norm import (
     FilterResponseNorm1d,
     FilterResponseNorm2d,
@@ -62,3 +63,14 @@ def test_errors(self):
             layer2d(torch.randn(1, 1, 1, 1, 20))
         with pytest.raises(ValueError):
             layer3d(torch.randn(1, 1, 1, 1, 1, 20))
+
+
+class TestChannelLayerNorm:
+    """Testing the FRN2d layer."""
+
+    def test_main(self):
+        """Test initialization."""
+        cln = ChannelLayerNorm(1)
+        cln(torch.randn(1, 1, 4, 4))
+        cln = ChannelLayerNorm(18)
+        cln(torch.randn(1, 18, 2, 3))
diff --git a/torch_uncertainty/baselines/classification/resnet.py b/torch_uncertainty/baselines/classification/resnet.py
index 36777e10..00ea94ce 100644
--- a/torch_uncertainty/baselines/classification/resnet.py
+++ b/torch_uncertainty/baselines/classification/resnet.py
@@ -52,6 +52,7 @@ def __init__(
         ],
         arch: int,
         style: str = "imagenet",
+        normalization_layer: type[nn.Module] = nn.BatchNorm2d,
         num_estimators: int = 1,
         dropout_rate: float = 0.0,
         mixup_params: dict | None = None,
@@ -106,6 +107,8 @@ def __init__(
 
             style (str, optional): Which ResNet style to use. Defaults to
             ``imagenet``.
+            normalization_layer (type[nn.Module], optional): Normalization layer
+                to use. Defaults to ``nn.BatchNorm2d``.
             num_estimators (int, optional): Number of estimators in the ensemble.
                 Only used if :attr:`version` is either ``"packed"``, ``"batched"``,
                 ``"masked"`` or ``"mc-dropout"`` Defaults to ``None``.
@@ -175,6 +178,7 @@ def __init__(
             "in_channels": in_channels,
             "num_classes": num_classes,
             "style": style,
+            "normalization_layer": normalization_layer,
         }
 
         format_batch_fn = nn.Identity()
diff --git a/torch_uncertainty/layers/__init__.py b/torch_uncertainty/layers/__init__.py
index f91746bd..210e0bea 100644
--- a/torch_uncertainty/layers/__init__.py
+++ b/torch_uncertainty/layers/__init__.py
@@ -1,6 +1,7 @@
 # ruff: noqa: F401
 from .batch_ensemble import BatchConv2d, BatchLinear
 from .bayesian import BayesConv1d, BayesConv2d, BayesConv3d, BayesLinear
+from .channel_layer_norm import ChannelLayerNorm
 from .masksembles import MaskedConv2d, MaskedLinear
 from .modules import Identity
 from .packed import PackedConv1d, PackedConv2d, PackedConv3d, PackedLinear
diff --git a/torch_uncertainty/layers/channel_layer_norm.py b/torch_uncertainty/layers/channel_layer_norm.py
new file mode 100644
index 00000000..69999324
--- /dev/null
+++ b/torch_uncertainty/layers/channel_layer_norm.py
@@ -0,0 +1,59 @@
+import torch
+from torch import Tensor
+from torch.nn import LayerNorm
+
+from .utils import ChannelBack, ChannelFront
+
+
+class ChannelLayerNorm(LayerNorm):
+    def __init__(
+        self,
+        normalized_shape: int | list[int],
+        eps: float = 0.00001,
+        elementwise_affine: bool = True,
+        bias: bool = True,
+        device: torch.device | str | None = None,
+        dtype: torch.dtype | str | None = None,
+    ) -> None:
+        r"""Layer normalization over the channel dimension.
+
+        Args:
+        normalized_shape (int or list or torch.Size): input shape from an expected input
+            of size
+
+            .. math::
+                [* \times \text{normalized\_shape}[0] \times \text{normalized\_shape}[1]
+                    \times \ldots \times \text{normalized\_shape}[-1]]
+
+            If a single integer is used, it is treated as a singleton list, and this module will
+            normalize over the channel dimension which is expected to be of that specific size.
+        eps (float): a value added to the denominator for numerical stability. Default: 1e-5
+        elementwise_affine (bool): a boolean value that when set to ``True``, this module
+            has learnable per-element affine parameters initialized to ones (for weights)
+            and zeros (for biases). Default: ``True``.
+        bias (bool): If set to ``False``, the layer will not learn an additive bias (only relevant if
+            :attr:`elementwise_affine` is ``True``). Default: ``True``.
+        device (torch.device or str or None): the desired device of the module.
+        dtype (torch.dtype or str or None): the desired floating point type of the module.
+
+        Attributes:
+            weight: the learnable weights of the module of shape
+                :math:`\text{normalized\_shape}` when :attr:`elementwise_affine` is set to ``True``.
+                The values are initialized to 1.
+            bias:   the learnable bias of the module of shape
+                    :math:`\text{normalized\_shape}` when :attr:`elementwise_affine` is set to ``True``.
+                    The values are initialized to 0.
+
+        Shape:
+            - Input: :math:`(N, *)`
+            - Output: :math:`(N, *)` (same shape as input)
+
+        """
+        super().__init__(
+            normalized_shape, eps, elementwise_affine, bias, device, dtype
+        )
+        self.cback = ChannelBack()
+        self.cfront = ChannelFront()
+
+    def forward(self, inputs: Tensor) -> Tensor:
+        return self.cfront(super().forward(self.cback(inputs)))
diff --git a/torch_uncertainty/layers/utils.py b/torch_uncertainty/layers/utils.py
new file mode 100644
index 00000000..050d56d9
--- /dev/null
+++ b/torch_uncertainty/layers/utils.py
@@ -0,0 +1,12 @@
+from einops import rearrange
+from torch import Tensor, nn
+
+
+class ChannelBack(nn.Module):
+    def forward(self, x: Tensor) -> Tensor:
+        return rearrange(x, "b c h w -> b h w c")
+
+
+class ChannelFront(nn.Module):
+    def forward(self, x: Tensor) -> Tensor:
+        return rearrange(x, "b h w c -> b c h w")
diff --git a/torch_uncertainty/models/resnet/packed.py b/torch_uncertainty/models/resnet/packed.py
index b353f1d7..4bf170d8 100644
--- a/torch_uncertainty/models/resnet/packed.py
+++ b/torch_uncertainty/models/resnet/packed.py
@@ -46,7 +46,7 @@ def __init__(
         in_planes: int,
         planes: int,
         stride: int,
-        alpha: float,
+        alpha: int,
         num_estimators: int,
         gamma: int,
         conv_bias: bool,
@@ -116,7 +116,7 @@ def __init__(
         in_planes: int,
         planes: int,
         stride: int,
-        alpha: float,
+        alpha: int,
         num_estimators: int,
         gamma: int,
         conv_bias: bool,
@@ -333,7 +333,7 @@ def _make_layer(
         planes: int,
         num_blocks: int,
         stride: int,
-        alpha: float,
+        alpha: int,
         num_estimators: int,
         conv_bias: bool,
         dropout_rate: float,

From 681350b9310d7beec90d073098db94f274743fff Mon Sep 17 00:00:00 2001
From: Olivier <olivier.ar.laurent@gmail.com>
Date: Fri, 23 Aug 2024 15:13:38 +0200
Subject: [PATCH 13/27] :shirt: Refine datasets and datamodules

---
 .../classification/test_cifar10.py            |  2 +-
 .../classification/test_cifar100.py           |  2 +-
 .../datamodules/classification/test_mnist.py  |  4 ++--
 .../datamodules/classification/cifar10.py     | 16 +++++++++++++--
 .../datamodules/classification/cifar100.py    | 16 +++++++++++++--
 .../datamodules/classification/imagenet.py    | 20 +++++++++++++++----
 .../datamodules/classification/mnist.py       | 10 +++++++++-
 .../classification/tiny_imagenet.py           | 15 +++++++++++---
 .../classification/imagenet/tiny_imagenet.py  |  5 ++++-
 .../datasets/segmentation/cityscapes.py       |  4 ++--
 10 files changed, 75 insertions(+), 19 deletions(-)

diff --git a/tests/datamodules/classification/test_cifar10.py b/tests/datamodules/classification/test_cifar10.py
index df12f214..64944684 100644
--- a/tests/datamodules/classification/test_cifar10.py
+++ b/tests/datamodules/classification/test_cifar10.py
@@ -13,7 +13,7 @@ def test_cifar10_main(self):
         dm = CIFAR10DataModule(root="./data/", batch_size=128, cutout=16)
 
         assert dm.dataset == CIFAR10
-        assert isinstance(dm.train_transform.transforms[2], Cutout)
+        assert isinstance(dm.train_transform.transforms[1], Cutout)
 
         dm.dataset = DummyClassificationDataset
         dm.ood_dataset = DummyClassificationDataset
diff --git a/tests/datamodules/classification/test_cifar100.py b/tests/datamodules/classification/test_cifar100.py
index e24af243..f2f00aa2 100644
--- a/tests/datamodules/classification/test_cifar100.py
+++ b/tests/datamodules/classification/test_cifar100.py
@@ -13,7 +13,7 @@ def test_cifar100(self):
         dm = CIFAR100DataModule(root="./data/", batch_size=128, cutout=16)
 
         assert dm.dataset == CIFAR100
-        assert isinstance(dm.train_transform.transforms[2], Cutout)
+        assert isinstance(dm.train_transform.transforms[1], Cutout)
 
         dm.dataset = DummyClassificationDataset
         dm.ood_dataset = DummyClassificationDataset
diff --git a/tests/datamodules/classification/test_mnist.py b/tests/datamodules/classification/test_mnist.py
index f52c9abf..ba30fad3 100644
--- a/tests/datamodules/classification/test_mnist.py
+++ b/tests/datamodules/classification/test_mnist.py
@@ -20,7 +20,7 @@ def test_mnist_cutout(self):
         )
 
         assert dm.dataset == MNIST
-        assert isinstance(dm.train_transform.transforms[0], Cutout)
+        assert isinstance(dm.train_transform.transforms[1], Cutout)
 
         dm = MNISTDataModule(
             root="./data/",
@@ -29,7 +29,7 @@ def test_mnist_cutout(self):
             cutout=0,
             val_split=0,
         )
-        assert isinstance(dm.train_transform.transforms[0], nn.Identity)
+        assert isinstance(dm.train_transform.transforms[1], nn.Identity)
 
         with pytest.raises(ValueError):
             MNISTDataModule(root="./data/", batch_size=128, ood_ds="other")
diff --git a/torch_uncertainty/datamodules/classification/cifar10.py b/torch_uncertainty/datamodules/classification/cifar10.py
index 1e5eda4a..a87d61c0 100644
--- a/torch_uncertainty/datamodules/classification/cifar10.py
+++ b/torch_uncertainty/datamodules/classification/cifar10.py
@@ -29,6 +29,7 @@ def __init__(
         eval_ood: bool = False,
         val_split: float | None = None,
         num_workers: int = 1,
+        basic_augment: bool = True,
         cutout: int | None = None,
         auto_augment: str | None = None,
         test_alt: Literal["c", "h"] | None = None,
@@ -47,6 +48,8 @@ def __init__(
                 to ``0.0``.
             num_workers (int): Number of workers to use for data loading. Defaults
                 to ``1``.
+            basic_augment (bool): Whether to apply base augmentations. Defaults to
+                ``True``.
             cutout (int): Size of cutout to apply to images. Defaults to ``None``.
             randaugment (bool): Whether to apply RandAugment. Defaults to
                 ``False``.
@@ -89,6 +92,16 @@ def __init__(
                 "GitHub issue if needed."
             )
 
+        if basic_augment:
+            basic_transform = T.Compose(
+                [
+                    T.RandomCrop(32, padding=4),
+                    T.RandomHorizontalFlip(),
+                ]
+            )
+        else:
+            basic_transform = nn.Identity()
+
         if cutout:
             main_transform = Cutout(cutout)
         elif auto_augment:
@@ -98,8 +111,7 @@ def __init__(
 
         self.train_transform = T.Compose(
             [
-                T.RandomCrop(32, padding=4),
-                T.RandomHorizontalFlip(),
+                basic_transform,
                 main_transform,
                 T.ToTensor(),
                 T.Normalize(
diff --git a/torch_uncertainty/datamodules/classification/cifar100.py b/torch_uncertainty/datamodules/classification/cifar100.py
index 373430bd..fa759853 100644
--- a/torch_uncertainty/datamodules/classification/cifar100.py
+++ b/torch_uncertainty/datamodules/classification/cifar100.py
@@ -29,6 +29,7 @@ def __init__(
         batch_size: int,
         eval_ood: bool = False,
         val_split: float | None = None,
+        basic_augment: bool = True,
         cutout: int | None = None,
         randaugment: bool = False,
         auto_augment: str | None = None,
@@ -48,6 +49,8 @@ def __init__(
             batch_size (int): Number of samples per batch.
             val_split (float): Share of samples to use for validation. Defaults
                 to ``0.0``.
+            basic_augment (bool): Whether to apply base augmentations. Defaults to
+                ``True``.
             cutout (int): Size of cutout to apply to images. Defaults to ``None``.
             randaugment (bool): Whether to apply RandAugment. Defaults to
                 ``False``.
@@ -93,6 +96,16 @@ def __init__(
                 "GitHub issue if needed."
             )
 
+        if basic_augment:
+            basic_transform = T.Compose(
+                [
+                    T.RandomCrop(32, padding=4),
+                    T.RandomHorizontalFlip(),
+                ]
+            )
+        else:
+            basic_transform = nn.Identity()
+
         if cutout:
             main_transform = Cutout(cutout)
         elif randaugment:
@@ -104,8 +117,7 @@ def __init__(
 
         self.train_transform = T.Compose(
             [
-                T.RandomCrop(32, padding=4),
-                T.RandomHorizontalFlip(),
+                basic_transform,
                 main_transform,
                 T.ToTensor(),
                 T.ConvertImageDtype(torch.float32),
diff --git a/torch_uncertainty/datamodules/classification/imagenet.py b/torch_uncertainty/datamodules/classification/imagenet.py
index 6d35303c..1e19ed4a 100644
--- a/torch_uncertainty/datamodules/classification/imagenet.py
+++ b/torch_uncertainty/datamodules/classification/imagenet.py
@@ -49,6 +49,7 @@ def __init__(
         procedure: str | None = None,
         train_size: int = 224,
         interpolation: str = "bilinear",
+        basic_augment: bool = True,
         rand_augment_opt: str | None = None,
         num_workers: int = 1,
         pin_memory: bool = True,
@@ -71,6 +72,8 @@ def __init__(
             train_size (int): Size of training images. Defaults to ``224``.
             interpolation (str): Interpolation method for the Resize Crops.
                 Defaults to ``"bilinear"``.
+            basic_augment (bool): Whether to apply base augmentations. Defaults to
+                ``True``.
             rand_augment_opt (str): Which RandAugment to use. Defaults to ``None``.
             num_workers (int): Number of workers to use for data loading. Defaults
                 to ``1``.
@@ -123,6 +126,18 @@ def __init__(
 
         self.procedure = procedure
 
+        if basic_augment:
+            basic_transform = T.Compose(
+                [
+                    T.RandomResizedCrop(
+                        train_size, interpolation=self.interpolation
+                    ),
+                    T.RandomHorizontalFlip(),
+                ]
+            )
+        else:
+            basic_transform = nn.Identity()
+
         if self.procedure is None:
             if rand_augment_opt is not None:
                 main_transform = rand_augment_transform(rand_augment_opt, {})
@@ -144,10 +159,7 @@ def __init__(
 
         self.train_transform = T.Compose(
             [
-                T.RandomResizedCrop(
-                    train_size, interpolation=self.interpolation
-                ),
-                T.RandomHorizontalFlip(),
+                basic_transform,
                 main_transform,
                 T.ToTensor(),
                 T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
diff --git a/torch_uncertainty/datamodules/classification/mnist.py b/torch_uncertainty/datamodules/classification/mnist.py
index b411f502..9be45ea6 100644
--- a/torch_uncertainty/datamodules/classification/mnist.py
+++ b/torch_uncertainty/datamodules/classification/mnist.py
@@ -27,6 +27,7 @@ def __init__(
         ood_ds: Literal["fashion", "notMNIST"] = "fashion",
         val_split: float | None = None,
         num_workers: int = 1,
+        basic_augment: bool = True,
         cutout: int | None = None,
         test_alt: Literal["c"] | None = None,
         pin_memory: bool = True,
@@ -45,6 +46,8 @@ def __init__(
                 to ``0.0``.
             num_workers (int): Number of workers to use for data loading. Defaults
                 to ``1``.
+            basic_augment (bool): Whether to apply base augmentations. Defaults to
+                ``True``.
             cutout (int): Size of cutout to apply to images. Defaults to ``None``.
             test_alt (str): Which test set to use. Defaults to ``None``.
             pin_memory (bool): Whether to pin memory. Defaults to ``True``.
@@ -78,13 +81,18 @@ def __init__(
                 f"`ood_ds` should be in {self.ood_datasets}. Got {ood_ds}."
             )
 
+        if basic_augment:
+            basic_transform = T.RandomCrop(28, padding=4)
+        else:
+            basic_transform = nn.Identity()
+
         main_transform = Cutout(cutout) if cutout else nn.Identity()
 
         self.train_transform = T.Compose(
             [
+                basic_transform,
                 main_transform,
                 T.ToTensor(),
-                T.RandomCrop(28, padding=4),
                 T.Normalize((0.1307,), (0.3081,)),
             ]
         )
diff --git a/torch_uncertainty/datamodules/classification/tiny_imagenet.py b/torch_uncertainty/datamodules/classification/tiny_imagenet.py
index 49506d48..bec3025d 100644
--- a/torch_uncertainty/datamodules/classification/tiny_imagenet.py
+++ b/torch_uncertainty/datamodules/classification/tiny_imagenet.py
@@ -30,6 +30,7 @@ def __init__(
         val_split: float | None = None,
         ood_ds: str = "svhn",
         interpolation: str = "bilinear",
+        basic_augment: bool = True,
         rand_augment_opt: str | None = None,
         num_workers: int = 1,
         pin_memory: bool = True,
@@ -44,7 +45,6 @@ def __init__(
             persistent_workers=persistent_workers,
         )
         # TODO: COMPUTE STATS
-
         self.eval_ood = eval_ood
         self.ood_ds = ood_ds
         self.interpolation = interpolation_modes_from_str(interpolation)
@@ -62,6 +62,16 @@ def __init__(
                 f"OOD dataset {ood_ds} not supported for TinyImageNet."
             )
 
+        if basic_augment:
+            basic_transform = T.Compose(
+                [
+                    T.RandomCrop(64, padding=4),
+                    T.RandomHorizontalFlip(),
+                ]
+            )
+        else:
+            basic_transform = nn.Identity()
+
         if rand_augment_opt is not None:
             main_transform = rand_augment_transform(rand_augment_opt, {})
         else:
@@ -69,8 +79,7 @@ def __init__(
 
         self.train_transform = T.Compose(
             [
-                T.RandomCrop(64, padding=4),
-                T.RandomHorizontalFlip(),
+                basic_transform,
                 main_transform,
                 T.ToTensor(),
                 T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
diff --git a/torch_uncertainty/datasets/classification/imagenet/tiny_imagenet.py b/torch_uncertainty/datasets/classification/imagenet/tiny_imagenet.py
index 553fbd1b..0e42331e 100644
--- a/torch_uncertainty/datasets/classification/imagenet/tiny_imagenet.py
+++ b/torch_uncertainty/datasets/classification/imagenet/tiny_imagenet.py
@@ -24,8 +24,11 @@ def __init__(
     ) -> None:
         self.root = Path(root) / "tiny-imagenet-200"
 
+        if split not in ["train", "val", "test"]:
+            raise ValueError(f"Split {split} is not supported.")
+
         self.split = split
-        self.label_idx = 1  # from [image, id, nid, box]
+        self.label_idx = 1
         self.transform = transform
         self.target_transform = target_transform
 
diff --git a/torch_uncertainty/datasets/segmentation/cityscapes.py b/torch_uncertainty/datasets/segmentation/cityscapes.py
index 234a6ee5..97e48ef0 100644
--- a/torch_uncertainty/datasets/segmentation/cityscapes.py
+++ b/torch_uncertainty/datasets/segmentation/cityscapes.py
@@ -5,11 +5,11 @@
 from PIL import Image
 from torchmetrics.utilities.plot import _AX_TYPE, _PLOT_OUT_TYPE
 from torchvision import tv_tensors
-from torchvision.datasets import Cityscapes as OriginalCityscapes
+from torchvision.datasets import Cityscapes as TVCityscapes
 from torchvision.transforms.v2 import functional as F
 
 
-class Cityscapes(OriginalCityscapes):
+class Cityscapes(TVCityscapes):
     def encode_target(self, target: Image.Image) -> Image.Image:
         """Encode target image to tensor.
 

From 6a6c2e400d0eda7d72959d77cecdaeb676f200eb Mon Sep 17 00:00:00 2001
From: Olivier <olivier.ar.laurent@gmail.com>
Date: Fri, 23 Aug 2024 16:14:13 +0200
Subject: [PATCH 14/27] :bug: Fix FPRx

---
 torch_uncertainty/metrics/classification/fpr95.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_uncertainty/metrics/classification/fpr95.py b/torch_uncertainty/metrics/classification/fpr95.py
index 23dabd47..eb6bf66b 100644
--- a/torch_uncertainty/metrics/classification/fpr95.py
+++ b/torch_uncertainty/metrics/classification/fpr95.py
@@ -104,10 +104,10 @@ def compute(self) -> Tensor:
         false_pos = torch.cat(
             [
                 false_pos[: last_ind + 1].flip(0),
-                torch.tensor([0.0], dtype=self.dtype, device=self.device),
+                torch.tensor([0.0], device=self.device),
             ]
         )
-        cutoff = torch.argmin(torch.abs(recall - 0.6))
+        cutoff = torch.argmin(torch.abs(recall - self.recall_level))
         return false_pos[cutoff] / (~labels).sum()
 
 

From f7d21bd226d8c819c79e02cd2aec98f70c04efeb Mon Sep 17 00:00:00 2001
From: Olivier Laurent <olivier.ar.laurent@gmail.com>
Date: Fri, 23 Aug 2024 16:26:44 +0200
Subject: [PATCH 15/27] :hammer: Rename fpr & misc

---
 .../metrics/classification/test_calibration.py | 18 +++++++++++++++++-
 .../metrics/classification/__init__.py         |  2 +-
 .../adaptive_calibration_error.py              |  2 +-
 .../classification/{fpr95.py => fpr.py}        |  0
 4 files changed, 19 insertions(+), 3 deletions(-)
 rename torch_uncertainty/metrics/classification/{fpr95.py => fpr.py} (100%)

diff --git a/tests/metrics/classification/test_calibration.py b/tests/metrics/classification/test_calibration.py
index 3ad5e3f3..def3a851 100644
--- a/tests/metrics/classification/test_calibration.py
+++ b/tests/metrics/classification/test_calibration.py
@@ -58,6 +58,7 @@ def test_main(self) -> None:
         ace = AdaptiveCalibrationError(
             task="binary", num_bins=2, norm="l1", validate_args=True
         )
+        
         ace = AdaptiveCalibrationError(
             task="binary", num_bins=2, norm="l1", validate_args=False
         )
@@ -112,7 +113,22 @@ def test_main(self) -> None:
             ),
             torch.as_tensor([0, 0, 0, 1, 1]),
         )
-        assert ace.compute().item() ** 2 == pytest.approx((0.8 - 0.5) ** 2)
+        assert ace.compute().item() == pytest.approx(0.8 - 0.5)
+
+        ace = AdaptiveCalibrationError(
+            task="binary", num_bins=3, norm="l2"
+        )
+        ece = CalibrationError(task="binary", num_bins=3, norm="l2")
+
+        ace.update(
+            torch.as_tensor([0.12, 0.26, 0.70, 0.71, 0.91, 0.92]),
+            torch.as_tensor([0, 1, 0, 0, 1, 1]),
+        )
+        ece.update(
+            torch.as_tensor([0.12, 0.26, 0.70, 0.71, 0.91, 0.92]),
+            torch.as_tensor([0, 1, 0, 0, 1, 1]),
+        )
+        assert ace.compute().item() > ece.compute().item()
 
     def test_errors(self) -> None:
         with pytest.raises(TypeError, match="is expected to be `int`"):
diff --git a/torch_uncertainty/metrics/classification/__init__.py b/torch_uncertainty/metrics/classification/__init__.py
index de375588..f604f1f7 100644
--- a/torch_uncertainty/metrics/classification/__init__.py
+++ b/torch_uncertainty/metrics/classification/__init__.py
@@ -5,7 +5,7 @@
 from .categorical_nll import CategoricalNLL
 from .disagreement import Disagreement
 from .entropy import Entropy
-from .fpr95 import FPR95, FPRx
+from .fpr import FPR95, FPRx
 from .grouping_loss import GroupingLoss
 from .mean_iou import MeanIntersectionOverUnion
 from .mutual_information import MutualInformation
diff --git a/torch_uncertainty/metrics/classification/adaptive_calibration_error.py b/torch_uncertainty/metrics/classification/adaptive_calibration_error.py
index 4f4c4850..c1e066d9 100644
--- a/torch_uncertainty/metrics/classification/adaptive_calibration_error.py
+++ b/torch_uncertainty/metrics/classification/adaptive_calibration_error.py
@@ -64,7 +64,7 @@ def _ace_compute(
         norm: Norm function to use when computing calibration error. Defaults
             to "l1".
         debias: Apply debiasing to L2 norm computation as in
-            `Verified Uncertainty Calibration`_. Defaults to False.
+            `Verified Uncertainty Calibration`. Defaults to False.
 
     Returns:
         Tensor: Adaptive Calibration error scalar.
diff --git a/torch_uncertainty/metrics/classification/fpr95.py b/torch_uncertainty/metrics/classification/fpr.py
similarity index 100%
rename from torch_uncertainty/metrics/classification/fpr95.py
rename to torch_uncertainty/metrics/classification/fpr.py

From e86dd2341bd806b472a3ef214f6fbdbc40e0c0dc Mon Sep 17 00:00:00 2001
From: Olivier <olivier.ar.laurent@gmail.com>
Date: Fri, 23 Aug 2024 16:38:48 +0200
Subject: [PATCH 16/27] :bug: Fix test & renaming

---
 .../classification/test_calibration.py        |  6 ++--
 ...test_corruptions.py => test_corruption.py} | 29 ++++++++++++-------
 .../{corruptions.py => corruption.py}         |  0
 3 files changed, 20 insertions(+), 15 deletions(-)
 rename tests/transforms/{test_corruptions.py => test_corruption.py} (86%)
 rename torch_uncertainty/transforms/{corruptions.py => corruption.py} (100%)

diff --git a/tests/metrics/classification/test_calibration.py b/tests/metrics/classification/test_calibration.py
index def3a851..cf94da78 100644
--- a/tests/metrics/classification/test_calibration.py
+++ b/tests/metrics/classification/test_calibration.py
@@ -58,7 +58,7 @@ def test_main(self) -> None:
         ace = AdaptiveCalibrationError(
             task="binary", num_bins=2, norm="l1", validate_args=True
         )
-        
+
         ace = AdaptiveCalibrationError(
             task="binary", num_bins=2, norm="l1", validate_args=False
         )
@@ -115,9 +115,7 @@ def test_main(self) -> None:
         )
         assert ace.compute().item() == pytest.approx(0.8 - 0.5)
 
-        ace = AdaptiveCalibrationError(
-            task="binary", num_bins=3, norm="l2"
-        )
+        ace = AdaptiveCalibrationError(task="binary", num_bins=3, norm="l2")
         ece = CalibrationError(task="binary", num_bins=3, norm="l2")
 
         ace.update(
diff --git a/tests/transforms/test_corruptions.py b/tests/transforms/test_corruption.py
similarity index 86%
rename from tests/transforms/test_corruptions.py
rename to tests/transforms/test_corruption.py
index 46b07ce3..4d979f89 100644
--- a/tests/transforms/test_corruptions.py
+++ b/tests/transforms/test_corruption.py
@@ -1,7 +1,8 @@
 import pytest
 import torch
+from requests.exceptions import HTTPError
 
-from torch_uncertainty.transforms.corruptions import (
+from torch_uncertainty.transforms.corruption import (
     DefocusBlur,
     Frost,
     GaussianBlur,
@@ -127,13 +128,19 @@ def test_pixelate(self):
         print(transform)
 
     def test_frost(self):
-        with pytest.raises(ValueError):
-            _ = Frost(-1)
-        with pytest.raises(TypeError):
-            _ = Frost(0.1)
-        inputs = torch.rand(3, 32, 32)
-        transform = Frost(1)
-        transform(inputs)
-        transform = Frost(0)
-        transform(inputs)
-        print(transform)
+        try:
+            Frost(1)
+            frost_ok = True
+        except HTTPError:
+            frost_ok = False
+        if frost_ok:
+            with pytest.raises(ValueError):
+                _ = Frost(-1)
+            with pytest.raises(TypeError):
+                _ = Frost(0.1)
+            inputs = torch.rand(3, 32, 32)
+            transform = Frost(1)
+            transform(inputs)
+            transform = Frost(0)
+            transform(inputs)
+            print(transform)
diff --git a/torch_uncertainty/transforms/corruptions.py b/torch_uncertainty/transforms/corruption.py
similarity index 100%
rename from torch_uncertainty/transforms/corruptions.py
rename to torch_uncertainty/transforms/corruption.py

From 9ffec86760739ce7730d0d0575abd0f75ce42cf6 Mon Sep 17 00:00:00 2001
From: Olivier <olivier.ar.laurent@gmail.com>
Date: Mon, 26 Aug 2024 11:55:42 +0200
Subject: [PATCH 17/27] :sparkles: Add AUGRC

---
 docs/source/api.rst                           |  82 ++++++++++--
 docs/source/references.rst                    |  12 ++
 torch_uncertainty/metrics/__init__.py         |   1 +
 .../metrics/classification/__init__.py        |   9 +-
 .../metrics/classification/risk_coverage.py   | 122 ++++++++++++++++--
 torch_uncertainty/routines/classification.py  |   8 +-
 torch_uncertainty/routines/segmentation.py    |  15 ++-
 7 files changed, 228 insertions(+), 21 deletions(-)

diff --git a/docs/source/api.rst b/docs/source/api.rst
index ed5e07ce..20d1a0ea 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -156,8 +156,6 @@ Models
 Wrappers
 ^^^^^^^^
 
-
-
 Functions
 """""""""
 
@@ -188,30 +186,82 @@ Metrics
 
 Classification
 ^^^^^^^^^^^^^^
-
 .. currentmodule:: torch_uncertainty.metrics.classification
 
+Proper Scores
+"""""""""""""
+
+.. autosummary::
+    :toctree: generated/
+    :nosignatures:
+    :template: class.rst
+
+    BrierScore
+    CategoricalNLL
+
+Out-of-Distribution Detection
+"""""""""""""""""""""""""""""
+
 .. autosummary::
     :toctree: generated/
     :nosignatures:
     :template: class.rst
 
     AURC
-    AUSE
+    FPRx
     FPR95
+
+
+Selective Classification
+""""""""""""""""""""""""
+
+.. autosummary::
+    :toctree: generated/
+    :nosignatures:
+    :template: class.rst
+
+    AUGRC
+    RiskAtxCov
+    RiskAt80Cov
+    CovAtxRisk
+    CovAt5Risk
+
+Calibration
+"""""""""""
+
+.. autosummary::
+    :toctree: generated/
+    :nosignatures:
+    :template: class.rst
+
     AdaptiveCalibrationError
-    BrierScore
     CalibrationError
-    CategoricalNLL
-    CovAt5Risk
+
+Diversity
+"""""""""
+
+.. autosummary::
+    :toctree: generated/
+    :nosignatures:
+    :template: class.rst
+
     Disagreement
     Entropy
-    GroupingLoss
-    MeanIntersectionOverUnion
     MutualInformation
-    RiskAt80Cov
     VariationRatio
 
+
+Others
+""""""
+
+.. autosummary::
+    :toctree: generated/
+    :nosignatures:
+    :template: class.rst
+
+    AUSE
+    GroupingLoss
+
 Regression
 ^^^^^^^^^^
 
@@ -232,6 +282,18 @@ Regression
     SILog
     ThresholdAccuracy
 
+Segmentation
+^^^^^^^^^^^^
+
+.. currentmodule:: torch_uncertainty.metrics.classification
+
+.. autosummary::
+    :toctree: generated/
+    :nosignatures:
+    :template: class.rst
+
+    MeanIntersectionOverUnion
+
 Losses
 ------
 
diff --git a/docs/source/references.rst b/docs/source/references.rst
index 89829c16..1165c6db 100644
--- a/docs/source/references.rst
+++ b/docs/source/references.rst
@@ -278,6 +278,18 @@ For the area under the risk-coverage curve, consider citing:
 * Authors: *Yonatan Geifman and Ran El-Yaniv*
 * Paper: `NeurIPS 2017 <https://arxiv.org/pdf/1705.08500.pdf>`__.
 
+
+Area Under the Generalized Risk-Coverage curve
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+For the area under the generalized risk-coverage curve, consider citing:
+
+**Overcoming Common Flaws in the Evaluation of Selective Classification Systems**
+
+* Authors: *Jeremias Traub, Till J. Bungert, Carsten T. Lüth, Michael Baumgartner, Klaus H. Maier-Hein, Lena Maier-Hein, and Paul F Jaeger*
+* Paper: `ArXiv <https://arxiv.org/pdf/2407.01032.pdf>`__.
+
+
 Grouping Loss
 ^^^^^^^^^^^^^
 
diff --git a/torch_uncertainty/metrics/__init__.py b/torch_uncertainty/metrics/__init__.py
index ee1a63b9..93458695 100644
--- a/torch_uncertainty/metrics/__init__.py
+++ b/torch_uncertainty/metrics/__init__.py
@@ -1,5 +1,6 @@
 # ruff: noqa: F401
 from .classification import (
+    AUGRC,
     AURC,
     AUSE,
     FPR95,
diff --git a/torch_uncertainty/metrics/classification/__init__.py b/torch_uncertainty/metrics/classification/__init__.py
index f604f1f7..0e454888 100644
--- a/torch_uncertainty/metrics/classification/__init__.py
+++ b/torch_uncertainty/metrics/classification/__init__.py
@@ -9,6 +9,13 @@
 from .grouping_loss import GroupingLoss
 from .mean_iou import MeanIntersectionOverUnion
 from .mutual_information import MutualInformation
-from .risk_coverage import AURC, CovAt5Risk, CovAtxRisk, RiskAt80Cov, RiskAtxCov
+from .risk_coverage import (
+    AUGRC,
+    AURC,
+    CovAt5Risk,
+    CovAtxRisk,
+    RiskAt80Cov,
+    RiskAtxCov,
+)
 from .sparsification import AUSE
 from .variation_ratio import VariationRatio
diff --git a/torch_uncertainty/metrics/classification/risk_coverage.py b/torch_uncertainty/metrics/classification/risk_coverage.py
index 5eb525f5..c4a434c2 100644
--- a/torch_uncertainty/metrics/classification/risk_coverage.py
+++ b/torch_uncertainty/metrics/classification/risk_coverage.py
@@ -4,7 +4,7 @@
 import numpy as np
 import torch
 from torch import Tensor
-from torchmetrics.metric import Metric
+from torchmetrics import Metric
 from torchmetrics.utilities.compute import _auc_compute
 from torchmetrics.utilities.data import dim_zero_cat
 from torchmetrics.utilities.plot import _AX_TYPE
@@ -87,8 +87,8 @@ def compute(self) -> Tensor:
         num_samples = error_rates.size(0)
         if num_samples < 2:
             return torch.tensor([float("nan")], device=self.device)
-        x = torch.arange(1, num_samples + 1, device=self.device) / num_samples
-        return _auc_compute(x, error_rates) / (1 - 1 / num_samples)
+        cov = torch.arange(1, num_samples + 1, device=self.device) / num_samples
+        return _auc_compute(cov, error_rates) / (1 - 1 / num_samples)
 
     def plot(
         self,
@@ -97,7 +97,7 @@ def plot(
         name: str | None = None,
     ) -> tuple[plt.Figure | None, plt.Axes]:
         """Plot the risk-cov. curve corresponding to the inputs passed to
-        ``update``, and the oracle risk-cov. curve.
+        ``update``.
 
         Args:
             ax (Axes | None, optional): An matplotlib axis object. If provided
@@ -111,7 +111,7 @@ def plot(
         """
         fig, ax = plt.subplots(figsize=(6, 6)) if ax is None else (None, ax)
 
-        # Computation of AUSEC
+        # Computation of AURC
         error_rates = self.partial_compute().cpu().flip(0)
         num_samples = error_rates.size(0)
 
@@ -136,7 +136,7 @@ def plot(
             ax.text(
                 0.02,
                 0.95,
-                f"AUSEC={aurc:.2%}",
+                f"AURC={aurc:.2%}",
                 color="black",
                 ha="left",
                 va="bottom",
@@ -163,13 +163,119 @@ def _aurc_rejection_rate_compute(
         scores (Tensor): uncertainty scores of shape :math:`(B,)`
         errors (Tensor): binary errors of shape :math:`(B,)`
     """
-    num_samples = scores.size(0)
     errors = errors[scores.argsort(descending=True)]
     return errors.cumsum(dim=-1) / torch.arange(
-        1, num_samples + 1, dtype=scores.dtype, device=scores.device
+        1, scores.size(0) + 1, dtype=scores.dtype, device=scores.device
     )
 
 
+class AUGRC(AURC):
+    """Area Under the Generalized Risk-Coverage curve.
+
+    The Area Under the Generalized Risk-Coverage curve (AUGRC) for
+    Selective Classification (SC) performance assessment. It avoids putting too much
+    weight on the most confident samples.
+
+    As input to ``forward`` and ``update`` the metric accepts the following input:
+
+    - ``preds`` (:class:`~torch.Tensor`): A float tensor of shape
+        ``(N, ...)`` containing probabilities for each observation.
+    - ``target`` (:class:`~torch.Tensor`): An int tensor of shape
+        ``(N, ...)`` containing ground-truth labels.
+
+    As output to ``forward`` and ``compute`` the metric returns the
+        following output:
+
+    - ``augrc`` (:class:`~torch.Tensor`): A scalar tensor containing the
+        area under the risk-coverage curve
+
+    Args:
+        kwargs: Additional keyword arguments.
+
+    Reference:
+        Traub et al. Overcoming Common Flaws in the Evaluation of Selective
+        Classification Systems. ArXiv.
+    """
+
+    def compute(self) -> Tensor:
+        """Compute the Area Under the Generalized Risk-Coverage curve (AUGRC).
+
+        Normalize the AUGRC as if its support was between 0 and 1. This has an
+        impact on the AUGRC when the number of samples is small.
+
+        Returns:
+            Tensor: The AUGRC.
+        """
+        error_rates = self.partial_compute()
+        num_samples = error_rates.size(0)
+        if num_samples < 2:
+            return torch.tensor([float("nan")], device=self.device)
+        cov = torch.arange(1, num_samples + 1, device=self.device) / num_samples
+        return _auc_compute(cov, error_rates * cov) / (1 - 1 / num_samples)
+
+    def plot(
+        self,
+        ax: _AX_TYPE | None = None,
+        plot_value: bool = True,
+        name: str | None = None,
+    ) -> tuple[plt.Figure | None, plt.Axes]:
+        """Plot the generalized risk-cov. curve corresponding to the inputs passed to
+        ``update``.
+
+        Args:
+            ax (Axes | None, optional): An matplotlib axis object. If provided
+                will add plot to this axis. Defaults to None.
+            plot_value (bool, optional): Whether to print the AURC value on the
+                plot. Defaults to True.
+            name (str | None, optional): Name of the model. Defaults to None.
+
+        Returns:
+            tuple[[Figure | None], Axes]: Figure object and Axes object
+        """
+        fig, ax = plt.subplots(figsize=(6, 6)) if ax is None else (None, ax)
+
+        # Computation of AUGRC
+        error_rates = self.partial_compute().cpu().flip(0)
+        num_samples = error_rates.size(0)
+        cov = torch.arange(num_samples) / num_samples
+
+        augrc = _auc_compute(cov, error_rates * cov).cpu().item()
+
+        # reduce plot size
+        plot_covs = np.arange(0.01, 100 + 0.01, 0.01)
+        covs = np.arange(start=1, stop=num_samples + 1) / num_samples
+
+        rejection_rates = np.interp(plot_covs, covs, cov * 100)
+        error_rates = np.interp(plot_covs, covs, error_rates * covs[::-1] * 100)
+
+        # plot
+        ax.plot(
+            100 - rejection_rates,
+            error_rates,
+            label="Model" if name is None else name,
+        )
+
+        if plot_value:
+            ax.text(
+                0.02,
+                0.95,
+                f"AUGRC={augrc:.2%}",
+                color="black",
+                ha="left",
+                va="bottom",
+                transform=ax.transAxes,
+            )
+        plt.grid(True, linestyle="--", alpha=0.7, zorder=0)
+        ax.set_xlabel("Coverage (%)", fontsize=16)
+        ax.set_ylabel("Generalized Risk (%)", fontsize=16)
+        ax.set_xlim(0, 100)
+        ax.set_ylim(0, 100)
+        ax.set_aspect("equal", "box")
+        ax.legend(loc="upper right")
+        fig.tight_layout()
+        return fig, ax
+
+
 class CovAtxRisk(Metric):
     is_differentiable: bool = False
     higher_is_better: bool = False
diff --git a/torch_uncertainty/routines/classification.py b/torch_uncertainty/routines/classification.py
index 2d45976a..b7e0262b 100644
--- a/torch_uncertainty/routines/classification.py
+++ b/torch_uncertainty/routines/classification.py
@@ -20,6 +20,7 @@
 from torch_uncertainty.layers import Identity
 from torch_uncertainty.losses import DECLoss, ELBOLoss
 from torch_uncertainty.metrics import (
+    AUGRC,
     AURC,
     FPR95,
     BrierScore,
@@ -194,6 +195,7 @@ def _init_metrics(self) -> None:
                     num_classes=self.num_classes,
                 ),
                 "sc/AURC": AURC(),
+                "sc/AUGRC": AUGRC(),
                 "sc/CovAt5Risk": CovAt5Risk(),
                 "sc/RiskAt80Cov": RiskAt80Cov(),
             },
@@ -202,7 +204,7 @@ def _init_metrics(self) -> None:
                 ["cls/Brier"],
                 ["cls/NLL"],
                 ["cal/ECE", "cal/aECE"],
-                ["sc/AURC", "sc/CovAt5Risk", "sc/RiskAt80Cov"],
+                ["sc/AURC", "sc/AUGRC", "sc/CovAt5Risk", "sc/RiskAt80Cov"],
             ],
         )
 
@@ -552,6 +554,10 @@ def on_test_epoch_end(self) -> None:
                 "Risk-Coverage curve",
                 self.test_cls_metrics["sc/AURC"].plot()[0],
             )
+            self.logger.experiment.add_figure(
+                "Generalized Risk-Coverage curve",
+                self.test_cls_metrics["sc/AUGRC"].plot()[0],
+            )
 
             if self.post_processing is not None:
                 self.logger.experiment.add_figure(
diff --git a/torch_uncertainty/routines/segmentation.py b/torch_uncertainty/routines/segmentation.py
index f3ece492..966553d1 100644
--- a/torch_uncertainty/routines/segmentation.py
+++ b/torch_uncertainty/routines/segmentation.py
@@ -9,6 +9,7 @@
 from torchvision.transforms.v2 import functional as F
 
 from torch_uncertainty.metrics import (
+    AUGRC,
     AURC,
     BrierScore,
     CalibrationError,
@@ -108,8 +109,16 @@ def __init__(
                     num_classes=num_classes,
                 ),
                 "sc/AURC": AURC(),
+                "sc/AUGRC": AUGRC(),
             },
-            compute_groups=False,
+            compute_groups=[
+                ["seg/mAcc"],
+                ["seg/Brier"],
+                ["seg/NLL"],
+                ["seg/pixAcc"],
+                ["cal/ECE", "cal/aECE"],
+                ["sc/AURC", "sc/AUGRC"],
+            ],
         )
 
         self.val_seg_metrics = seg_metrics.clone(prefix="val/")
@@ -222,6 +231,10 @@ def on_test_epoch_end(self) -> None:
                 "Risk-Coverage curve",
                 self.test_sbsmpl_seg_metrics["sc/AURC"].plot()[0],
             )
+            self.logger.experiment.add_figure(
+                "Generalized Risk-Coverage curve",
+                self.test_sbsmpl_seg_metrics["sc/AUGRC"].plot()[0],
+            )
 
     def subsample(self, pred: Tensor, target: Tensor) -> tuple[Tensor, Tensor]:
         total_size = target.size(0)

From 18d358376af97033d96d1e4c75a1ae9e68cb0ced Mon Sep 17 00:00:00 2001
From: Olivier Laurent <olivier.ar.laurent@gmail.com>
Date: Wed, 28 Aug 2024 15:37:22 +0200
Subject: [PATCH 18/27] :hammer: Rework optim recipes

---
 .../mnist/configs/lenet_swa.yaml              |   2 +-
 .../mnist/configs/lenet_swag.yaml             |   2 +-
 tests/test_optim_recipes.py                   |  10 +-
 torch_uncertainty/optim_recipes.py            | 101 ++++++++++++------
 4 files changed, 78 insertions(+), 37 deletions(-)

diff --git a/experiments/classification/mnist/configs/lenet_swa.yaml b/experiments/classification/mnist/configs/lenet_swa.yaml
index fa3eb77d..2274bdb5 100644
--- a/experiments/classification/mnist/configs/lenet_swa.yaml
+++ b/experiments/classification/mnist/configs/lenet_swa.yaml
@@ -57,7 +57,7 @@ optimizer:
   weight_decay: 5e-4
   nesterov: true
 lr_scheduler:
-  class_path: torch_uncertainty.optim_recipes.FullSWALR
+  class_path: torch_uncertainty.optim_recipes.CosineSWALR
   init_args:
     milestone: 20
     swa_lr: 0.01
diff --git a/experiments/classification/mnist/configs/lenet_swag.yaml b/experiments/classification/mnist/configs/lenet_swag.yaml
index 292b49f0..ddff0067 100644
--- a/experiments/classification/mnist/configs/lenet_swag.yaml
+++ b/experiments/classification/mnist/configs/lenet_swag.yaml
@@ -57,7 +57,7 @@ optimizer:
   weight_decay: 5e-4
   nesterov: true
 lr_scheduler:
-  class_path: torch_uncertainty.optim_recipes.FullSWALR
+  class_path: torch_uncertainty.optim_recipes.CosineSWALR
   init_args:
     milestone: 10
     swa_lr: 0.01
diff --git a/tests/test_optim_recipes.py b/tests/test_optim_recipes.py
index b6d15863..86f438de 100644
--- a/tests/test_optim_recipes.py
+++ b/tests/test_optim_recipes.py
@@ -2,12 +2,16 @@
 import pytest
 import torch
 
-from torch_uncertainty.optim_recipes import FullSWALR, get_procedure, optim_abnn
+from torch_uncertainty.optim_recipes import (
+    CosineSWALR,
+    get_procedure,
+    optim_abnn,
+)
 
 
-class TestFullSWALR:
+class TestCosineSWALR:
     def test_full_swa_lr(self):
-        FullSWALR(
+        CosineSWALR(
             torch.optim.SGD(torch.nn.Linear(1, 1).parameters(), lr=1e-3),
             swa_lr=1,
             milestone=12,
diff --git a/torch_uncertainty/optim_recipes.py b/torch_uncertainty/optim_recipes.py
index 5b089430..638aa56b 100644
--- a/torch_uncertainty/optim_recipes.py
+++ b/torch_uncertainty/optim_recipes.py
@@ -7,7 +7,13 @@
 from timm.optim import Lamb
 from torch import nn, optim
 from torch.optim import Optimizer
-from torch.optim.lr_scheduler import LRScheduler
+from torch.optim.lr_scheduler import (
+    CosineAnnealingLR,
+    LinearLR,
+    LRScheduler,
+    MultiStepLR,
+    SequentialLR,
+)
 
 
 def optim_abnn(
@@ -25,7 +31,7 @@ def optim_abnn(
         weight_decay=weight_decay,
         nesterov=nesterov,
     )
-    scheduler = optim.lr_scheduler.MultiStepLR(
+    scheduler = MultiStepLR(
         optimizer,
         milestones=[1, 4],
         gamma=0.1,
@@ -44,7 +50,7 @@ def optim_cifar10_resnet18(
         weight_decay=5e-4,
         nesterov=True,
     )
-    scheduler = optim.lr_scheduler.MultiStepLR(
+    scheduler = MultiStepLR(
         optimizer,
         milestones=[25, 50],
         gamma=0.1,
@@ -65,7 +71,7 @@ def optim_cifar10_resnet50(
         weight_decay=5e-4,
         nesterov=True,
     )
-    scheduler = optim.lr_scheduler.MultiStepLR(
+    scheduler = MultiStepLR(
         optimizer,
         milestones=[60, 120, 160],
         gamma=0.2,
@@ -84,7 +90,7 @@ def optim_cifar10_wideresnet(
         weight_decay=5e-4,
         nesterov=True,
     )
-    scheduler = optim.lr_scheduler.MultiStepLR(
+    scheduler = MultiStepLR(
         optimizer,
         milestones=[60, 120, 160],
         gamma=0.2,
@@ -101,7 +107,7 @@ def optim_cifar10_vgg16(
         lr=0.005,
         weight_decay=1e-6,
     )
-    scheduler = optim.lr_scheduler.MultiStepLR(
+    scheduler = MultiStepLR(
         optimizer,
         milestones=[25, 50],
         gamma=0.1,
@@ -119,7 +125,7 @@ def optim_cifar100_resnet18(
         weight_decay=5e-4,
         nesterov=True,
     )
-    scheduler = optim.lr_scheduler.MultiStepLR(
+    scheduler = MultiStepLR(
         optimizer,
         milestones=[25, 50],
         gamma=0.1,
@@ -140,7 +146,7 @@ def optim_cifar100_resnet50(
         weight_decay=5e-4,
         nesterov=True,
     )
-    scheduler = optim.lr_scheduler.MultiStepLR(
+    scheduler = MultiStepLR(
         optimizer,
         milestones=[60, 120, 160],
         gamma=0.2,
@@ -159,7 +165,7 @@ def optim_cifar100_vgg16(
         weight_decay=1e-4,
         nesterov=True,
     )
-    scheduler = optim.lr_scheduler.MultiStepLR(
+    scheduler = MultiStepLR(
         optimizer,
         milestones=[60, 120, 160],
         gamma=0.2,
@@ -183,9 +189,7 @@ def optim_imagenet_resnet50(
         weight_decay=3.0517578125e-05,
         nesterov=False,
     )
-    scheduler = optim.lr_scheduler.CosineAnnealingLR(
-        optimizer, num_epochs, eta_min=end_lr
-    )
+    scheduler = CosineAnnealingLR(optimizer, num_epochs, eta_min=end_lr)
     return {
         "optimizer": optimizer,
         "lr_scheduler": scheduler,
@@ -215,18 +219,18 @@ def optim_imagenet_resnet50_a3(
 
     optimizer = Lamb(model.parameters(), lr=0.008, weight_decay=0.02)
 
-    warmup = optim.lr_scheduler.LinearLR(
+    warmup = LinearLR(
         optimizer,
         start_factor=1e-4,
         end_factor=1,
         total_iters=5 * (1281167 // effective_batch_size + 1),
     )
-    cosine_scheduler = optim.lr_scheduler.CosineAnnealingLR(
+    cosine_scheduler = CosineAnnealingLR(
         optimizer,
         eta_min=1e-6,
         T_max=105 * (1281167 // effective_batch_size + 1),
     )
-    scheduler = optim.lr_scheduler.SequentialLR(
+    scheduler = SequentialLR(
         optimizer,
         schedulers=[warmup, cosine_scheduler],
         milestones=[5 * (1281167 // effective_batch_size + 1)],
@@ -252,7 +256,7 @@ def optim_cifar10_resnet34(
         weight_decay=1e-4,
         nesterov=True,
     )
-    scheduler = optim.lr_scheduler.MultiStepLR(
+    scheduler = MultiStepLR(
         optimizer,
         milestones=[100, 150],
         gamma=0.1,
@@ -270,7 +274,7 @@ def optim_cifar100_resnet34(
         weight_decay=1e-4,
         nesterov=True,
     )
-    scheduler = optim.lr_scheduler.MultiStepLR(
+    scheduler = MultiStepLR(
         optimizer,
         milestones=[100, 150],
         gamma=0.1,
@@ -295,7 +299,7 @@ def optim_tinyimagenet_resnet34(
         weight_decay=1e-4,
         nesterov=True,
     )
-    scheduler = optim.lr_scheduler.MultiStepLR(
+    scheduler = MultiStepLR(
         optimizer,
         milestones=[40, 60],
         gamma=0.1,
@@ -320,7 +324,7 @@ def optim_tinyimagenet_resnet50(
         weight_decay=1e-4,
         nesterov=True,
     )
-    scheduler = optim.lr_scheduler.MultiStepLR(
+    scheduler = MultiStepLR(
         optimizer,
         milestones=[40, 60],
         gamma=0.1,
@@ -436,43 +440,76 @@ def get_procedure(
     return procedure
 
 
-class CosineAnnealingWarmup(torch.optim.lr_scheduler.SequentialLR):
+class WarmupScheduler(torch.SequentialLR):
     def __init__(
         self,
         optimizer: Optimizer,
+        base_scheduler: type[LRScheduler],
         warmup_start_factor: float,
         warmup_epochs: int,
-        max_epochs: int,
-        eta_min: float = 0,
+        scheduler_args: dict[str, float],
     ) -> None:
-        """Cosine annealing scheduler with linear warmup.
+        """Scheduler with linear warmup.
 
         Args:
-            optimizer (Optimizer): The optimizer to be used.
+            optimizer (Optimizer): The optimizer to be used.*
+            base_scheduler (type[LRScheduler]): The base scheduler class to use after
+                the warmup.
             warmup_start_factor (float): The multiplicative factor to apply to
                 the learning rate at the start of the warmup.
             warmup_epochs (int): The number of epochs to warmup the learning
                 rate.
-            max_epochs (int): The total number of epochs.
-            eta_min (float): The minimum learning rate.
+            scheduler_args (dict[str, float]): The arguments to pass to the base
+                scheduler.
         """
-        warmup_scheduler = optim.lr_scheduler.LinearLR(
+        warmup_scheduler = LinearLR(
             optimizer,
             start_factor=warmup_start_factor,
             end_factor=1,
             total_iters=warmup_epochs,
         )
-        cosine_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
-            optimizer, T_max=max_epochs - warmup_epochs, eta_min=eta_min
+        base_scheduler = base_scheduler(optimizer, **scheduler_args)
+        super().__init__(
+            optimizer=optimizer,
+            schedulers=[warmup_scheduler, base_scheduler],
+            milestones=[warmup_epochs],
         )
+
+
+class CosineAnnealingWarmup(WarmupScheduler):
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        warmup_start_factor: float,
+        warmup_epochs: int,
+        max_epochs: int,
+        eta_min: float = 0,
+    ) -> None:
+        """Cosine annealing scheduler with linear warmup.
+
+        Args:
+            optimizer (Optimizer): The optimizer to be used.
+            warmup_start_factor (float): The multiplicative factor to apply to
+                the learning rate at the start of the warmup.
+            warmup_epochs (int): The number of epochs to warmup the learning
+                rate.
+            max_epochs (int): The total number of epochs including warmup.
+            eta_min (float): The minimum learning rate.
+        """
         super().__init__(
             optimizer=optimizer,
-            schedulers=[warmup_scheduler, cosine_scheduler],
+            base_scheduler=CosineAnnealingLR,
+            warmup_start_factor=warmup_start_factor,
+            warmup_epochs=warmup_epochs,
+            scheduler_args={
+                "T_max": max_epochs - warmup_epochs,
+                "eta_min": eta_min,
+            },
             milestones=[warmup_epochs],
         )
 
 
-class FullSWALR(torch.optim.lr_scheduler.SequentialLR):
+class CosineSWALR(torch.SequentialLR):
     def __init__(
         self,
         optimizer: Optimizer,
@@ -496,7 +533,7 @@ def __init__(
             optim_eta_min (float): The minimum learning rate for the first optimizer.
             anneal_strategy (Literal["cos", "linear"]): The strategy to anneal the learning rate.
         """
-        optim_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
+        optim_scheduler = CosineAnnealingLR(
             optimizer=optimizer, T_max=milestone, eta_min=optim_eta_min
         )
         swa_scheduler = torch.optim.swa_utils.SWALR(

From b66499a0f5e3dde42744bef9320615384223c7ae Mon Sep 17 00:00:00 2001
From: Olivier <olivier.ar.laurent@gmail.com>
Date: Wed, 4 Sep 2024 11:06:29 +0200
Subject: [PATCH 19/27] :bug: Small fixes

---
 auto_tutorials_source/tutorial_corruption.py | 12 ++++++------
 torch_uncertainty/optim_recipes.py           |  5 ++---
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/auto_tutorials_source/tutorial_corruption.py b/auto_tutorials_source/tutorial_corruption.py
index 01b6a2d9..faf9633c 100644
--- a/auto_tutorials_source/tutorial_corruption.py
+++ b/auto_tutorials_source/tutorial_corruption.py
@@ -2,13 +2,13 @@
 Corrupting Images with TorchUncertainty to Benchmark Robustness
 ===============================================================
 
-This tutorial shows the impact of the different corruptions available in the
-TorchUncertainty library. These corruptions were first proposed in the paper
+This tutorial shows the impact of the different corruption transforms available in the
+TorchUncertainty library. These corruption transforms were first proposed in the paper
 Benchmarking Neural Network Robustness to Common Corruptions and Perturbations
 by Dan Hendrycks and Thomas Dietterich.
 
 For this tutorial, we will only load the corruption transforms available in 
-torch_uncertainty.transforms.corruptions. We also need to load utilities from
+torch_uncertainty.transforms.corruption. We also need to load utilities from
 torchvision and matplotlib.
 """
 from torchvision.datasets import CIFAR10
@@ -60,7 +60,7 @@ def show_images(transforms):
 # %%
 # 1. Noise Corruptions
 # ~~~~~~~~~~~~~~~~~~~~
-from torch_uncertainty.transforms.corruptions import (
+from torch_uncertainty.transforms.corruption import (
     GaussianNoise,
     ShotNoise,
     ImpulseNoise,
@@ -79,7 +79,7 @@ def show_images(transforms):
 # %%
 # 2. Blur Corruptions
 # ~~~~~~~~~~~~~~~~~~~~
-from torch_uncertainty.transforms.corruptions import (
+from torch_uncertainty.transforms.corruption import (
     GaussianBlur,
     GlassBlur,
     DefocusBlur,
@@ -96,7 +96,7 @@ def show_images(transforms):
 # %%
 # 3. Other Corruptions
 # ~~~~~~~~~~~~~~~~~~~~
-from torch_uncertainty.transforms.corruptions import (
+from torch_uncertainty.transforms.corruption import (
     JPEGCompression,
     Pixelate,
     Frost,
diff --git a/torch_uncertainty/optim_recipes.py b/torch_uncertainty/optim_recipes.py
index 638aa56b..82b147c8 100644
--- a/torch_uncertainty/optim_recipes.py
+++ b/torch_uncertainty/optim_recipes.py
@@ -440,7 +440,7 @@ def get_procedure(
     return procedure
 
 
-class WarmupScheduler(torch.SequentialLR):
+class WarmupScheduler(SequentialLR):
     def __init__(
         self,
         optimizer: Optimizer,
@@ -505,11 +505,10 @@ def __init__(
                 "T_max": max_epochs - warmup_epochs,
                 "eta_min": eta_min,
             },
-            milestones=[warmup_epochs],
         )
 
 
-class CosineSWALR(torch.SequentialLR):
+class CosineSWALR(SequentialLR):
     def __init__(
         self,
         optimizer: Optimizer,

From 5a393e929b6078f2bfcaba3cc4fcf25f808db269 Mon Sep 17 00:00:00 2001
From: alafage <adrienlafage@outlook.com>
Date: Wed, 4 Sep 2024 11:45:49 +0200
Subject: [PATCH 20/27] :bug: Fix bad ELBOLoss handling in RegressionRoutine
 #110

- add relevant tests to check the behavior.
---
 tests/losses/test_bayesian.py            | 28 ++++++++++++++++++++----
 torch_uncertainty/routines/regression.py | 10 +++++++--
 2 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/tests/losses/test_bayesian.py b/tests/losses/test_bayesian.py
index 4135e3c8..9e43c1f3 100644
--- a/tests/losses/test_bayesian.py
+++ b/tests/losses/test_bayesian.py
@@ -1,11 +1,10 @@
 import pytest
 import torch
-from torch import nn
+from torch import nn, optim
 
 from torch_uncertainty.layers.bayesian import BayesLinear
-from torch_uncertainty.losses import (
-    ELBOLoss,
-)
+from torch_uncertainty.losses import ELBOLoss
+from torch_uncertainty.routines import RegressionRoutine
 
 
 class TestELBOLoss:
@@ -24,6 +23,27 @@ def test_main(self):
         loss = ELBOLoss(model, criterion, kl_weight=1e-5, num_samples=1)
         loss(model(torch.randn(1, 1)), torch.randn(1, 1))
 
+    def test_training_step(self):
+        model = BayesLinear(10, 4)
+        criterion = nn.MSELoss()
+        loss = ELBOLoss(model, criterion, kl_weight=1 / 50000, num_samples=3)
+
+        routine = RegressionRoutine(
+            probabilistic=False,
+            output_dim=4,
+            model=model,
+            loss=loss,
+            optim_recipe=optim.Adam(
+                model.parameters(),
+                lr=5e-4,
+                weight_decay=0,
+            ),
+        )
+
+        inputs = torch.randn(1, 10)
+        targets = torch.randn(1, 4)
+        routine.training_step((inputs, targets), 0)
+
     def test_failures(self):
         model = BayesLinear(1, 1)
         criterion = nn.BCEWithLogitsLoss()
diff --git a/torch_uncertainty/routines/regression.py b/torch_uncertainty/routines/regression.py
index b61d3ee0..b0fdc0f1 100644
--- a/torch_uncertainty/routines/regression.py
+++ b/torch_uncertainty/routines/regression.py
@@ -11,6 +11,7 @@
 from torch.optim import Optimizer
 from torchmetrics import MeanAbsoluteError, MeanSquaredError, MetricCollection
 
+from torch_uncertainty.losses import ELBOLoss
 from torch_uncertainty.metrics import (
     DistributionNLL,
 )
@@ -154,12 +155,17 @@ def training_step(
         self, batch: tuple[Tensor, Tensor], batch_idx: int
     ) -> STEP_OUTPUT:
         inputs, targets = self.format_batch_fn(batch)
-        dists = self.model(inputs)
+        print(inputs.shape, targets.shape)
 
         if self.one_dim_regression:
             targets = targets.unsqueeze(-1)
 
-        loss = self.loss(dists, targets)
+        if isinstance(self.loss, ELBOLoss):
+            loss = self.loss(inputs, targets)
+        else:
+            dists = self.model(inputs)
+            loss = self.loss(dists, targets)
+
         if self.needs_step_update:
             self.model.update_wrapper(self.current_epoch)
         self.log("train_loss", loss)

From 1bb6dbcd76b96986ebd718f4fab0d8c11897c6a2 Mon Sep 17 00:00:00 2001
From: alafage <adrienlafage@outlook.com>
Date: Wed, 4 Sep 2024 11:48:03 +0200
Subject: [PATCH 21/27] :books: Small update in tutorials and fix CovAtxRisk
 import

---
 auto_tutorials_source/tutorial_bayesian.py                  | 3 ++-
 auto_tutorials_source/tutorial_corruption.py                | 1 +
 auto_tutorials_source/tutorial_der_cubic.py                 | 1 +
 auto_tutorials_source/tutorial_evidential_classification.py | 1 +
 auto_tutorials_source/tutorial_from_de_to_pe.py             | 5 +++--
 auto_tutorials_source/tutorial_mc_batch_norm.py             | 1 +
 auto_tutorials_source/tutorial_mc_dropout.py                | 2 +-
 auto_tutorials_source/tutorial_scaler.py                    | 1 +
 torch_uncertainty/metrics/__init__.py                       | 1 +
 torch_uncertainty/metrics/classification/risk_coverage.py   | 2 +-
 10 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/auto_tutorials_source/tutorial_bayesian.py b/auto_tutorials_source/tutorial_bayesian.py
index cc4e830a..939e83c1 100644
--- a/auto_tutorials_source/tutorial_bayesian.py
+++ b/auto_tutorials_source/tutorial_bayesian.py
@@ -36,6 +36,7 @@
 We will also need to define an optimizer using torch.optim and Pytorch's
 neural network utils from torch.nn.
 """
+# %%
 from pathlib import Path
 
 from lightning.pytorch import Trainer
@@ -93,7 +94,7 @@ def optim_lenet(model: nn.Module):
 loss = ELBOLoss(
     model=model,
     inner_loss=nn.CrossEntropyLoss(),
-    kl_weight=1 / 50000,
+    kl_weight=1 / 10000,
     num_samples=3,
 )
 
diff --git a/auto_tutorials_source/tutorial_corruption.py b/auto_tutorials_source/tutorial_corruption.py
index faf9633c..4eecd6ff 100644
--- a/auto_tutorials_source/tutorial_corruption.py
+++ b/auto_tutorials_source/tutorial_corruption.py
@@ -11,6 +11,7 @@
 torch_uncertainty.transforms.corruption. We also need to load utilities from
 torchvision and matplotlib.
 """
+# %%
 from torchvision.datasets import CIFAR10
 from torchvision.transforms import Compose, ToTensor, Resize
 
diff --git a/auto_tutorials_source/tutorial_der_cubic.py b/auto_tutorials_source/tutorial_der_cubic.py
index 96d72375..a30b49d5 100644
--- a/auto_tutorials_source/tutorial_der_cubic.py
+++ b/auto_tutorials_source/tutorial_der_cubic.py
@@ -29,6 +29,7 @@
 
 We also need to define an optimizer using torch.optim and the neural network utils within torch.nn.
 """
+# %%
 import torch
 from lightning.pytorch import Trainer
 from lightning import LightningDataModule
diff --git a/auto_tutorials_source/tutorial_evidential_classification.py b/auto_tutorials_source/tutorial_evidential_classification.py
index dccda568..cd124f5d 100644
--- a/auto_tutorials_source/tutorial_evidential_classification.py
+++ b/auto_tutorials_source/tutorial_evidential_classification.py
@@ -24,6 +24,7 @@
 
 We also need to define an optimizer using torch.optim, the neural network utils within torch.nn.
 """
+# %%
 from pathlib import Path
 
 import torch
diff --git a/auto_tutorials_source/tutorial_from_de_to_pe.py b/auto_tutorials_source/tutorial_from_de_to_pe.py
index 24933566..55de3735 100644
--- a/auto_tutorials_source/tutorial_from_de_to_pe.py
+++ b/auto_tutorials_source/tutorial_from_de_to_pe.py
@@ -30,6 +30,7 @@
 The dataset is automatically downloaded using torchvision. We then visualize a few images to see a bit what we are working with.
 """
 # Create the transforms for the images
+# %%
 import torch
 import torchvision.transforms as T
 
@@ -241,7 +242,7 @@ def optim_recipe(model, lr_mult: float = 1.0):
 # We have put the pre-trained models on Hugging Face that you can download with the utility function
 # "hf_hub_download" imported just below. These models are trained for 75 epochs and are therefore not
 # comparable to the all the other models trained in this notebook. The pretrained models can be seen
-# `here <https://huggingface.co/ENSTA-U2IS/tutorial-models>`_ and TorchUncertainty's are `here <https://huggingface.co/torch-uncertainty>`_.
+# on `HuggingFace <https://huggingface.co/ENSTA-U2IS/tutorial-models>`_ and TorchUncertainty's are `here <https://huggingface.co/torch-uncertainty>`_.
 
 from torch_uncertainty.utils.hub import hf_hub_download
 
@@ -297,7 +298,7 @@ def optim_recipe(model, lr_mult: float = 1.0):
 # This modification is particularly useful when the ensemble size is large, as it is often the case in practice.
 #
 # We will need to update the model and replace the layers with their Packed equivalents. You can find the
-# documentation of the Packed-Linear layer `here <https://torch-uncertainty.github.io/generated/torch_uncertainty.layers.PackedLinear.html>`_,
+# documentation of the Packed-Linear layer using this `link <https://torch-uncertainty.github.io/generated/torch_uncertainty.layers.PackedLinear.html>`_,
 # and the Packed-Conv2D, `here <https://torch-uncertainty.github.io/generated/torch_uncertainty.layers.PackedLinear.html>`_.
 
 import torch
diff --git a/auto_tutorials_source/tutorial_mc_batch_norm.py b/auto_tutorials_source/tutorial_mc_batch_norm.py
index bbf495c8..886ed9cf 100644
--- a/auto_tutorials_source/tutorial_mc_batch_norm.py
+++ b/auto_tutorials_source/tutorial_mc_batch_norm.py
@@ -22,6 +22,7 @@
 
 We also need import the neural network utils within `torch.nn`.
 """
+# %%
 from pathlib import Path
 
 from lightning import Trainer
diff --git a/auto_tutorials_source/tutorial_mc_dropout.py b/auto_tutorials_source/tutorial_mc_dropout.py
index 4bd8373e..b8f01fb0 100644
--- a/auto_tutorials_source/tutorial_mc_dropout.py
+++ b/auto_tutorials_source/tutorial_mc_dropout.py
@@ -28,7 +28,7 @@
 
 We also need import the neural network utils within `torch.nn`.
 """
-
+# %%
 from pathlib import Path
 
 from torch_uncertainty.utils import TUTrainer
diff --git a/auto_tutorials_source/tutorial_scaler.py b/auto_tutorials_source/tutorial_scaler.py
index fdbfc469..ceaaa036 100644
--- a/auto_tutorials_source/tutorial_scaler.py
+++ b/auto_tutorials_source/tutorial_scaler.py
@@ -25,6 +25,7 @@
 
 If you use the classification routine, the plots will be automatically available in the tensorboard logs if you use the `log_plots` flag.
 """
+# %%
 from torch_uncertainty.datamodules import CIFAR100DataModule
 from torch_uncertainty.metrics import CalibrationError
 from torch_uncertainty.models.resnet import resnet
diff --git a/torch_uncertainty/metrics/__init__.py b/torch_uncertainty/metrics/__init__.py
index 93458695..52e55366 100644
--- a/torch_uncertainty/metrics/__init__.py
+++ b/torch_uncertainty/metrics/__init__.py
@@ -9,6 +9,7 @@
     CalibrationError,
     CategoricalNLL,
     CovAt5Risk,
+    CovAtxRisk,
     Disagreement,
     Entropy,
     GroupingLoss,
diff --git a/torch_uncertainty/metrics/classification/risk_coverage.py b/torch_uncertainty/metrics/classification/risk_coverage.py
index c4a434c2..dced8409 100644
--- a/torch_uncertainty/metrics/classification/risk_coverage.py
+++ b/torch_uncertainty/metrics/classification/risk_coverage.py
@@ -285,7 +285,7 @@ class CovAtxRisk(Metric):
     errors: list[Tensor]
 
     def __init__(self, risk_threshold: float, **kwargs) -> None:
-        r"""`Coverage at x Risk`_.
+        r"""Coverage at x Risk.
 
         If there are multiple coverage values corresponding to the given risk,
         i.e., the risk(coverage) is not monotonic, the coverage at x risk is

From 1856ae4a88d0ba1d3273b6aa59f619494ef8af0b Mon Sep 17 00:00:00 2001
From: alafage <adrienlafage@outlook.com>
Date: Wed, 4 Sep 2024 12:03:16 +0200
Subject: [PATCH 22/27] :construction: Try fixing dependency issue

---
 .github/workflows/build-docs.yml | 2 +-
 .github/workflows/run-tests.yml  | 2 +-
 pyproject.toml                   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build-docs.yml b/.github/workflows/build-docs.yml
index f28dcb77..084adb66 100644
--- a/.github/workflows/build-docs.yml
+++ b/.github/workflows/build-docs.yml
@@ -40,7 +40,7 @@ jobs:
 
     - name: Install dependencies
       run: |
-        python3 -m pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cpu
+        python3 -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
         python3 -m pip install .[image,dev,docs]
 
     - name: Sphinx build
diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
index 86564ecc..6c6ccc62 100644
--- a/.github/workflows/run-tests.yml
+++ b/.github/workflows/run-tests.yml
@@ -64,7 +64,7 @@ jobs:
     - name: Install dependencies
       if: steps.changed-files-specific.outputs.only_changed != 'true'
       run: |
-        python3 -m pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cpu
+        python3 -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
         python3 -m pip install .[all]
 
     - name: Check style & format
diff --git a/pyproject.toml b/pyproject.toml
index 94e8415e..02dd3598 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,7 +32,7 @@ classifiers = [
 ]
 dependencies = [
     "timm",
-    "lightning[pytorch-extra]",
+    "lightning[pytorch-extra]>=2.0",
     "torchvision>=0.16",
     "tensorboard",
     "einops",

From 28d5a211bf27f4e5661978e7177c2297d295fcaa Mon Sep 17 00:00:00 2001
From: alafage <adrienlafage@outlook.com>
Date: Wed, 4 Sep 2024 12:05:37 +0200
Subject: [PATCH 23/27] :zap: Bump package version

---
 docs/source/conf.py | 2 +-
 pyproject.toml      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index a9a685d5..fad0cd1c 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -15,7 +15,7 @@
     f"{datetime.now().year!s}, Adrien Lafage and Olivier Laurent"
 )
 author = "Adrien Lafage and Olivier Laurent"
-release = "0.2.1.post0"
+release = "0.2.1.post1"
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
diff --git a/pyproject.toml b/pyproject.toml
index 02dd3598..8122c12f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "flit_core.buildapi"
 
 [project]
 name = "torch_uncertainty"
-version = "0.2.1.post0"
+version = "0.2.1.post1"
 authors = [
     { name = "ENSTA U2IS", email = "olivier.laurent@ensta-paris.fr" },
     { name = "Adrien Lafage", email = "adrienlafage@outlook.com" },

From 6be9343c7ca144ec97f01d9f47fc9867525a2618 Mon Sep 17 00:00:00 2001
From: alafage <adrienlafage@outlook.com>
Date: Wed, 4 Sep 2024 14:50:30 +0200
Subject: [PATCH 24/27] :white_check_mark: Improve coverage prior to merging

---
 tests/losses/test_classification.py                |  4 ++--
 tests/metrics/classification/test_fpr95.py         |  8 ++++++++
 tests/metrics/classification/test_risk_coverage.py |  6 ++++++
 tests/test_optim_recipes.py                        | 12 ++++++++++++
 torch_uncertainty/metrics/classification/fpr.py    |  1 +
 5 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/tests/losses/test_classification.py b/tests/losses/test_classification.py
index 469e732a..f5bb2400 100644
--- a/tests/losses/test_classification.py
+++ b/tests/losses/test_classification.py
@@ -62,7 +62,7 @@ def test_main(self):
         loss(torch.tensor([[0.0, 0.0]]), torch.tensor([0]))
         loss = ConfidencePenaltyLoss(reg_weight=1e-2)
         loss(torch.tensor([[0.0, 0.0]]), torch.tensor([0]))
-        loss = ConfidencePenaltyLoss(reg_weight=1e-2, reduction="none")
+        loss = ConfidencePenaltyLoss(reg_weight=1e-2, reduction=None)
         loss(torch.tensor([[0.0, 0.0]]), torch.tensor([0]))
 
     def test_failures(self):
@@ -92,7 +92,7 @@ def test_main(self):
         loss(torch.tensor([[0.0, 0.0]]), torch.tensor([0]))
         loss = ConflictualLoss(reg_weight=1e-2)
         loss(torch.tensor([[0.0, 0.0]]), torch.tensor([0]))
-        loss = ConflictualLoss(reg_weight=1e-2, reduction="none")
+        loss = ConflictualLoss(reg_weight=1e-2, reduction=None)
         loss(torch.tensor([[0.0, 0.0]]), torch.tensor([0]))
 
     def test_failures(self):
diff --git a/tests/metrics/classification/test_fpr95.py b/tests/metrics/classification/test_fpr95.py
index 99bb0dc3..3c10fd01 100644
--- a/tests/metrics/classification/test_fpr95.py
+++ b/tests/metrics/classification/test_fpr95.py
@@ -32,6 +32,14 @@ def test_compute_one(self):
         res = metric.compute()
         assert res == 1
 
+    def test_compute_nan(self):
+        metric = FPR95(pos_label=1)
+        metric.update(
+            torch.as_tensor([0.1] * 50 + [0.4] * 50), torch.as_tensor([0] * 100)
+        )
+        res = metric.compute()
+        assert torch.isnan(res).all()
+
     def test_error(self):
         with pytest.raises(ValueError):
             FPRx(recall_level=1.2, pos_label=1)
diff --git a/tests/metrics/classification/test_risk_coverage.py b/tests/metrics/classification/test_risk_coverage.py
index 868443d6..63e82f43 100644
--- a/tests/metrics/classification/test_risk_coverage.py
+++ b/tests/metrics/classification/test_risk_coverage.py
@@ -40,6 +40,12 @@ def test_compute_multiclass(self) -> None:
         value = (0 * 0.4 + 0.25 * 0.2 / 2 + 0.25 * 0.2 + 0.15 * 0.2 / 2) / 0.8
         assert metric(probs, targets).item() == pytest.approx(value)
 
+    def test_compute_nan(self) -> None:
+        probs = torch.as_tensor([[0.1, 0.9]])
+        targets = torch.as_tensor([1]).long()
+        metric = AURC()
+        assert torch.isnan(metric(probs, targets)).all()
+
     def test_plot(self) -> None:
         scores = torch.as_tensor([0.2, 0.1, 0.5, 0.3, 0.4])
         values = torch.as_tensor([0.1, 0.2, 0.3, 0.4, 0.5])
diff --git a/tests/test_optim_recipes.py b/tests/test_optim_recipes.py
index 86f438de..ab4b455d 100644
--- a/tests/test_optim_recipes.py
+++ b/tests/test_optim_recipes.py
@@ -3,12 +3,24 @@
 import torch
 
 from torch_uncertainty.optim_recipes import (
+    CosineAnnealingWarmup,
     CosineSWALR,
     get_procedure,
     optim_abnn,
 )
 
 
+class TestCosineAnnealingWarmup:
+    def test_full_cosine_annealing_warmup(self):
+        CosineAnnealingWarmup(
+            torch.optim.SGD(torch.nn.Linear(1, 1).parameters(), lr=1e-3),
+            warmup_start_factor=0.1,
+            warmup_epochs=5,
+            max_epochs=100,
+            eta_min=1e-5,
+        )
+
+
 class TestCosineSWALR:
     def test_full_swa_lr(self):
         CosineSWALR(
diff --git a/torch_uncertainty/metrics/classification/fpr.py b/torch_uncertainty/metrics/classification/fpr.py
index eb6bf66b..214daded 100644
--- a/torch_uncertainty/metrics/classification/fpr.py
+++ b/torch_uncertainty/metrics/classification/fpr.py
@@ -89,6 +89,7 @@ def compute(self) -> Tensor:
             1 + threshold_idxs - true_pos
         )  # add one because of zero-based indexing
 
+        # check that there is at least one OOD example
         if true_pos[-1] == 0:
             return torch.tensor([torch.nan], device=self.device)
 

From 4e3a0557d346f89661c7038321f7665516412b48 Mon Sep 17 00:00:00 2001
From: alafage <adrienlafage@outlook.com>
Date: Wed, 4 Sep 2024 15:03:59 +0200
Subject: [PATCH 25/27] :wrench: Bump package version

---
 docs/source/conf.py | 2 +-
 pyproject.toml      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index fad0cd1c..418b398a 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -15,7 +15,7 @@
     f"{datetime.now().year!s}, Adrien Lafage and Olivier Laurent"
 )
 author = "Adrien Lafage and Olivier Laurent"
-release = "0.2.1.post1"
+release = "0.2.2"
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
diff --git a/pyproject.toml b/pyproject.toml
index 8122c12f..60ee19af 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "flit_core.buildapi"
 
 [project]
 name = "torch_uncertainty"
-version = "0.2.1.post1"
+version = "0.2.2"
 authors = [
     { name = "ENSTA U2IS", email = "olivier.laurent@ensta-paris.fr" },
     { name = "Adrien Lafage", email = "adrienlafage@outlook.com" },

From 2dee70bd7ab6b9492d55a55ce97d2195b9528c0e Mon Sep 17 00:00:00 2001
From: alafage <adrienlafage@outlook.com>
Date: Wed, 4 Sep 2024 15:07:48 +0200
Subject: [PATCH 26/27] :ok_hand: Remove print() in RegressionRoutine

---
 torch_uncertainty/routines/regression.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/torch_uncertainty/routines/regression.py b/torch_uncertainty/routines/regression.py
index b0fdc0f1..b118590a 100644
--- a/torch_uncertainty/routines/regression.py
+++ b/torch_uncertainty/routines/regression.py
@@ -155,7 +155,6 @@ def training_step(
         self, batch: tuple[Tensor, Tensor], batch_idx: int
     ) -> STEP_OUTPUT:
         inputs, targets = self.format_batch_fn(batch)
-        print(inputs.shape, targets.shape)
 
         if self.one_dim_regression:
             targets = targets.unsqueeze(-1)

From d47f0f37520d798548f9965c32adf9b18e3f4ce2 Mon Sep 17 00:00:00 2001
From: alafage <adrienlafage@outlook.com>
Date: Wed, 4 Sep 2024 15:16:33 +0200
Subject: [PATCH 27/27] :books: Add ConflictualLoss reference in documentation

---
 docs/source/references.rst                 | 13 +++++++++++++
 torch_uncertainty/losses/classification.py |  6 +++---
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/docs/source/references.rst b/docs/source/references.rst
index 1165c6db..490e080a 100644
--- a/docs/source/references.rst
+++ b/docs/source/references.rst
@@ -243,6 +243,19 @@ For Laplace Approximation, consider citing:
 * Authors: *Erik Daxberger, Agustinus Kristiadi, Alexander Immer, Runa Eschenhagen, Matthias Bauer, and Philipp Hennig*
 * Paper: `NeurIPS 2021 <https://arxiv.org/abs/2106.14806>`__.
 
+Losses
+------
+
+Conflictual Loss
+^^^^^^^^^^^^^^^^
+
+For the conflictual loss, consider citing:
+
+**On the Calibration of Epistemic Uncertainty: Principles, Paradoxes and Conflictual Loss**
+
+* Authors: *Mohammed Fellaji, Frédéric Pennerath, Brieuc Conan-Guez, and Miguel Couceiro*
+* Paper: `ArXiv 2024 <https://arxiv.org/pdf/2407.12211`__.
+
 Metrics
 -------
 
diff --git a/torch_uncertainty/losses/classification.py b/torch_uncertainty/losses/classification.py
index 1b358fc5..0b9230b9 100644
--- a/torch_uncertainty/losses/classification.py
+++ b/torch_uncertainty/losses/classification.py
@@ -235,7 +235,7 @@ def __init__(
         reg_weight: float = 1,
         reduction: str | None = "mean",
     ) -> None:
-        """The Conflictual Loss.
+        r"""The Conflictual Loss.
 
         Args:
             reg_weight (float, optional): The weight of the regularization term.
@@ -243,8 +243,8 @@ def __init__(
             output:``'none'`` | ``'mean'`` | ``'sum'``.
 
         Reference:
-            Mohammed Fellaji et al. On the Calibration of Epistemic Uncertainty:
-            Principles, Paradoxes and Conflictual Loss. https://arxiv.org/pdf/2407.12211
+            `Mohammed Fellaji et al. On the Calibration of Epistemic Uncertainty:
+            Principles, Paradoxes and Conflictual Loss <https://arxiv.org/pdf/2407.12211>`_.
         """
         super().__init__()
         if reduction is None: