diff --git a/docs/source/hooks.rst b/docs/source/hooks.rst
index f75b45cd..6d80f286 100644
--- a/docs/source/hooks.rst
+++ b/docs/source/hooks.rst
@@ -29,8 +29,8 @@ Hooks can be used to modify the learning process
 
 
 
-GPU Hooks
----------
+Distributed
+-----------
 
 Hooks can be used to distribute a model over GPUs
 
diff --git a/hf_models/README.md b/hf_models/README.md
new file mode 100644
index 00000000..ca8d1324
--- /dev/null
+++ b/hf_models/README.md
@@ -0,0 +1 @@
+This directory contains HuggingFace models produced by wrapping huggingface models for experimaestro IR.
diff --git a/src/xpmir/dm/config/co/huggingface/datasets/sentence-transformers/msmarco-hard-negatives.py b/src/xpmir/dm/config/co/huggingface/datasets/sentence-transformers/msmarco-hard-negatives.py
index 82118153..033287d9 100644
--- a/src/xpmir/dm/config/co/huggingface/datasets/sentence-transformers/msmarco-hard-negatives.py
+++ b/src/xpmir/dm/config/co/huggingface/datasets/sentence-transformers/msmarco-hard-negatives.py
@@ -1,4 +1,4 @@
-# See documentation on http://experimaestro.github.io/datamaestro/
+# See documentation on https://datamaestro.readthedocs.io
 
 from datamaestro.definitions import datatasks, datatags, dataset
 from datamaestro_text.data.ir.huggingface import HuggingFacePairwiseSampleDataset
diff --git a/src/xpmir/dm/config/com/github/sebastian-hofstaetter/neural-ranking-kd.py b/src/xpmir/dm/config/com/github/sebastian-hofstaetter/neural-ranking-kd.py
index bae5c670..fbbb076a 100644
--- a/src/xpmir/dm/config/com/github/sebastian-hofstaetter/neural-ranking-kd.py
+++ b/src/xpmir/dm/config/com/github/sebastian-hofstaetter/neural-ranking-kd.py
@@ -1,7 +1,6 @@
-# See documentation on http://experimaestro.github.io/datamaestro/
+# See documentation on https://datamaestro.readthedocs.io
 from hashlib import md5
 from datamaestro.definitions import dataset
-from datamaestro.data import File
 from datamaestro.download.single import filedownloader
 from datamaestro.utils import HashCheck
 from xpmir.letor.distillation.samplers import PairwiseDistillationSamplesTSV
@@ -9,7 +8,8 @@
 
 @filedownloader(
     "bert_cat_ensemble_msmarcopassage_train_scores_ids.tsv",
-    "https://zenodo.org/record/4068216/files/bert_cat_ensemble_msmarcopassage_train_scores_ids.tsv?download=1",
+    "https://zenodo.org/record/4068216/files/"
+    "bert_cat_ensemble_msmarcopassage_train_scores_ids.tsv?download=1",
     checker=HashCheck("4d99696386f96a7f1631076bcc53ac3c", md5),
 )
 @dataset(
@@ -19,7 +19,9 @@
 def msmarco_ensemble_teacher(bert_cat_ensemble_msmarcopassage_train_scores_ids):
     """Training files without the text content instead using the ids from MSMARCO
 
-    The teacher files (using the data from "Train Triples Small" with ~40 million triples) with the format pos_score neg_score query_id pos_passage_id neg_passage_id (with tab separation)
+    The teacher files (using the data from "Train Triples Small" with ~40
+    million triples) with the format pos_score neg_score query_id pos_passage_id
+    neg_passage_id (with tab separation)
     """
     return {
         "with_docid": True,
@@ -30,7 +32,8 @@ def msmarco_ensemble_teacher(bert_cat_ensemble_msmarcopassage_train_scores_ids):
 
 @filedownloader(
     "bertbase_cat_msmarcopassage_train_scores_ids.tsv",
-    "https://zenodo.org/record/4068216/files/bertbase_cat_msmarcopassage_train_scores_ids.tsv?download=1",
+    "https://zenodo.org/record/4068216/files/"
+    "bertbase_cat_msmarcopassage_train_scores_ids.tsv?download=1",
     checker=HashCheck("a2575af08a19b47c2041e67c9efcd917", md5),
 )
 @dataset(
@@ -40,7 +43,9 @@ def msmarco_ensemble_teacher(bert_cat_ensemble_msmarcopassage_train_scores_ids):
 def msmarco_bert_teacher(bertbase_cat_msmarcopassage_train_scores_ids):
     """Training files without the text content instead using the ids from MSMARCO
 
-    The teacher files (using the data from "Train Triples Small" with ~40 million triples) with the format pos_score neg_score query_id pos_passage_id neg_passage_id (with tab separation)
+    The teacher files (using the data from "Train Triples Small" with ~40
+    million triples) with the format pos_score neg_score query_id pos_passage_id
+    neg_passage_id (with tab separation)
     """
     return {
         "with_docid": True,
diff --git a/src/xpmir/evaluation.py b/src/xpmir/evaluation.py
index bf75f7b3..8d3808cd 100644
--- a/src/xpmir/evaluation.py
+++ b/src/xpmir/evaluation.py
@@ -1,14 +1,13 @@
 import sys
 from itertools import chain
+import pandas as pd
 from pathlib import Path
 from typing import DefaultDict, Dict, List, Protocol, Union, Tuple
 import ir_measures
 from datamaestro_text.data.ir import Adhoc, AdhocAssessments, AdhocDocuments
 from experimaestro import Task, Param, pathgenerator, Annotated, tags, TagDict
-from datamaestro_text.data.ir.trec import (
-    TrecAdhocRun,
-    TrecAdhocResults,
-)
+from datamaestro_text.data.ir import AdhocResults
+from datamaestro_text.data.ir.trec import TrecAdhocRun, TrecAdhocResults
 from xpmir.measures import Measure
 import xpmir.measures as m
 from xpmir.metrics import evaluator
@@ -137,7 +136,7 @@ class Evaluations:
     dataset: Adhoc
     measures: List[Measure]
     results: List[BaseEvaluation]
-    per_tag: Dict[TagDict, TrecAdhocResults]
+    per_tag: Dict[TagDict, AdhocResults]
 
     def __init__(self, dataset: Adhoc, measures: List[Measure]):
         self.dataset = dataset
@@ -147,12 +146,12 @@ def __init__(self, dataset: Adhoc, measures: List[Measure]):
 
     def evaluate_retriever(
         self, retriever: Union[Retriever, RetrieverFactory], launcher: Launcher = None
-    ) -> Tuple[Retriever, TrecAdhocResults]:
+    ) -> Tuple[Retriever, AdhocResults]:
         """Evaluates a retriever"""
         if not isinstance(retriever, Retriever):
             retriever = retriever(self.dataset.documents)
 
-        evaluation: TrecAdhocResults = Evaluate(
+        evaluation: AdhocResults = Evaluate(
             retriever=retriever, measures=self.measures, dataset=self.dataset
         ).submit(launcher=launcher)
         self.add(evaluation)
@@ -168,6 +167,9 @@ def add(self, *results: BaseEvaluation):
         self.results.extend(results)
 
     def output_results_per_tag(self, file=sys.stdout):
+        return self.to_dataframe().to_markdown(file)
+
+    def to_dataframe(self) -> pd.DataFrame:
         # Get all the tags
         tags = list(
             set(chain(*[tags_dict.keys() for tags_dict in self.per_tags.keys()]))
@@ -183,24 +185,27 @@ def output_results_per_tag(self, file=sys.stdout):
             to_process.append((tags_dict, results))
 
         # Table header
-        file.write(f"| {' | '.join(tags)}")
-        file.write(f" | {' | '.join(metrics)} |")
-        file.write("\n")
-        file.write(f"| {' | '.join('---' for _ in tags)}")
-        file.write(f" | {' | '.join('---' for _ in metrics)} |")
-        file.write("\n")
+        columns = []
+        for tag in tags:
+            columns.append(["tag", tag])
+        for metric in metrics:
+            columns.append(["metric", metric])
 
         # Output the results
+        rows = []
         for tags_dict, results in to_process:
+            row = []
             # tag values
             for k in tags:
-                file.write(f'| {str(tags_dict.get(k, ""))}')
+                row.append(str(tags_dict.get(k, "")))
 
             # metric values
             for metric in metrics:
-                file.write(f' | {str(results.get(metric, ""))}')
-            file.write(" |\n")
-        file.write("\n")
+                row.append(results.get(metric, ""))
+            rows.append(row)
+
+        index = pd.MultiIndex.from_tuples(columns)
+        return pd.DataFrame(rows, columns=index)
 
 
 class EvaluationsCollection:
@@ -212,7 +217,7 @@ class EvaluationsCollection:
 
     collection: Dict[str, Evaluations]
 
-    per_model: Dict[str, List[Tuple[str, TrecAdhocResults]]]
+    per_model: Dict[str, List[Tuple[str, AdhocResults]]]
     """List of results per model"""
 
     def __init__(self, **collection: Evaluations):
@@ -227,7 +232,7 @@ def evaluate_retriever(
         overwrite: bool = False,
     ):
         """Evaluate a retriever for all the evaluations in this collection (the
-        tasks are submitted to experimaestro the scheduler)"""
+        tasks are submitted to the experimaestro scheduler)"""
         if model_id is not None and not overwrite:
             assert (
                 model_id not in self.per_model
@@ -258,6 +263,15 @@ def output_results(self, file=sys.stdout):
                     file=file,
                 )
 
+    def to_dataframe(self) -> pd.DataFrame:
+        """Returns a Pandas dataframe"""
+        all_data = []
+        for key, evaluations in self.collection.items():
+            data = evaluations.to_dataframe()
+            data.dataset = key
+            all_data.append(data)
+        return pd.concat(all_data, ignore_index=True)
+
     def output_results_per_tag(self, file=sys.stdout):
         """Outputs the results for each collection, based on the retriever tags
         to build the table
diff --git a/src/xpmir/letor/trainers/pairwise.py b/src/xpmir/letor/trainers/pairwise.py
index 1580b67a..d9e643e6 100644
--- a/src/xpmir/letor/trainers/pairwise.py
+++ b/src/xpmir/letor/trainers/pairwise.py
@@ -73,6 +73,11 @@ class LogSoftmaxLoss(PairwiseLoss):
 
     NAME = "ranknet"
 
+    def __initialize__(self, ranker: LearnableScorer):
+        assert (
+            ranker.outputType != ScorerOutputType.PROBABILITY
+        ), "Probability outputs are not handled"
+
     def compute(self, scores: torch.Tensor, info: TrainerContext):
         return -F.logsigmoid(scores[:, 0] - scores[:, 1]).mean()
 
@@ -81,9 +86,12 @@ def compute(self, scores: torch.Tensor, info: TrainerContext):
 
 
 class HingeLoss(PairwiseLoss):
+    """Hinge loss"""
+
     NAME = "hinge"
 
     margin: Param[float] = 1.0
+    """The margin for the Hinge loss"""
 
     def compute(self, rel_scores_by_record, info: TrainerContext):
         return F.relu(
@@ -92,29 +100,25 @@ def compute(self, rel_scores_by_record, info: TrainerContext):
 
 
 class BCEWithLogLoss(nn.Module):
-    def __call__(self, log_probs, info: TrainerContext):
-        assert info.metrics is not None, "No metrics object in context"
+    """Custom cross-entropy loss when outputs are log probabilities"""
 
+    def __call__(self, log_probs: torch.Tensor, targets: torch.Tensor):
         # Assumes target is a two column matrix (rel. / not rel.)
-        rel_cost, nrel_cost = (
-            -log_probs[:, 0].mean(),
-            -(1.0 - log_probs[:, 1].exp()).log().mean(),
-        )
-        info.metrics.add(
-            ScalarMetric("pairwise-pce-rel", rel_cost.item(), len(log_probs))
-        )
-        info.metrics.add(
-            ScalarMetric("pairwise-pce-nrel", nrel_cost.item(), len(log_probs))
+
+        loss = (
+            -log_probs[targets > 0].sum() + (1.0 - log_probs[targets == 0].exp()).sum()
         )
-        return (rel_cost + nrel_cost) / 2
+
+        return loss / log_probs.numel()
 
 
 class PointwiseCrossEntropyLoss(PairwiseLoss):
-    """Regular PCE (>0 for relevant, 0 otherwise)
-    Uses the ranker output type:
+    """Point-wise cross-entropy loss
+
+    This loss adapts to the ranker output type:
     - If real, uses a BCELossWithLogits (sigmoid transformation)
     - If probability, uses the BCELoss
-    - If log probability, uses a custom BCE loss
+    - If log probability, uses a BCEWithLogLoss
     """
 
     NAME = "pointwise-cross-entropy"
diff --git a/src/xpmir/text/huggingface.py b/src/xpmir/text/huggingface.py
index d2aedeab..91c07e7e 100644
--- a/src/xpmir/text/huggingface.py
+++ b/src/xpmir/text/huggingface.py
@@ -325,11 +325,14 @@ class DualTransformerEncoder(BaseTransformer, DualTextEncoder):
 
     def forward(self, texts: List[Tuple[str, str]]):
         tokenized = self.batch_tokenize(texts, maxlen=self.maxlen, mask=True)
+
         with torch.set_grad_enabled(torch.is_grad_enabled() and self.trainable):
+            kwargs = {}
+            if tokenized.token_type_ids:
+                kwargs["token_type_ids"] = tokenized.token_type_ids.to(self.device)
+
             y = self.model(
-                tokenized.ids,
-                token_type_ids=tokenized.token_type_ids.to(self.device),
-                attention_mask=tokenized.mask.to(self.device),
+                tokenized.ids, attention_mask=tokenized.mask.to(self.device), **kwargs
             )
 
         # Assumes that [CLS] is the first token