diff --git a/docs/source/hooks.rst b/docs/source/hooks.rst index f75b45cd..6d80f286 100644 --- a/docs/source/hooks.rst +++ b/docs/source/hooks.rst @@ -29,8 +29,8 @@ Hooks can be used to modify the learning process -GPU Hooks ---------- +Distributed +----------- Hooks can be used to distribute a model over GPUs diff --git a/hf_models/README.md b/hf_models/README.md new file mode 100644 index 00000000..ca8d1324 --- /dev/null +++ b/hf_models/README.md @@ -0,0 +1 @@ +This directory contains HuggingFace models produced by wrapping huggingface models for experimaestro IR. diff --git a/src/xpmir/dm/config/co/huggingface/datasets/sentence-transformers/msmarco-hard-negatives.py b/src/xpmir/dm/config/co/huggingface/datasets/sentence-transformers/msmarco-hard-negatives.py index 82118153..033287d9 100644 --- a/src/xpmir/dm/config/co/huggingface/datasets/sentence-transformers/msmarco-hard-negatives.py +++ b/src/xpmir/dm/config/co/huggingface/datasets/sentence-transformers/msmarco-hard-negatives.py @@ -1,4 +1,4 @@ -# See documentation on http://experimaestro.github.io/datamaestro/ +# See documentation on https://datamaestro.readthedocs.io from datamaestro.definitions import datatasks, datatags, dataset from datamaestro_text.data.ir.huggingface import HuggingFacePairwiseSampleDataset diff --git a/src/xpmir/dm/config/com/github/sebastian-hofstaetter/neural-ranking-kd.py b/src/xpmir/dm/config/com/github/sebastian-hofstaetter/neural-ranking-kd.py index bae5c670..fbbb076a 100644 --- a/src/xpmir/dm/config/com/github/sebastian-hofstaetter/neural-ranking-kd.py +++ b/src/xpmir/dm/config/com/github/sebastian-hofstaetter/neural-ranking-kd.py @@ -1,7 +1,6 @@ -# See documentation on http://experimaestro.github.io/datamaestro/ +# See documentation on https://datamaestro.readthedocs.io from hashlib import md5 from datamaestro.definitions import dataset -from datamaestro.data import File from datamaestro.download.single import filedownloader from datamaestro.utils import HashCheck from xpmir.letor.distillation.samplers import PairwiseDistillationSamplesTSV @@ -9,7 +8,8 @@ @filedownloader( "bert_cat_ensemble_msmarcopassage_train_scores_ids.tsv", - "https://zenodo.org/record/4068216/files/bert_cat_ensemble_msmarcopassage_train_scores_ids.tsv?download=1", + "https://zenodo.org/record/4068216/files/" + "bert_cat_ensemble_msmarcopassage_train_scores_ids.tsv?download=1", checker=HashCheck("4d99696386f96a7f1631076bcc53ac3c", md5), ) @dataset( @@ -19,7 +19,9 @@ def msmarco_ensemble_teacher(bert_cat_ensemble_msmarcopassage_train_scores_ids): """Training files without the text content instead using the ids from MSMARCO - The teacher files (using the data from "Train Triples Small" with ~40 million triples) with the format pos_score neg_score query_id pos_passage_id neg_passage_id (with tab separation) + The teacher files (using the data from "Train Triples Small" with ~40 + million triples) with the format pos_score neg_score query_id pos_passage_id + neg_passage_id (with tab separation) """ return { "with_docid": True, @@ -30,7 +32,8 @@ def msmarco_ensemble_teacher(bert_cat_ensemble_msmarcopassage_train_scores_ids): @filedownloader( "bertbase_cat_msmarcopassage_train_scores_ids.tsv", - "https://zenodo.org/record/4068216/files/bertbase_cat_msmarcopassage_train_scores_ids.tsv?download=1", + "https://zenodo.org/record/4068216/files/" + "bertbase_cat_msmarcopassage_train_scores_ids.tsv?download=1", checker=HashCheck("a2575af08a19b47c2041e67c9efcd917", md5), ) @dataset( @@ -40,7 +43,9 @@ def msmarco_ensemble_teacher(bert_cat_ensemble_msmarcopassage_train_scores_ids): def msmarco_bert_teacher(bertbase_cat_msmarcopassage_train_scores_ids): """Training files without the text content instead using the ids from MSMARCO - The teacher files (using the data from "Train Triples Small" with ~40 million triples) with the format pos_score neg_score query_id pos_passage_id neg_passage_id (with tab separation) + The teacher files (using the data from "Train Triples Small" with ~40 + million triples) with the format pos_score neg_score query_id pos_passage_id + neg_passage_id (with tab separation) """ return { "with_docid": True, diff --git a/src/xpmir/evaluation.py b/src/xpmir/evaluation.py index bf75f7b3..8d3808cd 100644 --- a/src/xpmir/evaluation.py +++ b/src/xpmir/evaluation.py @@ -1,14 +1,13 @@ import sys from itertools import chain +import pandas as pd from pathlib import Path from typing import DefaultDict, Dict, List, Protocol, Union, Tuple import ir_measures from datamaestro_text.data.ir import Adhoc, AdhocAssessments, AdhocDocuments from experimaestro import Task, Param, pathgenerator, Annotated, tags, TagDict -from datamaestro_text.data.ir.trec import ( - TrecAdhocRun, - TrecAdhocResults, -) +from datamaestro_text.data.ir import AdhocResults +from datamaestro_text.data.ir.trec import TrecAdhocRun, TrecAdhocResults from xpmir.measures import Measure import xpmir.measures as m from xpmir.metrics import evaluator @@ -137,7 +136,7 @@ class Evaluations: dataset: Adhoc measures: List[Measure] results: List[BaseEvaluation] - per_tag: Dict[TagDict, TrecAdhocResults] + per_tag: Dict[TagDict, AdhocResults] def __init__(self, dataset: Adhoc, measures: List[Measure]): self.dataset = dataset @@ -147,12 +146,12 @@ def __init__(self, dataset: Adhoc, measures: List[Measure]): def evaluate_retriever( self, retriever: Union[Retriever, RetrieverFactory], launcher: Launcher = None - ) -> Tuple[Retriever, TrecAdhocResults]: + ) -> Tuple[Retriever, AdhocResults]: """Evaluates a retriever""" if not isinstance(retriever, Retriever): retriever = retriever(self.dataset.documents) - evaluation: TrecAdhocResults = Evaluate( + evaluation: AdhocResults = Evaluate( retriever=retriever, measures=self.measures, dataset=self.dataset ).submit(launcher=launcher) self.add(evaluation) @@ -168,6 +167,9 @@ def add(self, *results: BaseEvaluation): self.results.extend(results) def output_results_per_tag(self, file=sys.stdout): + return self.to_dataframe().to_markdown(file) + + def to_dataframe(self) -> pd.DataFrame: # Get all the tags tags = list( set(chain(*[tags_dict.keys() for tags_dict in self.per_tags.keys()])) @@ -183,24 +185,27 @@ def output_results_per_tag(self, file=sys.stdout): to_process.append((tags_dict, results)) # Table header - file.write(f"| {' | '.join(tags)}") - file.write(f" | {' | '.join(metrics)} |") - file.write("\n") - file.write(f"| {' | '.join('---' for _ in tags)}") - file.write(f" | {' | '.join('---' for _ in metrics)} |") - file.write("\n") + columns = [] + for tag in tags: + columns.append(["tag", tag]) + for metric in metrics: + columns.append(["metric", metric]) # Output the results + rows = [] for tags_dict, results in to_process: + row = [] # tag values for k in tags: - file.write(f'| {str(tags_dict.get(k, ""))}') + row.append(str(tags_dict.get(k, ""))) # metric values for metric in metrics: - file.write(f' | {str(results.get(metric, ""))}') - file.write(" |\n") - file.write("\n") + row.append(results.get(metric, "")) + rows.append(row) + + index = pd.MultiIndex.from_tuples(columns) + return pd.DataFrame(rows, columns=index) class EvaluationsCollection: @@ -212,7 +217,7 @@ class EvaluationsCollection: collection: Dict[str, Evaluations] - per_model: Dict[str, List[Tuple[str, TrecAdhocResults]]] + per_model: Dict[str, List[Tuple[str, AdhocResults]]] """List of results per model""" def __init__(self, **collection: Evaluations): @@ -227,7 +232,7 @@ def evaluate_retriever( overwrite: bool = False, ): """Evaluate a retriever for all the evaluations in this collection (the - tasks are submitted to experimaestro the scheduler)""" + tasks are submitted to the experimaestro scheduler)""" if model_id is not None and not overwrite: assert ( model_id not in self.per_model @@ -258,6 +263,15 @@ def output_results(self, file=sys.stdout): file=file, ) + def to_dataframe(self) -> pd.DataFrame: + """Returns a Pandas dataframe""" + all_data = [] + for key, evaluations in self.collection.items(): + data = evaluations.to_dataframe() + data.dataset = key + all_data.append(data) + return pd.concat(all_data, ignore_index=True) + def output_results_per_tag(self, file=sys.stdout): """Outputs the results for each collection, based on the retriever tags to build the table diff --git a/src/xpmir/letor/trainers/pairwise.py b/src/xpmir/letor/trainers/pairwise.py index 1580b67a..d9e643e6 100644 --- a/src/xpmir/letor/trainers/pairwise.py +++ b/src/xpmir/letor/trainers/pairwise.py @@ -73,6 +73,11 @@ class LogSoftmaxLoss(PairwiseLoss): NAME = "ranknet" + def __initialize__(self, ranker: LearnableScorer): + assert ( + ranker.outputType != ScorerOutputType.PROBABILITY + ), "Probability outputs are not handled" + def compute(self, scores: torch.Tensor, info: TrainerContext): return -F.logsigmoid(scores[:, 0] - scores[:, 1]).mean() @@ -81,9 +86,12 @@ def compute(self, scores: torch.Tensor, info: TrainerContext): class HingeLoss(PairwiseLoss): + """Hinge loss""" + NAME = "hinge" margin: Param[float] = 1.0 + """The margin for the Hinge loss""" def compute(self, rel_scores_by_record, info: TrainerContext): return F.relu( @@ -92,29 +100,25 @@ def compute(self, rel_scores_by_record, info: TrainerContext): class BCEWithLogLoss(nn.Module): - def __call__(self, log_probs, info: TrainerContext): - assert info.metrics is not None, "No metrics object in context" + """Custom cross-entropy loss when outputs are log probabilities""" + def __call__(self, log_probs: torch.Tensor, targets: torch.Tensor): # Assumes target is a two column matrix (rel. / not rel.) - rel_cost, nrel_cost = ( - -log_probs[:, 0].mean(), - -(1.0 - log_probs[:, 1].exp()).log().mean(), - ) - info.metrics.add( - ScalarMetric("pairwise-pce-rel", rel_cost.item(), len(log_probs)) - ) - info.metrics.add( - ScalarMetric("pairwise-pce-nrel", nrel_cost.item(), len(log_probs)) + + loss = ( + -log_probs[targets > 0].sum() + (1.0 - log_probs[targets == 0].exp()).sum() ) - return (rel_cost + nrel_cost) / 2 + + return loss / log_probs.numel() class PointwiseCrossEntropyLoss(PairwiseLoss): - """Regular PCE (>0 for relevant, 0 otherwise) - Uses the ranker output type: + """Point-wise cross-entropy loss + + This loss adapts to the ranker output type: - If real, uses a BCELossWithLogits (sigmoid transformation) - If probability, uses the BCELoss - - If log probability, uses a custom BCE loss + - If log probability, uses a BCEWithLogLoss """ NAME = "pointwise-cross-entropy" diff --git a/src/xpmir/text/huggingface.py b/src/xpmir/text/huggingface.py index d2aedeab..91c07e7e 100644 --- a/src/xpmir/text/huggingface.py +++ b/src/xpmir/text/huggingface.py @@ -325,11 +325,14 @@ class DualTransformerEncoder(BaseTransformer, DualTextEncoder): def forward(self, texts: List[Tuple[str, str]]): tokenized = self.batch_tokenize(texts, maxlen=self.maxlen, mask=True) + with torch.set_grad_enabled(torch.is_grad_enabled() and self.trainable): + kwargs = {} + if tokenized.token_type_ids: + kwargs["token_type_ids"] = tokenized.token_type_ids.to(self.device) + y = self.model( - tokenized.ids, - token_type_ids=tokenized.token_type_ids.to(self.device), - attention_mask=tokenized.mask.to(self.device), + tokenized.ids, attention_mask=tokenized.mask.to(self.device), **kwargs ) # Assumes that [CLS] is the first token