Skip to content

Commit

Permalink
fix: fix bugs and documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
bpiwowar committed Apr 6, 2023
1 parent b6ad015 commit 56dae60
Show file tree
Hide file tree
Showing 7 changed files with 73 additions and 46 deletions.
4 changes: 2 additions & 2 deletions docs/source/hooks.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ Hooks can be used to modify the learning process



GPU Hooks
---------
Distributed
-----------

Hooks can be used to distribute a model over GPUs

Expand Down
1 change: 1 addition & 0 deletions hf_models/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This directory contains HuggingFace models produced by wrapping huggingface models for experimaestro IR.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# See documentation on http://experimaestro.github.io/datamaestro/
# See documentation on https://datamaestro.readthedocs.io

from datamaestro.definitions import datatasks, datatags, dataset
from datamaestro_text.data.ir.huggingface import HuggingFacePairwiseSampleDataset
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
# See documentation on http://experimaestro.github.io/datamaestro/
# See documentation on https://datamaestro.readthedocs.io
from hashlib import md5
from datamaestro.definitions import dataset
from datamaestro.data import File
from datamaestro.download.single import filedownloader
from datamaestro.utils import HashCheck
from xpmir.letor.distillation.samplers import PairwiseDistillationSamplesTSV


@filedownloader(
"bert_cat_ensemble_msmarcopassage_train_scores_ids.tsv",
"https://zenodo.org/record/4068216/files/bert_cat_ensemble_msmarcopassage_train_scores_ids.tsv?download=1",
"https://zenodo.org/record/4068216/files/"
"bert_cat_ensemble_msmarcopassage_train_scores_ids.tsv?download=1",
checker=HashCheck("4d99696386f96a7f1631076bcc53ac3c", md5),
)
@dataset(
Expand All @@ -19,7 +19,9 @@
def msmarco_ensemble_teacher(bert_cat_ensemble_msmarcopassage_train_scores_ids):
"""Training files without the text content instead using the ids from MSMARCO
The teacher files (using the data from "Train Triples Small" with ~40 million triples) with the format pos_score neg_score query_id pos_passage_id neg_passage_id (with tab separation)
The teacher files (using the data from "Train Triples Small" with ~40
million triples) with the format pos_score neg_score query_id pos_passage_id
neg_passage_id (with tab separation)
"""
return {
"with_docid": True,
Expand All @@ -30,7 +32,8 @@ def msmarco_ensemble_teacher(bert_cat_ensemble_msmarcopassage_train_scores_ids):

@filedownloader(
"bertbase_cat_msmarcopassage_train_scores_ids.tsv",
"https://zenodo.org/record/4068216/files/bertbase_cat_msmarcopassage_train_scores_ids.tsv?download=1",
"https://zenodo.org/record/4068216/files/"
"bertbase_cat_msmarcopassage_train_scores_ids.tsv?download=1",
checker=HashCheck("a2575af08a19b47c2041e67c9efcd917", md5),
)
@dataset(
Expand All @@ -40,7 +43,9 @@ def msmarco_ensemble_teacher(bert_cat_ensemble_msmarcopassage_train_scores_ids):
def msmarco_bert_teacher(bertbase_cat_msmarcopassage_train_scores_ids):
"""Training files without the text content instead using the ids from MSMARCO
The teacher files (using the data from "Train Triples Small" with ~40 million triples) with the format pos_score neg_score query_id pos_passage_id neg_passage_id (with tab separation)
The teacher files (using the data from "Train Triples Small" with ~40
million triples) with the format pos_score neg_score query_id pos_passage_id
neg_passage_id (with tab separation)
"""
return {
"with_docid": True,
Expand Down
52 changes: 33 additions & 19 deletions src/xpmir/evaluation.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
import sys
from itertools import chain
import pandas as pd
from pathlib import Path
from typing import DefaultDict, Dict, List, Protocol, Union, Tuple
import ir_measures
from datamaestro_text.data.ir import Adhoc, AdhocAssessments, AdhocDocuments
from experimaestro import Task, Param, pathgenerator, Annotated, tags, TagDict
from datamaestro_text.data.ir.trec import (
TrecAdhocRun,
TrecAdhocResults,
)
from datamaestro_text.data.ir import AdhocResults
from datamaestro_text.data.ir.trec import TrecAdhocRun, TrecAdhocResults
from xpmir.measures import Measure
import xpmir.measures as m
from xpmir.metrics import evaluator
Expand Down Expand Up @@ -137,7 +136,7 @@ class Evaluations:
dataset: Adhoc
measures: List[Measure]
results: List[BaseEvaluation]
per_tag: Dict[TagDict, TrecAdhocResults]
per_tag: Dict[TagDict, AdhocResults]

def __init__(self, dataset: Adhoc, measures: List[Measure]):
self.dataset = dataset
Expand All @@ -147,12 +146,12 @@ def __init__(self, dataset: Adhoc, measures: List[Measure]):

def evaluate_retriever(
self, retriever: Union[Retriever, RetrieverFactory], launcher: Launcher = None
) -> Tuple[Retriever, TrecAdhocResults]:
) -> Tuple[Retriever, AdhocResults]:
"""Evaluates a retriever"""
if not isinstance(retriever, Retriever):
retriever = retriever(self.dataset.documents)

evaluation: TrecAdhocResults = Evaluate(
evaluation: AdhocResults = Evaluate(
retriever=retriever, measures=self.measures, dataset=self.dataset
).submit(launcher=launcher)
self.add(evaluation)
Expand All @@ -168,6 +167,9 @@ def add(self, *results: BaseEvaluation):
self.results.extend(results)

def output_results_per_tag(self, file=sys.stdout):
return self.to_dataframe().to_markdown(file)

def to_dataframe(self) -> pd.DataFrame:
# Get all the tags
tags = list(
set(chain(*[tags_dict.keys() for tags_dict in self.per_tags.keys()]))
Expand All @@ -183,24 +185,27 @@ def output_results_per_tag(self, file=sys.stdout):
to_process.append((tags_dict, results))

# Table header
file.write(f"| {' | '.join(tags)}")
file.write(f" | {' | '.join(metrics)} |")
file.write("\n")
file.write(f"| {' | '.join('---' for _ in tags)}")
file.write(f" | {' | '.join('---' for _ in metrics)} |")
file.write("\n")
columns = []
for tag in tags:
columns.append(["tag", tag])
for metric in metrics:
columns.append(["metric", metric])

# Output the results
rows = []
for tags_dict, results in to_process:
row = []
# tag values
for k in tags:
file.write(f'| {str(tags_dict.get(k, ""))}')
row.append(str(tags_dict.get(k, "")))

# metric values
for metric in metrics:
file.write(f' | {str(results.get(metric, ""))}')
file.write(" |\n")
file.write("\n")
row.append(results.get(metric, ""))
rows.append(row)

index = pd.MultiIndex.from_tuples(columns)
return pd.DataFrame(rows, columns=index)


class EvaluationsCollection:
Expand All @@ -212,7 +217,7 @@ class EvaluationsCollection:

collection: Dict[str, Evaluations]

per_model: Dict[str, List[Tuple[str, TrecAdhocResults]]]
per_model: Dict[str, List[Tuple[str, AdhocResults]]]
"""List of results per model"""

def __init__(self, **collection: Evaluations):
Expand All @@ -227,7 +232,7 @@ def evaluate_retriever(
overwrite: bool = False,
):
"""Evaluate a retriever for all the evaluations in this collection (the
tasks are submitted to experimaestro the scheduler)"""
tasks are submitted to the experimaestro scheduler)"""
if model_id is not None and not overwrite:
assert (
model_id not in self.per_model
Expand Down Expand Up @@ -258,6 +263,15 @@ def output_results(self, file=sys.stdout):
file=file,
)

def to_dataframe(self) -> pd.DataFrame:
"""Returns a Pandas dataframe"""
all_data = []
for key, evaluations in self.collection.items():
data = evaluations.to_dataframe()
data.dataset = key
all_data.append(data)
return pd.concat(all_data, ignore_index=True)

def output_results_per_tag(self, file=sys.stdout):
"""Outputs the results for each collection, based on the retriever tags
to build the table
Expand Down
34 changes: 19 additions & 15 deletions src/xpmir/letor/trainers/pairwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,11 @@ class LogSoftmaxLoss(PairwiseLoss):

NAME = "ranknet"

def __initialize__(self, ranker: LearnableScorer):
assert (
ranker.outputType != ScorerOutputType.PROBABILITY
), "Probability outputs are not handled"

def compute(self, scores: torch.Tensor, info: TrainerContext):
return -F.logsigmoid(scores[:, 0] - scores[:, 1]).mean()

Expand All @@ -81,9 +86,12 @@ def compute(self, scores: torch.Tensor, info: TrainerContext):


class HingeLoss(PairwiseLoss):
"""Hinge loss"""

NAME = "hinge"

margin: Param[float] = 1.0
"""The margin for the Hinge loss"""

def compute(self, rel_scores_by_record, info: TrainerContext):
return F.relu(
Expand All @@ -92,29 +100,25 @@ def compute(self, rel_scores_by_record, info: TrainerContext):


class BCEWithLogLoss(nn.Module):
def __call__(self, log_probs, info: TrainerContext):
assert info.metrics is not None, "No metrics object in context"
"""Custom cross-entropy loss when outputs are log probabilities"""

def __call__(self, log_probs: torch.Tensor, targets: torch.Tensor):
# Assumes target is a two column matrix (rel. / not rel.)
rel_cost, nrel_cost = (
-log_probs[:, 0].mean(),
-(1.0 - log_probs[:, 1].exp()).log().mean(),
)
info.metrics.add(
ScalarMetric("pairwise-pce-rel", rel_cost.item(), len(log_probs))
)
info.metrics.add(
ScalarMetric("pairwise-pce-nrel", nrel_cost.item(), len(log_probs))

loss = (
-log_probs[targets > 0].sum() + (1.0 - log_probs[targets == 0].exp()).sum()
)
return (rel_cost + nrel_cost) / 2

return loss / log_probs.numel()


class PointwiseCrossEntropyLoss(PairwiseLoss):
"""Regular PCE (>0 for relevant, 0 otherwise)
Uses the ranker output type:
"""Point-wise cross-entropy loss
This loss adapts to the ranker output type:
- If real, uses a BCELossWithLogits (sigmoid transformation)
- If probability, uses the BCELoss
- If log probability, uses a custom BCE loss
- If log probability, uses a BCEWithLogLoss
"""

NAME = "pointwise-cross-entropy"
Expand Down
9 changes: 6 additions & 3 deletions src/xpmir/text/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,11 +325,14 @@ class DualTransformerEncoder(BaseTransformer, DualTextEncoder):

def forward(self, texts: List[Tuple[str, str]]):
tokenized = self.batch_tokenize(texts, maxlen=self.maxlen, mask=True)

with torch.set_grad_enabled(torch.is_grad_enabled() and self.trainable):
kwargs = {}
if tokenized.token_type_ids:
kwargs["token_type_ids"] = tokenized.token_type_ids.to(self.device)

y = self.model(
tokenized.ids,
token_type_ids=tokenized.token_type_ids.to(self.device),
attention_mask=tokenized.mask.to(self.device),
tokenized.ids, attention_mask=tokenized.mask.to(self.device), **kwargs
)

# Assumes that [CLS] is the first token
Expand Down

0 comments on commit 56dae60

Please sign in to comment.