From d6c4c83f15c4ef29172507375358dcefd0d808eb Mon Sep 17 00:00:00 2001
From: Gyorgy Kovacs <gyuriofkovacs@gmail.com>
Date: Sun, 18 Feb 2024 12:29:29 +0100
Subject: [PATCH 1/5] Mainly documentation changes

---
 README.rst                                    | 17 ++++++++--
 docs/01a_requirements.rst                     |  2 ++
 docs/01c_consistency_checking.rst             | 13 +++++++-
 .../binary/_check_1_dataset_kfold_som.py      |  9 ++++--
 .../_check_1_dataset_known_folds_mos.py       |  9 +++++-
 .../_check_1_dataset_unknown_folds_mos.py     |  9 +++++-
 .../check/binary/_check_1_testset_no_kfold.py |  9 ++++--
 .../binary/_check_n_datasets_mos_kfold_som.py |  8 ++++-
 .../_check_n_datasets_mos_known_folds_mos.py  |  8 ++++-
 ..._check_n_datasets_mos_unknown_folds_mos.py | 11 ++++++-
 .../binary/_check_n_datasets_som_kfold_som.py |  9 ++++--
 .../binary/_check_n_testsets_mos_no_kfold.py  |  8 ++++-
 .../binary/_check_n_testsets_som_no_kfold.py  |  9 ++++--
 mlscorecheck/check/bundles/ehg/_tpehg.py      |  5 ++-
 .../check/bundles/retina/_chasedb1.py         | 18 +++++++----
 .../check/bundles/retina/_diaretdb0.py        | 14 +++++---
 .../check/bundles/retina/_diaretdb1.py        | 27 ++++++++++------
 .../check/bundles/retina/_drishti_gs.py       | 23 +++++++++----
 mlscorecheck/check/bundles/retina/_drive.py   | 27 ++++++++++------
 mlscorecheck/check/bundles/retina/_hrf.py     | 32 +++++++++++++------
 mlscorecheck/check/bundles/retina/_stare.py   | 24 ++++++++++----
 .../check/bundles/skinlesion/_isic2016.py     |  9 ++++--
 .../check/bundles/skinlesion/_isic2017.py     |  9 ++++--
 .../_check_1_dataset_known_folds_mos_macro.py |  4 +++
 .../_check_1_dataset_known_folds_mos_micro.py |  5 ++-
 .../_check_1_dataset_known_folds_som_macro.py |  5 ++-
 .../_check_1_testset_no_kfold_macro.py        |  5 ++-
 .../_check_1_testset_no_kfold_micro.py        |  9 ++++--
 mlscorecheck/individual/_utils.py             | 31 ++++++++++++++++++
 mlscorecheck/scores/scores.json               |  4 +--
 tests/individual/test_utils.py                | 18 +++++++++++
 31 files changed, 306 insertions(+), 84 deletions(-)

diff --git a/README.rst b/README.rst
index 8fb7ccb..8582c44 100644
--- a/README.rst
+++ b/README.rst
@@ -76,7 +76,7 @@ If you use the package, please consider citing the following paper:
 .. code-block:: BibTex
 
   @misc{fazekas2023testing,
-      title={Testing the Consistency of Performance Scores Reported for Binary Classification Problems}, 
+      title={Testing the Consistency of Performance Scores Reported for Binary Classification Problems},
       author={Attila Fazekas and György Kovács},
       year={2023},
       eprint={2310.12527},
@@ -159,6 +159,8 @@ A simple binary classification testset consisting of ``p`` positive samples (usu
 
     testset = {"p": 10, "n": 20}
 
+We note that alternative notations, like using ``n_positive``, ``n_minority`` or ``n_1`` instead of ``p`` and similarly, ``n_negative``, ``n_majority`` and ``n_0`` instead of ``n`` are supported.
+
 One can also specify a commonly used dataset by its name and the package will look up the ``p`` and ``n`` counts of the datasets from its internal registry (based on the representations in the ``common-datasets`` package):
 
 .. code-block:: Python
@@ -261,7 +263,18 @@ Depending on the experimental setup, the consistency tests developed for binary
   * prevalence threshold (``pt``),
   * diagnostic odds ratio (``dor``),
   * Jaccard index (``ji``),
-  * Cohen's kappa (``kappa``)
+  * Cohen's kappa (``kappa``).
+
+We note that synonyms and full names are also supported, for example:
+
+  * alternatives to ``sens`` are ``sensitivity``, ``true_positive_rate``, ``tpr`` and ``recall``,
+  * alternatives to ``spec`` are ``specificity``, ``true_negative_rate``, ``tnr`` and ``selectivity``,
+  * alternative to ``ppv`` are ``positive_predictive_value`` and ``precision``.
+
+Similarly, complements are supported as:
+
+  * one can specify ``false_positive_rate`` or ``fpr`` as a complement of ``spec``,
+  * and similarly, ``false_negative_rate`` or ``fnr`` as a complement of ``sens``.
 
 The tests are designed to detect inconsistencies. If the resulting ``inconsistency`` flag is ``False``, the scores can still be calculated in non-standard ways. However, **if the resulting ``inconsistency`` flag is ``True``, it conclusively indicates that inconsistencies are detected, and the reported scores could not be the outcome of the presumed experiment**.
 
diff --git a/docs/01a_requirements.rst b/docs/01a_requirements.rst
index fd18813..19e5f2a 100644
--- a/docs/01a_requirements.rst
+++ b/docs/01a_requirements.rst
@@ -26,6 +26,8 @@ A simple binary classification testset consisting of ``p`` positive samples (usu
 
     testset = {"p": 10, "n": 20}
 
+We note that alternative notations, like using ``n_positive``, ``n_minority`` or ``n_1`` instead of ``p`` and similarly, ``n_negative``, ``n_majority`` and ``n_0`` instead of ``n`` are supported.
+
 One can also specify a commonly used dataset by its name and the package will look up the ``p`` and ``n`` counts of the datasets from its internal registry (based on the representations in the ``common-datasets`` package):
 
 .. code-block:: Python
diff --git a/docs/01c_consistency_checking.rst b/docs/01c_consistency_checking.rst
index 33ccb9a..4b0118e 100644
--- a/docs/01c_consistency_checking.rst
+++ b/docs/01c_consistency_checking.rst
@@ -24,7 +24,18 @@ Depending on the experimental setup, the consistency tests developed for binary
   * prevalence threshold (``pt``),
   * diagnostic odds ratio (``dor``),
   * Jaccard index (``ji``),
-  * Cohen's kappa (``kappa``)
+  * Cohen's kappa (``kappa``).
+
+We note that synonyms and full names are also supported, for example:
+
+  * alternatives to ``sens`` are ``sensitivity``, ``true_positive_rate``, ``tpr`` and ``recall``,
+  * alternatives to ``spec`` are ``specificity``, ``true_negative_rate``, ``tnr`` and ``selectivity``,
+  * alternative to ``ppv`` are ``positive_predictive_value`` and ``precision``.
+
+Similarly, complements are supported as:
+
+  * one can specify ``false_positive_rate`` or ``fpr`` as a complement of ``spec``,
+  * and similarly, ``false_negative_rate`` or ``fnr`` as a complement of ``sens``.
 
 The tests are designed to detect inconsistencies. If the resulting ``inconsistency`` flag is ``False``, the scores can still be calculated in non-standard ways. However, **if the resulting ``inconsistency`` flag is ``True``, it conclusively indicates that inconsistencies are detected, and the reported scores could not be the outcome of the presumed experiment**.
 
diff --git a/mlscorecheck/check/binary/_check_1_dataset_kfold_som.py b/mlscorecheck/check/binary/_check_1_dataset_kfold_som.py
index 21d8d64..549a9a3 100644
--- a/mlscorecheck/check/binary/_check_1_dataset_kfold_som.py
+++ b/mlscorecheck/check/binary/_check_1_dataset_kfold_som.py
@@ -5,7 +5,7 @@
 """
 
 from ...core import NUMERICAL_TOLERANCE
-from ...individual import check_scores_tptn_pairs
+from ...individual import check_scores_tptn_pairs, translate_metadata
 from ...aggregated import Experiment
 
 __all__ = ["check_1_dataset_kfold_som"]
@@ -32,7 +32,10 @@ def check_1_dataset_kfold_som(
                                 'f1', 'fm', 'f1n', 'fbp', 'fbn', 'upm', 'gm', 'mk', 'lrp', 'lrn',
                                 'mcc', 'bm', 'pt', 'dor', 'ji', 'kappa'). When using f-beta
                                 positive or f-beta negative, also set 'beta_positive' and
-                                'beta_negative'.
+                                'beta_negative'. Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
         eps (float|dict(str,float)): The numerical uncertainty(ies) of the scores.
         numerical_tolerance (float, optional): In practice, beyond the numerical uncertainty of
                                             the scores, some further tolerance is applied. This
@@ -90,6 +93,8 @@ def check_1_dataset_kfold_som(
         # True
 
     """
+    folding = translate_metadata(folding)
+
     if folding.get("folds") is None and folding.get("strategy") is None:
         # any folding strategy results the same
         folding = {**folding} | {"strategy": "stratified_sklearn"}
diff --git a/mlscorecheck/check/binary/_check_1_dataset_known_folds_mos.py b/mlscorecheck/check/binary/_check_1_dataset_known_folds_mos.py
index 07b78b3..da8c706 100644
--- a/mlscorecheck/check/binary/_check_1_dataset_known_folds_mos.py
+++ b/mlscorecheck/check/binary/_check_1_dataset_known_folds_mos.py
@@ -6,6 +6,7 @@
 
 from ...core import NUMERICAL_TOLERANCE
 from ...aggregated import check_aggregated_scores, Experiment, Evaluation
+from ...individual import translate_metadata
 
 __all__ = ["check_1_dataset_known_folds_mos"]
 
@@ -31,7 +32,10 @@ def check_1_dataset_known_folds_mos(
 
     The test can only check the consistency of the 'acc', 'sens', 'spec' and 'bacc'
     scores. For a stronger test, one can add ``fold_score_bounds`` when, for example, the minimum
-    and the maximum scores over the folds are also provided.
+    and the maximum scores over the folds are also provided. Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
 
     Args:
         dataset (dict): The dataset specification.
@@ -105,6 +109,9 @@ def check_1_dataset_known_folds_mos(
         # True
     """
 
+    dataset = translate_metadata(dataset)
+    folding = translate_metadata(folding)
+
     evaluation = Evaluation(
         dataset=dataset,
         folding=folding,
diff --git a/mlscorecheck/check/binary/_check_1_dataset_unknown_folds_mos.py b/mlscorecheck/check/binary/_check_1_dataset_unknown_folds_mos.py
index 04bd032..3670fe3 100644
--- a/mlscorecheck/check/binary/_check_1_dataset_unknown_folds_mos.py
+++ b/mlscorecheck/check/binary/_check_1_dataset_unknown_folds_mos.py
@@ -5,6 +5,7 @@
 
 from ...core import NUMERICAL_TOLERANCE
 from ...aggregated import Dataset, repeated_kfolds_generator, kfolds_generator
+from ...individual import translate_metadata
 from ._check_1_dataset_known_folds_mos import check_1_dataset_known_folds_mos
 
 __all__ = ["check_1_dataset_unknown_folds_mos", "estimate_n_evaluations"]
@@ -63,7 +64,10 @@ def check_1_dataset_unknown_folds_mos(
 
     The test can only check the consistency of the 'acc', 'sens', 'spec' and 'bacc'
     scores. For a stronger test, one can add fold_score_bounds when, for example, the minimum and
-    the maximum scores over the folds are also provided.
+    the maximum scores over the folds are also provided. Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
 
     Note that depending on the size of the dataset (especially the number of minority instances)
     and the folding configuration, this test might lead to an untractable number of problems to
@@ -126,6 +130,9 @@ def check_1_dataset_unknown_folds_mos(
         >>> result['inconsistency']
         # True
     """
+    dataset = translate_metadata(dataset)
+    folding = translate_metadata(folding)
+
     evaluation = {
         "dataset": dataset,
         "folding": folding,
diff --git a/mlscorecheck/check/binary/_check_1_testset_no_kfold.py b/mlscorecheck/check/binary/_check_1_testset_no_kfold.py
index c411d6b..b64140a 100644
--- a/mlscorecheck/check/binary/_check_1_testset_no_kfold.py
+++ b/mlscorecheck/check/binary/_check_1_testset_no_kfold.py
@@ -6,7 +6,7 @@
 import warnings
 
 from ...core import logger, NUMERICAL_TOLERANCE
-from ...individual import check_scores_tptn_pairs
+from ...individual import check_scores_tptn_pairs, translate_metadata
 from ...experiments import dataset_statistics
 
 __all__ = ["check_1_testset_no_kfold"]
@@ -32,7 +32,10 @@ def check_1_testset_no_kfold(
                                     'fbp', 'fbn', 'upm', 'gm', 'mk', 'lrp', 'lrn', 'mcc',
                                     'bm', 'pt', 'dor', 'ji', 'kappa'), when using
                                     f-beta positive or f-beta negative, also set
-                                    'beta_positive' and 'beta_negative'.
+                                    'beta_positive' and 'beta_negative'. Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
         eps (float|dict(str,float)): the numerical uncertainty (potentially for each score)
         numerical_tolerance (float): in practice, beyond the numerical uncertainty of
                                     the scores, some further tolerance is applied. This is
@@ -90,6 +93,8 @@ def check_1_testset_no_kfold(
         "no aggregation of any kind."
     )
 
+    testset = translate_metadata(testset)
+
     if ("p" not in testset or "n" not in testset) and ("name" not in testset):
         raise ValueError('either "p" and "n" or "name" should be specified')
 
diff --git a/mlscorecheck/check/binary/_check_n_datasets_mos_kfold_som.py b/mlscorecheck/check/binary/_check_n_datasets_mos_kfold_som.py
index fe46e10..ff71693 100644
--- a/mlscorecheck/check/binary/_check_n_datasets_mos_kfold_som.py
+++ b/mlscorecheck/check/binary/_check_n_datasets_mos_kfold_som.py
@@ -7,6 +7,7 @@
 import copy
 
 from ...aggregated import check_aggregated_scores, Experiment
+from ...individual import translate_metadata
 from ...core import NUMERICAL_TOLERANCE
 
 __all__ = ["check_n_datasets_mos_kfold_som"]
@@ -33,7 +34,10 @@ def check_n_datasets_mos_kfold_som(
 
     The test can only check the consistency of the 'acc', 'sens', 'spec' and 'bacc'
     scores. For a stronger test, one can add ``dataset_score_bounds`` when, for example, the minimum
-    and the maximum scores over the datasets are also provided.
+    and the maximum scores over the datasets are also provided. Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
 
     Args:
         evaluations (list(dict)): the list of evaluation specifications
@@ -105,6 +109,8 @@ def check_n_datasets_mos_kfold_som(
         # True
     """
 
+    evaluations = translate_metadata(evaluations)
+
     if any(evaluation.get("aggregation", "som") != "som" for evaluation in evaluations):
         raise ValueError(
             'the aggregation specified in each dataset must be "rom" or nothing.'
diff --git a/mlscorecheck/check/binary/_check_n_datasets_mos_known_folds_mos.py b/mlscorecheck/check/binary/_check_n_datasets_mos_known_folds_mos.py
index 2d20457..ebc3a87 100644
--- a/mlscorecheck/check/binary/_check_n_datasets_mos_known_folds_mos.py
+++ b/mlscorecheck/check/binary/_check_n_datasets_mos_known_folds_mos.py
@@ -7,6 +7,7 @@
 import copy
 
 from ...aggregated import check_aggregated_scores, Experiment
+from ...individual import translate_metadata
 from ...core import NUMERICAL_TOLERANCE
 
 __all__ = ["check_n_datasets_mos_known_folds_mos"]
@@ -33,7 +34,10 @@ def check_n_datasets_mos_known_folds_mos(
 
     The test can only check the consistency of the 'acc', 'sens', 'spec' and 'bacc'
     scores. For a stronger test, one can add ``dataset_score_bounds`` when, for example, the
-    minimum and the maximum scores over the datasets are also provided.
+    minimum and the maximum scores over the datasets are also provided. Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
 
     Args:
         evaluations (list): The list of evaluation specifications.
@@ -107,6 +111,8 @@ def check_n_datasets_mos_known_folds_mos(
     ):
         raise ValueError("do not specify fold_score_bounds through this interface")
 
+    evaluations = translate_metadata(evaluations)
+
     evaluations = copy.deepcopy(evaluations)
 
     for evaluation in evaluations:
diff --git a/mlscorecheck/check/binary/_check_n_datasets_mos_unknown_folds_mos.py b/mlscorecheck/check/binary/_check_n_datasets_mos_unknown_folds_mos.py
index 58b1352..3b8127a 100644
--- a/mlscorecheck/check/binary/_check_n_datasets_mos_unknown_folds_mos.py
+++ b/mlscorecheck/check/binary/_check_n_datasets_mos_unknown_folds_mos.py
@@ -13,6 +13,7 @@
 from ._check_1_dataset_unknown_folds_mos import estimate_n_evaluations
 from ...core import NUMERICAL_TOLERANCE
 from ...aggregated import experiment_kfolds_generator
+from ...individual import translate_metadata
 
 __all__ = ["check_n_datasets_mos_unknown_folds_mos", "estimate_n_experiments"]
 
@@ -27,6 +28,9 @@ def estimate_n_experiments(evaluations: list, available_scores: list = None) ->
     Returns:
         int: the estimated number of different fold configurations.
     """
+
+    evaluations = translate_metadata(evaluations)
+
     available_scores = [] if available_scores is None else available_scores
 
     counts = [
@@ -63,7 +67,10 @@ def check_n_datasets_mos_unknown_folds_mos(
 
     The test can only check the consistency of the 'acc', 'sens', 'spec' and 'bacc'
     scores. For a stronger test, one can add dataset_score_bounds when, for example, the minimum and
-    the maximum scores over the datasets are also provided.
+    the maximum scores over the datasets are also provided. Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
 
     Note that depending on the size of the dataset (especially the number of minority instances)
     and the folding configuration, this test might lead to an untractable number of problems to
@@ -130,6 +137,8 @@ def check_n_datasets_mos_unknown_folds_mos(
         >>> result['inconsistency']
         # True
     """
+    evaluations = translate_metadata(evaluations)
+
     if any(evaluation.get("aggregation", "mos") != "mos" for evaluation in evaluations):
         raise ValueError(
             'the aggregation specified in each dataset must be "mor" or nothing.'
diff --git a/mlscorecheck/check/binary/_check_n_datasets_som_kfold_som.py b/mlscorecheck/check/binary/_check_n_datasets_som_kfold_som.py
index 4502629..a541777 100644
--- a/mlscorecheck/check/binary/_check_n_datasets_som_kfold_som.py
+++ b/mlscorecheck/check/binary/_check_n_datasets_som_kfold_som.py
@@ -7,7 +7,7 @@
 import copy
 
 from ...core import NUMERICAL_TOLERANCE
-from ...individual import check_scores_tptn_pairs
+from ...individual import check_scores_tptn_pairs, translate_metadata
 from ...aggregated import Experiment
 
 __all__ = ["check_n_datasets_som_kfold_som"]
@@ -34,7 +34,10 @@ def check_n_datasets_som_kfold_som(
                                     'fbp', 'fbn', 'upm', 'gm', 'mk', 'lrp', 'lrn', 'mcc',
                                     'bm', 'pt', 'dor', 'ji', 'kappa'), when using
                                     f-beta positive or f-beta negative, also set
-                                    'beta_positive' and 'beta_negative'.
+                                    'beta_positive' and 'beta_negative'. Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
         eps (float|dict(str,float)): the numerical uncertainty(ies) of the scores
         numerical_tolerance (float): in practice, beyond the numerical uncertainty of
                                     the scores, some further tolerance is applied. This is
@@ -97,6 +100,8 @@ def check_n_datasets_som_kfold_som(
         >>> result['inconsistency']
         # True
     """
+    evaluations = translate_metadata(evaluations)
+
     if any(evaluation.get("aggregation", "som") != "som" for evaluation in evaluations):
         raise ValueError(
             "the aggregation specifications cannot be anything else but 'rom'"
diff --git a/mlscorecheck/check/binary/_check_n_testsets_mos_no_kfold.py b/mlscorecheck/check/binary/_check_n_testsets_mos_no_kfold.py
index 1c23180..bad793d 100644
--- a/mlscorecheck/check/binary/_check_n_testsets_mos_no_kfold.py
+++ b/mlscorecheck/check/binary/_check_n_testsets_mos_no_kfold.py
@@ -5,6 +5,7 @@
 """
 
 from ...aggregated import check_aggregated_scores, Experiment, Dataset
+from ...individual import translate_metadata
 from ...core import NUMERICAL_TOLERANCE
 
 __all__ = ["check_n_testsets_mos_no_kfold"]
@@ -30,7 +31,10 @@ def check_n_testsets_mos_no_kfold(
 
     The test can only check the consistency of the 'acc', 'sens', 'spec' and 'bacc'
     scores. For a stronger test, one can add ``testset_score_bounds`` when, for example, the minimum
-    and the maximum scores over the testsets are also provided.
+    and the maximum scores over the testsets are also provided. Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
 
     Args:
         testsets (list(dict)): the list of testset specifications
@@ -90,6 +94,8 @@ def check_n_testsets_mos_no_kfold(
         # True
     """
 
+    testsets = translate_metadata(testsets)
+
     datasets = [Dataset(**dataset) for dataset in testsets]
 
     evaluations = [
diff --git a/mlscorecheck/check/binary/_check_n_testsets_som_no_kfold.py b/mlscorecheck/check/binary/_check_n_testsets_som_no_kfold.py
index c02f74c..d928f36 100644
--- a/mlscorecheck/check/binary/_check_n_testsets_som_no_kfold.py
+++ b/mlscorecheck/check/binary/_check_n_testsets_som_no_kfold.py
@@ -5,7 +5,7 @@
 """
 
 from ...core import NUMERICAL_TOLERANCE
-from ...individual import check_scores_tptn_pairs
+from ...individual import check_scores_tptn_pairs, translate_metadata
 from ...aggregated import Experiment, Dataset
 
 __all__ = ["check_n_testsets_som_no_kfold"]
@@ -32,7 +32,10 @@ def check_n_testsets_som_no_kfold(
                                     'fbp', 'fbn', 'upm', 'gm', 'mk', 'lrp', 'lrn', 'mcc',
                                     'bm', 'pt', 'dor', 'ji', 'kappa'), when using
                                     f-beta positive or f-beta negative, also set
-                                    'beta_positive' and 'beta_negative'.
+                                    'beta_positive' and 'beta_negative'. Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
         eps (float|dict(str,float)): the numerical uncertainty(ies) of the scores
         numerical_tolerance (float): in practice, beyond the numerical uncertainty of
                                     the scores, some further tolerance is applied. This is
@@ -83,6 +86,8 @@ def check_n_testsets_som_no_kfold(
         # True
     """
 
+    testsets = translate_metadata(testsets)
+
     datasets = [Dataset(**dataset) for dataset in testsets]
 
     evaluations = [
diff --git a/mlscorecheck/check/bundles/ehg/_tpehg.py b/mlscorecheck/check/bundles/ehg/_tpehg.py
index e3008fe..f667bf7 100644
--- a/mlscorecheck/check/bundles/ehg/_tpehg.py
+++ b/mlscorecheck/check/bundles/ehg/_tpehg.py
@@ -23,7 +23,10 @@ def check_tpehg(scores: dict,
 
     Args:
         scores (dict(str,float)): the dictionary of scores (supports only 'acc', 'sens', 'spec',
-                                    'bacc')
+                                    'bacc'). Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
         eps (float|dict(str,float)): the numerical uncertainties
         n_folds (int): the number of folds
         n_repeats (int): the number of repetitions
diff --git a/mlscorecheck/check/bundles/retina/_chasedb1.py b/mlscorecheck/check/bundles/retina/_chasedb1.py
index fb0a0e3..c1b96e2 100644
--- a/mlscorecheck/check/bundles/retina/_chasedb1.py
+++ b/mlscorecheck/check/bundles/retina/_chasedb1.py
@@ -112,9 +112,12 @@ def check_chasedb1_vessel_aggregated_som(imageset,
         scores (dict): the scores to check ('acc', 'sens', 'spec',
                         'bacc', 'npv', 'ppv', 'f1', 'fm', 'f1n',
                         'fbp', 'fbn', 'upm', 'gm', 'mk', 'lrp', 'lrn', 'mcc',
-                        'bm', 'pt', 'dor', 'ji', 'kappa'), when using
-                        f-beta positive or f-beta negative, also set
-                        'beta_positive' and 'beta_negative'.
+                        'bm', 'pt', 'dor', 'ji', 'kappa'). When using f-beta
+                                positive or f-beta negative, also set 'beta_positive' and
+                                'beta_negative'. Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
         eps (float|dict(str,float)): the numerical uncertainty(ies) of the scores
         numerical_tolerance (float): in practice, beyond the numerical uncertainty of
                                     the scores, some further tolerance is applied. This is
@@ -243,9 +246,12 @@ def check_chasedb1_vessel_image(image_identifier: str,
         scores (dict(str,float)): the scores to be tested ('acc', 'sens', 'spec',
                                     'bacc', 'npv', 'ppv', 'f1', 'fm', 'f1n',
                                     'fbp', 'fbn', 'upm', 'gm', 'mk', 'lrp', 'lrn', 'mcc',
-                                    'bm', 'pt', 'dor', 'ji', 'kappa'), when using
-                                    f-beta positive or f-beta negative, also set
-                                    'beta_positive' and 'beta_negative'.
+                                    'bm', 'pt', 'dor', 'ji', 'kappa'). When using f-beta
+                                positive or f-beta negative, also set 'beta_positive' and
+                                'beta_negative'. Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
         eps (float): the numerical uncertainty
         numerical_tolerance (float): in practice, beyond the numerical uncertainty of
                                     the scores, some further tolerance is applied. This is
diff --git a/mlscorecheck/check/bundles/retina/_diaretdb0.py b/mlscorecheck/check/bundles/retina/_diaretdb0.py
index 7124fe3..d31837e 100644
--- a/mlscorecheck/check/bundles/retina/_diaretdb0.py
+++ b/mlscorecheck/check/bundles/retina/_diaretdb0.py
@@ -77,9 +77,12 @@ def check_diaretdb0_class_som(subset: str,
         scores (dict(str,float)): the scores to be tested ('acc', 'sens', 'spec',
                                     'bacc', 'npv', 'ppv', 'f1', 'fm', 'f1n',
                                     'fbp', 'fbn', 'upm', 'gm', 'mk', 'lrp', 'lrn', 'mcc',
-                                    'bm', 'pt', 'dor', 'ji', 'kappa'), when using
-                                    f-beta positive or f-beta negative, also set
-                                    'beta_positive' and 'beta_negative'.
+                                    'bm', 'pt', 'dor', 'ji', 'kappa'). When using f-beta
+                                positive or f-beta negative, also set 'beta_positive' and
+                                'beta_negative'. Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
         eps (float): the numerical uncertainty
         numerical_tolerance (float): in practice, beyond the numerical uncertainty of
                                     the scores, some further tolerance is applied. This is
@@ -149,7 +152,10 @@ def check_diaretdb0_class_mos(subset: str,
                         'hardexudates'|'softexudates'|'hemorrhages'|'redsmalldots'), a list if
                         a list of classes is treated as positive
         scores (dict(str,float)): the scores to be tested (supports only 'acc', 'sens', 'spec',
-                                    'bacc')
+                                    'bacc'). Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
         eps (float): the numerical uncertainty
         score_bounds (dict(str,tuple(float,float))): the potential bounds on the scores
                                                             of the images
diff --git a/mlscorecheck/check/bundles/retina/_diaretdb1.py b/mlscorecheck/check/bundles/retina/_diaretdb1.py
index c1ef1d1..4375816 100644
--- a/mlscorecheck/check/bundles/retina/_diaretdb1.py
+++ b/mlscorecheck/check/bundles/retina/_diaretdb1.py
@@ -128,9 +128,12 @@ def check_diaretdb1_class(*,
         scores (dict(str,float)): the scores to be tested ('acc', 'sens', 'spec',
                                     'bacc', 'npv', 'ppv', 'f1', 'fm', 'f1n',
                                     'fbp', 'fbn', 'upm', 'gm', 'mk', 'lrp', 'lrn', 'mcc',
-                                    'bm', 'pt', 'dor', 'ji', 'kappa'), when using
-                                    f-beta positive or f-beta negative, also set
-                                    'beta_positive' and 'beta_negative'.
+                                    'bm', 'pt', 'dor', 'ji', 'kappa'). When using f-beta
+                                positive or f-beta negative, also set 'beta_positive' and
+                                'beta_negative'. Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
         eps (float): the numerical uncertainty
         numerical_tolerance (float): in practice, beyond the numerical uncertainty of
                                     the scores, some further tolerance is applied. This is
@@ -201,9 +204,12 @@ def check_diaretdb1_segmentation_image_assumption(*,
         scores (dict(str,float)): the scores to be tested ('acc', 'sens', 'spec',
                                     'bacc', 'npv', 'ppv', 'f1', 'fm', 'f1n',
                                     'fbp', 'fbn', 'upm', 'gm', 'mk', 'lrp', 'lrn', 'mcc',
-                                    'bm', 'pt', 'dor', 'ji', 'kappa'), when using
-                                    f-beta positive or f-beta negative, also set
-                                    'beta_positive' and 'beta_negative'.
+                                    'bm', 'pt', 'dor', 'ji', 'kappa'). When using f-beta
+                                positive or f-beta negative, also set 'beta_positive' and
+                                'beta_negative'. Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
         eps (float): the numerical uncertainty
         numerical_tolerance (float): in practice, beyond the numerical uncertainty of
                                     the scores, some further tolerance is applied. This is
@@ -269,9 +275,12 @@ def check_diaretdb1_segmentation_image(*,
         scores (dict(str,float)): the scores to be tested ('acc', 'sens', 'spec',
                                     'bacc', 'npv', 'ppv', 'f1', 'fm', 'f1n',
                                     'fbp', 'fbn', 'upm', 'gm', 'mk', 'lrp', 'lrn', 'mcc',
-                                    'bm', 'pt', 'dor', 'ji', 'kappa'), when using
-                                    f-beta positive or f-beta negative, also set
-                                    'beta_positive' and 'beta_negative'.
+                                    'bm', 'pt', 'dor', 'ji', 'kappa'). When using f-beta
+                                positive or f-beta negative, also set 'beta_positive' and
+                                'beta_negative'. Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
         eps (float): the numerical uncertainty
         numerical_tolerance (float): in practice, beyond the numerical uncertainty of
                                     the scores, some further tolerance is applied. This is
diff --git a/mlscorecheck/check/bundles/retina/_drishti_gs.py b/mlscorecheck/check/bundles/retina/_drishti_gs.py
index a15d166..16e4a74 100644
--- a/mlscorecheck/check/bundles/retina/_drishti_gs.py
+++ b/mlscorecheck/check/bundles/retina/_drishti_gs.py
@@ -73,9 +73,12 @@ def check_drishti_gs_segmentation_image(image_identifier: str,
         scores (dict): the scores to check ('acc', 'sens', 'spec',
                                     'bacc', 'npv', 'ppv', 'f1', 'fm', 'f1n',
                                     'fbp', 'fbn', 'upm', 'gm', 'mk', 'lrp', 'lrn', 'mcc',
-                                    'bm', 'pt', 'dor', 'ji', 'kappa'), when using
-                                    f-beta positive or f-beta negative, also set
-                                    'beta_positive' and 'beta_negative'.
+                                    'bm', 'pt', 'dor', 'ji', 'kappa'). When using f-beta
+                                positive or f-beta negative, also set 'beta_positive' and
+                                'beta_negative'. Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
         eps (float|dict(str,float)): the numerical uncertainty(ies) of the scores
         numerical_tolerance (float): in practice, beyond the numerical uncertainty of
                                     the scores, some further tolerance is applied. This is
@@ -144,7 +147,10 @@ def check_drishti_gs_segmentation_aggregated_mos(subset,
                             the soft segmentation ground truth image at threshold*255
         target (str): the target anatomical part ('OD'/'OC')
         scores (dict(str,float)): the scores to be tested (supports only 'acc', 'sens', 'spec',
-                                    'bacc')
+                                    'bacc'). Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
         eps (float|dict(str,float)): the numerical uncertainty(ies) of the scores
         score_bounds (dict(str,tuple(float,float))): the potential bounds on the scores
                                                             of the images
@@ -208,9 +214,12 @@ def check_drishti_gs_segmentation_aggregated_som(subset: str,
         scores (dict(str,float)): the scores to be tested ('acc', 'sens', 'spec',
                                     'bacc', 'npv', 'ppv', 'f1', 'fm', 'f1n',
                                     'fbp', 'fbn', 'upm', 'gm', 'mk', 'lrp', 'lrn', 'mcc',
-                                    'bm', 'pt', 'dor', 'ji', 'kappa'), when using
-                                    f-beta positive or f-beta negative, also set
-                                    'beta_positive' and 'beta_negative'.
+                                    'bm', 'pt', 'dor', 'ji', 'kappa'). When using f-beta
+                                positive or f-beta negative, also set 'beta_positive' and
+                                'beta_negative'. Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
         eps (float|dict(str,float)): the numerical uncertainty(ies) of the scores
         numerical_tolerance (float): in practice, beyond the numerical uncertainty of
                                     the scores, some further tolerance is applied. This is
diff --git a/mlscorecheck/check/bundles/retina/_drive.py b/mlscorecheck/check/bundles/retina/_drive.py
index 8a6cb9e..cab2f3b 100644
--- a/mlscorecheck/check/bundles/retina/_drive.py
+++ b/mlscorecheck/check/bundles/retina/_drive.py
@@ -129,9 +129,12 @@ def check_drive_vessel_aggregated_som_assumption(imageset,
         scores (dict): the scores to check ('acc', 'sens', 'spec',
                         'bacc', 'npv', 'ppv', 'f1', 'fm', 'f1n',
                         'fbp', 'fbn', 'upm', 'gm', 'mk', 'lrp', 'lrn', 'mcc',
-                        'bm', 'pt', 'dor', 'ji', 'kappa'), when using
-                        f-beta positive or f-beta negative, also set
-                        'beta_positive' and 'beta_negative'.
+                        'bm', 'pt', 'dor', 'ji', 'kappa'). When using f-beta
+                                positive or f-beta negative, also set 'beta_positive' and
+                                'beta_negative'. Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
         eps (float|dict(str,float)): the numerical uncertainty(ies) of the scores
         numerical_tolerance (float): in practice, beyond the numerical uncertainty of
                                     the scores, some further tolerance is applied. This is
@@ -188,9 +191,12 @@ def check_drive_vessel_image_assumption(image_identifier: str,
         scores (dict(str,float)): the scores to be tested ('acc', 'sens', 'spec',
                                     'bacc', 'npv', 'ppv', 'f1', 'fm', 'f1n',
                                     'fbp', 'fbn', 'upm', 'gm', 'mk', 'lrp', 'lrn', 'mcc',
-                                    'bm', 'pt', 'dor', 'ji', 'kappa'), when using
-                                    f-beta positive or f-beta negative, also set
-                                    'beta_positive' and 'beta_negative'.
+                                    'bm', 'pt', 'dor', 'ji', 'kappa'). When using f-beta
+                                positive or f-beta negative, also set 'beta_positive' and
+                                'beta_negative'. Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
         eps (float|dict(str,float)): the numerical uncertainty(ies) of the scores
         numerical_tolerance (float): in practice, beyond the numerical uncertainty of
                                     the scores, some further tolerance is applied. This is
@@ -334,9 +340,12 @@ def check_drive_vessel_image(image_identifier: str,
         scores (dict(str,float)): the scores to be tested ('acc', 'sens', 'spec',
                                     'bacc', 'npv', 'ppv', 'f1', 'fm', 'f1n',
                                     'fbp', 'fbn', 'upm', 'gm', 'mk', 'lrp', 'lrn', 'mcc',
-                                    'bm', 'pt', 'dor', 'ji', 'kappa'), when using
-                                    f-beta positive or f-beta negative, also set
-                                    'beta_positive' and 'beta_negative'.
+                                    'bm', 'pt', 'dor', 'ji', 'kappa'). When using f-beta
+                                positive or f-beta negative, also set 'beta_positive' and
+                                'beta_negative'. Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
         eps (float): the numerical uncertainty
         numerical_tolerance (float): in practice, beyond the numerical uncertainty of
                                     the scores, some further tolerance is applied. This is
diff --git a/mlscorecheck/check/bundles/retina/_hrf.py b/mlscorecheck/check/bundles/retina/_hrf.py
index b8297a2..dba373a 100644
--- a/mlscorecheck/check/bundles/retina/_hrf.py
+++ b/mlscorecheck/check/bundles/retina/_hrf.py
@@ -57,7 +57,10 @@ def check_hrf_vessel_aggregated_mos_assumption(imageset,
     Args:
         imageset (str|list): 'all' or the list of identifiers of images (e.g. ['13_h', '01_g'])
         assumption (str): the assumption on the region of evaluation to test ('fov'/'all')
-        scores (dict): the scores to check (supports only 'acc', 'sens', 'spec', 'bacc')
+        scores (dict): the scores to check (supports only 'acc', 'sens', 'spec', 'bacc'). Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
         eps (float|dict(str,float)): the numerical uncertainty(ies) of the scores
         score_bounds (dict(str,tuple(float,float))): the potential bounds on the scores
                                                             of the images
@@ -118,9 +121,12 @@ def check_hrf_vessel_aggregated_som_assumption(imageset,
         scores (dict): the scores to check ('acc', 'sens', 'spec',
                                     'bacc', 'npv', 'ppv', 'f1', 'fm', 'f1n',
                                     'fbp', 'fbn', 'upm', 'gm', 'mk', 'lrp', 'lrn', 'mcc',
-                                    'bm', 'pt', 'dor', 'ji', 'kappa'), when using
-                                    f-beta positive or f-beta negative, also set
-                                    'beta_positive' and 'beta_negative'.
+                                    'bm', 'pt', 'dor', 'ji', 'kappa'). When using f-beta
+                                positive or f-beta negative, also set 'beta_positive' and
+                                'beta_negative'. Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
         eps (float|dict(str,float)): the numerical uncertainty(ies) of the scores
         numerical_tolerance (float): in practice, beyond the numerical uncertainty of
                                     the scores, some further tolerance is applied. This is
@@ -175,9 +181,12 @@ def check_hrf_vessel_image_assumption(image_identifier: str,
         scores (dict(str,float)): the scores to be tested ('acc', 'sens', 'spec',
                                     'bacc', 'npv', 'ppv', 'f1', 'fm', 'f1n',
                                     'fbp', 'fbn', 'upm', 'gm', 'mk', 'lrp', 'lrn', 'mcc',
-                                    'bm', 'pt', 'dor', 'ji', 'kappa'), when using
-                                    f-beta positive or f-beta negative, also set
-                                    'beta_positive' and 'beta_negative'.
+                                    'bm', 'pt', 'dor', 'ji', 'kappa'). When using f-beta
+                                positive or f-beta negative, also set 'beta_positive' and
+                                'beta_negative'. Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
         eps (float): the numerical uncertainty
         numerical_tolerance (float): the additional numerical tolerance
 
@@ -305,9 +314,12 @@ def check_hrf_vessel_image(image_identifier: str,
         scores (dict(str,float)): the scores to be tested ('acc', 'sens', 'spec',
                                     'bacc', 'npv', 'ppv', 'f1', 'fm', 'f1n',
                                     'fbp', 'fbn', 'upm', 'gm', 'mk', 'lrp', 'lrn', 'mcc',
-                                    'bm', 'pt', 'dor', 'ji', 'kappa'), when using
-                                    f-beta positive or f-beta negative, also set
-                                    'beta_positive' and 'beta_negative'.
+                                    'bm', 'pt', 'dor', 'ji', 'kappa'). When using f-beta
+                                positive or f-beta negative, also set 'beta_positive' and
+                                'beta_negative'. Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
         eps (float): the numerical uncertainty
         numerical_tolerance (float): in practice, beyond the numerical uncertainty of
                                     the scores, some further tolerance is applied. This is
diff --git a/mlscorecheck/check/bundles/retina/_stare.py b/mlscorecheck/check/bundles/retina/_stare.py
index 30eea19..df66355 100644
--- a/mlscorecheck/check/bundles/retina/_stare.py
+++ b/mlscorecheck/check/bundles/retina/_stare.py
@@ -49,7 +49,11 @@ def check_stare_vessel_aggregated_mos(imageset,
         imageset (str|list): 'all' if all images are used, or a list of identifiers of
                             images (e.g. ['im0082', 'im0235'])
         annotator (str): the annotation to be used ('ah'/'vk')
-        scores (dict): the scores to check (supports only 'acc', 'sens', 'spec', 'bacc')
+        scores (dict): the scores to check (supports only 'acc', 'sens', 'spec', 'bacc').
+                                Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
         eps (float|dict(str,float)): the numerical uncertainty(ies) of the scores
         score_bounds (dict(str,tuple(float,float))): the potential bounds on the scores
                                                             of the images
@@ -111,9 +115,12 @@ def check_stare_vessel_aggregated_som(imageset,
         scores (dict): the scores to check ('acc', 'sens', 'spec',
                                     'bacc', 'npv', 'ppv', 'f1', 'fm', 'f1n',
                                     'fbp', 'fbn', 'upm', 'gm', 'mk', 'lrp', 'lrn', 'mcc',
-                                    'bm', 'pt', 'dor', 'ji', 'kappa'), when using
-                                    f-beta positive or f-beta negative, also set
-                                    'beta_positive' and 'beta_negative'.
+                                    'bm', 'pt', 'dor', 'ji', 'kappa'). When using f-beta
+                                positive or f-beta negative, also set 'beta_positive' and
+                                'beta_negative'. Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
         eps (float|dict(str,float)): the numerical uncertainty(ies) of the scores
         numerical_tolerance (float): in practice, beyond the numerical uncertainty of
                                     the scores, some further tolerance is applied. This is
@@ -240,9 +247,12 @@ def check_stare_vessel_image(image_identifier: str,
         scores (dict(str,float)): the scores to be tested ('acc', 'sens', 'spec',
                                     'bacc', 'npv', 'ppv', 'f1', 'fm', 'f1n',
                                     'fbp', 'fbn', 'upm', 'gm', 'mk', 'lrp', 'lrn', 'mcc',
-                                    'bm', 'pt', 'dor', 'ji', 'kappa'), when using
-                                    f-beta positive or f-beta negative, also set
-                                    'beta_positive' and 'beta_negative'.
+                                    'bm', 'pt', 'dor', 'ji', 'kappa'). When using f-beta
+                                positive or f-beta negative, also set 'beta_positive' and
+                                'beta_negative'. Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
         eps (float): the numerical uncertainty
         numerical_tolerance (float): in practice, beyond the numerical uncertainty of
                                     the scores, some further tolerance is applied. This is
diff --git a/mlscorecheck/check/bundles/skinlesion/_isic2016.py b/mlscorecheck/check/bundles/skinlesion/_isic2016.py
index f3e6bba..3608059 100644
--- a/mlscorecheck/check/bundles/skinlesion/_isic2016.py
+++ b/mlscorecheck/check/bundles/skinlesion/_isic2016.py
@@ -20,9 +20,12 @@ def check_isic2016(*,
         scores (dict): the scores to check ('acc', 'sens', 'spec',
                                     'bacc', 'npv', 'ppv', 'f1', 'fm', 'f1n',
                                     'fbp', 'fbn', 'upm', 'gm', 'mk', 'lrp', 'lrn', 'mcc',
-                                    'bm', 'pt', 'dor', 'ji', 'kappa'), when using
-                                    f-beta positive or f-beta negative, also set
-                                    'beta_positive' and 'beta_negative'.
+                                    'bm', 'pt', 'dor', 'ji', 'kappa'). When using f-beta
+                                positive or f-beta negative, also set 'beta_positive' and
+                                'beta_negative'. Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
         eps (float|dict(str,float)): the numerical uncertainty(ies) of the scores
         numerical_tolerance (float): in practice, beyond the numerical uncertainty of
                                     the scores, some further tolerance is applied. This is
diff --git a/mlscorecheck/check/bundles/skinlesion/_isic2017.py b/mlscorecheck/check/bundles/skinlesion/_isic2017.py
index f302b16..099e1fc 100644
--- a/mlscorecheck/check/bundles/skinlesion/_isic2017.py
+++ b/mlscorecheck/check/bundles/skinlesion/_isic2017.py
@@ -55,9 +55,12 @@ def check_isic2017(*,
         scores (dict): the scores to check ('acc', 'sens', 'spec',
                                     'bacc', 'npv', 'ppv', 'f1', 'fm', 'f1n',
                                     'fbp', 'fbn', 'upm', 'gm', 'mk', 'lrp', 'lrn', 'mcc',
-                                    'bm', 'pt', 'dor', 'ji', 'kappa'), when using
-                                    f-beta positive or f-beta negative, also set
-                                    'beta_positive' and 'beta_negative'.
+                                    'bm', 'pt', 'dor', 'ji', 'kappa'). When using f-beta
+                                positive or f-beta negative, also set 'beta_positive' and
+                                'beta_negative'. Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
         eps (float|dict(str,float)): the numerical uncertainty(ies) of the scores
         numerical_tolerance (float): in practice, beyond the numerical uncertainty of
                                     the scores, some further tolerance is applied. This is
diff --git a/mlscorecheck/check/multiclass/_check_1_dataset_known_folds_mos_macro.py b/mlscorecheck/check/multiclass/_check_1_dataset_known_folds_mos_macro.py
index 94d8f22..1e455f7 100644
--- a/mlscorecheck/check/multiclass/_check_1_dataset_known_folds_mos_macro.py
+++ b/mlscorecheck/check/multiclass/_check_1_dataset_known_folds_mos_macro.py
@@ -34,6 +34,10 @@ def check_1_dataset_known_folds_mos_macro(
     The test can only check the consistency of the 'acc', 'sens', 'spec' and 'bacc'
     scores. For a stronger test, one can add ``class_score_bounds`` or ``fold_score_bounds``
     when, for example, the minimum and the maximum scores over the classes or folds are available.
+    Full names in camel case, like
+    'positive_predictive_value', synonyms, like 'true_positive_rate'
+    or 'tpr' instead of 'sens' and complements, like
+    'false_positive_rate' for (1 - 'spec') can also be used.
 
     Args:
         testset (dict): the specification of the testset
diff --git a/mlscorecheck/check/multiclass/_check_1_dataset_known_folds_mos_micro.py b/mlscorecheck/check/multiclass/_check_1_dataset_known_folds_mos_micro.py
index 6534ebe..ce659ce 100644
--- a/mlscorecheck/check/multiclass/_check_1_dataset_known_folds_mos_micro.py
+++ b/mlscorecheck/check/multiclass/_check_1_dataset_known_folds_mos_micro.py
@@ -32,7 +32,10 @@ def check_1_dataset_known_folds_mos_micro(
 
     The test can only check the consistency of the 'acc', 'sens', 'spec' and 'bacc'
     scores. For a stronger test, one can add ``fold_score_bounds`` when, for example, the minimum
-    and the maximum scores over the folds are available.
+    and the maximum scores over the folds are available. Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
 
     Args:
         dataset (dict): The specification of the dataset.
diff --git a/mlscorecheck/check/multiclass/_check_1_dataset_known_folds_som_macro.py b/mlscorecheck/check/multiclass/_check_1_dataset_known_folds_som_macro.py
index a3aae1d..2618c9e 100644
--- a/mlscorecheck/check/multiclass/_check_1_dataset_known_folds_som_macro.py
+++ b/mlscorecheck/check/multiclass/_check_1_dataset_known_folds_som_macro.py
@@ -34,7 +34,10 @@ def check_1_dataset_known_folds_som_macro(
 
     The test can only check the consistency of the 'acc', 'sens', 'spec' and 'bacc'
     scores. For a stronger test, one can add ``class_score_bounds`` when, for example, the minimum
-    and the maximum scores over the classes are available.
+    and the maximum scores over the classes are available. Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
 
     Args:
         dataset (dict): The specification of the dataset.
diff --git a/mlscorecheck/check/multiclass/_check_1_testset_no_kfold_macro.py b/mlscorecheck/check/multiclass/_check_1_testset_no_kfold_macro.py
index 927a7d9..ab82c3e 100644
--- a/mlscorecheck/check/multiclass/_check_1_testset_no_kfold_macro.py
+++ b/mlscorecheck/check/multiclass/_check_1_testset_no_kfold_macro.py
@@ -34,7 +34,10 @@ class level scores on one single multiclass dataset.
     likely that there will be a configuration matching the scores provided. In order to
     increase the strength of the test, one can add ``class_scores_bounds`` when, for example,
     besides the average score, the minimum and the maximum scores over the classes
-    are also provided.
+    are also provided. Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
 
     Args:
         testset (dict): the specification of the testset
diff --git a/mlscorecheck/check/multiclass/_check_1_testset_no_kfold_micro.py b/mlscorecheck/check/multiclass/_check_1_testset_no_kfold_micro.py
index 8414f60..8ee06f6 100644
--- a/mlscorecheck/check/multiclass/_check_1_testset_no_kfold_micro.py
+++ b/mlscorecheck/check/multiclass/_check_1_testset_no_kfold_micro.py
@@ -30,9 +30,12 @@ def check_1_testset_no_kfold_micro(
         scores (dict(str,float)): the scores to check ('acc', 'sens', 'spec',
                                     'bacc', 'npv', 'ppv', 'f1', 'fm', 'f1n',
                                     'fbp', 'fbn', 'upm', 'gm', 'mk', 'lrp', 'lrn', 'mcc',
-                                    'bm', 'pt', 'dor', 'ji', 'kappa'), when using
-                                    f-beta positive or f-beta negative, also set
-                                    'beta_positive' and 'beta_negative'.
+                                    'bm', 'pt', 'dor', 'ji', 'kappa'). When using f-beta
+                                positive or f-beta negative, also set 'beta_positive' and
+                                'beta_negative'. Full names in camel case, like
+                                'positive_predictive_value', synonyms, like 'true_positive_rate'
+                                or 'tpr' instead of 'sens' and complements, like
+                                'false_positive_rate' for (1 - 'spec') can also be used.
         eps (float|dict(str,float)): the numerical uncertainty(ies) of the scores
         numerical_tolerance (float): in practice, beyond the numerical uncertainty of
                                     the scores, some further tolerance is applied. This is
diff --git a/mlscorecheck/individual/_utils.py b/mlscorecheck/individual/_utils.py
index 657dc2f..83140ad 100644
--- a/mlscorecheck/individual/_utils.py
+++ b/mlscorecheck/individual/_utils.py
@@ -24,6 +24,7 @@
     "is_less_than_zero",
     "is_zero",
     "unify_results",
+    "translate_metadata"
 ]
 
 solutions = solution_specifications
@@ -34,6 +35,36 @@
 functions = score_functions_without_complements
 functions_standardized = score_functions_standardized_without_complements
 
+def translate_metadata(original):
+    """
+    Translates the metadata, internally uses p instead of n_positive, n_minority or
+    n_1, and similarly, internally uses n instead of n_negative, n_majority, or n_0.
+
+    Args:
+        original (dict(str, int)|list): the original metadata specification
+
+    Returns:
+        dict(str, int)|list: the translated metadata specification
+    """
+
+    if isinstance(original, dict):
+        result = {}
+        for key, val in original.items():
+            if isinstance(val, dict):
+                result[key] = translate_metadata(val)
+            else:
+                if key in ('n_positive', 'n_minority', 'n_1'):
+                    result['p'] = val
+                elif key in ('n_negative', 'n_majority', 'n_0'):
+                    result['n'] = val
+                else:
+                    result[key] = val
+    elif isinstance(original, list):
+        result = [translate_metadata(item) for item in original]
+    else:
+        result = original
+
+    return result
 
 def resolve_aliases_and_complements(scores: dict) -> dict:
     """
diff --git a/mlscorecheck/scores/scores.json b/mlscorecheck/scores/scores.json
index 16482ea..f95a6a8 100644
--- a/mlscorecheck/scores/scores.json
+++ b/mlscorecheck/scores/scores.json
@@ -48,7 +48,7 @@
             "formula": "tp/p",
             "args_standardized": ["tp", "p"],
             "formula_standardized": "tp/p",
-            "synonyms": ["recall", "true_positive_rate"],
+            "synonyms": ["recall", "true_positive_rate", "tpr"],
             "polynomial_equation": "sens*p - tp",
             "higher_better": true,
             "description": "The proportion of correctly classified positive items.",
@@ -96,7 +96,7 @@
             "formula": "tn/n",
             "args_standardized": ["tn", "n"],
             "formula_standardized": "tn/n",
-            "synonyms": ["selectivity", "true_negative_rate"],
+            "synonyms": ["selectivity", "true_negative_rate", "tnr"],
             "polynomial_equation": "spec*n - tn",
             "higher_better": true,
             "description": "The proportion of correctly classified negative items.",
diff --git a/tests/individual/test_utils.py b/tests/individual/test_utils.py
index e0c05d1..a1824e8 100644
--- a/tests/individual/test_utils.py
+++ b/tests/individual/test_utils.py
@@ -12,6 +12,7 @@
     is_less_than_zero,
     unify_results,
     IntervalUnion,
+    translate_metadata
 )
 from mlscorecheck.scores import score_functions_with_solutions, score_specifications
 
@@ -22,6 +23,23 @@
 random_seeds = [5]
 
 
+def test_translate_metadata():
+    """
+    Testing the metadata translation
+    """
+
+    result = translate_metadata({'n_negative': 10, 'n_minority': 5})
+
+    assert len(result) == 2
+    assert 'n' in result
+    assert 'p' in result
+
+    result = translate_metadata([1, 2, 3])
+
+    assert result == [1, 2, 3]
+
+    assert 1 == translate_metadata(1)
+
 def test_resolve_aliases_and_complements():
     """
     Resolve the score aliases and complemnets

From d92a44087bf30dfa4c234b8b37ccfd7965e0c7c1 Mon Sep 17 00:00:00 2001
From: Gyorgy Kovacs <gyuriofkovacs@gmail.com>
Date: Sun, 18 Feb 2024 12:31:21 +0100
Subject: [PATCH 2/5] pylint added to requirements.txt

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index adef61b..1dc7c77 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,3 +8,4 @@ sphinx
 sphinx-gallery
 sphinx_rtd_theme
 sympy
+pylint

From 3dadeea3de0165ab6316148d453c633885a2c91d Mon Sep 17 00:00:00 2001
From: Gyorgy Kovacs <gyuriofkovacs@gmail.com>
Date: Sun, 18 Feb 2024 12:41:51 +0100
Subject: [PATCH 3/5] random states added to tests

---
 .pylintrc                                            |  2 ++
 mlscorecheck/aggregated/_fold_enumeration.py         | 12 ++++++++----
 .../binary/_check_n_datasets_mos_known_folds_mos.py  |  7 +++----
 .../check/binary/_check_n_datasets_som_kfold_som.py  |  9 +++++----
 .../check/binary/_check_n_testsets_som_no_kfold.py   |  9 +++++----
 mlscorecheck/check/bundles/retina/_hrf.py            |  8 ++++----
 tests/aggregated/test_check_aggregated_scores.py     |  5 ++++-
 tests/aggregated/test_evaluation.py                  |  4 +++-
 tests/aggregated/test_experiment.py                  |  4 +++-
 tests/check/multiclass/__init__.py                   |  2 +-
 .../test_1_dataset_known_folds_som_micro.py          |  1 -
 tests/check/multiclass/test_evaluate.py              |  2 --
 12 files changed, 38 insertions(+), 27 deletions(-)

diff --git a/.pylintrc b/.pylintrc
index 1ed3d17..3abc574 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -6,3 +6,5 @@ ignored-modules = numpy
 
 # Minimum lines number of a similarity.
 min-similarity-lines=30
+
+disable = too-many-arguments
\ No newline at end of file
diff --git a/mlscorecheck/aggregated/_fold_enumeration.py b/mlscorecheck/aggregated/_fold_enumeration.py
index e1898ea..1ab8ec6 100644
--- a/mlscorecheck/aggregated/_fold_enumeration.py
+++ b/mlscorecheck/aggregated/_fold_enumeration.py
@@ -370,7 +370,7 @@ def experiment_kfolds_generator(experiment: dict, available_scores: list):
             "aggregation": experiment["aggregation"],
         }
 
-def multiclass_fold_partitioning_generator_22(n0: int, n1: int, c0: int) -> dict:
+def multiclass_fold_partitioning_generator_22(n0: int, n1: int, c0: int):
     """
     Generates the configurations for two folds of cardinalities n0 and n1 and two
     classes of cardinalities c0 and n0 + n1 - c0
@@ -392,7 +392,7 @@ def multiclass_fold_partitioning_generator_22(n0: int, n1: int, c0: int) -> dict
             1: (c0 - c_00, n1 - c0 + c_00)
         }
 
-def multiclass_fold_partitioning_generator_2n(n0: int, n1: int, cs: list) -> dict:
+def multiclass_fold_partitioning_generator_2n(n0: int, n1: int, cs: list):
     """
     Generates the configurations for two folds of cardinalities n0 and n1 and a list
     of classes with sizes in cs
@@ -409,13 +409,17 @@ def multiclass_fold_partitioning_generator_2n(n0: int, n1: int, cs: list) -> dic
         if len(cs) == 2:
             yield part
         else:
-            for part_deep in multiclass_fold_partitioning_generator_2n(part[0][1], part[1][1], cs[1:]):
+            for part_deep in multiclass_fold_partitioning_generator_2n(
+                part[0][1],
+                part[1][1],
+                cs[1:]
+                ):
                 yield {
                     0: (part[0][0], *(part_deep[0])),
                     1: (part[1][0], *(part_deep[1]))
                 }
 
-def multiclass_fold_partitioning_generator_kn(ns: list, cs: list) -> dict:
+def multiclass_fold_partitioning_generator_kn(ns: list, cs: list):
     """
     Generates the configurations for a list of folds of sizes ns and a list
     of classes with sizes in cs
diff --git a/mlscorecheck/check/binary/_check_n_datasets_mos_known_folds_mos.py b/mlscorecheck/check/binary/_check_n_datasets_mos_known_folds_mos.py
index ebc3a87..9ab1169 100644
--- a/mlscorecheck/check/binary/_check_n_datasets_mos_known_folds_mos.py
+++ b/mlscorecheck/check/binary/_check_n_datasets_mos_known_folds_mos.py
@@ -34,10 +34,9 @@ def check_n_datasets_mos_known_folds_mos(
 
     The test can only check the consistency of the 'acc', 'sens', 'spec' and 'bacc'
     scores. For a stronger test, one can add ``dataset_score_bounds`` when, for example, the
-    minimum and the maximum scores over the datasets are also provided. Full names in camel case, like
-                                'positive_predictive_value', synonyms, like 'true_positive_rate'
-                                or 'tpr' instead of 'sens' and complements, like
-                                'false_positive_rate' for (1 - 'spec') can also be used.
+    minimum and the maximum scores over the datasets are also provided. Full names in camel case,
+    like 'positive_predictive_value', synonyms, like 'true_positive_rate' or 'tpr' instead of
+    'sens' and complements, like 'false_positive_rate' for (1 - 'spec') can also be used.
 
     Args:
         evaluations (list): The list of evaluation specifications.
diff --git a/mlscorecheck/check/binary/_check_n_datasets_som_kfold_som.py b/mlscorecheck/check/binary/_check_n_datasets_som_kfold_som.py
index a541777..2b52834 100644
--- a/mlscorecheck/check/binary/_check_n_datasets_som_kfold_som.py
+++ b/mlscorecheck/check/binary/_check_n_datasets_som_kfold_som.py
@@ -34,10 +34,11 @@ def check_n_datasets_som_kfold_som(
                                     'fbp', 'fbn', 'upm', 'gm', 'mk', 'lrp', 'lrn', 'mcc',
                                     'bm', 'pt', 'dor', 'ji', 'kappa'), when using
                                     f-beta positive or f-beta negative, also set
-                                    'beta_positive' and 'beta_negative'. Full names in camel case, like
-                                'positive_predictive_value', synonyms, like 'true_positive_rate'
-                                or 'tpr' instead of 'sens' and complements, like
-                                'false_positive_rate' for (1 - 'spec') can also be used.
+                                    'beta_positive' and 'beta_negative'. Full names in camel case,
+                                    like 'positive_predictive_value', synonyms, like
+                                    'true_positive_rate' or 'tpr' instead of 'sens' and
+                                    complements, like 'false_positive_rate' for (1 - 'spec') can
+                                    also be used.
         eps (float|dict(str,float)): the numerical uncertainty(ies) of the scores
         numerical_tolerance (float): in practice, beyond the numerical uncertainty of
                                     the scores, some further tolerance is applied. This is
diff --git a/mlscorecheck/check/binary/_check_n_testsets_som_no_kfold.py b/mlscorecheck/check/binary/_check_n_testsets_som_no_kfold.py
index d928f36..4d9a9df 100644
--- a/mlscorecheck/check/binary/_check_n_testsets_som_no_kfold.py
+++ b/mlscorecheck/check/binary/_check_n_testsets_som_no_kfold.py
@@ -32,10 +32,11 @@ def check_n_testsets_som_no_kfold(
                                     'fbp', 'fbn', 'upm', 'gm', 'mk', 'lrp', 'lrn', 'mcc',
                                     'bm', 'pt', 'dor', 'ji', 'kappa'), when using
                                     f-beta positive or f-beta negative, also set
-                                    'beta_positive' and 'beta_negative'. Full names in camel case, like
-                                'positive_predictive_value', synonyms, like 'true_positive_rate'
-                                or 'tpr' instead of 'sens' and complements, like
-                                'false_positive_rate' for (1 - 'spec') can also be used.
+                                    'beta_positive' and 'beta_negative'. Full names in camel case,
+                                    like 'positive_predictive_value', synonyms, like
+                                    'true_positive_rate' or 'tpr' instead of 'sens' and
+                                    complements, like 'false_positive_rate' for (1 - 'spec') can
+                                    also be used.
         eps (float|dict(str,float)): the numerical uncertainty(ies) of the scores
         numerical_tolerance (float): in practice, beyond the numerical uncertainty of
                                     the scores, some further tolerance is applied. This is
diff --git a/mlscorecheck/check/bundles/retina/_hrf.py b/mlscorecheck/check/bundles/retina/_hrf.py
index dba373a..1e3a78f 100644
--- a/mlscorecheck/check/bundles/retina/_hrf.py
+++ b/mlscorecheck/check/bundles/retina/_hrf.py
@@ -57,10 +57,10 @@ def check_hrf_vessel_aggregated_mos_assumption(imageset,
     Args:
         imageset (str|list): 'all' or the list of identifiers of images (e.g. ['13_h', '01_g'])
         assumption (str): the assumption on the region of evaluation to test ('fov'/'all')
-        scores (dict): the scores to check (supports only 'acc', 'sens', 'spec', 'bacc'). Full names in camel case, like
-                                'positive_predictive_value', synonyms, like 'true_positive_rate'
-                                or 'tpr' instead of 'sens' and complements, like
-                                'false_positive_rate' for (1 - 'spec') can also be used.
+        scores (dict): the scores to check (supports only 'acc', 'sens', 'spec', 'bacc'). Full
+                        names in camel case, like 'positive_predictive_value', synonyms, like
+                        'true_positive_rate' or 'tpr' instead of 'sens' and complements, like
+                        'false_positive_rate' for (1 - 'spec') can also be used.
         eps (float|dict(str,float)): the numerical uncertainty(ies) of the scores
         score_bounds (dict(str,tuple(float,float))): the potential bounds on the scores
                                                             of the images
diff --git a/tests/aggregated/test_check_aggregated_scores.py b/tests/aggregated/test_check_aggregated_scores.py
index 7e50610..06b2310 100644
--- a/tests/aggregated/test_check_aggregated_scores.py
+++ b/tests/aggregated/test_check_aggregated_scores.py
@@ -207,7 +207,10 @@ def test_others():
     """
 
     experiment, scores = generate_experiment(
-        aggregation="som", evaluation_params={"aggregation": "mos"}, return_scores=True
+        aggregation="som",
+        evaluation_params={"aggregation": "mos"},
+        return_scores=True,
+        random_state=5
     )
     with pytest.raises(ValueError):
         check_aggregated_scores(experiment=experiment, scores=scores, eps=1e-4)
diff --git a/tests/aggregated/test_evaluation.py b/tests/aggregated/test_evaluation.py
index 64be9cc..b317135 100644
--- a/tests/aggregated/test_evaluation.py
+++ b/tests/aggregated/test_evaluation.py
@@ -336,6 +336,8 @@ def test_others():
     Testing other functionalities
     """
 
-    evaluation = generate_evaluation(aggregation="som", feasible_fold_score_bounds=True)
+    evaluation = generate_evaluation(aggregation="som",
+                                        feasible_fold_score_bounds=True,
+                                        random_state=5)
     with pytest.raises(ValueError):
         Evaluation(**evaluation)
diff --git a/tests/aggregated/test_experiment.py b/tests/aggregated/test_experiment.py
index f61ead9..50930ce 100644
--- a/tests/aggregated/test_experiment.py
+++ b/tests/aggregated/test_experiment.py
@@ -342,7 +342,9 @@ def test_others():
     """
 
     experiment = generate_experiment(
-        aggregation="som", feasible_dataset_score_bounds=True
+        aggregation="som",
+        feasible_dataset_score_bounds=True,
+        random_state=5
     )
     with pytest.raises(ValueError):
         Experiment(**experiment)
diff --git a/tests/check/multiclass/__init__.py b/tests/check/multiclass/__init__.py
index a05d840..45f894f 100644
--- a/tests/check/multiclass/__init__.py
+++ b/tests/check/multiclass/__init__.py
@@ -1,3 +1,3 @@
 """
 This file turns the multiclass folder into a module
-"""
\ No newline at end of file
+"""
diff --git a/tests/check/multiclass/test_1_dataset_known_folds_som_micro.py b/tests/check/multiclass/test_1_dataset_known_folds_som_micro.py
index 2b4eec1..9ebd55a 100644
--- a/tests/check/multiclass/test_1_dataset_known_folds_som_micro.py
+++ b/tests/check/multiclass/test_1_dataset_known_folds_som_micro.py
@@ -53,4 +53,3 @@ def test_inconsistent(random_seed: int):
     )
 
     assert result["inconsistency"]
-
diff --git a/tests/check/multiclass/test_evaluate.py b/tests/check/multiclass/test_evaluate.py
index 0f02410..5271039 100644
--- a/tests/check/multiclass/test_evaluate.py
+++ b/tests/check/multiclass/test_evaluate.py
@@ -4,8 +4,6 @@
 
 import warnings
 
-import pytest
-
 from ._evaluate import evaluate_timeout
 
 def test_evaluate_timeout():

From aab44f0953ef7fc0c11f0ea79bbb66315a4757c4 Mon Sep 17 00:00:00 2001
From: Gyorgy Kovacs <gyuriofkovacs@gmail.com>
Date: Sun, 18 Feb 2024 12:42:40 +0100
Subject: [PATCH 4/5] typo fixes in documentation

---
 README.rst                        | 2 +-
 docs/01c_consistency_checking.rst | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.rst b/README.rst
index 8582c44..32024f1 100644
--- a/README.rst
+++ b/README.rst
@@ -269,7 +269,7 @@ We note that synonyms and full names are also supported, for example:
 
   * alternatives to ``sens`` are ``sensitivity``, ``true_positive_rate``, ``tpr`` and ``recall``,
   * alternatives to ``spec`` are ``specificity``, ``true_negative_rate``, ``tnr`` and ``selectivity``,
-  * alternative to ``ppv`` are ``positive_predictive_value`` and ``precision``.
+  * alternatives to ``ppv`` are ``positive_predictive_value`` and ``precision``.
 
 Similarly, complements are supported as:
 
diff --git a/docs/01c_consistency_checking.rst b/docs/01c_consistency_checking.rst
index 4b0118e..fb2009c 100644
--- a/docs/01c_consistency_checking.rst
+++ b/docs/01c_consistency_checking.rst
@@ -30,7 +30,7 @@ We note that synonyms and full names are also supported, for example:
 
   * alternatives to ``sens`` are ``sensitivity``, ``true_positive_rate``, ``tpr`` and ``recall``,
   * alternatives to ``spec`` are ``specificity``, ``true_negative_rate``, ``tnr`` and ``selectivity``,
-  * alternative to ``ppv`` are ``positive_predictive_value`` and ``precision``.
+  * alternatives to ``ppv`` are ``positive_predictive_value`` and ``precision``.
 
 Similarly, complements are supported as:
 

From 71257000dee91df5b26b1d01ce31066aae8a0179 Mon Sep 17 00:00:00 2001
From: Gyorgy Kovacs <gyuriofkovacs@gmail.com>
Date: Sun, 18 Feb 2024 12:43:36 +0100
Subject: [PATCH 5/5] line too long

---
 mlscorecheck/check/binary/_check_1_testset_no_kfold.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/mlscorecheck/check/binary/_check_1_testset_no_kfold.py b/mlscorecheck/check/binary/_check_1_testset_no_kfold.py
index b64140a..65544cc 100644
--- a/mlscorecheck/check/binary/_check_1_testset_no_kfold.py
+++ b/mlscorecheck/check/binary/_check_1_testset_no_kfold.py
@@ -32,10 +32,11 @@ def check_1_testset_no_kfold(
                                     'fbp', 'fbn', 'upm', 'gm', 'mk', 'lrp', 'lrn', 'mcc',
                                     'bm', 'pt', 'dor', 'ji', 'kappa'), when using
                                     f-beta positive or f-beta negative, also set
-                                    'beta_positive' and 'beta_negative'. Full names in camel case, like
-                                'positive_predictive_value', synonyms, like 'true_positive_rate'
-                                or 'tpr' instead of 'sens' and complements, like
-                                'false_positive_rate' for (1 - 'spec') can also be used.
+                                    'beta_positive' and 'beta_negative'. Full names in camel case,
+                                    like 'positive_predictive_value', synonyms, like
+                                    'true_positive_rate' or 'tpr' instead of 'sens' and
+                                    complements, like 'false_positive_rate' for (1 - 'spec') can
+                                    also be used.
         eps (float|dict(str,float)): the numerical uncertainty (potentially for each score)
         numerical_tolerance (float): in practice, beyond the numerical uncertainty of
                                     the scores, some further tolerance is applied. This is