From d9afd976c5f16a2d3a388ec2765c305078c5d972 Mon Sep 17 00:00:00 2001 From: Isaac To Date: Sun, 20 Oct 2024 19:26:05 -0700 Subject: [PATCH 1/2] Define `JsonschemaValidationErrorType` For representing types of `jsonschema.exceptions.ValidationError` objects --- .../cli/models.py | 25 ++++++++++- tests/test_cli/test_models.py | 41 +++++++++++++++++++ 2 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 tests/test_cli/test_models.py diff --git a/src/dandisets_linkml_status_tools/cli/models.py b/src/dandisets_linkml_status_tools/cli/models.py index c2d68fa..e526bc8 100644 --- a/src/dandisets_linkml_status_tools/cli/models.py +++ b/src/dandisets_linkml_status_tools/cli/models.py @@ -1,6 +1,6 @@ from collections.abc import Sequence from datetime import datetime -from typing import Annotated, Any, Union +from typing import Annotated, Any, NamedTuple, Union from dandi.dandiapi import VersionStatus from jsonschema.exceptions import ValidationError @@ -160,3 +160,26 @@ def dandiset_schema_version(self) -> str: # Errors encountered in validation against the dandiset metadata model in LinkML linkml_validation_errs: LinkmlValidationErrsType = [] + + +class JsonschemaValidationErrorType(NamedTuple): + """ + A named tuple for representing types of `jsonschema.exceptions.ValidationError` + objects. + + The type of a `jsonschema.exceptions.ValidationError` is decided by the value of its + `validator` field and the value of its `validator_value` field. The values + of these fields are bundled in an instance of this named tuple to represent a type + of `jsonschema.exceptions.ValidationError` objects. + """ + + validator: str + validator_value: Any + + def __eq__(self, other: object) -> bool: + return ( + isinstance(other, JsonschemaValidationErrorType) + and self.validator == other.validator + and type(self.validator_value) is type(other.validator_value) # noqa E721 + and self.validator_value == other.validator_value + ) diff --git a/tests/test_cli/test_models.py b/tests/test_cli/test_models.py new file mode 100644 index 0000000..fef40f6 --- /dev/null +++ b/tests/test_cli/test_models.py @@ -0,0 +1,41 @@ +import pytest + +from dandisets_linkml_status_tools.cli.models import JsonschemaValidationErrorType + + +@pytest.mark.parametrize( + ("op1", "op2", "expected_result"), + [ + (JsonschemaValidationErrorType("integer", [1, 2]), "hello", False), + ( + JsonschemaValidationErrorType("integer", 1), + JsonschemaValidationErrorType("string", 1), + False, + ), + ( + JsonschemaValidationErrorType("integer", 1), + JsonschemaValidationErrorType("integer", "1"), + False, + ), + ( + JsonschemaValidationErrorType("integer", 1), + JsonschemaValidationErrorType("integer", 2), + False, + ), + ( + JsonschemaValidationErrorType("integer", 42), + JsonschemaValidationErrorType("integer", 42), + True, + ), + ( + JsonschemaValidationErrorType("integer", [1, 2, 3]), + JsonschemaValidationErrorType("integer", [1, 2, 3]), + True, + ), + ], +) +def test_jsonschema_validation_error_type_equality(op1, op2, expected_result): + """ + Test the equal operator of the `JsonschemaValidationErrorType` class + """ + assert (op1 == op2) == expected_result From c22f7c37f63f2d1d83a8faf8726f22f13da1ec00 Mon Sep 17 00:00:00 2001 From: Isaac To Date: Mon, 21 Oct 2024 10:16:17 -0700 Subject: [PATCH 2/2] Reimplement `get_linkml_err_counts()` Based on the `validator` and `validator_value` fields from JSON schema validation errors --- .../cli/tools.py | 80 +++++++++++++++--- tests/test_cli/test_tools.py | 81 +++++++++++++++++++ 2 files changed, 148 insertions(+), 13 deletions(-) create mode 100644 tests/test_cli/test_tools.py diff --git a/src/dandisets_linkml_status_tools/cli/tools.py b/src/dandisets_linkml_status_tools/cli/tools.py index 35e3201..e48182e 100644 --- a/src/dandisets_linkml_status_tools/cli/tools.py +++ b/src/dandisets_linkml_status_tools/cli/tools.py @@ -1,13 +1,13 @@ import json import logging -import re from collections import Counter from collections.abc import Iterable from copy import deepcopy from functools import partial +from itertools import chain from pathlib import Path from shutil import rmtree -from typing import Any, Optional +from typing import Any, NamedTuple, Optional from dandi.dandiapi import RemoteDandiset from dandischema.models import Dandiset @@ -29,6 +29,7 @@ from dandisets_linkml_status_tools.cli.models import ( DandisetValidationReport, + JsonschemaValidationErrorType, LinkmlValidationErrsType, PydanticValidationErrsType, dandiset_metadata_adapter, @@ -288,7 +289,7 @@ def output_reports(reports: list[DandisetValidationReport], output_path: Path) - # For the linkml column ( f"[{len(r.linkml_validation_errs)} " - f"({' + '.join(str(v) for v in linkml_err_counts.values())})]" + f"({' + '.join(str(c) for _, c in linkml_err_counts)})]" f"({version_dir}/linkml_validation_errs.yaml)" if r.linkml_validation_errs else "0" @@ -360,16 +361,69 @@ def get_pydantic_err_counts(errs: PydanticValidationErrsType) -> Counter[str]: return Counter(isorted(e["type"] for e in errs)) -def get_linkml_err_counts(errs: LinkmlValidationErrsType) -> Counter[str]: +class _JsonschemaValidationErrorCounts(NamedTuple): + """ + A record of the counts of individual types of JSON schema validation error """ - Get a `Counter` object that counts the LinkML validation errors by type - :param errs: The list of LinkML validation errors to be counted - :return: The `Counter` object - Notes: The determination of the type of a LinkML validation error is rather - rudimentary at this point. + types: list[JsonschemaValidationErrorType] """ - linkml_err_types = [ - re.sub(r".*(is .*) in \S.*", r"\1", e.message, count=1) for e in errs - ] - return Counter(isorted(linkml_err_types)) + The unique types of JSON schema validation errors + """ + + counts: list[int] + """ + The corresponding counts, by index, of the types of JSON schema validation errors + """ + + +def get_linkml_err_counts( + errs: LinkmlValidationErrsType, +) -> list[tuple[JsonschemaValidationErrorType, int]]: + """ + Counts given LinkML validation errors by type + + :param errs: A list of LinkML validation errors to be counted + :return: A list of tuples where each tuple contains a + `JsonschemaValidationErrorType` object and the count of the errors of the type + represented by that object + """ + + def count_err(e_: ValidationResult) -> None: + validator = e_.source.validator + err_type = JsonschemaValidationErrorType(validator, e_.source.validator_value) + + if validator in counter: + for i, t in enumerate(counter[validator].types): + if t == err_type: + counter[validator].counts[i] += 1 + break + else: + counter[validator].types.append(err_type) + counter[validator].counts.append(1) + else: + counter[validator] = _JsonschemaValidationErrorCounts( + types=[err_type], counts=[1] + ) + + def compile_counts() -> list[tuple[JsonschemaValidationErrorType, int]]: + def sorting_key( + c: tuple[JsonschemaValidationErrorType, int] + ) -> tuple[str, int]: + return c[0].validator, -c[1] + + return sorted( + chain.from_iterable(zip(t, c) for t, c in counter.values()), key=sorting_key + ) + + # A dictionary that keeps the counts of individual types of JSON schema validation + # errors. The keys of the dictionary are the `validator` of + # the `JsonschemaValidationErrorType` objects, and the values are + # the `_JsonschemaValidationErrorCounts` that tallies the errors represented by + # `JsonschemaValidationErrorType` objects with the same `validator` value. + counter: dict[str, _JsonschemaValidationErrorCounts] = {} + + for e in errs: + count_err(e) + + return compile_counts() diff --git a/tests/test_cli/test_tools.py b/tests/test_cli/test_tools.py new file mode 100644 index 0000000..0097e7e --- /dev/null +++ b/tests/test_cli/test_tools.py @@ -0,0 +1,81 @@ +import pytest +from jsonschema.exceptions import ValidationError +from linkml.validator.report import Severity, ValidationResult + +from dandisets_linkml_status_tools.cli.models import JsonschemaValidationErrorType +from dandisets_linkml_status_tools.cli.tools import get_linkml_err_counts + + +@pytest.mark.parametrize( + ("error_types", "expected_counts"), + [ + ([], []), + ( + [ + JsonschemaValidationErrorType("integer", 1), + JsonschemaValidationErrorType("integer", 2), + JsonschemaValidationErrorType("string", "hello"), + ], + [ + (JsonschemaValidationErrorType("integer", 1), 1), + (JsonschemaValidationErrorType("integer", 2), 1), + (JsonschemaValidationErrorType("string", "hello"), 1), + ], + ), + ( + [ + JsonschemaValidationErrorType("integer", 1), + JsonschemaValidationErrorType("integer", 1), + JsonschemaValidationErrorType("integer", 1), + ], + [(JsonschemaValidationErrorType("integer", 1), 3)], + ), + ( + [ + JsonschemaValidationErrorType("integer", 1), + JsonschemaValidationErrorType("string", "hello"), + JsonschemaValidationErrorType("string", "hello"), + JsonschemaValidationErrorType("integer", 2), + JsonschemaValidationErrorType("integer", 1), + JsonschemaValidationErrorType("array", [1, 2, 3]), + JsonschemaValidationErrorType("array", (1, 2, 3)), + ], + [ + (JsonschemaValidationErrorType("array", [1, 2, 3]), 1), + (JsonschemaValidationErrorType("array", (1, 2, 3)), 1), + (JsonschemaValidationErrorType("integer", 1), 2), + (JsonschemaValidationErrorType("integer", 2), 1), + (JsonschemaValidationErrorType("string", "hello"), 2), + ], + ), + ], +) +def test_get_linkml_err_counts( + error_types: list[JsonschemaValidationErrorType], + expected_counts: list[tuple[JsonschemaValidationErrorType, int]], +): + """ + Test the `get_linkml_err_counts` function + + :param error_types: A list of JSON schema validation error types + :param expected_counts: A list of tuples of JSON schema validation error types + and their expected counts + """ + errs = [] + for t in error_types: + # noinspection PyTypeChecker + jsonschema_validation_error = ValidationError( + message="An artificial error", + validator=t.validator, + validator_value=t.validator_value, + ) + validation_result = ValidationResult( + type="jsonschema", + severity=Severity.ERROR, + message="What need to be fixed", + source=jsonschema_validation_error, + ) + errs.append(validation_result) + + counts = get_linkml_err_counts(errs) + assert counts == expected_counts