[REF] Upgrade codebase to Pydantic>2 (#389)

* update extra fields config, regex validation * replace conlist with List and custom AfterValidator * add type annotation to all schemaKey fields - now required to prevent error: pydantic.errors.PydanticUserError: Field 'schemaKey' defined on a base class was overridden by a non-annotated attribute. All field definitions, including overrides, require a type annotation. * use pydantic.RootModel instead of deprecated __root__ * update pydantic model method names * update generate_context to better accommodate changes to Pydantic internals * update URL type annots and notes re: deprecation of str inheriting * remove example portal URL missing TLD - pydantic v2 HttpUrl no longer requires it * shorten test name * update dependencies including pydantic>2 * replace deprecated DataFrame.applymap call * require python>=3.10 for package and test 3.11 * add python versions badge * add README note about pip-compile's Python version awareness * test validate_unique_list() in pydantic model instance * remove seemingly unnecessary Optional type hint * raise explicit exception for non-unique missing value list --------- Co-authored-by: Sebastian Urchs <surchs@users.noreply.github.com>
neurobagel · Nov 15, 2024 · b1823ee · b1823ee
1 parent 3156101
commit b1823ee
Show file tree

Hide file tree

Showing 11 changed files with 254 additions and 140 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -9,7 +9,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.9", "3.10"]
+        python-version: ["3.10", "3.11"]
 
     steps:
     - uses: actions/checkout@v4

diff --git a/README.md b/README.md
@@ -4,6 +4,7 @@
 [![Coverage Status](https://coveralls.io/repos/github/neurobagel/bagel-cli/badge.svg?branch=main)](https://coveralls.io/github/neurobagel/bagel-cli?branch=main)
 [![Tests](https://github.com/neurobagel/bagel-cli/actions/workflows/test.yml/badge.svg)](https://github.com/neurobagel/bagel-cli/actions/workflows/test.yml)
 [![Docker Image Version](https://img.shields.io/docker/v/neurobagel/bagelcli?label=docker)](https://hub.docker.com/r/neurobagel/bagelcli/tags)
+[![Python versions](https://img.shields.io/badge/Python-3.10%20%7C%203.11-blue?style=flat)](https://www.python.org)
 
 </div>
 
@@ -61,6 +62,8 @@ The `requirements.txt` file is automatically generated from the `setup.cfg`
 constraints. To update it, we use `pip-compile` from the `pip-tools` package.
 Here is how you can use these tools to update the `requirements.txt` file.
 
+_Note: `pip-compile` will update dependencies based on the Python version of the environment it's running in._
+
 1. Ensure `pip-tools` is installed:
     ```bash
     pip install pip-tools
@@ -72,7 +75,7 @@ Here is how you can use these tools to update the `requirements.txt` file.
 3. The above command only updates the runtime dependencies.
 Now, update the developer dependencies in `dev_requirements.txt`:
     ```bash
-    pip-compile -o dev_requirements.txt --extra all
+    pip-compile -o dev_requirements.txt --extra all --upgrade
     ```
 
 ## Regenerating the Neurobagel vocabulary file

diff --git a/bagel/dictionary_models.py b/bagel/dictionary_models.py
@@ -1,6 +1,27 @@
-from typing import Dict, Optional, Union
+from typing import Dict, List, Union
 
-from pydantic import BaseModel, Extra, Field, conlist
+from pydantic import AfterValidator, BaseModel, ConfigDict, Field, RootModel
+from pydantic_core import PydanticCustomError
+from typing_extensions import Annotated
+
+
+def validate_unique_list(values: List[str]) -> List[str]:
+    """
+    Check that provided list only has unique elements.
+
+    This custom validator is needed because constrained dtypes and their `unique_items` parameter
+    were deprecated in Pydantic v2. This function was adapted from https://github.com/pydantic/pydantic-core/pull/820#issuecomment-1656228704
+    and https://docs.pydantic.dev/latest/concepts/validators/#annotated-validators.
+
+    See also:
+    - https://docs.pydantic.dev/latest/migration/#changes-to-pydanticfield
+    - https://docs.pydantic.dev/latest/api/types/#pydantic.types.conlist)
+    """
+    if len(values) != len(set(values)):
+        raise PydanticCustomError(
+            "unique_list", f"{values} is not a unique list"
+        )
+    return values
 
 
 class Identifier(BaseModel):
@@ -27,15 +48,19 @@ class Neurobagel(BaseModel):
         description="The concept or controlled term that describes this column",
         alias="IsAbout",
     )
-    missingValues: conlist(str, unique_items=True) = Field(
-        [],
-        description="A list of unique values that represent "
-        "invalid responses, typos, or missing data",
-        alias="MissingValues",
-    )
+    missingValues: Annotated[
+        List[str],
+        AfterValidator(validate_unique_list),
+        Field(
+            [],
+            description="A list of unique values that represent "
+            "invalid responses, typos, or missing data",
+            alias="MissingValues",
+            json_schema_extra={"uniqueItems": True},
+        ),
+    ]
 
-    class Config:
-        extra = Extra.forbid
+    model_config = ConfigDict(extra="forbid")
 
 
 class CategoricalNeurobagel(Neurobagel):
@@ -66,7 +91,7 @@ class ContinuousNeurobagel(Neurobagel):
 class IdentifierNeurobagel(Neurobagel):
     """A Neurobagel annotation for an identifier column"""
 
-    identifies: "str" = Field(
+    identifies: str = Field(
         ...,
         description="For identifier columns, the type of observation uniquely identified by this column.",
         alias="Identifies",
@@ -76,7 +101,9 @@ class IdentifierNeurobagel(Neurobagel):
 class ToolNeurobagel(Neurobagel):
     """A Neurobagel annotation for an assessment tool column"""
 
-    isPartOf: Optional[Identifier] = Field(
+    # NOTE: Optional[Identifier] was removed as part of https://github.com/neurobagel/bagel-cli/pull/389
+    # because we couldn't tell what the Optional was doing
+    isPartOf: Identifier = Field(
         ...,
         description="If the column is a subscale or item of an assessment tool "
         "then the assessment tool should be specified here.",
@@ -123,7 +150,9 @@ class ContinuousColumn(Column):
     )
 
 
-class DataDictionary(BaseModel):
+class DataDictionary(
+    RootModel[Dict[str, Union[ContinuousColumn, CategoricalColumn]]]
+):
     """A data dictionary with human and machine readable information for a tabular data file"""
 
-    __root__: Dict[str, Union[ContinuousColumn, CategoricalColumn]]
+    pass
diff --git a/bagel/models.py b/bagel/models.py
@@ -1,47 +1,49 @@
 import uuid
 from typing import List, Literal, Optional, Union
 
-from pydantic import BaseModel, Extra, Field, HttpUrl
+from pydantic import BaseModel, ConfigDict, Field, HttpUrl
 
 from bagel.mappings import NB
 
 UUID_PATTERN = r"[0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{12}$"
 BAGEL_UUID_PATTERN = rf"^{NB.pf}:{UUID_PATTERN}"
 
 
-class Bagel(BaseModel, extra=Extra.forbid):
+class Bagel(BaseModel):
     """identifier has to be a valid UUID prepended by the Neurobagel namespace
     by default, a random (uuid4) string UUID will be created"""
 
     identifier: str = Field(
-        regex=BAGEL_UUID_PATTERN,
+        pattern=BAGEL_UUID_PATTERN,
         default_factory=lambda: NB.pf + ":" + str(uuid.uuid4()),
     )
 
+    model_config = ConfigDict(extra="forbid")
+
 
 class ControlledTerm(BaseModel):
     identifier: Union[str, HttpUrl]
     schemaKey: str
 
 
 class Sex(ControlledTerm):
-    schemaKey = "Sex"
+    schemaKey: Literal["Sex"] = "Sex"
 
 
 class Diagnosis(ControlledTerm):
-    schemaKey = "Diagnosis"
+    schemaKey: Literal["Diagnosis"] = "Diagnosis"
 
 
 class SubjectGroup(ControlledTerm):
-    schemaKey = "SubjectGroup"
+    schemaKey: Literal["SubjectGroup"] = "SubjectGroup"
 
 
 class Assessment(ControlledTerm):
-    schemaKey = "Assessment"
+    schemaKey: Literal["Assessment"] = "Assessment"
 
 
 class Image(ControlledTerm):
-    schemaKey = "Image"
+    schemaKey: Literal["Image"] = "Image"
 
 
 class Acquisition(Bagel):
@@ -50,7 +52,7 @@ class Acquisition(Bagel):
 
 
 class Pipeline(ControlledTerm):
-    schemaKey = "Pipeline"
+    schemaKey: Literal["Pipeline"] = "Pipeline"
 
 
 class CompletedPipeline(Bagel):
@@ -69,15 +71,15 @@ class PhenotypicSession(Session):
     isSubjectGroup: Optional[SubjectGroup] = None
     hasDiagnosis: Optional[List[Diagnosis]] = None
     hasAssessment: Optional[List[Assessment]] = None
-    schemaKey = "PhenotypicSession"
+    schemaKey: Literal["PhenotypicSession"] = "PhenotypicSession"
 
 
 class ImagingSession(Session):
     # NOTE: Do imaging session have to have at least one acquisition OR at least one completed pipeline to be valid?
     hasFilePath: Optional[str] = None
     hasAcquisition: Optional[List[Acquisition]] = None
     hasCompletedPipeline: Optional[List[CompletedPipeline]] = None
-    schemaKey = "ImagingSession"
+    schemaKey: Literal["ImagingSession"] = "ImagingSession"
 
 
 class Subject(Bagel):
@@ -88,6 +90,8 @@ class Subject(Bagel):
 
 class Dataset(Bagel):
     hasLabel: str
-    hasPortalURI: Optional[HttpUrl] = None
+    # NOTE: Since Pydantic v2, URL types no longer inherit from `str`
+    # (see https://docs.pydantic.dev/latest/migration/#url-and-dsn-types-in-pydanticnetworks-no-longer-inherit-from-str)
+    hasPortalURI: Optional[Union[str, HttpUrl]] = None
     hasSamples: List[Subject]
     schemaKey: Literal["Dataset"] = "Dataset"
diff --git a/bagel/utilities/model_utils.py b/bagel/utilities/model_utils.py
@@ -1,3 +1,4 @@
+import inspect
 from pathlib import Path
 from typing import Iterable
 
@@ -11,24 +12,22 @@
 
 
 def generate_context():
-    # Direct copy of the dandi-schema context generation function
+    # Adapted from the dandi-schema context generation function
     # https://github.com/dandi/dandi-schema/blob/c616d87eaae8869770df0cb5405c24afdb9db096/dandischema/metadata.py
     field_preamble = {
         namespace.pf: namespace.url for namespace in ALL_NAMESPACES
     }
     fields = {}
-    for val in dir(models):
-        klass = getattr(models, val)
-        if not isinstance(klass, pydantic.main.ModelMetaclass):
-            continue
-        fields[klass.__name__] = f"{NB.pf}:{klass.__name__}"
-        for name, field in klass.__fields__.items():
-            if name == "schemaKey":
-                fields[name] = "@type"
-            elif name == "identifier":
-                fields[name] = "@id"
-            elif name not in fields:
-                fields[name] = {"@id": f"{NB.pf}:{name}"}
+    for klass_name, klass in inspect.getmembers(models):
+        if inspect.isclass(klass) and issubclass(klass, pydantic.BaseModel):
+            fields[klass_name] = f"{NB.pf}:{klass_name}"
+            for name, field in klass.model_fields.items():
+                if name == "schemaKey":
+                    fields[name] = "@type"
+                elif name == "identifier":
+                    fields[name] = "@id"
+                elif name not in fields:
+                    fields[name] = {"@id": f"{NB.pf}:{name}"}
 
     field_preamble.update(**fields)
 
@@ -41,7 +40,7 @@ def add_context_to_graph_dataset(dataset: models.Dataset) -> dict:
     # We can't just exclude_unset here because the identifier and schemaKey
     # for each instance are created as default values and so technically are never set
     # TODO: we should revisit this because there may be reasons to have None be meaningful in the future
-    return {**context, **dataset.dict(exclude_none=True)}
+    return {**context, **dataset.model_dump(exclude_none=True)}
 
 
 def get_subs_missing_from_pheno_data(
@@ -80,7 +79,7 @@ def extract_and_validate_jsonld_dataset(file_path: Path) -> models.Dataset:
     jsonld = file_utils.load_json(file_path)
     jsonld.pop("@context")
     try:
-        jsonld_dataset = models.Dataset.parse_obj(jsonld)
+        jsonld_dataset = models.Dataset.model_validate(jsonld)
     except ValidationError as err:
         typer.echo(
             typer.style(

diff --git a/bagel/utilities/pheno_utils.py b/bagel/utilities/pheno_utils.py
@@ -13,7 +13,7 @@
 from bagel import dictionary_models, mappings
 from bagel.mappings import NB
 
-DICTIONARY_SCHEMA = dictionary_models.DataDictionary.schema()
+DICTIONARY_SCHEMA = dictionary_models.DataDictionary.model_json_schema()
 
 AGE_HEURISTICS = {
     "float": NB.pf + ":FromFloat",
@@ -24,10 +24,13 @@
 }
 
 
-def validate_portal_uri(portal: str) -> Optional[str]:
+def validate_portal_uri(portal: Optional[str]) -> Optional[str]:
     """Custom validation that portal is a valid HttpUrl"""
+    # NOTE: We need Optional in the validation type below to account for --portal being an optional argument in the pheno command
     try:
-        pydantic.parse_obj_as(Optional[pydantic.HttpUrl], portal)
+        pydantic.TypeAdapter(Optional[pydantic.HttpUrl]).validate_python(
+            portal
+        )
     except pydantic.ValidationError as err:
         raise BadParameter(
             "Not a valid http or https URL: "
@@ -281,11 +284,7 @@ def get_rows_with_empty_strings(df: pd.DataFrame, columns: list) -> list:
     """For specified columns, returns the indices of rows with empty strings"""
     # NOTE: Profile this section if things get slow, transforming "" -> nan and then
     # using .isna() will very likely be much faster
-    empty_row = (
-        df[columns]
-        .applymap(lambda cell: cell == "")
-        .apply(lambda row: any([value for value in row]), axis=1)
-    )
+    empty_row = df[columns].eq("").any(axis=1)
     return list(empty_row[empty_row].index)