Skip to content

Commit

Permalink
[REF] Upgrade codebase to Pydantic>2 (#389)
Browse files Browse the repository at this point in the history
* update extra fields config, regex validation

* replace conlist with List and custom AfterValidator

* add type annotation to all schemaKey fields
- now required to prevent error: pydantic.errors.PydanticUserError: Field 'schemaKey' defined on a base class was overridden by a non-annotated attribute. All field definitions, including overrides, require a type annotation.

* use pydantic.RootModel instead of deprecated __root__

* update pydantic model method names

* update generate_context to better accommodate changes to Pydantic internals

* update URL type annots and notes re: deprecation of str inheriting

* remove example portal URL missing TLD
- pydantic v2 HttpUrl no longer requires it

* shorten test name

* update dependencies including pydantic>2

* replace deprecated DataFrame.applymap call

* require python>=3.10 for package and test 3.11

* add python versions badge

* add README note about pip-compile's Python version awareness

* test validate_unique_list() in pydantic model instance

* remove seemingly unnecessary Optional type hint

* raise explicit exception for non-unique missing value list

---------

Co-authored-by: Sebastian Urchs <surchs@users.noreply.github.com>
  • Loading branch information
alyssadai and surchs authored Nov 15, 2024
1 parent 3156101 commit b1823ee
Show file tree
Hide file tree
Showing 11 changed files with 254 additions and 140 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ["3.9", "3.10"]
python-version: ["3.10", "3.11"]

steps:
- uses: actions/checkout@v4
Expand Down
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
[![Coverage Status](https://coveralls.io/repos/github/neurobagel/bagel-cli/badge.svg?branch=main)](https://coveralls.io/github/neurobagel/bagel-cli?branch=main)
[![Tests](https://github.com/neurobagel/bagel-cli/actions/workflows/test.yml/badge.svg)](https://github.com/neurobagel/bagel-cli/actions/workflows/test.yml)
[![Docker Image Version](https://img.shields.io/docker/v/neurobagel/bagelcli?label=docker)](https://hub.docker.com/r/neurobagel/bagelcli/tags)
[![Python versions](https://img.shields.io/badge/Python-3.10%20%7C%203.11-blue?style=flat)](https://www.python.org)

</div>

Expand Down Expand Up @@ -61,6 +62,8 @@ The `requirements.txt` file is automatically generated from the `setup.cfg`
constraints. To update it, we use `pip-compile` from the `pip-tools` package.
Here is how you can use these tools to update the `requirements.txt` file.

_Note: `pip-compile` will update dependencies based on the Python version of the environment it's running in._
1. Ensure `pip-tools` is installed:
```bash
pip install pip-tools
Expand All @@ -72,7 +75,7 @@ Here is how you can use these tools to update the `requirements.txt` file.
3. The above command only updates the runtime dependencies.
Now, update the developer dependencies in `dev_requirements.txt`:
```bash
pip-compile -o dev_requirements.txt --extra all
pip-compile -o dev_requirements.txt --extra all --upgrade
```
## Regenerating the Neurobagel vocabulary file
Expand Down
57 changes: 43 additions & 14 deletions bagel/dictionary_models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,27 @@
from typing import Dict, Optional, Union
from typing import Dict, List, Union

from pydantic import BaseModel, Extra, Field, conlist
from pydantic import AfterValidator, BaseModel, ConfigDict, Field, RootModel
from pydantic_core import PydanticCustomError
from typing_extensions import Annotated


def validate_unique_list(values: List[str]) -> List[str]:
"""
Check that provided list only has unique elements.
This custom validator is needed because constrained dtypes and their `unique_items` parameter
were deprecated in Pydantic v2. This function was adapted from https://github.com/pydantic/pydantic-core/pull/820#issuecomment-1656228704
and https://docs.pydantic.dev/latest/concepts/validators/#annotated-validators.
See also:
- https://docs.pydantic.dev/latest/migration/#changes-to-pydanticfield
- https://docs.pydantic.dev/latest/api/types/#pydantic.types.conlist)
"""
if len(values) != len(set(values)):
raise PydanticCustomError(
"unique_list", f"{values} is not a unique list"
)
return values


class Identifier(BaseModel):
Expand All @@ -27,15 +48,19 @@ class Neurobagel(BaseModel):
description="The concept or controlled term that describes this column",
alias="IsAbout",
)
missingValues: conlist(str, unique_items=True) = Field(
[],
description="A list of unique values that represent "
"invalid responses, typos, or missing data",
alias="MissingValues",
)
missingValues: Annotated[
List[str],
AfterValidator(validate_unique_list),
Field(
[],
description="A list of unique values that represent "
"invalid responses, typos, or missing data",
alias="MissingValues",
json_schema_extra={"uniqueItems": True},
),
]

class Config:
extra = Extra.forbid
model_config = ConfigDict(extra="forbid")


class CategoricalNeurobagel(Neurobagel):
Expand Down Expand Up @@ -66,7 +91,7 @@ class ContinuousNeurobagel(Neurobagel):
class IdentifierNeurobagel(Neurobagel):
"""A Neurobagel annotation for an identifier column"""

identifies: "str" = Field(
identifies: str = Field(
...,
description="For identifier columns, the type of observation uniquely identified by this column.",
alias="Identifies",
Expand All @@ -76,7 +101,9 @@ class IdentifierNeurobagel(Neurobagel):
class ToolNeurobagel(Neurobagel):
"""A Neurobagel annotation for an assessment tool column"""

isPartOf: Optional[Identifier] = Field(
# NOTE: Optional[Identifier] was removed as part of https://github.com/neurobagel/bagel-cli/pull/389
# because we couldn't tell what the Optional was doing
isPartOf: Identifier = Field(
...,
description="If the column is a subscale or item of an assessment tool "
"then the assessment tool should be specified here.",
Expand Down Expand Up @@ -123,7 +150,9 @@ class ContinuousColumn(Column):
)


class DataDictionary(BaseModel):
class DataDictionary(
RootModel[Dict[str, Union[ContinuousColumn, CategoricalColumn]]]
):
"""A data dictionary with human and machine readable information for a tabular data file"""

__root__: Dict[str, Union[ContinuousColumn, CategoricalColumn]]
pass
28 changes: 16 additions & 12 deletions bagel/models.py
Original file line number Diff line number Diff line change
@@ -1,47 +1,49 @@
import uuid
from typing import List, Literal, Optional, Union

from pydantic import BaseModel, Extra, Field, HttpUrl
from pydantic import BaseModel, ConfigDict, Field, HttpUrl

from bagel.mappings import NB

UUID_PATTERN = r"[0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{12}$"
BAGEL_UUID_PATTERN = rf"^{NB.pf}:{UUID_PATTERN}"


class Bagel(BaseModel, extra=Extra.forbid):
class Bagel(BaseModel):
"""identifier has to be a valid UUID prepended by the Neurobagel namespace
by default, a random (uuid4) string UUID will be created"""

identifier: str = Field(
regex=BAGEL_UUID_PATTERN,
pattern=BAGEL_UUID_PATTERN,
default_factory=lambda: NB.pf + ":" + str(uuid.uuid4()),
)

model_config = ConfigDict(extra="forbid")


class ControlledTerm(BaseModel):
identifier: Union[str, HttpUrl]
schemaKey: str


class Sex(ControlledTerm):
schemaKey = "Sex"
schemaKey: Literal["Sex"] = "Sex"


class Diagnosis(ControlledTerm):
schemaKey = "Diagnosis"
schemaKey: Literal["Diagnosis"] = "Diagnosis"


class SubjectGroup(ControlledTerm):
schemaKey = "SubjectGroup"
schemaKey: Literal["SubjectGroup"] = "SubjectGroup"


class Assessment(ControlledTerm):
schemaKey = "Assessment"
schemaKey: Literal["Assessment"] = "Assessment"


class Image(ControlledTerm):
schemaKey = "Image"
schemaKey: Literal["Image"] = "Image"


class Acquisition(Bagel):
Expand All @@ -50,7 +52,7 @@ class Acquisition(Bagel):


class Pipeline(ControlledTerm):
schemaKey = "Pipeline"
schemaKey: Literal["Pipeline"] = "Pipeline"


class CompletedPipeline(Bagel):
Expand All @@ -69,15 +71,15 @@ class PhenotypicSession(Session):
isSubjectGroup: Optional[SubjectGroup] = None
hasDiagnosis: Optional[List[Diagnosis]] = None
hasAssessment: Optional[List[Assessment]] = None
schemaKey = "PhenotypicSession"
schemaKey: Literal["PhenotypicSession"] = "PhenotypicSession"


class ImagingSession(Session):
# NOTE: Do imaging session have to have at least one acquisition OR at least one completed pipeline to be valid?
hasFilePath: Optional[str] = None
hasAcquisition: Optional[List[Acquisition]] = None
hasCompletedPipeline: Optional[List[CompletedPipeline]] = None
schemaKey = "ImagingSession"
schemaKey: Literal["ImagingSession"] = "ImagingSession"


class Subject(Bagel):
Expand All @@ -88,6 +90,8 @@ class Subject(Bagel):

class Dataset(Bagel):
hasLabel: str
hasPortalURI: Optional[HttpUrl] = None
# NOTE: Since Pydantic v2, URL types no longer inherit from `str`
# (see https://docs.pydantic.dev/latest/migration/#url-and-dsn-types-in-pydanticnetworks-no-longer-inherit-from-str)
hasPortalURI: Optional[Union[str, HttpUrl]] = None
hasSamples: List[Subject]
schemaKey: Literal["Dataset"] = "Dataset"
29 changes: 14 additions & 15 deletions bagel/utilities/model_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import inspect
from pathlib import Path
from typing import Iterable

Expand All @@ -11,24 +12,22 @@


def generate_context():
# Direct copy of the dandi-schema context generation function
# Adapted from the dandi-schema context generation function
# https://github.com/dandi/dandi-schema/blob/c616d87eaae8869770df0cb5405c24afdb9db096/dandischema/metadata.py
field_preamble = {
namespace.pf: namespace.url for namespace in ALL_NAMESPACES
}
fields = {}
for val in dir(models):
klass = getattr(models, val)
if not isinstance(klass, pydantic.main.ModelMetaclass):
continue
fields[klass.__name__] = f"{NB.pf}:{klass.__name__}"
for name, field in klass.__fields__.items():
if name == "schemaKey":
fields[name] = "@type"
elif name == "identifier":
fields[name] = "@id"
elif name not in fields:
fields[name] = {"@id": f"{NB.pf}:{name}"}
for klass_name, klass in inspect.getmembers(models):
if inspect.isclass(klass) and issubclass(klass, pydantic.BaseModel):
fields[klass_name] = f"{NB.pf}:{klass_name}"
for name, field in klass.model_fields.items():
if name == "schemaKey":
fields[name] = "@type"
elif name == "identifier":
fields[name] = "@id"
elif name not in fields:
fields[name] = {"@id": f"{NB.pf}:{name}"}

field_preamble.update(**fields)

Expand All @@ -41,7 +40,7 @@ def add_context_to_graph_dataset(dataset: models.Dataset) -> dict:
# We can't just exclude_unset here because the identifier and schemaKey
# for each instance are created as default values and so technically are never set
# TODO: we should revisit this because there may be reasons to have None be meaningful in the future
return {**context, **dataset.dict(exclude_none=True)}
return {**context, **dataset.model_dump(exclude_none=True)}


def get_subs_missing_from_pheno_data(
Expand Down Expand Up @@ -80,7 +79,7 @@ def extract_and_validate_jsonld_dataset(file_path: Path) -> models.Dataset:
jsonld = file_utils.load_json(file_path)
jsonld.pop("@context")
try:
jsonld_dataset = models.Dataset.parse_obj(jsonld)
jsonld_dataset = models.Dataset.model_validate(jsonld)
except ValidationError as err:
typer.echo(
typer.style(
Expand Down
15 changes: 7 additions & 8 deletions bagel/utilities/pheno_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from bagel import dictionary_models, mappings
from bagel.mappings import NB

DICTIONARY_SCHEMA = dictionary_models.DataDictionary.schema()
DICTIONARY_SCHEMA = dictionary_models.DataDictionary.model_json_schema()

AGE_HEURISTICS = {
"float": NB.pf + ":FromFloat",
Expand All @@ -24,10 +24,13 @@
}


def validate_portal_uri(portal: str) -> Optional[str]:
def validate_portal_uri(portal: Optional[str]) -> Optional[str]:
"""Custom validation that portal is a valid HttpUrl"""
# NOTE: We need Optional in the validation type below to account for --portal being an optional argument in the pheno command
try:
pydantic.parse_obj_as(Optional[pydantic.HttpUrl], portal)
pydantic.TypeAdapter(Optional[pydantic.HttpUrl]).validate_python(
portal
)
except pydantic.ValidationError as err:
raise BadParameter(
"Not a valid http or https URL: "
Expand Down Expand Up @@ -281,11 +284,7 @@ def get_rows_with_empty_strings(df: pd.DataFrame, columns: list) -> list:
"""For specified columns, returns the indices of rows with empty strings"""
# NOTE: Profile this section if things get slow, transforming "" -> nan and then
# using .isna() will very likely be much faster
empty_row = (
df[columns]
.applymap(lambda cell: cell == "")
.apply(lambda row: any([value for value in row]), axis=1)
)
empty_row = df[columns].eq("").any(axis=1)
return list(empty_row[empty_row].index)


Expand Down
Loading

0 comments on commit b1823ee

Please sign in to comment.