Skip to content

Commit

Permalink
Fix/urls (#142)
Browse files Browse the repository at this point in the history
* Update minio URL to reflect new bucket-object structure

* Add configurable base urls for arff and minio links

* Add configurable FastAPI kwargs, currently for root_path

* default target can contain multiple values, standardize null as []

* Update tests to reflect changes in server responses
  • Loading branch information
PGijsbers authored Jan 9, 2024
1 parent 7c0f2de commit 19bab14
Show file tree
Hide file tree
Showing 8 changed files with 33 additions and 14 deletions.
4 changes: 4 additions & 0 deletions src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,7 @@ def load_database_configuration(file: Path = Path(__file__).parent / "config.tom
"ok",
)
return database_configuration


def load_configuration(file: Path = Path(__file__).parent / "config.toml") -> TomlTable:
return tomllib.loads(file.read_text())
6 changes: 6 additions & 0 deletions src/config.toml
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
arff_base_url="https://test.openml.org"
minio_base_url="https://openml1.win.tue.nl"

[fastapi]
root_path=""

[databases.defaults]
host="openml-test-database"
port="3306"
Expand Down
8 changes: 5 additions & 3 deletions src/core/formatting.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import html

from config import load_configuration
from schemas.datasets.openml import DatasetFileFormat
from sqlalchemy.engine import Row

Expand All @@ -24,12 +25,13 @@ def _format_parquet_url(dataset: Row) -> str | None:
if dataset.format.lower() != DatasetFileFormat.ARFF:
return None

minio_base_url = "https://openml1.win.tue.nl"
return f"{minio_base_url}/dataset{dataset.did}/dataset_{dataset.did}.pq"
minio_base_url = load_configuration()["minio_base_url"]
prefix = dataset.did // 10_000
return f"{minio_base_url}/{prefix:04d}/{dataset.did:04d}/dataset_{dataset.did}.pq"


def _format_dataset_url(dataset: Row) -> str:
base_url = "https://test.openml.org"
base_url = load_configuration()["arff_base_url"]
filename = f"{html.escape(dataset.name)}.{dataset.format.lower()}"
return f"{base_url}/data/v1/download/{dataset.file_id}/{filename}"

Expand Down
4 changes: 3 additions & 1 deletion src/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import argparse

import uvicorn
from config import load_configuration
from fastapi import FastAPI
from routers.mldcat_ap.dataset import router as mldcat_ap_router
from routers.openml.datasets import router as datasets_router
Expand Down Expand Up @@ -40,7 +41,8 @@ def _parse_args() -> argparse.Namespace:


def create_api() -> FastAPI:
app = FastAPI()
fastapi_kwargs = load_configuration()["fastapi"]
app = FastAPI(**fastapi_kwargs)

app.include_router(datasets_router)
app.include_router(qualities_router)
Expand Down
4 changes: 2 additions & 2 deletions src/routers/openml/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
_format_dataset_url,
_format_error,
_format_parquet_url,
_safe_unquote,
)
from database.datasets import (
_get_qualities_for_datasets,
Expand Down Expand Up @@ -412,6 +411,7 @@ def get_dataset(
ignore_attribute = _csv_as_list(dataset.ignore_attribute, unquote_items=True)
row_id_attribute = _csv_as_list(dataset.row_id_attribute, unquote_items=True)
original_data_url = _csv_as_list(dataset.original_data_url, unquote_items=True)
default_target_attribute = _csv_as_list(dataset.default_target_attribute, unquote_items=True)

# Not sure which properties are set by this bit:
# foreach( $this->xml_fields_dataset['csv'] as $field ) {
Expand All @@ -437,7 +437,7 @@ def get_dataset(
description=description_,
description_version=description.version if description else 0,
tag=tags,
default_target_attribute=_safe_unquote(dataset.default_target_attribute),
default_target_attribute=default_target_attribute,
ignore_attribute=ignore_attribute,
row_id_attribute=row_id_attribute,
url=dataset_url,
Expand Down
6 changes: 3 additions & 3 deletions src/schemas/datasets/openml.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,9 +108,9 @@ class DatasetMetadata(BaseModel):
)
description_version: int = Field(json_schema_extra={"example": 2})
tags: list[str] = Field(json_schema_extra={"example": ["study_1", "uci"]}, alias="tag")
default_target_attribute: str | None = Field(json_schema_extra={"example": "class"})
ignore_attribute: list[str] | None = Field(json_schema_extra={"example": "sensitive_feature"})
row_id_attribute: list[str] | None = Field(json_schema_extra={"example": "ssn"})
default_target_attribute: list[str] = Field(json_schema_extra={"example": "class"})
ignore_attribute: list[str] = Field(json_schema_extra={"example": "sensitive_feature"})
row_id_attribute: list[str] = Field(json_schema_extra={"example": "ssn"})

url: HttpUrl = Field(
json_schema_extra={
Expand Down
6 changes: 3 additions & 3 deletions tests/routers/openml/datasets_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,13 @@ def test_get_dataset(py_api: TestClient) -> None:
"upload_date": "2014-04-06T23:19:24",
"licence": "Public",
"url": "https://test.openml.org/data/v1/download/1/anneal.arff",
"parquet_url": "https://openml1.win.tue.nl/dataset1/dataset_1.pq",
"parquet_url": "https://openml1.win.tue.nl/0000/0001/dataset_1.pq",
"file_id": 1,
"default_target_attribute": "class",
"default_target_attribute": ["class"],
"version_label": "1",
"tag": ["study_14"],
"visibility": "public",
"minio_url": "https://openml1.win.tue.nl/dataset1/dataset_1.pq",
"minio_url": "https://openml1.win.tue.nl/0000/0001/dataset_1.pq",
"status": "in_preparation",
"processing_date": "2024-01-04T10:13:59",
"md5_checksum": "4eaed8b6ec9d8211024b6c089b064761",
Expand Down
9 changes: 7 additions & 2 deletions tests/routers/openml/migration/datasets_migration_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,13 @@ def test_dataset_response_is_identical(
if processing_data := new_body.get("processing_date"):
new_body["processing_date"] = str(processing_data).replace("T", " ")
if parquet_url := new_body.get("parquet_url"):
new_body["parquet_url"] = str(parquet_url).replace("https", "http")
bucket, prefix, did, filename = parquet_url.rsplit("/", 3)
new_body["parquet_url"] = f"{bucket}/dataset{did.lstrip('0')}/{filename}"
new_body["parquet_url"] = new_body["parquet_url"].replace("https", "http")
if minio_url := new_body.get("minio_url"):
new_body["minio_url"] = str(minio_url).replace("https", "http")
bucket, prefix, did, filename = minio_url.rsplit("/", 3)
new_body["minio_url"] = f"{bucket}/dataset{did.lstrip('0')}/{filename}"
new_body["minio_url"] = new_body["minio_url"].replace("https", "http")

manual = []
# ref test.openml.org/d/33 (contributor) and d/34 (creator)
Expand All @@ -87,6 +91,7 @@ def test_dataset_response_is_identical(

if "description" not in new_body:
new_body["description"] = []

assert original == new_body


Expand Down

0 comments on commit 19bab14

Please sign in to comment.