Fix/urls (#142)

* Update minio URL to reflect new bucket-object structure * Add configurable base urls for arff and minio links * Add configurable FastAPI kwargs, currently for root_path * default target can contain multiple values, standardize null as [] * Update tests to reflect changes in server responses
openml · Jan 9, 2024 · 19bab14 · 19bab14
1 parent 7c0f2de
commit 19bab14
Show file tree

Hide file tree

Showing 8 changed files with 33 additions and 14 deletions.
diff --git a/src/config.py b/src/config.py
@@ -43,3 +43,7 @@ def load_database_configuration(file: Path = Path(__file__).parent / "config.tom
         "ok",
     )
     return database_configuration
+
+
+def load_configuration(file: Path = Path(__file__).parent / "config.toml") -> TomlTable:
+    return tomllib.loads(file.read_text())
diff --git a/src/config.toml b/src/config.toml
@@ -1,3 +1,9 @@
+arff_base_url="https://test.openml.org"
+minio_base_url="https://openml1.win.tue.nl"
+
+[fastapi]
+root_path=""
+
 [databases.defaults]
 host="openml-test-database"
 port="3306"

diff --git a/src/core/formatting.py b/src/core/formatting.py
@@ -1,5 +1,6 @@
 import html
 
+from config import load_configuration
 from schemas.datasets.openml import DatasetFileFormat
 from sqlalchemy.engine import Row
 
@@ -24,12 +25,13 @@ def _format_parquet_url(dataset: Row) -> str | None:
     if dataset.format.lower() != DatasetFileFormat.ARFF:
         return None
 
-    minio_base_url = "https://openml1.win.tue.nl"
-    return f"{minio_base_url}/dataset{dataset.did}/dataset_{dataset.did}.pq"
+    minio_base_url = load_configuration()["minio_base_url"]
+    prefix = dataset.did // 10_000
+    return f"{minio_base_url}/{prefix:04d}/{dataset.did:04d}/dataset_{dataset.did}.pq"
 
 
 def _format_dataset_url(dataset: Row) -> str:
-    base_url = "https://test.openml.org"
+    base_url = load_configuration()["arff_base_url"]
     filename = f"{html.escape(dataset.name)}.{dataset.format.lower()}"
     return f"{base_url}/data/v1/download/{dataset.file_id}/{filename}"
 

diff --git a/src/main.py b/src/main.py
@@ -1,6 +1,7 @@
 import argparse
 
 import uvicorn
+from config import load_configuration
 from fastapi import FastAPI
 from routers.mldcat_ap.dataset import router as mldcat_ap_router
 from routers.openml.datasets import router as datasets_router
@@ -40,7 +41,8 @@ def _parse_args() -> argparse.Namespace:
 
 
 def create_api() -> FastAPI:
-    app = FastAPI()
+    fastapi_kwargs = load_configuration()["fastapi"]
+    app = FastAPI(**fastapi_kwargs)
 
     app.include_router(datasets_router)
     app.include_router(qualities_router)

diff --git a/src/routers/openml/datasets.py b/src/routers/openml/datasets.py
@@ -15,7 +15,6 @@
     _format_dataset_url,
     _format_error,
     _format_parquet_url,
-    _safe_unquote,
 )
 from database.datasets import (
     _get_qualities_for_datasets,
@@ -412,6 +411,7 @@ def get_dataset(
     ignore_attribute = _csv_as_list(dataset.ignore_attribute, unquote_items=True)
     row_id_attribute = _csv_as_list(dataset.row_id_attribute, unquote_items=True)
     original_data_url = _csv_as_list(dataset.original_data_url, unquote_items=True)
+    default_target_attribute = _csv_as_list(dataset.default_target_attribute, unquote_items=True)
 
     # Not sure which properties are set by this bit:
     # foreach( $this->xml_fields_dataset['csv'] as $field ) {
@@ -437,7 +437,7 @@ def get_dataset(
         description=description_,
         description_version=description.version if description else 0,
         tag=tags,
-        default_target_attribute=_safe_unquote(dataset.default_target_attribute),
+        default_target_attribute=default_target_attribute,
         ignore_attribute=ignore_attribute,
         row_id_attribute=row_id_attribute,
         url=dataset_url,

diff --git a/src/schemas/datasets/openml.py b/src/schemas/datasets/openml.py
@@ -108,9 +108,9 @@ class DatasetMetadata(BaseModel):
     )
     description_version: int = Field(json_schema_extra={"example": 2})
     tags: list[str] = Field(json_schema_extra={"example": ["study_1", "uci"]}, alias="tag")
-    default_target_attribute: str | None = Field(json_schema_extra={"example": "class"})
-    ignore_attribute: list[str] | None = Field(json_schema_extra={"example": "sensitive_feature"})
-    row_id_attribute: list[str] | None = Field(json_schema_extra={"example": "ssn"})
+    default_target_attribute: list[str] = Field(json_schema_extra={"example": "class"})
+    ignore_attribute: list[str] = Field(json_schema_extra={"example": "sensitive_feature"})
+    row_id_attribute: list[str] = Field(json_schema_extra={"example": "ssn"})
 
     url: HttpUrl = Field(
         json_schema_extra={

diff --git a/tests/routers/openml/datasets_test.py b/tests/routers/openml/datasets_test.py
@@ -43,13 +43,13 @@ def test_get_dataset(py_api: TestClient) -> None:
         "upload_date": "2014-04-06T23:19:24",
         "licence": "Public",
         "url": "https://test.openml.org/data/v1/download/1/anneal.arff",
-        "parquet_url": "https://openml1.win.tue.nl/dataset1/dataset_1.pq",
+        "parquet_url": "https://openml1.win.tue.nl/0000/0001/dataset_1.pq",
         "file_id": 1,
-        "default_target_attribute": "class",
+        "default_target_attribute": ["class"],
         "version_label": "1",
         "tag": ["study_14"],
         "visibility": "public",
-        "minio_url": "https://openml1.win.tue.nl/dataset1/dataset_1.pq",
+        "minio_url": "https://openml1.win.tue.nl/0000/0001/dataset_1.pq",
         "status": "in_preparation",
         "processing_date": "2024-01-04T10:13:59",
         "md5_checksum": "4eaed8b6ec9d8211024b6c089b064761",

diff --git a/tests/routers/openml/migration/datasets_migration_test.py b/tests/routers/openml/migration/datasets_migration_test.py
@@ -59,9 +59,13 @@ def test_dataset_response_is_identical(
     if processing_data := new_body.get("processing_date"):
         new_body["processing_date"] = str(processing_data).replace("T", " ")
     if parquet_url := new_body.get("parquet_url"):
-        new_body["parquet_url"] = str(parquet_url).replace("https", "http")
+        bucket, prefix, did, filename = parquet_url.rsplit("/", 3)
+        new_body["parquet_url"] = f"{bucket}/dataset{did.lstrip('0')}/{filename}"
+        new_body["parquet_url"] = new_body["parquet_url"].replace("https", "http")
     if minio_url := new_body.get("minio_url"):
-        new_body["minio_url"] = str(minio_url).replace("https", "http")
+        bucket, prefix, did, filename = minio_url.rsplit("/", 3)
+        new_body["minio_url"] = f"{bucket}/dataset{did.lstrip('0')}/{filename}"
+        new_body["minio_url"] = new_body["minio_url"].replace("https", "http")
 
     manual = []
     # ref test.openml.org/d/33 (contributor) and d/34 (creator)
@@ -87,6 +91,7 @@ def test_dataset_response_is_identical(
 
     if "description" not in new_body:
         new_body["description"] = []
+
     assert original == new_body