Ruff formatting

openml · Dec 4, 2024 · a7f61cf · a7f61cf
1 parent 5707f2e
commit a7f61cf
Show file tree

Hide file tree

Showing 186 changed files with 6,446 additions and 3,815 deletions.
diff --git a/CITATION.cff b/CITATION.cff
@@ -35,4 +35,4 @@ preferred-citation:
   issue: 101
   volume: 25
   year: 2024
-  url: http://jmlr.org/papers/v25/22-0493.html
+  url: http://jmlr.org/papers/v25/22-0493.html
diff --git a/amlb/__init__.py b/amlb/__init__.py
@@ -20,5 +20,5 @@
     "AWSBenchmark",
     "SetupMode",
     "TaskResult",
-    "__version__"
+    "__version__",
 ]
diff --git a/amlb/benchmark.py b/amlb/benchmark.py
diff --git a/amlb/benchmarks/file.py b/amlb/benchmarks/file.py
@@ -7,7 +7,9 @@
 log = logging.getLogger(__name__)
 
 
-def _find_local_benchmark_definition(name: str, benchmark_definition_dirs: List[str]) -> str:
+def _find_local_benchmark_definition(
+    name: str, benchmark_definition_dirs: List[str]
+) -> str:
     # 'name' should be either a full path to the benchmark,
     # or a filename (without extension) in the benchmark directory.
     if os.path.exists(name):
@@ -20,11 +22,15 @@ def _find_local_benchmark_definition(name: str, benchmark_definition_dirs: List[
             return bf
 
     # should we support s3 and check for s3 path before raising error?
-    raise ValueError(f"Incorrect benchmark name or path `{name}`, name not available in {benchmark_definition_dirs}.")
+    raise ValueError(
+        f"Incorrect benchmark name or path `{name}`, name not available in {benchmark_definition_dirs}."
+    )
 
 
-def load_file_benchmark(name: str, benchmark_definition_dirs: List[str]) -> Tuple[str, Optional[str], List[Namespace]]:
-    """ Loads benchmark from a local file. """
+def load_file_benchmark(
+    name: str, benchmark_definition_dirs: List[str]
+) -> Tuple[str, Optional[str], List[Namespace]]:
+    """Loads benchmark from a local file."""
     benchmark_file = _find_local_benchmark_definition(name, benchmark_definition_dirs)
     log.info("Loading benchmark definitions from %s.", benchmark_file)
     tasks = config_load(benchmark_file)

diff --git a/amlb/benchmarks/openml.py b/amlb/benchmarks/openml.py
@@ -14,26 +14,28 @@
 
 
 def is_openml_benchmark(benchmark: str) -> bool:
-    """ Check if 'benchmark' is a valid identifier for an openml task or suite. """
+    """Check if 'benchmark' is a valid identifier for an openml task or suite."""
     return re.match(r"(openml|test\.openml)/[st]/\d+", benchmark) is not None
 
 
 def load_oml_benchmark(benchmark: str) -> tuple[str, str | None, list[Namespace]]:
-    """ Loads benchmark defined by openml suite or task, from openml/s/X or openml/t/Y. """
-    domain, oml_type, oml_id = benchmark.split('/')
+    """Loads benchmark defined by openml suite or task, from openml/s/X or openml/t/Y."""
+    domain, oml_type, oml_id = benchmark.split("/")
 
     if domain == "test.openml":
         log.debug("Setting openml server to the test server.")
         openml.config.server = "https://test.openml.org/api/v1/xml"
 
     if openml.config.retry_policy != "robot":
         log.debug(
-            "Setting openml retry_policy from '%s' to 'robot'." % openml.config.retry_policy)
+            "Setting openml retry_policy from '%s' to 'robot'."
+            % openml.config.retry_policy
+        )
         openml.config.set_retry_policy("robot")
 
-    if oml_type == 't':
+    if oml_type == "t":
         tasks = load_openml_task(domain, oml_id)
-    elif oml_type == 's':
+    elif oml_type == "s":
         tasks = load_openml_tasks_from_suite(domain, oml_id)
     else:
         raise ValueError(f"The oml_type is {oml_type} but must be 's' or 't'")
@@ -47,21 +49,35 @@ def load_openml_tasks_from_suite(domain: str, oml_id: str) -> list[Namespace]:
     suite = openml.study.get_suite(oml_id)
     # Here we know the (task, dataset) pairs so only download dataset meta-data is sufficient
     tasks = []
-    datasets = cast(pd.DataFrame, openml.datasets.list_datasets(data_id=suite.data, output_format='dataframe'))
-    datasets.set_index('did', inplace=True)
+    datasets = cast(
+        pd.DataFrame,
+        openml.datasets.list_datasets(data_id=suite.data, output_format="dataframe"),
+    )
+    datasets.set_index("did", inplace=True)
     for tid, did in zip(cast(list[int], suite.tasks), cast(list[int], suite.data)):
-        tasks.append(Namespace(name=str_sanitize(datasets.loc[did]['name']),
-                               description=f"{domain}/d/{did}",
-                               openml_task_id=tid,
-                               id="{}.org/t/{}".format(domain, tid)))
+        tasks.append(
+            Namespace(
+                name=str_sanitize(datasets.loc[did]["name"]),
+                description=f"{domain}/d/{did}",
+                openml_task_id=tid,
+                id="{}.org/t/{}".format(domain, tid),
+            )
+        )
     return tasks
 
+
 def load_openml_task(domain: str, oml_id: str) -> list[Namespace]:
     log.info("Loading openml task %s.", oml_id)
     # We first have the retrieve the task because we don't know the dataset id
     t = openml.tasks.get_task(oml_id, download_data=False, download_qualities=False)
-    data = openml.datasets.get_dataset(t.dataset_id, download_data=False, download_qualities=False)
-    return [Namespace(name=str_sanitize(data.name),
-                       description=data.description,
-                       openml_task_id=t.id,
-                       id="{}.org/t/{}".format(domain, t.id))]
+    data = openml.datasets.get_dataset(
+        t.dataset_id, download_data=False, download_qualities=False
+    )
+    return [
+        Namespace(
+            name=str_sanitize(data.name),
+            description=data.description,
+            openml_task_id=t.id,
+            id="{}.org/t/{}".format(domain, t.id),
+        )
+    ]
diff --git a/amlb/benchmarks/parser.py b/amlb/benchmarks/parser.py
@@ -8,15 +8,9 @@
 
 
 def benchmark_load(
-        name: str,
-        benchmark_definition_dirs: List[str]
-    ) -> Tuple[
-            Namespace | None,
-            List[Namespace],
-            str | None,
-            str
-        ]:
-    """ Loads the benchmark definition for the 'benchmark' cli input string.
+    name: str, benchmark_definition_dirs: List[str]
+) -> Tuple[Namespace | None, List[Namespace], str | None, str]:
+    """Loads the benchmark definition for the 'benchmark' cli input string.
 
     :param name: the value for 'benchmark'
     :param benchmark_definition_dirs: directories in which benchmark definitions can be found
@@ -28,9 +22,11 @@ def benchmark_load(
     if is_openml_benchmark(name):
         benchmark_name, benchmark_path, tasks = load_oml_benchmark(name)
     else:
-        benchmark_name, benchmark_path, tasks = load_file_benchmark(name, benchmark_definition_dirs)
+        benchmark_name, benchmark_path, tasks = load_file_benchmark(
+            name, benchmark_definition_dirs
+        )
 
-    hard_defaults = next((task for task in tasks if task.name == '__defaults__'), None)
+    hard_defaults = next((task for task in tasks if task.name == "__defaults__"), None)
     tasks = [task for task in tasks if task is not hard_defaults]
     for t in tasks:
         t.name = str_sanitize(t.name)

diff --git a/amlb/data.py b/amlb/data.py
@@ -10,6 +10,7 @@
   which can also be encoded (``y_enc``, ``X_enc``)
 - **Feature** provides metadata for a given feature/column as well as encoding functions.
 """
+
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
@@ -32,8 +33,15 @@
 
 
 class Feature:
-
-    def __init__(self, index: int, name: str, data_type: str | None, values: Iterable[str] | None = None, has_missing_values: bool = False, is_target: bool = False):
+    def __init__(
+        self,
+        index: int,
+        name: str,
+        data_type: str | None,
+        values: Iterable[str] | None = None,
+        has_missing_values: bool = False,
+        is_target: bool = False,
+    ):
         """
         :param index: index of the feature in the full data frame.
         :param name: name of the feature.
@@ -51,31 +59,33 @@ def __init__(self, index: int, name: str, data_type: str | None, values: Iterabl
 
     def is_categorical(self, strict: bool = True) -> bool:
         if strict:
-            return self.data_type == 'category'
+            return self.data_type == "category"
         return self.data_type is not None and not self.is_numerical()
 
     def is_numerical(self) -> bool:
-        return self.data_type in ['int', 'float', 'number']
+        return self.data_type in ["int", "float", "number"]
 
     @lazy_property
     def label_encoder(self) -> Encoder:
-        return Encoder('label' if self.values is not None else 'no-op',
-                       target=self.is_target,
-                       encoded_type=int if self.is_target and not self.is_numerical() else float,
-                       missing_values=[None, np.nan, pd.NA],
-                       missing_policy='mask' if self.has_missing_values else 'ignore',
-                       normalize_fn=Feature.normalize
-                       ).fit(self.values)
+        return Encoder(
+            "label" if self.values is not None else "no-op",
+            target=self.is_target,
+            encoded_type=int if self.is_target and not self.is_numerical() else float,
+            missing_values=[None, np.nan, pd.NA],
+            missing_policy="mask" if self.has_missing_values else "ignore",
+            normalize_fn=Feature.normalize,
+        ).fit(self.values)
 
     @lazy_property
     def one_hot_encoder(self) -> Encoder:
-        return Encoder('one-hot' if self.values is not None else 'no-op',
-                       target=self.is_target,
-                       encoded_type=int if self.is_target and not self.is_numerical() else float,
-                       missing_values=[None, np.nan, pd.NA],
-                       missing_policy='mask' if self.has_missing_values else 'ignore',
-                       normalize_fn=Feature.normalize
-                       ).fit(self.values)
+        return Encoder(
+            "one-hot" if self.values is not None else "no-op",
+            target=self.is_target,
+            encoded_type=int if self.is_target and not self.is_numerical() else float,
+            missing_values=[None, np.nan, pd.NA],
+            missing_policy="mask" if self.has_missing_values else "ignore",
+            normalize_fn=Feature.normalize,
+        ).fit(self.values)
 
     @staticmethod
     def normalize(arr: Iterable[str]) -> np.ndarray:
@@ -87,14 +97,15 @@ def values(self) -> list[str] | None:
 
     @values.setter
     def values(self, values: Iterable[str]) -> None:
-        self._values = Feature.normalize(values).tolist() if values is not None else None
+        self._values = (
+            Feature.normalize(values).tolist() if values is not None else None
+        )
 
     def __repr__(self) -> str:
-        return repr_def(self, 'all')
+        return repr_def(self, "all")
 
 
 class Datasplit(ABC):
-
     def __init__(self, dataset: Dataset, file_format: str):
         """
         :param file_format: the default format of the data file, obtained through the 'path' property.
@@ -143,10 +154,13 @@ def y(self) -> DF:
     @lazy_property
     @profile(logger=log)
     def data_enc(self) -> AM:
-        encoded_cols = [f.label_encoder.transform(self.data.iloc[:, f.index]) for f in self.dataset.features]
+        encoded_cols = [
+            f.label_encoder.transform(self.data.iloc[:, f.index])
+            for f in self.dataset.features
+        ]
         # optimize mem usage : frameworks use either raw data or encoded ones,
         # so we can clear the cached raw data once they've been encoded
-        self.release(['data', 'X', 'y'])
+        self.release(["data", "X", "y"])
         return np.hstack(tuple(col.reshape(-1, 1) for col in encoded_cols))
 
     @lazy_property
@@ -177,7 +191,6 @@ class DatasetType(Enum):
 
 
 class Dataset(ABC):
-
     def __init__(self) -> None:
         super().__init__()
 

diff --git a/amlb/datasets/__init__.py b/amlb/datasets/__init__.py
@@ -11,9 +11,10 @@ class DataSourceType(Enum):
 
 
 class DataLoader:
-
     def __init__(self, config):
-        self.openml_loader = OpenmlLoader(api_key=config.openml.apikey, cache_dir=config.input_dir)
+        self.openml_loader = OpenmlLoader(
+            api_key=config.openml.apikey, cache_dir=config.input_dir
+        )
         self.file_loader = FileLoader(cache_dir=config.input_dir)
 
     def load(self, source: DataSourceType, *args, **kwargs):