Skip to content

Commit

Permalink
Ruff formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
PGijsbers committed Dec 4, 2024
1 parent 5707f2e commit a7f61cf
Show file tree
Hide file tree
Showing 186 changed files with 6,446 additions and 3,815 deletions.
2 changes: 1 addition & 1 deletion CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,4 @@ preferred-citation:
issue: 101
volume: 25
year: 2024
url: http://jmlr.org/papers/v25/22-0493.html
url: http://jmlr.org/papers/v25/22-0493.html
2 changes: 1 addition & 1 deletion amlb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,5 @@
"AWSBenchmark",
"SetupMode",
"TaskResult",
"__version__"
"__version__",
]
493 changes: 343 additions & 150 deletions amlb/benchmark.py

Large diffs are not rendered by default.

14 changes: 10 additions & 4 deletions amlb/benchmarks/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
log = logging.getLogger(__name__)


def _find_local_benchmark_definition(name: str, benchmark_definition_dirs: List[str]) -> str:
def _find_local_benchmark_definition(
name: str, benchmark_definition_dirs: List[str]
) -> str:
# 'name' should be either a full path to the benchmark,
# or a filename (without extension) in the benchmark directory.
if os.path.exists(name):
Expand All @@ -20,11 +22,15 @@ def _find_local_benchmark_definition(name: str, benchmark_definition_dirs: List[
return bf

# should we support s3 and check for s3 path before raising error?
raise ValueError(f"Incorrect benchmark name or path `{name}`, name not available in {benchmark_definition_dirs}.")
raise ValueError(
f"Incorrect benchmark name or path `{name}`, name not available in {benchmark_definition_dirs}."
)


def load_file_benchmark(name: str, benchmark_definition_dirs: List[str]) -> Tuple[str, Optional[str], List[Namespace]]:
""" Loads benchmark from a local file. """
def load_file_benchmark(
name: str, benchmark_definition_dirs: List[str]
) -> Tuple[str, Optional[str], List[Namespace]]:
"""Loads benchmark from a local file."""
benchmark_file = _find_local_benchmark_definition(name, benchmark_definition_dirs)
log.info("Loading benchmark definitions from %s.", benchmark_file)
tasks = config_load(benchmark_file)
Expand Down
50 changes: 33 additions & 17 deletions amlb/benchmarks/openml.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,26 +14,28 @@


def is_openml_benchmark(benchmark: str) -> bool:
""" Check if 'benchmark' is a valid identifier for an openml task or suite. """
"""Check if 'benchmark' is a valid identifier for an openml task or suite."""
return re.match(r"(openml|test\.openml)/[st]/\d+", benchmark) is not None


def load_oml_benchmark(benchmark: str) -> tuple[str, str | None, list[Namespace]]:
""" Loads benchmark defined by openml suite or task, from openml/s/X or openml/t/Y. """
domain, oml_type, oml_id = benchmark.split('/')
"""Loads benchmark defined by openml suite or task, from openml/s/X or openml/t/Y."""
domain, oml_type, oml_id = benchmark.split("/")

if domain == "test.openml":
log.debug("Setting openml server to the test server.")
openml.config.server = "https://test.openml.org/api/v1/xml"

if openml.config.retry_policy != "robot":
log.debug(
"Setting openml retry_policy from '%s' to 'robot'." % openml.config.retry_policy)
"Setting openml retry_policy from '%s' to 'robot'."
% openml.config.retry_policy
)
openml.config.set_retry_policy("robot")

if oml_type == 't':
if oml_type == "t":
tasks = load_openml_task(domain, oml_id)
elif oml_type == 's':
elif oml_type == "s":
tasks = load_openml_tasks_from_suite(domain, oml_id)
else:
raise ValueError(f"The oml_type is {oml_type} but must be 's' or 't'")
Expand All @@ -47,21 +49,35 @@ def load_openml_tasks_from_suite(domain: str, oml_id: str) -> list[Namespace]:
suite = openml.study.get_suite(oml_id)
# Here we know the (task, dataset) pairs so only download dataset meta-data is sufficient
tasks = []
datasets = cast(pd.DataFrame, openml.datasets.list_datasets(data_id=suite.data, output_format='dataframe'))
datasets.set_index('did', inplace=True)
datasets = cast(
pd.DataFrame,
openml.datasets.list_datasets(data_id=suite.data, output_format="dataframe"),
)
datasets.set_index("did", inplace=True)
for tid, did in zip(cast(list[int], suite.tasks), cast(list[int], suite.data)):
tasks.append(Namespace(name=str_sanitize(datasets.loc[did]['name']),
description=f"{domain}/d/{did}",
openml_task_id=tid,
id="{}.org/t/{}".format(domain, tid)))
tasks.append(
Namespace(
name=str_sanitize(datasets.loc[did]["name"]),
description=f"{domain}/d/{did}",
openml_task_id=tid,
id="{}.org/t/{}".format(domain, tid),
)
)
return tasks


def load_openml_task(domain: str, oml_id: str) -> list[Namespace]:
log.info("Loading openml task %s.", oml_id)
# We first have the retrieve the task because we don't know the dataset id
t = openml.tasks.get_task(oml_id, download_data=False, download_qualities=False)
data = openml.datasets.get_dataset(t.dataset_id, download_data=False, download_qualities=False)
return [Namespace(name=str_sanitize(data.name),
description=data.description,
openml_task_id=t.id,
id="{}.org/t/{}".format(domain, t.id))]
data = openml.datasets.get_dataset(
t.dataset_id, download_data=False, download_qualities=False
)
return [
Namespace(
name=str_sanitize(data.name),
description=data.description,
openml_task_id=t.id,
id="{}.org/t/{}".format(domain, t.id),
)
]
18 changes: 7 additions & 11 deletions amlb/benchmarks/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,9 @@


def benchmark_load(
name: str,
benchmark_definition_dirs: List[str]
) -> Tuple[
Namespace | None,
List[Namespace],
str | None,
str
]:
""" Loads the benchmark definition for the 'benchmark' cli input string.
name: str, benchmark_definition_dirs: List[str]
) -> Tuple[Namespace | None, List[Namespace], str | None, str]:
"""Loads the benchmark definition for the 'benchmark' cli input string.
:param name: the value for 'benchmark'
:param benchmark_definition_dirs: directories in which benchmark definitions can be found
Expand All @@ -28,9 +22,11 @@ def benchmark_load(
if is_openml_benchmark(name):
benchmark_name, benchmark_path, tasks = load_oml_benchmark(name)
else:
benchmark_name, benchmark_path, tasks = load_file_benchmark(name, benchmark_definition_dirs)
benchmark_name, benchmark_path, tasks = load_file_benchmark(
name, benchmark_definition_dirs
)

hard_defaults = next((task for task in tasks if task.name == '__defaults__'), None)
hard_defaults = next((task for task in tasks if task.name == "__defaults__"), None)
tasks = [task for task in tasks if task is not hard_defaults]
for t in tasks:
t.name = str_sanitize(t.name)
Expand Down
61 changes: 37 additions & 24 deletions amlb/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
which can also be encoded (``y_enc``, ``X_enc``)
- **Feature** provides metadata for a given feature/column as well as encoding functions.
"""

from __future__ import annotations

from abc import ABC, abstractmethod
Expand All @@ -32,8 +33,15 @@


class Feature:

def __init__(self, index: int, name: str, data_type: str | None, values: Iterable[str] | None = None, has_missing_values: bool = False, is_target: bool = False):
def __init__(
self,
index: int,
name: str,
data_type: str | None,
values: Iterable[str] | None = None,
has_missing_values: bool = False,
is_target: bool = False,
):
"""
:param index: index of the feature in the full data frame.
:param name: name of the feature.
Expand All @@ -51,31 +59,33 @@ def __init__(self, index: int, name: str, data_type: str | None, values: Iterabl

def is_categorical(self, strict: bool = True) -> bool:
if strict:
return self.data_type == 'category'
return self.data_type == "category"
return self.data_type is not None and not self.is_numerical()

def is_numerical(self) -> bool:
return self.data_type in ['int', 'float', 'number']
return self.data_type in ["int", "float", "number"]

@lazy_property
def label_encoder(self) -> Encoder:
return Encoder('label' if self.values is not None else 'no-op',
target=self.is_target,
encoded_type=int if self.is_target and not self.is_numerical() else float,
missing_values=[None, np.nan, pd.NA],
missing_policy='mask' if self.has_missing_values else 'ignore',
normalize_fn=Feature.normalize
).fit(self.values)
return Encoder(
"label" if self.values is not None else "no-op",
target=self.is_target,
encoded_type=int if self.is_target and not self.is_numerical() else float,
missing_values=[None, np.nan, pd.NA],
missing_policy="mask" if self.has_missing_values else "ignore",
normalize_fn=Feature.normalize,
).fit(self.values)

@lazy_property
def one_hot_encoder(self) -> Encoder:
return Encoder('one-hot' if self.values is not None else 'no-op',
target=self.is_target,
encoded_type=int if self.is_target and not self.is_numerical() else float,
missing_values=[None, np.nan, pd.NA],
missing_policy='mask' if self.has_missing_values else 'ignore',
normalize_fn=Feature.normalize
).fit(self.values)
return Encoder(
"one-hot" if self.values is not None else "no-op",
target=self.is_target,
encoded_type=int if self.is_target and not self.is_numerical() else float,
missing_values=[None, np.nan, pd.NA],
missing_policy="mask" if self.has_missing_values else "ignore",
normalize_fn=Feature.normalize,
).fit(self.values)

@staticmethod
def normalize(arr: Iterable[str]) -> np.ndarray:
Expand All @@ -87,14 +97,15 @@ def values(self) -> list[str] | None:

@values.setter
def values(self, values: Iterable[str]) -> None:
self._values = Feature.normalize(values).tolist() if values is not None else None
self._values = (
Feature.normalize(values).tolist() if values is not None else None
)

def __repr__(self) -> str:
return repr_def(self, 'all')
return repr_def(self, "all")


class Datasplit(ABC):

def __init__(self, dataset: Dataset, file_format: str):
"""
:param file_format: the default format of the data file, obtained through the 'path' property.
Expand Down Expand Up @@ -143,10 +154,13 @@ def y(self) -> DF:
@lazy_property
@profile(logger=log)
def data_enc(self) -> AM:
encoded_cols = [f.label_encoder.transform(self.data.iloc[:, f.index]) for f in self.dataset.features]
encoded_cols = [
f.label_encoder.transform(self.data.iloc[:, f.index])
for f in self.dataset.features
]
# optimize mem usage : frameworks use either raw data or encoded ones,
# so we can clear the cached raw data once they've been encoded
self.release(['data', 'X', 'y'])
self.release(["data", "X", "y"])
return np.hstack(tuple(col.reshape(-1, 1) for col in encoded_cols))

@lazy_property
Expand Down Expand Up @@ -177,7 +191,6 @@ class DatasetType(Enum):


class Dataset(ABC):

def __init__(self) -> None:
super().__init__()

Expand Down
5 changes: 3 additions & 2 deletions amlb/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,10 @@ class DataSourceType(Enum):


class DataLoader:

def __init__(self, config):
self.openml_loader = OpenmlLoader(api_key=config.openml.apikey, cache_dir=config.input_dir)
self.openml_loader = OpenmlLoader(
api_key=config.openml.apikey, cache_dir=config.input_dir
)
self.file_loader = FileLoader(cache_dir=config.input_dir)

def load(self, source: DataSourceType, *args, **kwargs):
Expand Down
Loading

0 comments on commit a7f61cf

Please sign in to comment.