diff --git a/amlb/benchmark.py b/amlb/benchmark.py index 90a010053..18bf3cc4d 100644 --- a/amlb/benchmark.py +++ b/amlb/benchmark.py @@ -654,7 +654,6 @@ def handle_unfulfilled(message, on_auto="warn"): class BenchmarkTask: - def __init__(self, benchmark: Benchmark, task_def, fold): """ @@ -705,7 +704,6 @@ def load_data(self): "Loaded OpenML dataset for task_id %s.", self._task_def.openml_task_id ) elif hasattr(self._task_def, "openml_dataset_id"): - # TODO raise NotImplementedError( "OpenML datasets without task_id are not supported yet." ) diff --git a/amlb/data.py b/amlb/data.py index e7d05d83f..f65648c63 100644 --- a/amlb/data.py +++ b/amlb/data.py @@ -167,9 +167,6 @@ def data_enc(self) -> AM: @lazy_property @profile(logger=log) def X_enc(self) -> AM: - # TODO: should we use one_hot_encoder here instead? - # encoded_cols = [p.label_encoder.transform(self.data[:, p.index]) for p in self.dataset.predictors] - # return np.hstack(tuple(col.reshape(-1, 1) for col in encoded_cols)) predictors_ind = [p.index for p in self.dataset.predictors] return self.data_enc[:, predictors_ind] diff --git a/amlb/datasets/file.py b/amlb/datasets/file.py index a053bcfe4..22db525c2 100644 --- a/amlb/datasets/file.py +++ b/amlb/datasets/file.py @@ -217,9 +217,6 @@ def __init__( If None, defaults to a feature with name "class" or "target", or the last feature otherwise. features: list[ns | str] - #TODO: DEADCODE? - I don't see this accessed anywhere, and `features` property is retrieved - from split metadata, which also do not reference this. type: str, optional A valid DatasetType. If not specified, it is inferred by the properties of the target column. @@ -342,7 +339,6 @@ def __repr__(self): class ArffDataset(FileDataset): def __init__(self, train_path, test_path, target=None, features=None, type=None): - # todo: handle auto-split (if test_path is None): requires loading the training set, split, save super().__init__( ArffDatasplit(self, train_path), ArffDatasplit(self, test_path), @@ -422,7 +418,6 @@ def release(self, properties=None): class CsvDataset(FileDataset): def __init__(self, train_path, test_path, target=None, features=None, type=None): - # todo: handle auto-split (if test_path is None): requires loading the training set, split, save super().__init__(None, None, target=target, features=features, type=type) self._train = CsvDatasplit(self, train_path) self._test = CsvDatasplit(self, test_path) diff --git a/amlb/datasets/openml.py b/amlb/datasets/openml.py index 18d775c02..9f5839224 100644 --- a/amlb/datasets/openml.py +++ b/amlb/datasets/openml.py @@ -339,7 +339,7 @@ def _get_split_paths(self, ext=None): class OpenmlDatasplit(Datasplit): def __init__(self, dataset: OpenmlDataset): - super().__init__(dataset, "arff") # TODO: fix format + super().__init__(dataset, "arff") self._data: dict[str, AM | DF | str] = {} def data_path(self, format): diff --git a/amlb/datautils.py b/amlb/datautils.py index fa1a74394..0cda7d479 100644 --- a/amlb/datautils.py +++ b/amlb/datautils.py @@ -141,8 +141,7 @@ def reorder_dataset( if os.path.isfile(reordered_path): if save: # reordered file already exists, returning it as there's no data to load here return reordered_path - else: # reordered file already exists, use it to load reordered data - path = reordered_path + path = reordered_path with open(path) as file: df = arff.load(file) @@ -181,9 +180,6 @@ def reorder_dataset( }, file, ) - # TODO: provide the possibility to return data even if save is set to false, - # as the client code doesn't want to have to load the data again, - # and may want to benefit from the caching of reordered data for future runs. return reordered_path @@ -337,8 +333,6 @@ def return_value(v: np.ndarray) -> np.ndarray | str: if self._mask_missing or self._encode_missing: mask = [v in self.missing_values for v in vector] if any(mask): - # if self._mask_missing: - # missing = vector[mask] vector[mask] = self.missing_replaced_by if self.normalize_fn: vector = self.normalize_fn(vector) @@ -369,7 +363,6 @@ def inverse_transform( if not self.delegate: return vec - # TODO: handle mask vec = np.asarray(vec).astype(self.encoded_type, copy=False) return self.delegate.inverse_transform(vec, **params) diff --git a/amlb/resources.py b/amlb/resources.py index 0e37b240d..d9a763ba1 100644 --- a/amlb/resources.py +++ b/amlb/resources.py @@ -144,7 +144,6 @@ def framework_definition(self, name, tag=None): frameworks = self._frameworks[tag] log.debug("Available framework definitions:\n%s", frameworks) framework = next((f for n, f in frameworks if n.lower() == lname), None) - # TODO: Clean up this workflow and error messaging as part of #518 base_framework = next( (f for n, f in self._frameworks[default_tag] if n.lower() == lname), None ) @@ -334,12 +333,13 @@ def from_configs(*configs: Namespace): def get() -> Resources: if __INSTANCE__ is None: - # TODO: Instead why not do normal lazy loading pattern? raise RuntimeError("No configuration has been loaded yet.") return __INSTANCE__ def config(): + if __INSTANCE__ is None: + raise RuntimeError("No configuration has been loaded yet.") return __INSTANCE__.config diff --git a/amlb/runners/aws.py b/amlb/runners/aws.py index ca7655186..4446e01ce 100644 --- a/amlb/runners/aws.py +++ b/amlb/runners/aws.py @@ -799,9 +799,6 @@ def _start_instance( datetime_iso(micros=True, time_sep="."), ).lower() ) - # TODO: don't know if it would be considerably faster to reuse previously stopped instances sometimes - # instead of always creating a new one: - # would still need to set a new UserData though before restarting the instance. ec2_config = rconfig().aws.ec2 try: if ec2_config.subnet_id: @@ -1621,52 +1618,3 @@ def _ec2_startup_script(self, instance_key, script_params="", timeout_secs=-1): if timeout_secs > 0 else rconfig().aws.max_timeout_seconds, ) - - -class AWSRemoteBenchmark(Benchmark): - # TODO: idea is to handle results progressively on the remote side and push results as soon as they're generated - # this would allow to safely run multiple tasks on single AWS instance - - def __init__( - self, - framework_name, - benchmark_name, - constraint_name, - region=None, - job_history: str | None = None, - ): - self.region = region - self.s3 = boto3.resource("s3", region_name=self.region) - self.bucket = self._init_bucket() - self._download_resources() - super().__init__( - framework_name, benchmark_name, constraint_name, job_history=job_history - ) - - def run(self, save_scores=False): - super().run(save_scores) - self._upload_results() - - def _make_job(self, task_name=None, folds=None): - job = super()._make_job(task_name, folds) - super_run = job._run - - def new_run(): - super_run() - # self._upload_result() - - job._run = new_run - return job - - def _init_bucket(self): - pass - - def _download_resources(self): - root_key = str_def(rconfig().aws.s3.root_key) - benchmark_basename = os.path.basename(self.benchmark_path) - self.bucket.upload_file( - self.benchmark_path, root_key + ("/".join(["input", benchmark_basename])) - ) - - def _upload_results(self): - pass diff --git a/amlb/runners/container.py b/amlb/runners/container.py index 9a7e1a798..94d283b9f 100644 --- a/amlb/runners/container.py +++ b/amlb/runners/container.py @@ -92,7 +92,6 @@ def setup(self, mode, upload=False): self._upload_image(self.image) def cleanup(self): - # TODO: remove generated script? anything else? pass def run( @@ -136,7 +135,6 @@ def _run(): else rconfig().seed, ) ) - # TODO: would be nice to reload generated scores and return them job = Job( rconfig().token_separator.join(