Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove TODOs from the code base, use issue tracker instead #679

Merged
merged 1 commit into from
Dec 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions amlb/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -654,7 +654,6 @@ def handle_unfulfilled(message, on_auto="warn"):


class BenchmarkTask:

def __init__(self, benchmark: Benchmark, task_def, fold):
"""

Expand Down Expand Up @@ -705,7 +704,6 @@ def load_data(self):
"Loaded OpenML dataset for task_id %s.", self._task_def.openml_task_id
)
elif hasattr(self._task_def, "openml_dataset_id"):
# TODO
raise NotImplementedError(
"OpenML datasets without task_id are not supported yet."
)
Expand Down
3 changes: 0 additions & 3 deletions amlb/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,9 +167,6 @@ def data_enc(self) -> AM:
@lazy_property
@profile(logger=log)
def X_enc(self) -> AM:
# TODO: should we use one_hot_encoder here instead?
# encoded_cols = [p.label_encoder.transform(self.data[:, p.index]) for p in self.dataset.predictors]
# return np.hstack(tuple(col.reshape(-1, 1) for col in encoded_cols))
predictors_ind = [p.index for p in self.dataset.predictors]
return self.data_enc[:, predictors_ind]

Expand Down
5 changes: 0 additions & 5 deletions amlb/datasets/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,9 +217,6 @@ def __init__(
If None, defaults to a feature with name "class" or "target", or the last
feature otherwise.
features: list[ns | str]
#TODO: DEADCODE?
I don't see this accessed anywhere, and `features` property is retrieved
from split metadata, which also do not reference this.
type: str, optional
A valid DatasetType. If not specified, it is inferred by the properties of the
target column.
Expand Down Expand Up @@ -342,7 +339,6 @@ def __repr__(self):

class ArffDataset(FileDataset):
def __init__(self, train_path, test_path, target=None, features=None, type=None):
# todo: handle auto-split (if test_path is None): requires loading the training set, split, save
super().__init__(
ArffDatasplit(self, train_path),
ArffDatasplit(self, test_path),
Expand Down Expand Up @@ -422,7 +418,6 @@ def release(self, properties=None):

class CsvDataset(FileDataset):
def __init__(self, train_path, test_path, target=None, features=None, type=None):
# todo: handle auto-split (if test_path is None): requires loading the training set, split, save
super().__init__(None, None, target=target, features=features, type=type)
self._train = CsvDatasplit(self, train_path)
self._test = CsvDatasplit(self, test_path)
Expand Down
2 changes: 1 addition & 1 deletion amlb/datasets/openml.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,7 @@ def _get_split_paths(self, ext=None):

class OpenmlDatasplit(Datasplit):
def __init__(self, dataset: OpenmlDataset):
super().__init__(dataset, "arff") # TODO: fix format
super().__init__(dataset, "arff")
self._data: dict[str, AM | DF | str] = {}

def data_path(self, format):
Expand Down
9 changes: 1 addition & 8 deletions amlb/datautils.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,8 +141,7 @@ def reorder_dataset(
if os.path.isfile(reordered_path):
if save: # reordered file already exists, returning it as there's no data to load here
return reordered_path
else: # reordered file already exists, use it to load reordered data
path = reordered_path
path = reordered_path

with open(path) as file:
df = arff.load(file)
Expand Down Expand Up @@ -181,9 +180,6 @@ def reorder_dataset(
},
file,
)
# TODO: provide the possibility to return data even if save is set to false,
# as the client code doesn't want to have to load the data again,
# and may want to benefit from the caching of reordered data for future runs.
return reordered_path


Expand Down Expand Up @@ -337,8 +333,6 @@ def return_value(v: np.ndarray) -> np.ndarray | str:
if self._mask_missing or self._encode_missing:
mask = [v in self.missing_values for v in vector]
if any(mask):
# if self._mask_missing:
# missing = vector[mask]
vector[mask] = self.missing_replaced_by
if self.normalize_fn:
vector = self.normalize_fn(vector)
Expand Down Expand Up @@ -369,7 +363,6 @@ def inverse_transform(
if not self.delegate:
return vec

# TODO: handle mask
vec = np.asarray(vec).astype(self.encoded_type, copy=False)
return self.delegate.inverse_transform(vec, **params)

Expand Down
4 changes: 2 additions & 2 deletions amlb/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,6 @@ def framework_definition(self, name, tag=None):
frameworks = self._frameworks[tag]
log.debug("Available framework definitions:\n%s", frameworks)
framework = next((f for n, f in frameworks if n.lower() == lname), None)
# TODO: Clean up this workflow and error messaging as part of #518
base_framework = next(
(f for n, f in self._frameworks[default_tag] if n.lower() == lname), None
)
Expand Down Expand Up @@ -334,12 +333,13 @@ def from_configs(*configs: Namespace):

def get() -> Resources:
if __INSTANCE__ is None:
# TODO: Instead why not do normal lazy loading pattern?
raise RuntimeError("No configuration has been loaded yet.")
return __INSTANCE__


def config():
if __INSTANCE__ is None:
raise RuntimeError("No configuration has been loaded yet.")
return __INSTANCE__.config


Expand Down
52 changes: 0 additions & 52 deletions amlb/runners/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -799,9 +799,6 @@ def _start_instance(
datetime_iso(micros=True, time_sep="."),
).lower()
)
# TODO: don't know if it would be considerably faster to reuse previously stopped instances sometimes
# instead of always creating a new one:
# would still need to set a new UserData though before restarting the instance.
ec2_config = rconfig().aws.ec2
try:
if ec2_config.subnet_id:
Expand Down Expand Up @@ -1621,52 +1618,3 @@ def _ec2_startup_script(self, instance_key, script_params="", timeout_secs=-1):
if timeout_secs > 0
else rconfig().aws.max_timeout_seconds,
)


class AWSRemoteBenchmark(Benchmark):
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the idea is to create an AWS benchmark which saves results during execution to avoid losing data when a task errors when you run multiple tasks? Not entirely sure what Seb meant. Running multiple tasks is assumed to be sequential (in parallel we can't guarantee fair resource usage), which means also extra cleanup between runs and so on. There hasn't been a request for this in the 6 years since it was written, i'm not 100% sure what is meant, so this is not converted to an issue,

# TODO: idea is to handle results progressively on the remote side and push results as soon as they're generated
# this would allow to safely run multiple tasks on single AWS instance

def __init__(
self,
framework_name,
benchmark_name,
constraint_name,
region=None,
job_history: str | None = None,
):
self.region = region
self.s3 = boto3.resource("s3", region_name=self.region)
self.bucket = self._init_bucket()
self._download_resources()
super().__init__(
framework_name, benchmark_name, constraint_name, job_history=job_history
)

def run(self, save_scores=False):
super().run(save_scores)
self._upload_results()

def _make_job(self, task_name=None, folds=None):
job = super()._make_job(task_name, folds)
super_run = job._run

def new_run():
super_run()
# self._upload_result()

job._run = new_run
return job

def _init_bucket(self):
pass

def _download_resources(self):
root_key = str_def(rconfig().aws.s3.root_key)
benchmark_basename = os.path.basename(self.benchmark_path)
self.bucket.upload_file(
self.benchmark_path, root_key + ("/".join(["input", benchmark_basename]))
)

def _upload_results(self):
pass
2 changes: 0 additions & 2 deletions amlb/runners/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,6 @@ def setup(self, mode, upload=False):
self._upload_image(self.image)

def cleanup(self):
# TODO: remove generated script? anything else?
pass

def run(
Expand Down Expand Up @@ -136,7 +135,6 @@ def _run():
else rconfig().seed,
)
)
# TODO: would be nice to reload generated scores and return them
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While that would follow the other job results, results currently get printed and saved to logs fine? So I am not 100% sure what the added value is. Might be missing something, didn't have a close look.


job = Job(
rconfig().token_separator.join(
Expand Down
Loading