From d806571066cf4a6b61008ae02ba2fa7ce35ae0ab Mon Sep 17 00:00:00 2001 From: perib Date: Wed, 27 Mar 2024 12:19:06 -0700 Subject: [PATCH 1/6] graph and tree random length initial pipeline --- tpot2/search_spaces/pipelines/graph.py | 30 ++++++++++++++++++++------ 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/tpot2/search_spaces/pipelines/graph.py b/tpot2/search_spaces/pipelines/graph.py index c8a5280f..0ebe7092 100644 --- a/tpot2/search_spaces/pipelines/graph.py +++ b/tpot2/search_spaces/pipelines/graph.py @@ -303,7 +303,7 @@ def _crossover_swap_branch(self, G2, rng=None): pair_gen = select_nodes_randomly(self.graph, G2.graph, rng=rng) for node1, node2 in pair_gen: - #TODO: if root is in inner_config_dict, then do use it? + #TODO: if root is in inner_search_space, then do use it? if node1 is self.root or node2 is G2.root: #dont want to add root as inner node continue @@ -354,7 +354,7 @@ def _crossover_take_branch(self, G2, rng=None): pair_gen = select_nodes_randomly(self.graph, G2.graph, rng=rng) for node1, node2 in pair_gen: - #TODO: if root is in inner_config_dict, then do use it? + #TODO: if root is in inner_search_space, then do use it? if node2 is G2.root: #dont want to add root as inner node continue @@ -365,7 +365,7 @@ def _crossover_take_branch(self, G2, rng=None): #icheck if node2 is graph individual # if isinstance(node2,GraphIndividual): - # if not ((isinstance(node2,GraphIndividual) and ("Recursive" in self.inner_config_dict or "Recursive" in self.leaf_search_space))): + # if not ((isinstance(node2,GraphIndividual) and ("Recursive" in self.inner_search_space or "Recursive" in self.leaf_search_space))): # continue #isolating the branch @@ -624,9 +624,8 @@ class GraphPipeline(SklearnIndividualGenerator): def __init__(self, root_search_space : SklearnIndividualGenerator, leaf_search_space : SklearnIndividualGenerator = None, inner_search_space : SklearnIndividualGenerator =None, - max_size: int = 10, - crossover_same_depth=False, - rng=None) -> None: + max_size: int = np.inf, + crossover_same_depth=False) -> None: """ Generates a directed acyclic graph of variable size. Search spaces for root, leaf, and inner nodes can be defined separately if desired. @@ -642,4 +641,21 @@ def __init__(self, root_search_space : SklearnIndividualGenerator, self.crossover_same_depth = crossover_same_depth def generate(self, rng=None): - return GraphPipelineIndividual(self.search_space, self.leaf_search_space, self.inner_search_space, self.max_size, self.crossover_same_depth, rng=rng) \ No newline at end of file + rng = np.random.default_rng(rng) + ind = GraphPipelineIndividual(self.search_space, self.leaf_search_space, self.inner_search_space, self.max_size, self.crossover_same_depth, rng=rng) + # if user specified limit, grab a random number between that limit + + n_nodes = min(rng.integers(1, self.max_size), 5) + starting_ops = [] + if self.inner_search_space is not None: + starting_ops.append(ind._mutate_insert_inner_node) + if self.leaf_search_space is not None or self.inner_search_space is not None: + starting_ops.append(ind._mutate_insert_leaf) + n_nodes -= 1 + + if len(starting_ops) > 0: + for _ in range(n_nodes-1): + func = rng.choice(starting_ops) + func(rng=rng) + + return ind \ No newline at end of file From 3e2a3c4d6007da7decd78774a8cd21a97b23f13f Mon Sep 17 00:00:00 2001 From: perib Date: Fri, 29 Mar 2024 15:57:44 -0700 Subject: [PATCH 2/6] flatten pipelines to graph, graphpipe params --- Tutorial/2_Search_Spaces.ipynb | 192 +++++++++++++--------- tpot2/graphsklearn.py | 4 +- tpot2/search_spaces/base.py | 104 +++++++++++- tpot2/search_spaces/pipelines/__init__.py | 4 +- tpot2/search_spaces/pipelines/graph.py | 142 +++++++++++++--- 5 files changed, 343 insertions(+), 103 deletions(-) diff --git a/Tutorial/2_Search_Spaces.ipynb b/Tutorial/2_Search_Spaces.ipynb index c4aa8ab2..8e0af2b9 100644 --- a/Tutorial/2_Search_Spaces.ipynb +++ b/Tutorial/2_Search_Spaces.ipynb @@ -31,7 +31,7 @@ "output_type": "stream", "text": [ "sampled hyperparameters\n", - "{'metric': 'euclidean', 'n_jobs': 1, 'n_neighbors': 5, 'p': 3, 'weights': 'uniform'}\n" + "{'metric': 'euclidean', 'n_jobs': 1, 'n_neighbors': 6, 'p': 3, 'weights': 'distance'}\n" ] } ], @@ -154,9 +154,9 @@ "output_type": "stream", "text": [ "sampled hyperparameters\n", - "{'metric': 'minkowski', 'n_jobs': 1, 'n_neighbors': 6, 'p': 2, 'weights': 'uniform'}\n", + "{'metric': 'minkowski', 'n_jobs': 1, 'n_neighbors': 9, 'p': 2, 'weights': 'distance'}\n", "mutated hyperparameters\n", - "{'metric': 'minkowski', 'n_jobs': 1, 'n_neighbors': 4, 'p': 3, 'weights': 'distance'}\n" + "{'metric': 'euclidean', 'n_jobs': 1, 'n_neighbors': 5, 'p': 1, 'weights': 'uniform'}\n" ] } ], @@ -187,14 +187,14 @@ "output_type": "stream", "text": [ "original hyperparameters for individual 1\n", - "{'metric': 'minkowski', 'n_jobs': 1, 'n_neighbors': 7, 'p': 1, 'weights': 'uniform'}\n", + "{'metric': 'euclidean', 'n_jobs': 1, 'n_neighbors': 9, 'p': 2, 'weights': 'uniform'}\n", "original hyperparameters for individual 2\n", - "{'metric': 'euclidean', 'n_jobs': 1, 'n_neighbors': 5, 'p': 2, 'weights': 'distance'}\n", + "{'metric': 'euclidean', 'n_jobs': 1, 'n_neighbors': 4, 'p': 1, 'weights': 'uniform'}\n", "\n", "post crossover hyperparameters for individual 1\n", - "{'metric': 'minkowski', 'n_jobs': 1, 'n_neighbors': 7, 'p': 2, 'weights': 'uniform'}\n", + "{'metric': 'euclidean', 'n_jobs': 1, 'n_neighbors': 9, 'p': 2, 'weights': 'uniform'}\n", "post crossover hyperparameters for individual 2\n", - "{'metric': 'euclidean', 'n_jobs': 1, 'n_neighbors': 5, 'p': 2, 'weights': 'distance'}\n" + "{'metric': 'euclidean', 'n_jobs': 1, 'n_neighbors': 4, 'p': 1, 'weights': 'uniform'}\n" ] } ], @@ -637,10 +637,10 @@ " /* fitted */\n", " background-color: var(--sklearn-color-fitted-level-3);\n", "}\n", - "
KNeighborsClassifier(n_jobs=1, n_neighbors=7)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + "
KNeighborsClassifier(metric='euclidean', n_jobs=1, n_neighbors=9)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ - "KNeighborsClassifier(n_jobs=1, n_neighbors=7)" + "KNeighborsClassifier(metric='euclidean', n_jobs=1, n_neighbors=9)" ] }, "execution_count": 5, @@ -676,7 +676,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 6, @@ -1194,10 +1194,13 @@ " /* fitted */\n", " background-color: var(--sklearn-color-fitted-level-3);\n", "}\n", - "
KNeighborsClassifier(metric='euclidean', n_jobs=1, n_neighbors=3)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + "
LogisticRegression(C=99.0450142669678, class_weight='balanced', dual=True,\n",
+       "                   max_iter=1000, n_jobs=1, solver='liblinear')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ - "KNeighborsClassifier(metric='euclidean', n_jobs=1, n_neighbors=3)" + "LogisticRegression(C=99.0450142669678, class_weight='balanced', dual=True,\n", + " max_iter=1000, n_jobs=1, solver='liblinear')" ] }, "execution_count": 7, @@ -1631,10 +1634,13 @@ " /* fitted */\n", " background-color: var(--sklearn-color-fitted-level-3);\n", "}\n", - "
KNeighborsClassifier(metric='euclidean', n_jobs=1, n_neighbors=1, p=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + "
KNeighborsClassifier(metric='euclidean', n_jobs=1, n_neighbors=4, p=3,\n",
+       "                     weights='distance')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ - "KNeighborsClassifier(metric='euclidean', n_jobs=1, n_neighbors=1, p=1)" + "KNeighborsClassifier(metric='euclidean', n_jobs=1, n_neighbors=4, p=3,\n", + " weights='distance')" ] }, "execution_count": 8, @@ -2085,13 +2091,13 @@ " /* fitted */\n", " background-color: var(--sklearn-color-fitted-level-3);\n", "}\n", - "
DecisionTreeClassifier(criterion='entropy', max_depth=22, max_features=1.0,\n",
-       "                       min_samples_leaf=16, min_samples_split=20)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + "
DecisionTreeClassifier(criterion='entropy', max_depth=2, max_features='log2',\n",
+       "                       min_samples_leaf=4, min_samples_split=10)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ - "DecisionTreeClassifier(criterion='entropy', max_depth=22, max_features=1.0,\n", - " min_samples_leaf=16, min_samples_split=20)" + "DecisionTreeClassifier(criterion='entropy', max_depth=2, max_features='log2',\n", + " min_samples_leaf=4, min_samples_split=10)" ] }, "execution_count": 9, @@ -2526,10 +2532,13 @@ " /* fitted */\n", " background-color: var(--sklearn-color-fitted-level-3);\n", "}\n", - "
KNeighborsClassifier(n_jobs=1, n_neighbors=4)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + "
DecisionTreeClassifier(criterion='entropy', max_depth=25, max_features='log2',\n",
+       "                       min_samples_leaf=6, min_samples_split=13)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ - "KNeighborsClassifier(n_jobs=1, n_neighbors=4)" + "DecisionTreeClassifier(criterion='entropy', max_depth=25, max_features='log2',\n", + " min_samples_leaf=6, min_samples_split=13)" ] }, "execution_count": 10, @@ -2961,13 +2970,10 @@ " /* fitted */\n", " background-color: var(--sklearn-color-fitted-level-3);\n", "}\n", - "
MLPClassifier(alpha=0.09935758704160183,\n",
-       "              learning_rate_init=0.004466259151092733)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + "
BernoulliNB(alpha=1.1043626639293316)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ - "MLPClassifier(alpha=0.09935758704160183,\n", - " learning_rate_init=0.004466259151092733)" + "BernoulliNB(alpha=1.1043626639293316)" ] }, "execution_count": 11, @@ -3402,13 +3408,10 @@ " /* fitted */\n", " background-color: var(--sklearn-color-fitted-level-3);\n", "}\n", - "
DecisionTreeClassifier(criterion='entropy', max_depth=11, max_features=1.0,\n",
-       "                       min_samples_leaf=12, min_samples_split=8)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + "
SVC(C=0.007250294080496579, degree=2, max_iter=3000, probability=True)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ - "DecisionTreeClassifier(criterion='entropy', max_depth=11, max_features=1.0,\n", - " min_samples_leaf=12, min_samples_split=8)" + "SVC(C=0.007250294080496579, degree=2, max_iter=3000, probability=True)" ] }, "execution_count": 12, @@ -3849,26 +3852,19 @@ " /* fitted */\n", " background-color: var(--sklearn-color-fitted-level-3);\n", "}\n", - "
Pipeline(steps=[('selectfwe', SelectFwe(alpha=0.007682074361801758)),\n",
-       "                ('fastica', FastICA(n_components=64)),\n",
-       "                ('randomforestclassifier',\n",
-       "                 RandomForestClassifier(bootstrap=False, criterion='entropy',\n",
-       "                                        min_samples_leaf=10,\n",
-       "                                        min_samples_split=6))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + "
Pipeline(steps=[('selectfwe', SelectFwe(alpha=0.0004402567631974485)),\n",
+       "                ('rbfsampler',\n",
+       "                 RBFSampler(gamma=0.5507862784926447, n_components=4)),\n",
+       "                ('multinomialnb', MultinomialNB(alpha=0.019703201853925403))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ - "Pipeline(steps=[('selectfwe', SelectFwe(alpha=0.007682074361801758)),\n", - " ('fastica', FastICA(n_components=64)),\n", - " ('randomforestclassifier',\n", - " RandomForestClassifier(bootstrap=False, criterion='entropy',\n", - " min_samples_leaf=10,\n", - " min_samples_split=6))])" + "Pipeline(steps=[('selectfwe', SelectFwe(alpha=0.0004402567631974485)),\n", + " ('rbfsampler',\n", + " RBFSampler(gamma=0.5507862784926447, n_components=4)),\n", + " ('multinomialnb', MultinomialNB(alpha=0.019703201853925403))])" ] }, "execution_count": 13, @@ -4308,27 +4304,79 @@ " background-color: var(--sklearn-color-fitted-level-3);\n", "}\n", "
Pipeline(steps=[('selectpercentile',\n",
-       "                 SelectPercentile(percentile=75.04535288452273)),\n",
+       "                 SelectPercentile(percentile=1.0089148758394795)),\n",
        "                ('nystroem',\n",
-       "                 Nystroem(gamma=0.4607961332716787, kernel='laplacian',\n",
-       "                          n_components=90)),\n",
-       "                ('bernoullinb',\n",
-       "                 BernoulliNB(alpha=2.4816194955956314, fit_prior=False))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SelectPercentile(percentile=1.0089148758394795)
Nystroem(gamma=0.2371171340711561, kernel='cosine', n_components=73)
XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
+       "              colsample_bylevel=None, colsample_bynode=None,\n",
+       "              colsample_bytree=None, device=None, early_stopping_rounds=None,\n",
+       "              enable_categorical=False, eval_metric=None, feature_types=None,\n",
+       "              gamma=None, grow_policy=None, importance_type=None,\n",
+       "              interaction_constraints=None, learning_rate=0.003591562007988768,\n",
+       "              max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,\n",
+       "              max_delta_step=None, max_depth=8, max_leaves=None,\n",
+       "              min_child_weight=1, missing=nan, monotone_constraints=None,\n",
+       "              multi_strategy=None, n_estimators=100, n_jobs=1,\n",
+       "              num_parallel_tree=None, random_state=None, ...)
" ], "text/plain": [ "Pipeline(steps=[('selectpercentile',\n", - " SelectPercentile(percentile=75.04535288452273)),\n", + " SelectPercentile(percentile=1.0089148758394795)),\n", " ('nystroem',\n", - " Nystroem(gamma=0.4607961332716787, kernel='laplacian',\n", - " n_components=90)),\n", - " ('bernoullinb',\n", - " BernoulliNB(alpha=2.4816194955956314, fit_prior=False))])" + " Nystroem(gamma=0.2371171340711561, kernel='cosine',\n", + " n_components=73)),\n", + " ('xgbclassifier',\n", + " XGBClassifier(base_score=None, booster=None, callbacks=None,\n", + " colsample_bylevel=None, colsample_bynode=None,\n", + " colsample_bytree=None, device=None,\n", + " early_stopping_rounds=None,\n", + " enab...\n", + " feature_types=None, gamma=None, grow_policy=None,\n", + " importance_type=None,\n", + " interaction_constraints=None,\n", + " learning_rate=0.003591562007988768, max_bin=None,\n", + " max_cat_threshold=None, max_cat_to_onehot=None,\n", + " max_delta_step=None, max_depth=8,\n", + " max_leaves=None, min_child_weight=1, missing=nan,\n", + " monotone_constraints=None, multi_strategy=None,\n", + " n_estimators=100, n_jobs=1,\n", + " num_parallel_tree=None, random_state=None, ...))])" ] }, "execution_count": 14, @@ -4359,11 +4407,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Generation: 0%| | 0/5 [00:00
TPOTEstimator(classification=True, generations=5, max_eval_time_seconds=300,\n",
        "              population_size=10, scorers=['roc_auc'], scorers_weights=[1],\n",
-       "              search_space=<tpot2.search_spaces.pipelines.graph.GraphPipeline object at 0x77c026a110c0>,\n",
+       "              search_space=<tpot2.search_spaces.pipelines.graph.GraphPipeline object at 0x7544c5ab8f40>,\n",
        "              verbose=2)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "TPOTEstimator(classification=True, generations=5, max_eval_time_seconds=300,\n", " population_size=10, scorers=['roc_auc'], scorers_weights=[1],\n", - " search_space=,\n", + " search_space=,\n", " verbose=2)" ] }, @@ -4845,7 +4889,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "auroc score 0.9876518024288388\n" + "auroc score 0.9501489525273881\n" ] } ], @@ -4865,7 +4909,7 @@ "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] diff --git a/tpot2/graphsklearn.py b/tpot2/graphsklearn.py index e0d500ae..7c519af5 100644 --- a/tpot2/graphsklearn.py +++ b/tpot2/graphsklearn.py @@ -231,7 +231,7 @@ def __init__( graph, cross_val_predict_cv=0, #signature function(estimator, X, y=none) method='auto', - memory=None, #TODO memory caching like sklearn.pipeline + memory=None, use_label_encoder=False, **kwargs, ): @@ -252,7 +252,7 @@ def __init__( The prediction method to use for the inner classifiers or regressors. If 'auto', it will try to use predict_proba, decision_function, or predict in that order. memory: str or object with the joblib.Memory interface, optional - Used to cache the fitted transformers of the pipeline. By default, no caching is performed. If a string is given, it is the path to the caching directory. + Used to cache the input and outputs of nodes to prevent refitting or computationally heavy transformations. By default, no caching is performed. If a string is given, it is the path to the caching directory. use_label_encoder: bool, optional If True, the label encoder is used to encode the labels to be 0 to N. If False, the label encoder is not used. diff --git a/tpot2/search_spaces/base.py b/tpot2/search_spaces/base.py index 88955ba7..09fc61e5 100644 --- a/tpot2/search_spaces/base.py +++ b/tpot2/search_spaces/base.py @@ -6,7 +6,10 @@ from typing import Generator, List, Tuple, Union import random from sklearn.base import BaseEstimator - +import sklearn +import networkx as nx +from . import graph_utils +from typing import final class SklearnIndividual(tpot2.BaseIndividual): @@ -25,10 +28,107 @@ def export_pipeline(self) -> BaseEstimator: def unique_id(self): return self + @final + def export_flattened_graphpipeline(self) -> tpot2.GraphPipeline: + return flatten_to_graphpipeline(self.export_pipeline()) class SklearnIndividualGenerator(): def __init__(self,): pass def generate(self, rng=None) -> SklearnIndividual: - pass \ No newline at end of file + pass + + + + + + +def flatten_graphpipeline(est): + flattened_full_graph = est.graph.copy() + + #put ests into the node label from the attributes + + flattened_full_graph = nx.relabel_nodes(flattened_full_graph, {n: flattened_full_graph.nodes[n]['instance'] for n in flattened_full_graph.nodes}) + + + remove_list = [] + for node in flattened_full_graph.nodes: + if isinstance(node, nx.DiGraph): + flattened = flatten_any(node) + + roots = graph_utils.get_roots(flattened) + leaves = graph_utils.get_leaves(flattened) + + n1_s = flattened_full_graph.successors(node) + n1_p = flattened_full_graph.predecessors(node) + + remove_list.append(node) + + flattened_full_graph = nx.compose(flattened_full_graph, flattened) + + + flattened_full_graph.add_edges_from([ (n2, n) for n in n1_s for n2 in leaves]) + flattened_full_graph.add_edges_from([ (n, n2) for n in n1_p for n2 in roots]) + + for node in remove_list: + flattened_full_graph.remove_node(node) + + return flattened_full_graph + +def flatten_pipeline(est): + graph = nx.DiGraph() + steps = [flatten_any(s[1]) for s in est.steps] + + #add steps to graph and connect them + for s in steps: + graph = nx.compose(graph, s) + + #connect leaves of each step to the roots of the next step + for i in range(len(steps)-1): + roots = graph_utils.get_roots(steps[i]) + leaves = graph_utils.get_leaves(steps[i+1]) + graph.add_edges_from([ (l,r) for l in leaves for r in roots]) + + + return graph + + + +def flatten_estimator(est): + graph = nx.DiGraph() + graph.add_node(est) + return graph + +def flatten_any(est): + if isinstance(est, tpot2.GraphPipeline): + return flatten_graphpipeline(est) + elif isinstance(est, sklearn.pipeline.Pipeline): + return flatten_pipeline(est) + else: + return flatten_estimator(est) + + +def flatten_to_graphpipeline(est): + #rename nodes to string representation of the instance and put the instance in the node attributes + flattened_full_graph = flatten_any(est) + + instance_to_label = {} + label_to_instance = {} + for node in flattened_full_graph.nodes: + found_unique_label = False + i=1 + while not found_unique_label: + new_label = f"{node.__class__.__name__}_{i}" + if new_label not in label_to_instance: + found_unique_label = True + i+=1 + label_to_instance[new_label] = node + instance_to_label[node] = new_label + + flattened_full_graph = nx.relabel_nodes(flattened_full_graph, instance_to_label) + + for label, instance in label_to_instance.items(): + flattened_full_graph.nodes[label]["instance"] = instance + + return tpot2.GraphPipeline(flattened_full_graph) \ No newline at end of file diff --git a/tpot2/search_spaces/pipelines/__init__.py b/tpot2/search_spaces/pipelines/__init__.py index ec90eb0e..b0c2c74d 100644 --- a/tpot2/search_spaces/pipelines/__init__.py +++ b/tpot2/search_spaces/pipelines/__init__.py @@ -3,4 +3,6 @@ from .sequential import * from .graph import * from .tree import * -from .wrapper import * \ No newline at end of file +from .wrapper import * + +from . import graph_utils \ No newline at end of file diff --git a/tpot2/search_spaces/pipelines/graph.py b/tpot2/search_spaces/pipelines/graph.py index 0ebe7092..9f70577f 100644 --- a/tpot2/search_spaces/pipelines/graph.py +++ b/tpot2/search_spaces/pipelines/graph.py @@ -1,10 +1,6 @@ import tpot2 import numpy as np -import pandas as pd -import sklearn -from tpot2 import config from typing import Generator, List, Tuple, Union -import random from ..base import SklearnIndividual, SklearnIndividualGenerator import networkx as nx import copy @@ -12,20 +8,67 @@ import itertools from .graph_utils import * from ..nodes.estimator_node import EstimatorNodeIndividual - +from typing import Union, Callable +import sklearn class GraphPipelineIndividual(SklearnIndividual): - def __init__(self, - root_search_space : SklearnIndividualGenerator, - leaf_search_space : SklearnIndividualGenerator = None, - inner_search_space : SklearnIndividualGenerator =None, - max_size: int = 10, - crossover_same_depth=False, - rng=None) -> None: - """ - Generates a tree shaped pipeline individual. Can be used to export a sklearn Pipeline that uses feature unions to merge branches of the pipeline. + """ + Defines a search space of pipelines in the shape of a Directed Acyclic Graphs. The search spaces for root, leaf, and inner nodes can be defined separately if desired. + Each graph will have a single root serving as the final estimator which is drawn from the `root_search_space`. If the `leaf_search_space` is defined, all leaves + in the pipeline will be drawn from that search space. If the `leaf_search_space` is not defined, all leaves will be drawn from the `inner_search_space`. + Nodes that are not leaves or roots will be drawn from the `inner_search_space`. If the `inner_search_space` is not defined, there will be no inner nodes. + + `cross_val_predict_cv`, `method`, `memory`, and `use_label_encoder` are passed to the GraphPipeline object when the pipeline is exported and not directly used in the search space. + + Exports to a GraphPipeline object. + + Parameters + ---------- + + root_search_space: SklearnIndividualGenerator + The search space for the root node of the graph. This node will be the final estimator in the pipeline. + inner_search_space: SklearnIndividualGenerator, optional + The search space for the inner nodes of the graph. If not defined, there will be no inner nodes. + + leaf_search_space: SklearnIndividualGenerator, optional + The search space for the leaf nodes of the graph. If not defined, the leaf nodes will be drawn from the inner_search_space. + + crossover_same_depth: bool, optional + If True, crossover will only occur between nodes at the same depth in the graph. If False, crossover will occur between nodes at any depth. + + cross_val_predict_cv: int, cross-validation generator or an iterable, optional + Determines the cross-validation splitting strategy used in inner classifiers or regressors + + method: str, optional + The prediction method to use for the inner classifiers or regressors. If 'auto', it will try to use predict_proba, decision_function, or predict in that order. + + memory: str or object with the joblib.Memory interface, optional + Used to cache the input and outputs of nodes to prevent refitting or computationally heavy transformations. By default, no caching is performed. If a string is given, it is the path to the caching directory. + + use_label_encoder: bool, optional + If True, the label encoder is used to encode the labels to be 0 to N. If False, the label encoder is not used. + Mainly useful for classifiers (XGBoost) that require labels to be ints from 0 to N. + Can also be a sklearn.preprocessing.LabelEncoder object. If so, that label encoder is used. + + rng: int, RandomState instance or None, optional + Seed for sampling the first graph instance. + """ + + def __init__( + self, + root_search_space: SklearnIndividualGenerator, + leaf_search_space: SklearnIndividualGenerator = None, + inner_search_space: SklearnIndividualGenerator = None, + max_size: int = np.inf, + crossover_same_depth: bool = False, + cross_val_predict_cv: Union[int, Callable] = 0, #signature function(estimator, X, y=none) + method: str = 'auto', + memory=None, + use_label_encoder: bool = False, + rng=None): + super().__init__() self.__debug = False @@ -38,6 +81,11 @@ def __init__(self, self.max_size = max_size self.crossover_same_depth = crossover_same_depth + self.cross_val_predict_cv = cross_val_predict_cv + self.method = method + self.memory = memory + self.use_label_encoder = use_label_encoder + self.root = self.root_search_space.generate(rng) self.graph = nx.DiGraph() self.graph.add_node(self.root) @@ -535,7 +583,7 @@ def _merge_duplicated_nodes(self): return graph_changed - def export_pipeline(self, **graph_pipeline_args): + def export_pipeline(self): estimator_graph = self.graph.copy() #mapping = {node:node.method_class(**node.hyperparameters) for node in estimator_graph} @@ -561,7 +609,7 @@ def export_pipeline(self, **graph_pipeline_args): for label, instance in label_to_instance.items(): estimator_graph.nodes[label]["instance"] = instance - return tpot2.GraphPipeline(graph=estimator_graph, **graph_pipeline_args) + return tpot2.GraphPipeline(graph=estimator_graph, memory=self.memory, use_label_encoder=self.use_label_encoder, method=self.method, cross_val_predict_cv=self.cross_val_predict_cv) def plot(self): @@ -621,28 +669,74 @@ def unique_id(self): class GraphPipeline(SklearnIndividualGenerator): - def __init__(self, root_search_space : SklearnIndividualGenerator, - leaf_search_space : SklearnIndividualGenerator = None, - inner_search_space : SklearnIndividualGenerator =None, - max_size: int = np.inf, - crossover_same_depth=False) -> None: + def __init__(self, + root_search_space: SklearnIndividualGenerator, + leaf_search_space: SklearnIndividualGenerator = None, + inner_search_space: SklearnIndividualGenerator = None, + max_size: int = np.inf, + crossover_same_depth: bool = False, + cross_val_predict_cv: Union[int, Callable] = 0, #signature function(estimator, X, y=none) + method: str = 'auto', + memory=None, + use_label_encoder: bool = False,): """ - Generates a directed acyclic graph of variable size. Search spaces for root, leaf, and inner nodes can be defined separately if desired. + Defines a search space of pipelines in the shape of a Directed Acyclic Graphs. The search spaces for root, leaf, and inner nodes can be defined separately if desired. + Each graph will have a single root serving as the final estimator which is drawn from the `root_search_space`. If the `leaf_search_space` is defined, all leaves + in the pipeline will be drawn from that search space. If the `leaf_search_space` is not defined, all leaves will be drawn from the `inner_search_space`. + Nodes that are not leaves or roots will be drawn from the `inner_search_space`. If the `inner_search_space` is not defined, there will be no inner nodes. + + `cross_val_predict_cv`, `method`, `memory`, and `use_label_encoder` are passed to the GraphPipeline object when the pipeline is exported and not directly used in the search space. + Exports to a GraphPipeline object. + Parameters + ---------- + + root_search_space: SklearnIndividualGenerator + The search space for the root node of the graph. This node will be the final estimator in the pipeline. + + inner_search_space: SklearnIndividualGenerator, optional + The search space for the inner nodes of the graph. If not defined, there will be no inner nodes. + + leaf_search_space: SklearnIndividualGenerator, optional + The search space for the leaf nodes of the graph. If not defined, the leaf nodes will be drawn from the inner_search_space. + + crossover_same_depth: bool, optional + If True, crossover will only occur between nodes at the same depth in the graph. If False, crossover will occur between nodes at any depth. + + cross_val_predict_cv: int, cross-validation generator or an iterable, optional + Determines the cross-validation splitting strategy used in inner classifiers or regressors + + method: str, optional + The prediction method to use for the inner classifiers or regressors. If 'auto', it will try to use predict_proba, decision_function, or predict in that order. + + memory: str or object with the joblib.Memory interface, optional + Used to cache the input and outputs of nodes to prevent refitting or computationally heavy transformations. By default, no caching is performed. If a string is given, it is the path to the caching directory. + + use_label_encoder: bool, optional + If True, the label encoder is used to encode the labels to be 0 to N. If False, the label encoder is not used. + Mainly useful for classifiers (XGBoost) that require labels to be ints from 0 to N. + Can also be a sklearn.preprocessing.LabelEncoder object. If so, that label encoder is used. + """ - self.search_space = root_search_space + self.root_search_space = root_search_space self.leaf_search_space = leaf_search_space self.inner_search_space = inner_search_space self.max_size = max_size self.crossover_same_depth = crossover_same_depth + self.cross_val_predict_cv = cross_val_predict_cv + self.method = method + self.memory = memory + self.use_label_encoder = use_label_encoder + def generate(self, rng=None): rng = np.random.default_rng(rng) - ind = GraphPipelineIndividual(self.search_space, self.leaf_search_space, self.inner_search_space, self.max_size, self.crossover_same_depth, rng=rng) + ind = GraphPipelineIndividual(self.root_search_space, self.leaf_search_space, self.inner_search_space, self.max_size, self.crossover_same_depth, + self.cross_val_predict_cv, self.method, self.memory, self.use_label_encoder, rng=rng) # if user specified limit, grab a random number between that limit n_nodes = min(rng.integers(1, self.max_size), 5) From 59dad61b6854c160220e4986bf19d2e8fcf3229b Mon Sep 17 00:00:00 2001 From: perib Date: Wed, 10 Apr 2024 11:42:04 -0700 Subject: [PATCH 3/6] flatten to graphpipeline, steadystate --- tpot2/config/get_configspace.py | 13 ++- tpot2/evolvers/__init__.py | 2 +- tpot2/evolvers/steady_state_evolver.py | 6 - tpot2/search_spaces/base.py | 8 +- .../{pipelines => }/graph_utils.py | 0 .../nodes/genetic_feature_selection.py | 6 +- tpot2/search_spaces/pipelines/__init__.py | 4 +- tpot2/search_spaces/pipelines/graph.py | 5 +- tpot2/search_spaces/pipelines/tree.py | 2 +- tpot2/tpot_estimator/estimator.py | 44 +++++--- tpot2/tpot_estimator/estimator_utils.py | 104 ++++-------------- .../tpot_estimator/steady_state_estimator.py | 79 ++++++------- 12 files changed, 111 insertions(+), 162 deletions(-) rename tpot2/search_spaces/{pipelines => }/graph_utils.py (100%) diff --git a/tpot2/config/get_configspace.py b/tpot2/config/get_configspace.py index 2c4485bf..44892278 100644 --- a/tpot2/config/get_configspace.py +++ b/tpot2/config/get_configspace.py @@ -18,7 +18,8 @@ from . import classifiers_sklearnex from . import regressors_sklearnex - +from ConfigSpace import ConfigurationSpace +from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal #autoqtl_builtins from tpot2.builtin_modules import genetic_encoders @@ -163,7 +164,7 @@ "classifiers" : ["LogisticRegression", "DecisionTreeClassifier", "KNeighborsClassifier", "GradientBoostingClassifier", "ExtraTreesClassifier", "RandomForestClassifier", "SGDClassifier", "GaussianNB", "BernoulliNB", "MultinomialNB", "XGBClassifier", "SVC", "MLPClassifier"], "regressors" : ["ElasticNetCV", "ExtraTreesRegressor", "GradientBoostingRegressor", "AdaBoostRegressor", "DecisionTreeRegressor", "KNeighborsRegressor", "LassoLarsCV", "SVR", "RandomForestRegressor", "RidgeCV", "XGBRegressor", "SGDRegressor" ], "transformers": ["Binarizer", "Normalizer", "PCA", "ZeroCount", "OneHotEncoder", "FastICA", "FeatureAgglomeration", "Nystroem", "RBFSampler"], - "arithmatic": ["AddTransformer", "mul_neg_1_Transformer", "MulTransformer", "SafeReciprocalTransformer", "EQTransformer", "NETransformer", "GETransformer", "GTTransformer", "LETransformer", "LTTransformer", "MinTransformer", "MaxTransformer", "ZeroTransformer", "OneTransformer", "NTransformer"], + "arithmatic": ["AddTransformer", "mul_neg_1_Transformer", "MulTransformer", "SafeReciprocalTransformer", "EQTransformer", "NETransformer", "GETransformer", "GTTransformer", "LETransformer", "LTTransformer", "MinTransformer", "MaxTransformer"], "imputers": [], "skrebate": ["ReliefF", "SURF", "SURFstar", "MultiSURF"], "genetic_encoders": ["DominantEncoder", "RecessiveEncoder", "HeterosisEncoder", "UnderDominanceEncoder", "OverDominanceEncoder"], @@ -286,7 +287,13 @@ def get_configspace(name, n_classes=3, n_samples=100, n_features=100, random_sta case "OneTransformer": return {} case "NTransformer": - return {} + return ConfigurationSpace( + + space = { + + 'n': Float("n", bounds=(-1e3, 1e3), log=True), + } + ) #imputers.py diff --git a/tpot2/evolvers/__init__.py b/tpot2/evolvers/__init__.py index 1d6af1a9..cf130f80 100644 --- a/tpot2/evolvers/__init__.py +++ b/tpot2/evolvers/__init__.py @@ -1,2 +1,2 @@ from .base_evolver import * -#from .steady_state_evolver import * \ No newline at end of file +from .steady_state_evolver import * \ No newline at end of file diff --git a/tpot2/evolvers/steady_state_evolver.py b/tpot2/evolvers/steady_state_evolver.py index 5db3e502..1aa457c8 100644 --- a/tpot2/evolvers/steady_state_evolver.py +++ b/tpot2/evolvers/steady_state_evolver.py @@ -1,17 +1,11 @@ #All abstract methods in the Evolutionary_Optimization module - -from abc import abstractmethod import tpot2 import typing import tqdm -from tpot2.individual_representations import BaseIndividual import time import numpy as np -import copy -import scipy import os import pickle -import statistics from tqdm.dask import TqdmCallback import distributed from dask.distributed import Client diff --git a/tpot2/search_spaces/base.py b/tpot2/search_spaces/base.py index 09fc61e5..80388708 100644 --- a/tpot2/search_spaces/base.py +++ b/tpot2/search_spaces/base.py @@ -29,8 +29,8 @@ def unique_id(self): return self @final - def export_flattened_graphpipeline(self) -> tpot2.GraphPipeline: - return flatten_to_graphpipeline(self.export_pipeline()) + def export_flattened_graphpipeline(self, **graphpipeline_kwargs) -> tpot2.GraphPipeline: + return flatten_to_graphpipeline(self.export_pipeline(), **graphpipeline_kwargs) class SklearnIndividualGenerator(): def __init__(self,): @@ -109,7 +109,7 @@ def flatten_any(est): return flatten_estimator(est) -def flatten_to_graphpipeline(est): +def flatten_to_graphpipeline(est, **graphpipeline_kwargs): #rename nodes to string representation of the instance and put the instance in the node attributes flattened_full_graph = flatten_any(est) @@ -131,4 +131,4 @@ def flatten_to_graphpipeline(est): for label, instance in label_to_instance.items(): flattened_full_graph.nodes[label]["instance"] = instance - return tpot2.GraphPipeline(flattened_full_graph) \ No newline at end of file + return tpot2.GraphPipeline(flattened_full_graph, **graphpipeline_kwargs) \ No newline at end of file diff --git a/tpot2/search_spaces/pipelines/graph_utils.py b/tpot2/search_spaces/graph_utils.py similarity index 100% rename from tpot2/search_spaces/pipelines/graph_utils.py rename to tpot2/search_spaces/graph_utils.py diff --git a/tpot2/search_spaces/nodes/genetic_feature_selection.py b/tpot2/search_spaces/nodes/genetic_feature_selection.py index e51ff8ba..1894026a 100644 --- a/tpot2/search_spaces/nodes/genetic_feature_selection.py +++ b/tpot2/search_spaces/nodes/genetic_feature_selection.py @@ -157,7 +157,7 @@ def __init__(self, crossover_rate = 0.5, mutation_rate_rate = 0, crossover_rate_rate = 0, - rng=None,): + ): self.n_features = n_features self.start_p = start_p @@ -165,7 +165,7 @@ def __init__(self, self.crossover_rate = crossover_rate self.mutation_rate_rate = mutation_rate_rate self.crossover_rate_rate = crossover_rate_rate - self.rng = rng + def generate(self, rng=None) -> SklearnIndividual: return GeneticFeatureSelectorIndividual( mask=self.n_features, @@ -174,5 +174,5 @@ def generate(self, rng=None) -> SklearnIndividual: crossover_rate=self.crossover_rate, mutation_rate_rate=self.mutation_rate_rate, crossover_rate_rate=self.crossover_rate_rate, - rng=self.rng + rng=rng ) \ No newline at end of file diff --git a/tpot2/search_spaces/pipelines/__init__.py b/tpot2/search_spaces/pipelines/__init__.py index b0c2c74d..ec90eb0e 100644 --- a/tpot2/search_spaces/pipelines/__init__.py +++ b/tpot2/search_spaces/pipelines/__init__.py @@ -3,6 +3,4 @@ from .sequential import * from .graph import * from .tree import * -from .wrapper import * - -from . import graph_utils \ No newline at end of file +from .wrapper import * \ No newline at end of file diff --git a/tpot2/search_spaces/pipelines/graph.py b/tpot2/search_spaces/pipelines/graph.py index 9f70577f..5c6668b9 100644 --- a/tpot2/search_spaces/pipelines/graph.py +++ b/tpot2/search_spaces/pipelines/graph.py @@ -6,7 +6,7 @@ import copy import matplotlib.pyplot as plt import itertools -from .graph_utils import * +from ..graph_utils import * from ..nodes.estimator_node import EstimatorNodeIndividual from typing import Union, Callable import sklearn @@ -360,7 +360,8 @@ def _crossover_swap_branch(self, G2, rng=None): node1_is_leaf = len(list(self.graph.successors(node1))) == 0 node2_is_leaf = len(list(G2.graph.successors(node2))) == 0 #if not ((node1_is_leaf and node1_is_leaf) or (not node1_is_leaf and not node2_is_leaf)): #if node1 is a leaf - if (node1_is_leaf and (not node2_is_leaf)) or ( (not node1_is_leaf) and node2_is_leaf): + #if (node1_is_leaf and (not node2_is_leaf)) or ( (not node1_is_leaf) and node2_is_leaf): + if not node1_is_leaf: #only continue if node1 and node2 are both leaves or both not leaves continue diff --git a/tpot2/search_spaces/pipelines/tree.py b/tpot2/search_spaces/pipelines/tree.py index de4c2aef..813a59e1 100644 --- a/tpot2/search_spaces/pipelines/tree.py +++ b/tpot2/search_spaces/pipelines/tree.py @@ -13,7 +13,7 @@ from .graph import GraphPipelineIndividual, GraphPipeline -from .graph_utils import * +from ..graph_utils import * class TreePipelineIndividual(GraphPipelineIndividual): def __init__(self, diff --git a/tpot2/tpot_estimator/estimator.py b/tpot2/tpot_estimator/estimator.py index 7465564c..999dbffe 100644 --- a/tpot2/tpot_estimator/estimator.py +++ b/tpot2/tpot_estimator/estimator.py @@ -29,7 +29,9 @@ def set_dask_settings(): #TODO inherit from _BaseComposition? class TPOTEstimator(BaseEstimator): - def __init__(self, scorers, + def __init__(self, + search_space, + scorers, scorers_weights, classification, cv = 5, @@ -38,13 +40,12 @@ def __init__(self, scorers, objective_function_names = None, bigger_is_better = True, - search_space = None, - - + export_graphpipeline = False, cross_val_predict_cv = 0, + memory = None, + categorical_features = None, subsets = None, - memory = None, preprocessing = False, population_size = 50, initial_population_size = None, @@ -87,7 +88,7 @@ def __init__(self, scorers, #dask parameters n_jobs=1, - memory_limit = "4GB", + memory_limit = None, client = None, processes = True, @@ -369,10 +370,17 @@ def __init__(self, scorers, self.search_space = search_space + self.export_graphpipeline = export_graphpipeline self.cross_val_predict_cv = cross_val_predict_cv + self.memory = memory + + if self.cross_val_predict_cv !=0 or self.memory is not None: + if not self.export_graphpipeline: + raise ValueError("cross_val_predict_cv and memory parameters are parameters for GraphPipeline. To enable these options export_graphpipeline to be True. Otherwise these can be passed into the relevant Search spaces as parameters.") + self.categorical_features = categorical_features self.subsets = subsets - self.memory = memory + self.preprocessing = preprocessing self.validation_strategy = validation_strategy self.validation_fraction = validation_fraction @@ -600,6 +608,7 @@ def objective_function(pipeline_individual, scorers= self._scorers, cv=self.cv_gen, other_objective_functions=self.other_objective_functions, + export_graphpipeline=self.export_graphpipeline, memory=self.memory, cross_val_predict_cv=self.cross_val_predict_cv, **kwargs): @@ -611,6 +620,7 @@ def objective_function(pipeline_individual, scorers= scorers, cv=cv, other_objective_functions=other_objective_functions, + export_graphpipeline=export_graphpipeline, memory=memory, cross_val_predict_cv=cross_val_predict_cv, **kwargs, @@ -713,6 +723,7 @@ def ind_generator(rng): scorers= self._scorers, cv=self.cv_gen, other_objective_functions=self.other_objective_functions, + export_graphpipeline=self.export_graphpipeline, memory=self.memory, cross_val_predict_cv=self.cross_val_predict_cv, @@ -724,6 +735,7 @@ def ind_generator(rng): scorers= scorers, cv=cv, other_objective_functions=other_objective_functions, + export_graphpipeline=export_graphpipeline, memory=memory, cross_val_predict_cv=cross_val_predict_cv, **kwargs, @@ -738,7 +750,8 @@ def ind_generator(rng): self.objective_names_for_selection = val_objective_names self.evaluated_individuals.loc[best_pareto_front_idx,val_objective_names] = val_scores - self.evaluated_individuals["Validation_Pareto_Front"] = tpot2.utils.get_pareto_front(self.evaluated_individuals, val_objective_names, self.objective_function_weights, invalid_values=["TIMEOUT","INVALID"]) + self.evaluated_individuals["Validation_Pareto_Front"] = tpot2.utils.get_pareto_frontier(self.evaluated_individuals, column_names=val_objective_names, weights=self.objective_function_weights, invalid_values=["TIMEOUT","INVALID"]) + elif validation_strategy == 'split': @@ -765,6 +778,7 @@ def ind_generator(rng): y_val, scorers= self._scorers, other_objective_functions=self.other_objective_functions, + export_graphpipeline=self.export_graphpipeline, memory=self.memory, cross_val_predict_cv=self.cross_val_predict_cv, **kwargs: val_objective_function_generator( @@ -775,6 +789,7 @@ def ind_generator(rng): y_val, scorers= scorers, other_objective_functions=other_objective_functions, + export_graphpipeline=export_graphpipeline, memory=memory, cross_val_predict_cv=cross_val_predict_cv, **kwargs, @@ -787,11 +802,11 @@ def ind_generator(rng): val_objective_names = ['validation_'+name for name in self.objective_names] self.objective_names_for_selection = val_objective_names self.evaluated_individuals.loc[best_pareto_front_idx,val_objective_names] = val_scores - self.evaluated_individuals["Validation_Pareto_Front"] = tpot2.utils.get_pareto_front(self.evaluated_individuals, val_objective_names, self.objective_function_weights, invalid_values=["TIMEOUT","INVALID"]) + self.evaluated_individuals["Validation_Pareto_Front"] = tpot2.utils.get_pareto_frontier(self.evaluated_individuals, column_names=val_objective_names, weights=self.objective_function_weights, invalid_values=["TIMEOUT","INVALID"]) else: self.objective_names_for_selection = self.objective_names - - val_scores = self.evaluated_individuals[~self.evaluated_individuals[self.objective_names_for_selection].isin(["TIMEOUT","INVALID"]).any(axis=1)][self.objective_names_for_selection].astype(float) + + val_scores = self.evaluated_individuals[~self.evaluated_individuals[self.objective_names_for_selection].isna().all(1)][self.objective_names_for_selection] weighted_scores = val_scores*self.objective_function_weights if self.bigger_is_better: @@ -805,7 +820,10 @@ def ind_generator(rng): #TODO #best_individual_pipeline = best_individual.export_pipeline(memory=self.memory, cross_val_predict_cv=self.cross_val_predict_cv) - best_individual_pipeline = best_individual.export_pipeline() + if self.export_graphpipeline: + best_individual_pipeline = best_individual.export_flattened_graphpipeline(memory=self.memory, cross_val_predict_cv=self.cross_val_predict_cv) + else: + best_individual_pipeline = best_individual.export_pipeline() if self.preprocessing: self.fitted_pipeline_ = sklearn.pipeline.make_pipeline(sklearn.base.clone(self._preprocessing_pipeline), best_individual_pipeline ) @@ -888,7 +906,7 @@ def make_evaluated_individuals(self): self.evaluated_individuals = self.evaluated_individuals.set_index(self.evaluated_individuals.index.map(object_to_int)) self.evaluated_individuals['Parents'] = self.evaluated_individuals['Parents'].apply(lambda row: convert_parents_tuples_to_integers(row, object_to_int)) - self.evaluated_individuals["Instance"] = self.evaluated_individuals["Individual"].apply(lambda ind: apply_make_pipeline(ind, preprocessing_pipeline=self._preprocessing_pipeline)) + self.evaluated_individuals["Instance"] = self.evaluated_individuals["Individual"].apply(lambda ind: apply_make_pipeline(ind, preprocessing_pipeline=self._preprocessing_pipeline, export_graphpipeline=self.export_graphpipeline, memory=self.memory, cross_val_predict_cv=self.cross_val_predict_cv)) return self.evaluated_individuals diff --git a/tpot2/tpot_estimator/estimator_utils.py b/tpot2/tpot_estimator/estimator_utils.py index c0b79739..7be96e26 100644 --- a/tpot2/tpot_estimator/estimator_utils.py +++ b/tpot2/tpot_estimator/estimator_utils.py @@ -13,97 +13,33 @@ def convert_parents_tuples_to_integers(row, object_to_int): return np.nan #TODO add kwargs -def apply_make_pipeline(graphindividual, preprocessing_pipeline=None): +def apply_make_pipeline(graphindividual, preprocessing_pipeline=None, export_graphpipeline=False, **pipeline_kwargs): try: - if preprocessing_pipeline is None: - return graphindividual.export_pipeline() - else: - return sklearn.pipeline.make_pipeline(sklearn.base.clone(preprocessing_pipeline), graphindividual.export_pipeline()) - except: - return None - -def get_configuration_dictionary(options, n_samples, n_features, classification, random_state=None, cv=None, subsets=None, feature_names=None, n_classes=None): - if options is None: - return options - - if isinstance(options, dict): - return recursive_with_defaults(options, n_samples, n_features, classification, random_state=None, cv=None, subsets=subsets, feature_names=feature_names, n_classes=n_classes) - - if not isinstance(options, list): - options = [options] - - config_dict = {} - - for option in options: - - if option == "selectors": - config_dict.update(tpot2.config.make_selector_config_dictionary(random_state=random_state, classifier=classification)) - - elif option == "classifiers": - config_dict.update(tpot2.config.make_classifier_config_dictionary(random_state=random_state, n_samples=n_samples, n_classes=n_classes)) - - elif option == "classifiers_sklearnex": - config_dict.update(tpot2.config.make_sklearnex_classifier_config_dictionary(random_state=random_state, n_samples=n_samples, n_classes=n_classes)) - - elif option == "regressors": - config_dict.update(tpot2.config.make_regressor_config_dictionary(random_state=random_state, cv=cv, n_samples=n_samples)) - - elif option == "regressors_sklearnex": - config_dict.update(tpot2.config.make_sklearnex_regressor_config_dictionary(random_state=random_state, n_samples=n_samples)) - - elif option == "transformers": - config_dict.update(tpot2.config.make_transformer_config_dictionary(random_state=random_state, n_features=n_features)) - - elif option == "arithmetic_transformer": - config_dict.update(tpot2.config.make_arithmetic_transformer_config_dictionary()) - - elif option == "feature_set_selector": - config_dict.update(tpot2.config.make_FSS_config_dictionary(subsets, n_features, feature_names=feature_names)) - - elif option == "skrebate": - config_dict.update(tpot2.config.make_skrebate_config_dictionary(n_features=n_features)) - - elif option == "MDR": - config_dict.update(tpot2.config.make_MDR_config_dictionary()) - - elif option == "continuousMDR": - config_dict.update(tpot2.config.make_ContinuousMDR_config_dictionary()) - - elif option == "FeatureEncodingFrequencySelector": - config_dict.update(tpot2.config.make_FeatureEncodingFrequencySelector_config_dictionary()) - - elif option == "genetic encoders": - config_dict.update(tpot2.config.make_genetic_encoders_config_dictionary()) - - elif option == "passthrough": - config_dict.update(tpot2.config.make_passthrough_config_dictionary()) - + if export_graphpipeline: + est = graphindividual.export_flattened_graphpipeline(**pipeline_kwargs) else: - config_dict.update(recursive_with_defaults(option, n_samples, n_features, classification, random_state, cv, subsets=subsets, feature_names=feature_names, n_classes=n_classes)) - - if len(config_dict) == 0: - raise ValueError("No valid configuration options were provided. Please check the options you provided and try again.") + est = graphindividual.export_pipeline() - return config_dict -def recursive_with_defaults(config_dict, n_samples, n_features, classification, random_state=None, cv=None, subsets=None, feature_names=None, n_classes=None): + if preprocessing_pipeline is None: + return est + else: + return sklearn.pipeline.make_pipeline(sklearn.base.clone(preprocessing_pipeline), est) + except: + return None - for key in 'leaf_config_dict', 'root_config_dict', 'inner_config_dict', 'Recursive': - if key in config_dict: - value = config_dict[key] - if key=="Resursive": - config_dict[key] = recursive_with_defaults(value, n_samples, n_features, classification, random_state, cv, subsets=None, feature_names=None, n_classes=None) - else: - config_dict[key] = get_configuration_dictionary(value, n_samples, n_features, classification, random_state, cv, subsets, feature_names, n_classes) - return config_dict -def objective_function_generator(pipeline, x,y, scorers, cv, other_objective_functions, step=None, budget=None, generation=1, is_classification=True, **pipeline_kwargs): +def objective_function_generator(pipeline, x,y, scorers, cv, other_objective_functions, step=None, budget=None, generation=1, is_classification=True, export_graphpipeline=False, **pipeline_kwargs): #pipeline = pipeline.export_pipeline(**pipeline_kwargs) - pipeline = pipeline.export_pipeline() + if export_graphpipeline: + pipeline = pipeline.export_flattened_graphpipeline(**pipeline_kwargs) + else: + pipeline = pipeline.export_pipeline() + if budget is not None and budget < 1: if is_classification: x,y = sklearn.utils.resample(x,y, stratify=y, n_samples=int(budget*len(x)), replace=False, random_state=1) @@ -129,9 +65,13 @@ def objective_function_generator(pipeline, x,y, scorers, cv, other_objective_fun return np.concatenate([cv_obj_scores,other_scores]) -def val_objective_function_generator(pipeline, X_train, y_train, X_test, y_test, scorers, other_objective_functions, **pipeline_kwargs): +def val_objective_function_generator(pipeline, X_train, y_train, X_test, y_test, scorers, other_objective_functions, export_graphpipeline=False, **pipeline_kwargs): #subsample the data - pipeline = pipeline.export_pipeline(**pipeline_kwargs) + if export_graphpipeline: + pipeline = pipeline.export_flattened_graphpipeline(**pipeline_kwargs) + else: + pipeline = pipeline.export_pipeline() + fitted_pipeline = sklearn.base.clone(pipeline) fitted_pipeline.fit(X_train, y_train) diff --git a/tpot2/tpot_estimator/steady_state_estimator.py b/tpot2/tpot_estimator/steady_state_estimator.py index 777c8cad..c73584b6 100644 --- a/tpot2/tpot_estimator/steady_state_estimator.py +++ b/tpot2/tpot_estimator/steady_state_estimator.py @@ -27,7 +27,9 @@ def set_dask_settings(): #TODO inherit from _BaseComposition? class TPOTEstimatorSteadyState(BaseEstimator): - def __init__(self, scorers= [], + def __init__(self, + search_space, + scorers= [], scorers_weights = [], classification = False, cv = 5, @@ -35,15 +37,14 @@ def __init__(self, scorers= [], other_objective_functions_weights = [], objective_function_names = None, bigger_is_better = True, - max_size = np.inf, - linear_pipeline = False, - root_config_dict= 'Auto', - inner_config_dict=["selectors", "transformers"], - leaf_config_dict= None, + + + export_graphpipeline = False, cross_val_predict_cv = 0, + memory = None, + categorical_features = None, subsets = None, - memory = None, preprocessing = False, validation_strategy = "none", validation_fraction = .2, @@ -77,7 +78,6 @@ def __init__(self, scorers= [], stepwise_steps = 5, warm_start = False, - subset_column = None, verbose = 0, periodic_checkpoint_folder = None, @@ -364,8 +364,6 @@ def __init__(self, scorers= [], warm_start : bool, default=False If True, will use the continue the evolutionary algorithm from the last generation of the previous run. - subset_column : str or int, default=None - EXPERIMENTAL The column to use for the subset selection. Must also pass in unique_subset_values to GraphIndividual to function. verbose : int, default=1 How much information to print during the optimization process. Higher values include the information from lower values. @@ -422,6 +420,7 @@ def __init__(self, scorers= [], # sklearn BaseEstimator must have a corresponding attribute for each parameter. # These should not be modified once set. + self.search_space = search_space self.scorers = scorers self.scorers_weights = scorers_weights self.classification = classification @@ -430,15 +429,18 @@ def __init__(self, scorers= [], self.other_objective_functions_weights = other_objective_functions_weights self.objective_function_names = objective_function_names self.bigger_is_better = bigger_is_better - self.max_size = max_size - self.linear_pipeline = linear_pipeline - self.root_config_dict= root_config_dict - self.inner_config_dict= inner_config_dict - self.leaf_config_dict= leaf_config_dict + + self.export_graphpipeline = export_graphpipeline self.cross_val_predict_cv = cross_val_predict_cv + self.memory = memory + + if self.cross_val_predict_cv !=0 or self.memory is not None: + if not self.export_graphpipeline: + raise ValueError("cross_val_predict_cv and memory parameters are parameters for GraphPipeline. To enable these options export_graphpipeline to be True. Otherwise these can be passed into the relevant Search spaces as parameters.") + + self.categorical_features = categorical_features self.subsets = subsets - self.memory = memory self.preprocessing = preprocessing self.validation_strategy = validation_strategy self.validation_fraction = validation_fraction @@ -468,7 +470,6 @@ def __init__(self, scorers= [], self.stepwise_steps = stepwise_steps self.warm_start = warm_start - self.subset_column = subset_column self.verbose = verbose self.periodic_checkpoint_folder = periodic_checkpoint_folder @@ -660,17 +661,6 @@ def fit(self, X, y): else: self.feature_names = None - if self.root_config_dict == 'Auto': - if self.classification: - n_classes = len(np.unique(y)) - root_config_dict = get_configuration_dictionary("classifiers", n_samples, n_features, self.classification, self.random_state, self.cv_gen, subsets=self.subsets, feature_names=self.feature_names, n_classes=n_classes) - else: - root_config_dict = get_configuration_dictionary("regressors", n_samples, n_features, self.classification, self.random_state, self.cv_gen, subsets=self.subsets, feature_names=self.feature_names) - else: - root_config_dict = get_configuration_dictionary(self.root_config_dict, n_samples, n_features, self.classification, self.random_state, self.cv_gen, subsets=self.subsets,feature_names=self.feature_names) - - inner_config_dict = get_configuration_dictionary(self.inner_config_dict, n_samples, n_features, self.classification, self.random_state, self.cv_gen, subsets=self.subsets, feature_names=self.feature_names) - leaf_config_dict = get_configuration_dictionary(self.leaf_config_dict, n_samples, n_features, self.classification, self.random_state, self.cv_gen, subsets=self.subsets, feature_names=self.feature_names) @@ -681,9 +671,9 @@ def objective_function(pipeline_individual, scorers= self._scorers, cv=self.cv_gen, other_objective_functions=self.other_objective_functions, + export_graphpipeline=self.export_graphpipeline, memory=self.memory, cross_val_predict_cv=self.cross_val_predict_cv, - subset_column=self.subset_column, **kwargs): return objective_function_generator( pipeline_individual, @@ -693,19 +683,16 @@ def objective_function(pipeline_individual, scorers= scorers, cv=cv, other_objective_functions=other_objective_functions, + export_graphpipeline=export_graphpipeline, memory=memory, cross_val_predict_cv=cross_val_predict_cv, - subset_column=subset_column, **kwargs, ) - self.individual_generator_instance = tpot2.individual_representations.graph_pipeline_individual.estimator_graph_individual_generator( - inner_config_dict=inner_config_dict, - root_config_dict=root_config_dict, - leaf_config_dict=leaf_config_dict, - max_size = self.max_size, - linear_pipeline=self.linear_pipeline, - ) + def ind_generator(rng): + rng = np.random.default_rng(rng) + while True: + yield self.search_space.generate(rng) @@ -718,7 +705,7 @@ def objective_function(pipeline_individual, #If warm start and we have an evolver instance, use the existing one if not(self.warm_start and self._evolver_instance is not None): - self._evolver_instance = self._evolver( individual_generator=self.individual_generator_instance, + self._evolver_instance = self._evolver( individual_generator=ind_generator(self.rng), objective_functions= [objective_function], objective_function_weights = self.objective_function_weights, objective_names=self.objective_names, @@ -805,9 +792,10 @@ def objective_function(pipeline_individual, scorers= self._scorers, cv=self.cv_gen, other_objective_functions=self.other_objective_functions, + export_graphpipeline=self.export_graphpipeline, memory=self.memory, cross_val_predict_cv=self.cross_val_predict_cv, - subset_column=self.subset_column, + **kwargs: objective_function_generator( ind, X, @@ -816,9 +804,9 @@ def objective_function(pipeline_individual, scorers= scorers, cv=cv, other_objective_functions=other_objective_functions, + export_graphpipeline=export_graphpipeline, memory=memory, cross_val_predict_cv=cross_val_predict_cv, - subset_column=subset_column, **kwargs, )] @@ -858,9 +846,9 @@ def objective_function(pipeline_individual, y_val, scorers= self._scorers, other_objective_functions=self.other_objective_functions, + export_graphpipeline=self.export_graphpipeline, memory=self.memory, cross_val_predict_cv=self.cross_val_predict_cv, - subset_column=self.subset_column, **kwargs: val_objective_function_generator( ind, X, @@ -869,9 +857,9 @@ def objective_function(pipeline_individual, y_val, scorers= scorers, other_objective_functions=other_objective_functions, + export_graphpipeline=export_graphpipeline, memory=memory, cross_val_predict_cv=cross_val_predict_cv, - subset_column=subset_column, **kwargs, )] @@ -898,7 +886,10 @@ def objective_function(pipeline_individual, self.selected_best_score = self.evaluated_individuals.loc[best_idx] - best_individual_pipeline = best_individual.export_pipeline(memory=self.memory, cross_val_predict_cv=self.cross_val_predict_cv, subset_column=self.subset_column) + if self.export_graphpipeline: + best_individual_pipeline = best_individual.export_flattened_graphpipeline(memory=self.memory, cross_val_predict_cv=self.cross_val_predict_cv) + else: + best_individual_pipeline = best_individual.export_pipeline() if self.preprocessing: self.fitted_pipeline_ = sklearn.pipeline.make_pipeline(sklearn.base.clone(self._preprocessing_pipeline), best_individual_pipeline ) @@ -979,7 +970,7 @@ def make_evaluated_individuals(self): self.evaluated_individuals = self.evaluated_individuals.set_index(self.evaluated_individuals.index.map(object_to_int)) self.evaluated_individuals['Parents'] = self.evaluated_individuals['Parents'].apply(lambda row: convert_parents_tuples_to_integers(row, object_to_int)) - self.evaluated_individuals["Instance"] = self.evaluated_individuals["Individual"].apply(lambda ind: apply_make_pipeline(ind, preprocessing_pipeline=self._preprocessing_pipeline)) + self.evaluated_individuals["Instance"] = self.evaluated_individuals["Individual"].apply(lambda ind: apply_make_pipeline(ind, preprocessing_pipeline=self._preprocessing_pipeline, export_graphpipeline=self.export_graphpipeline, memory=self.memory, cross_val_predict_cv=self.cross_val_predict_cv)) return self.evaluated_individuals From d2dab4eadec600e5b4addaa634ade88936936750 Mon Sep 17 00:00:00 2001 From: perib Date: Wed, 17 Apr 2024 21:23:44 -0700 Subject: [PATCH 4/6] lots of edits to configuration spaces --- README.md | 10 - Tutorial/2_Search_Spaces.ipynb | 455 +++++++++++++++++- .../builtin_modules/column_one_hot_encoder.py | 1 + tpot2/config/classifiers.py | 370 ++++++++++---- tpot2/config/classifiers_sklearnex.py | 10 +- tpot2/config/get_configspace.py | 199 ++++++-- tpot2/config/mdr_configs.py | 8 +- tpot2/config/regressors.py | 387 ++++++++++----- tpot2/config/regressors_sklearnex.py | 14 +- tpot2/config/special_configs.py | 51 -- tpot2/config/tests/__init__.py | 0 tpot2/config/tests/test_get_configspace.py | 26 + tpot2/config/transformers.py | 35 ++ tpot2/search_spaces/nodes/estimator_node.py | 61 ++- 14 files changed, 1271 insertions(+), 356 deletions(-) create mode 100644 tpot2/config/tests/__init__.py create mode 100644 tpot2/config/tests/test_get_configspace.py diff --git a/README.md b/README.md index 6f30b08d..f7551551 100644 --- a/README.md +++ b/README.md @@ -159,16 +159,6 @@ Setting `verbose` to 5 can be helpful during debugging as it will print out the We welcome you to check the existing issues for bugs or enhancements to work on. If you have an idea for an extension to TPOT2, please file a new issue so we can discuss it. -### Known issues -* TPOT2 uses the func_timeout package to terminate long running pipelines. The early termination signal may fail on particular estimators and cause TPOT2 to run for longer than intended. If you are using your own custom configuration dictionaries, and are noticing that TPOT2 is running for longer than intended, this may be the issue. We are currently looking into it. Sometimes restarting TPOT2 resolves the issue. -* Periodic checkpoint folder may not correctly resume if using budget and/or initial_population size. -* Population class is slow to add new individuals. The Population class needs to be updated to use a dictionary for storage rather than a pandas dataframe. -* Crossover may sometimes go over the size restrictions. -* Memory caching with GraphPipeline may miss some nodes where the ordering on inputs happens to be different between two nodes. - - - - ### Support for TPOT2 TPOT2 was developed in the [Artificial Intelligence Innovation (A2I) Lab](http://epistasis.org/) at Cedars-Sinai with funding from the [NIH](http://www.nih.gov/) under grants U01 AG066833 and R01 LM010098. We are incredibly grateful for the support of the NIH and the Cedars-Sinai during the development of this project. diff --git a/Tutorial/2_Search_Spaces.ipynb b/Tutorial/2_Search_Spaces.ipynb index 8e0af2b9..51d2aff7 100644 --- a/Tutorial/2_Search_Spaces.ipynb +++ b/Tutorial/2_Search_Spaces.ipynb @@ -141,7 +141,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "You can sample generate an individual with the generate() function. This individual samples from the search space as well as provides mutation and crossover functions to modify the current sample." + "You can sample generate an individual with the generate() function. This individual samples from the search space as well as provides mutation and crossover functions to modify the current sample.\n", + "\n", + "Note that ConfigurationSpace does not support None as a parameter. Instead, use the special string \"\\\". TPOT will automatically replace instances of this string with the Python None." ] }, { @@ -652,6 +654,455 @@ "knn_individual1.export_pipeline()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If a dictionary of parameters is passed instead of of a ConfigSpace, then the hyperparameters will be fixed and not learned." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
KNeighborsClassifier(n_neighbors=10)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "KNeighborsClassifier(n_neighbors=10)" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import tpot2\n", + "from ConfigSpace import ConfigurationSpace\n", + "from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "\n", + "space = {\n", + "\n", + " 'n_neighbors':10,\n", + "}\n", + "\n", + "knn_node = tpot2.search_spaces.nodes.EstimatorNode(\n", + " method = KNeighborsClassifier,\n", + " space = space,\n", + ")\n", + "\n", + "knn_node.generate().export_pipeline()" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1658,7 +2109,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "TPOT2 also comes with predefined search spaces. the helper function `tpot2.config.get_search_space` takes in a string or a list of strings, and returns either a EstimatorNode or a ChoicePipeline,respectively. \n", + "TPOT2 also comes with predefined search spaces. The current search spaces were adapted from a combination of the original TPOT package as well as the search spaces used in [AutoSklearn](https://github.com/automl/auto-sklearn/tree/development/autosklearn/pipeline/components). The helper function `tpot2.config.get_search_space` takes in a string or a list of strings, and returns either a EstimatorNode or a ChoicePipeline,respectively. \n", "\n", "strings can correspond to individual methods. Tehre are also special strings that return predefined lists of methods. \n", "\n", diff --git a/tpot2/builtin_modules/column_one_hot_encoder.py b/tpot2/builtin_modules/column_one_hot_encoder.py index 4f3843bf..34c3320e 100644 --- a/tpot2/builtin_modules/column_one_hot_encoder.py +++ b/tpot2/builtin_modules/column_one_hot_encoder.py @@ -44,6 +44,7 @@ def __init__(self, columns='auto', drop=None, handle_unknown='error', sparse_out ---------- columns : str, list, default='auto' + Determines which columns to onehot encode with sklearn.preprocessing.OneHotEncoder. - 'auto' : Automatically select categorical features based on columns with less than 10 unique values - 'categorical' : Automatically select categorical features - 'numeric' : Automatically select numeric features diff --git a/tpot2/config/classifiers.py b/tpot2/config/classifiers.py index 14649f61..6423f328 100644 --- a/tpot2/config/classifiers.py +++ b/tpot2/config/classifiers.py @@ -1,31 +1,42 @@ from ConfigSpace import ConfigurationSpace from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal +from ConfigSpace import EqualsCondition, OrConjunction, NotEqualsCondition, InCondition +from ..search_spaces.nodes.estimator_node import NONE_SPECIAL_STRING, TRUE_SPECIAL_STRING, FALSE_SPECIAL_STRING +import numpy as np + #TODO Conditional search space to prevent invalid combinations of hyperparameters -def get_LogisticRegression_ConfigurationSpace(random_state=None): - - space = { - 'solver': Categorical('solver', ['saga','liblinear']), - 'penalty': Categorical("penalty", ['elasticnet','l1', 'l2']), #TODO workaround to support None option? - 'dual': Categorical("dual", [True, False]), - 'C': Float("C", bounds=(1e-4, 1e4), log=True), - - #TODO workaround for including None as a value for class_weight - 'class_weight': Categorical("class_weight", ['balanced']), - 'n_jobs': 1, - 'max_iter': 1000, - } +def get_LogisticRegression_ConfigurationSpace(n_samples, n_features, random_state): + + dual = n_samples<=n_features + + dual = TRUE_SPECIAL_STRING if dual else FALSE_SPECIAL_STRING + + space = {"solver":"saga", + "max_iter":1000, + "n_jobs":1, + "dual":dual, + } + + penalty = Categorical('penalty', ['l1', 'l2',"elasticnet"], default='l2') + C = Float('C', (0.01, 1e5), log=True) + l1_ratio = Float('l1_ratio', (0.0, 1.0)) + + l1_ratio_condition = EqualsCondition(l1_ratio, penalty, 'elasticnet') if random_state is not None: #This is required because configspace doesn't allow None as a value space['random_state'] = random_state - return ConfigurationSpace( - space = space - ) + + cs = ConfigurationSpace(space) + cs.add_hyperparameters([penalty, C, l1_ratio]) + cs.add_conditions([l1_ratio_condition]) + + return cs -def get_KNeighborsClassifier_ConfigurationSpace(n_samples=10): +def get_KNeighborsClassifier_ConfigurationSpace(n_samples): return ConfigurationSpace( space = { @@ -39,14 +50,14 @@ def get_KNeighborsClassifier_ConfigurationSpace(n_samples=10): ) -def get_DecisionTreeClassifier_ConfigurationSpace(random_state=None, n_featues=20): +def get_DecisionTreeClassifier_ConfigurationSpace(n_featues, random_state): space = { 'criterion': Categorical("criterion", ['gini', 'entropy']), - 'max_depth': Integer("max_depth", bounds=(1, 2*n_featues)), - 'min_samples_split': Integer("min_samples_split", bounds=(2, 20)), + 'max_depth': Integer("max_depth", bounds=(1, 2*n_featues)), #max of 20? log scale? + 'min_samples_split': Integer("min_samples_split", bounds=(1, 20)), 'min_samples_leaf': Integer("min_samples_leaf", bounds=(1, 20)), - 'max_features': Categorical("max_features", [1.0, 'sqrt', 'log2']), + 'max_features': Categorical("max_features", [NONE_SPECIAL_STRING, 'sqrt', 'log2']), 'min_weight_fraction_leaf': 0.0, } @@ -58,54 +69,66 @@ def get_DecisionTreeClassifier_ConfigurationSpace(random_state=None, n_featues=2 space = space ) +#TODO Conditional search spaces +def get_LinearSVC_ConfigurationSpace(random_state): + space = {"dual":"auto"} + + penalty = Categorical('penalty', ['l1', 'l2']) + C = Float('C', (0.01, 1e5), log=True) + loss = Categorical('loss', ['hinge', 'squared_hinge']) -def get_SVC_ConfigurationSpace(random_state=None): + loss_condition = EqualsCondition(loss, penalty, 'l2') - space = { - 'kernel': Categorical("kernel", ['poly', 'rbf', 'linear', 'sigmoid']), - 'C': Float("C", bounds=(1e-4, 25), log=True), - 'degree': Integer("degree", bounds=(1, 4)), - - #'class_weight': Categorical("class_weight", [None, 'balanced']), #TODO add class_weight. configspace doesn't allow None as a value. - 'max_iter': 3000, - 'tol': 0.001, - 'probability': Categorical("probability", [True]), # configspace doesn't allow bools as a default value? but does allow them as a value inside a Categorical - } if random_state is not None: #This is required because configspace doesn't allow None as a value space['random_state'] = random_state - - return ConfigurationSpace( - space = space - ) -#TODO Conditional search spaces -def get_LinearSVC_ConfigurationSpace(random_state=None,): + + cs = ConfigurationSpace(space) + cs.add_hyperparameters([penalty, C, loss]) + cs.add_conditions([loss_condition]) + + return cs + + +def get_SVC_ConfigurationSpace(random_state): + space = { - 'penalty': Categorical("penalty", ['l1', 'l2']), - 'loss': Categorical("loss", ['hinge', 'squared_hinge']), - 'dual': Categorical("dual", [True, False]), - 'C': Float("C", bounds=(1e-4, 25), log=True), - } - + 'max_iter': 3000, + 'probability':TRUE_SPECIAL_STRING} + + kernel = Categorical("kernel", ['poly', 'rbf', 'sigmoid']) + C = Float('C', (0.01, 1e5), log=True) + degree = Integer("degree", bounds=(1, 5)) + gamma = Float("gamma", bounds=(1e-5, 8), log=True) + shrinking = Categorical("shrinking", [True, False]) + coef0 = Float("coef0", bounds=(-1, 1)) + + degree_condition = EqualsCondition(degree, kernel, 'poly') + gamma_condition = InCondition(gamma, kernel, ['rbf', 'poly']) + coef0_condition = InCondition(coef0, kernel, ['poly', 'sigmoid']) + if random_state is not None: #This is required because configspace doesn't allow None as a value space['random_state'] = random_state - - return ConfigurationSpace( - space = space - ) + cs = ConfigurationSpace(space) + cs.add_hyperparameters([kernel, C, coef0, degree, gamma, shrinking]) + cs.add_conditions([degree_condition, gamma_condition, coef0_condition]) + return cs -def get_RandomForestClassifier_ConfigurationSpace(random_state=None): + +def get_RandomForestClassifier_ConfigurationSpace(n_features, random_state): space = { - 'n_estimators': 100, + 'n_estimators': 128, #as recommended by Oshiro et al. (2012 + 'max_features': Integer("max_features", bounds=(1, max(1, n_features))), #log scale like autosklearn? 'criterion': Categorical("criterion", ['gini', 'entropy']), 'min_samples_split': Integer("min_samples_split", bounds=(2, 20)), 'min_samples_leaf': Integer("min_samples_leaf", bounds=(1, 20)), 'bootstrap': Categorical("bootstrap", [True, False]), + 'class_weight': Categorical("class_weight", [NONE_SPECIAL_STRING, 'balanced']), } if random_state is not None: #This is required because configspace doesn't allow None as a value @@ -115,46 +138,21 @@ def get_RandomForestClassifier_ConfigurationSpace(random_state=None): space = space ) -def get_GradientBoostingClassifier_ConfigurationSpace(random_state=None, n_classes=None): - - if n_classes is not None and n_classes > 2: - loss = 'log_loss' - else: - loss = Categorical("loss", ['log_loss', 'exponential']) - - space = { - 'n_estimators': 100, - 'loss': loss, - 'learning_rate': Float("learning_rate", bounds=(1e-3, 1), log=True), - 'min_samples_leaf': Integer("min_samples_leaf", bounds=(1, 200)), - 'min_samples_split': Integer("min_samples_split", bounds=(2, 20)), - 'subsample': Float("subsample", bounds=(0.1, 1.0)), - 'max_features': Float("max_features", bounds=(0.1, 1.0)), - 'max_depth': Integer("max_depth", bounds=(1, 10)), - - #TODO include max leaf nodes? - #TODO validation fraction + n_iter_no_change? maybe as conditional - - 'tol': 1e-4, - } - - if random_state is not None: #This is required because configspace doesn't allow None as a value - space['random_state'] = random_state - - return ConfigurationSpace( - space = space - ) - -def get_XGBClassifier_ConfigurationSpace(random_state=None,): +def get_XGBClassifier_ConfigurationSpace(random_state,): space = { 'n_estimators': 100, 'learning_rate': Float("learning_rate", bounds=(1e-3, 1), log=True), - 'subsample': Float("subsample", bounds=(0.1, 1.0)), + 'subsample': Float("subsample", bounds=(0.5, 1.0)), 'min_child_weight': Integer("min_child_weight", bounds=(1, 21)), - 'max_depth': Integer("max_depth", bounds=(1, 11)), + 'gamma': Float("gamma", bounds=(1e-4, 20), log=True), + 'max_depth': Integer("max_depth", bounds=(3, 18)), + 'reg_alpha': Float("reg_alpha", bounds=(1e-4, 100), log=True), + 'reg_lambda': Float("reg_lambda", bounds=(1e-4, 1), log=True), 'n_jobs': 1, + 'nthread': 1, + 'verbosity': 0, } if random_state is not None: #This is required because configspace doesn't allow None as a value @@ -164,7 +162,7 @@ def get_XGBClassifier_ConfigurationSpace(random_state=None,): space = space ) -def get_LGBMClassifier_ConfigurationSpace(random_state=None,): +def get_LGBMClassifier_ConfigurationSpace(random_state,): space = { 'objective': 'binary', @@ -184,7 +182,7 @@ def get_LGBMClassifier_ConfigurationSpace(random_state=None,): ) -def get_ExtraTreesClassifier_ConfigurationSpace(random_state=None): +def get_ExtraTreesClassifier_ConfigurationSpace(random_state): space = { 'n_estimators': 100, 'criterion': Categorical("criterion", ["gini", "entropy"]), @@ -204,41 +202,36 @@ def get_ExtraTreesClassifier_ConfigurationSpace(random_state=None): -def get_SGDClassifier_ConfigurationSpace(random_state=None): +def get_SGDClassifier_ConfigurationSpace(random_state): space = { - 'loss': Categorical("loss", ['log_loss', 'modified_huber']), + 'loss': Categorical("loss", ['squared_hinge', 'modified_huber']), #don't include hinge because we have LinearSVC, don't include log because we have LogisticRegression 'penalty': 'elasticnet', 'alpha': Float("alpha", bounds=(1e-5, 0.01), log=True), - 'learning_rate': Categorical("learning_rate", ['invscaling', 'constant']), 'l1_ratio': Float("l1_ratio", bounds=(0.0, 1.0)), 'eta0': Float("eta0", bounds=(0.01, 1.0)), - 'power_t': Float("power_t", bounds=(1e-5, 100.0), log=True), 'n_jobs': 1, 'fit_intercept': Categorical("fit_intercept", [True]), + 'class_weight': Categorical("class_weight", [NONE_SPECIAL_STRING, 'balanced']), } if random_state is not None: #This is required because configspace doesn't allow None as a value space['random_state'] = random_state - return ConfigurationSpace( + power_t = Float("power_t", bounds=(1e-5, 100.0), log=True) + learning_rate = Categorical("learning_rate", ['invscaling', 'constant', "optimal"]) + powertcond = EqualsCondition(power_t, learning_rate, 'invscaling') + + + cs = ConfigurationSpace( space = space ) + cs.add_hyperparameters([power_t, learning_rate]) + cs.add_conditions([powertcond]) + return cs -def get_MLPClassifier_ConfigurationSpace(random_state=None): - space = { - 'alpha': Float("alpha", bounds=(1e-4, 1e-1), log=True), - 'learning_rate_init': Float("learning_rate_init", bounds=(1e-3, 1.), log=True), - } - - if random_state is not None: #This is required because configspace doesn't allow None as a value - space['random_state'] = random_state - - return ConfigurationSpace( - space = space - ) GaussianNB_ConfigurationSpace = {} @@ -261,12 +254,11 @@ def get_MultinomialNB_ConfigurationSpace(): -def get_AdaBoostClassifier_ConfigurationSpace(random_state=None): +def get_AdaBoostClassifier_ConfigurationSpace(random_state): space = { 'n_estimators': Integer("n_estimators", bounds=(50, 500)), 'learning_rate': Float("learning_rate", bounds=(0.01, 2), log=True), 'algorithm': Categorical("algorithm", ['SAMME', 'SAMME.R']), - 'max_depth': Integer("max_depth", bounds=(1, 10)), } if random_state is not None: #This is required because configspace doesn't allow None as a value @@ -274,4 +266,172 @@ def get_AdaBoostClassifier_ConfigurationSpace(random_state=None): return ConfigurationSpace( space = space - ) \ No newline at end of file + ) + + +def get_QuadraticDiscriminantAnalysis_ConfigurationSpace(): + return ConfigurationSpace( + space = { + 'reg_param': Float("reg_param", bounds=(0, 1)), + } + ) + +def get_PassiveAggressiveClassifier_ConfigurationSpace(random_state): + space = { + 'C': Float("C", bounds=(1e-5, 10), log=True), + 'loss': Categorical("loss", ['hinge', 'squared_hinge']), + 'average': Categorical("average", [True, False]), + } + + if random_state is not None: #This is required because configspace doesn't allow None as a value + space['random_state'] = random_state + + return ConfigurationSpace( + space = space + ) +#TODO support auto shrinkage when solver is svd. may require custom node +def get_LinearDiscriminantAnalysis_ConfigurationSpace(): + + solver = Categorical("solver", ['svd', 'lsqr', 'eigen']), + shrinkage = Float("shrinkage", bounds=(0, 1)), + + shrinkcond = NotEqualsCondition(shrinkage, solver, 'svd') + + cs = ConfigurationSpace() + cs.add_hyperparameters([solver, shrinkage]) + cs.add_conditions([shrinkcond]) + + return + + + +#### Gradient Boosting Classifiers + +def get_GradientBoostingClassifier_ConfigurationSpace(n_features, random_state): + early_stop = Categorical("early_stop", ["off", "valid", "train"]) + n_iter_no_change = Integer("n_iter_no_change",bounds=(1,20)) + validation_fraction = Float("validation_fraction", bounds=(0.01, 0.4)) + + n_iter_no_change_cond = InCondition(n_iter_no_change, early_stop, ["valid", "train"] ) + validation_fraction_cond = EqualsCondition(validation_fraction, early_stop, "valid") + + space = { + 'loss': Categorical("loss", ['log_loss', 'exponential']), + 'learning_rate': Float("learning_rate", bounds=(1e-3, 1), log=True), + 'min_samples_leaf': Integer("min_samples_leaf", bounds=(1, 200)), + 'min_samples_split': Integer("min_samples_split", bounds=(2, 20)), + 'subsample': Float("subsample", bounds=(0.1, 1.0)), + 'max_features': Integer("max_features", bounds=(1, max(1, n_features))), + 'max_leaf_nodes': Integer("max_leaf_nodes", bounds=(3, 2047)), + 'max_depth': Integer("max_depth", bounds=(1, 2*n_features)), + 'tol': 1e-4, + } + + if random_state is not None: #This is required because configspace doesn't allow None as a value + space['random_state'] = random_state + + cs = ConfigurationSpace( + space = space + ) + cs.add_hyperparameters([n_iter_no_change, validation_fraction, early_stop ]) + cs.add_conditions([validation_fraction_cond, n_iter_no_change_cond]) + return cs + + + + +#only difference is l2_regularization +def get_HistGradientBoostingClassifier_ConfigurationSpace(n_features, random_state): + early_stopping = Categorical("early_stopping", ["off", "valid", "train"]) + n_iter_no_change = Integer("n_iter_no_change",bounds=(1,20)) + validation_fraction = Float("validation_fraction", bounds=(0.01, 0.4)) + + n_iter_no_change_cond = InCondition(n_iter_no_change, early_stopping, ["valid", "train"] ) + validation_fraction_cond = EqualsCondition(validation_fraction, early_stopping, "valid") + + space = { + 'loss': Categorical("loss", ['log_loss', 'exponential']), + 'learning_rate': Float("learning_rate", bounds=(1e-3, 1), log=True), + 'min_samples_leaf': Integer("min_samples_leaf", bounds=(1, 200)), + 'max_features': Float("max_features", bounds=(0.1,1.0)), + 'max_leaf_nodes': Integer("max_leaf_nodes", bounds=(3, 2047)), + 'max_depth': Integer("max_depth", bounds=(1, 2*n_features)), + 'l2_regularization': Float("l2_regularization", bounds=(1e-10, 1), log=True), + 'tol': 1e-4, + } + + if random_state is not None: #This is required because configspace doesn't allow None as a value + space['random_state'] = random_state + + cs = ConfigurationSpace( + space = space + ) + cs.add_hyperparameters([n_iter_no_change, validation_fraction, early_stopping ]) + cs.add_conditions([validation_fraction_cond, n_iter_no_change_cond]) + + return cs + +def GradientBoostingClassifier_hyperparameter_parser(params): + + final_params = { + 'loss': params['loss'], + 'learning_rate': params['learning_rate'], + 'min_samples_leaf': params['min_samples_leaf'], + 'min_samples_split': params['min_samples_split'], + 'subsample': params['subsample'], + 'max_features': params['max_features'], + 'max_leaf_nodes': params['max_leaf_nodes'], + 'max_depth': params['max_depth'], + 'tol': params['tol'], + } + + if "l2_regularization" in params: + final_params['l2_regularization'] = params['l2_regularization'] + + if params['early_stop'] == 'off': + final_params['n_iter_no_change'] = None + final_params['validation_fraction'] = None + elif params['early_stop'] == 'valid': + final_params['n_iter_no_change'] = params['n_iter_no_change'] + final_params['validation_fraction'] = params['validation_fraction'] + elif params['early_stop'] == 'train': + final_params['n_iter_no_change'] = params['n_iter_no_change'] + final_params['validation_fraction'] = None + + + return final_params + + +### + +def get_MLPClassifier_ConfigurationSpace(random_state): + space = {"n_iter_no_change":32} + + if random_state is not None: #This is required because configspace doesn't allow None as a value + space['random_state'] = random_state + + cs = ConfigurationSpace( + space = space + ) + + n_hidden_layers = Integer("n_hidden_layers", bounds=(1, 3)) + n_nodes_per_layer = Integer("n_nodes_per_layer", bounds=(16, 512)) + activation = Categorical("activation", ['tanh', 'relu']) + alpha = Float("alpha", bounds=(1e-7, 1e-1), log=True) + learning_rate = Float("learning_rate", bounds=(1e-4, 1e-1), log=True) + early_stopping = Categorical("early_stopping", [True,False]) + + cs.add_hyperparameters([n_hidden_layers, n_nodes_per_layer, activation, alpha, learning_rate, early_stopping]) + + return cs + +def MLPClassifier_hyperparameter_parser(params): + hyperparameters = { + 'n_iter_no_change': params['n_iter_no_change'], + 'hidden_layer_sizes' : [params['n_nodes_per_layer']]*params['n_hidden_layers'], + 'activation': params['activation'], + 'alpha': params['alpha'], + 'learning_rate': params['learning_rate'], + 'early_stopping': params['early_stopping'], + } + return hyperparameters \ No newline at end of file diff --git a/tpot2/config/classifiers_sklearnex.py b/tpot2/config/classifiers_sklearnex.py index a158a9a6..ad581898 100644 --- a/tpot2/config/classifiers_sklearnex.py +++ b/tpot2/config/classifiers_sklearnex.py @@ -2,7 +2,7 @@ from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal -def get_RandomForestClassifier_ConfigurationSpace(random_state=None): +def get_RandomForestClassifier_ConfigurationSpace(random_state): space = { 'n_estimators': 100, #TODO make this a higher number? learned? 'bootstrap': Categorical("bootstrap", [True, False]), @@ -19,7 +19,7 @@ def get_RandomForestClassifier_ConfigurationSpace(random_state=None): space = space ) -def get_KNeighborsClassifier_ConfigurationSpace(n_samples=10): +def get_KNeighborsClassifier_ConfigurationSpace(n_samples): return ConfigurationSpace( space = { 'n_neighbors': Integer("n_neighbors", bounds=(1, max(n_samples, 100)), log=True), @@ -29,7 +29,7 @@ def get_KNeighborsClassifier_ConfigurationSpace(n_samples=10): #TODO add conditionals -def get_LogisticRegression_ConfigurationSpace(random_state=None): +def get_LogisticRegression_ConfigurationSpace(random_state): space = { 'solver': Categorical("solver", ['liblinear', 'sag', 'saga']), 'penalty': Categorical("penalty", ['l1', 'l2']), @@ -45,7 +45,7 @@ def get_LogisticRegression_ConfigurationSpace(random_state=None): space = space ) -def get_SVC_ConfigurationSpace(random_state=None): +def get_SVC_ConfigurationSpace(random_state): space = { 'kernel': Categorical("kernel", ['poly', 'rbf', 'linear', 'sigmoid']), 'C': Float("C", bounds=(1e-4, 25), log=True), @@ -62,7 +62,7 @@ def get_SVC_ConfigurationSpace(random_state=None): space = space ) -def get_NuSVC_ConfigurationSpace(random_state=None): +def get_NuSVC_ConfigurationSpace(random_state): space = { 'nu': Float("nu", bounds=(0.05, 1.0)), 'kernel': Categorical("kernel", ['poly', 'rbf', 'linear', 'sigmoid']), diff --git a/tpot2/config/get_configspace.py b/tpot2/config/get_configspace.py index 44892278..cf75cd47 100644 --- a/tpot2/config/get_configspace.py +++ b/tpot2/config/get_configspace.py @@ -52,6 +52,7 @@ from sklearn.kernel_approximation import RBFSampler from sklearn.preprocessing import RobustScaler from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import PowerTransformer, QuantileTransformer from sklearn.feature_selection import SelectFwe @@ -62,13 +63,12 @@ import sklearn.feature_selection - +#TODO create a selectomixin using these? from sklearn.feature_selection import f_classif from sklearn.feature_selection import f_regression from sklearn.linear_model import SGDRegressor -from sklearn.linear_model import LinearRegression from sklearn.linear_model import Ridge from sklearn.linear_model import Lasso from sklearn.linear_model import ElasticNet @@ -76,21 +76,23 @@ from sklearn.linear_model import LassoLars, LassoLarsCV from sklearn.linear_model import RidgeCV +from sklearn.svm import SVR, SVC +from sklearn.svm import LinearSVR, LinearSVC -from sklearn.svm import SVR -from sklearn.svm import LinearSVR - -from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor,RandomForestRegressor +from sklearn.ensemble import AdaBoostRegressor, AdaBoostClassifier, GradientBoostingRegressor,RandomForestRegressor from sklearn.ensemble import BaggingRegressor from sklearn.ensemble import ExtraTreesRegressor +from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor from sklearn.tree import DecisionTreeRegressor from sklearn.neighbors import KNeighborsRegressor from sklearn.linear_model import ElasticNetCV -from xgboost import XGBRegressor +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis -from tpot2.builtin_modules import RFE_ExtraTreesClassifier, SelectFromModel_ExtraTreesClassifier, RFE_ExtraTreesRegressor, SelectFromModel_ExtraTreesRegressor +from sklearn.gaussian_process import GaussianProcessRegressor + +from xgboost import XGBRegressor from tpot2.builtin_modules import AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer @@ -99,8 +101,11 @@ #MDR -all_methods = [SGDClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, MLPClassifier, DecisionTreeClassifier, XGBClassifier, KNeighborsClassifier, SVC, LogisticRegression, LGBMClassifier, LinearSVC, GaussianNB, BernoulliNB, MultinomialNB, ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, DecisionTreeRegressor, KNeighborsRegressor, XGBRegressor, RFE_ExtraTreesClassifier, SelectFromModel_ExtraTreesClassifier, RFE_ExtraTreesRegressor, SelectFromModel_ExtraTreesRegressor, ZeroCount, OneHotEncoder, ColumnOneHotEncoder, Binarizer, FastICA, FeatureAgglomeration, MaxAbsScaler, MinMaxScaler, Normalizer, Nystroem, PCA, PolynomialFeatures, RBFSampler, RobustScaler, StandardScaler, SelectFwe, SelectPercentile, VarianceThreshold, RFE, SelectFromModel, f_classif, f_regression, SGDRegressor, LinearRegression, Ridge, Lasso, ElasticNet, Lars, LassoLars, LassoLarsCV, RidgeCV, SVR, LinearSVR, AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, DecisionTreeRegressor, KNeighborsRegressor, ElasticNetCV, +all_methods = [SGDClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, MLPClassifier, DecisionTreeClassifier, XGBClassifier, KNeighborsClassifier, SVC, LogisticRegression, LGBMClassifier, LinearSVC, GaussianNB, BernoulliNB, MultinomialNB, ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, DecisionTreeRegressor, KNeighborsRegressor, XGBRegressor, ZeroCount, OneHotEncoder, ColumnOneHotEncoder, Binarizer, FastICA, FeatureAgglomeration, MaxAbsScaler, MinMaxScaler, Normalizer, Nystroem, PCA, PolynomialFeatures, RBFSampler, RobustScaler, StandardScaler, SelectFwe, SelectPercentile, VarianceThreshold, SGDRegressor, Ridge, Lasso, ElasticNet, Lars, LassoLars, LassoLarsCV, RidgeCV, SVR, LinearSVR, AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, DecisionTreeRegressor, KNeighborsRegressor, ElasticNetCV, + AdaBoostClassifier, + GaussianProcessRegressor, HistGradientBoostingClassifier, HistGradientBoostingRegressor, AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer, + PowerTransformer, QuantileTransformer, ] @@ -118,36 +123,21 @@ all_methods.append(MultiSURF) if 'sklearnex' in sys.modules: - from sklearnex.linear_model import LinearRegression - from sklearnex.linear_model import Ridge - from sklearnex.linear_model import Lasso - from sklearnex.linear_model import ElasticNet - from sklearnex.svm import SVR - from sklearnex.svm import NuSVR - from sklearnex.ensemble import RandomForestRegressor - from sklearnex.neighbors import KNeighborsRegressor - - from sklearnex.ensemble import RandomForestClassifier - from sklearnex.neighbors import KNeighborsClassifier - from sklearnex.svm import SVC - from sklearnex.svm import NuSVC - from sklearnex.linear_model import LogisticRegression - - - all_methods.append(LinearRegression) - all_methods.append(Ridge) - all_methods.append(Lasso) - all_methods.append(ElasticNet) - all_methods.append(SVR) - all_methods.append(NuSVR) - all_methods.append(RandomForestRegressor) - all_methods.append(KNeighborsRegressor) - - all_methods.append(RandomForestClassifier) - all_methods.append(KNeighborsClassifier) - all_methods.append(SVC) - all_methods.append(NuSVC) - all_methods.append(LogisticRegression) + import sklearnex + + all_methods.append(sklearnex.linear_model.LinearRegression) + all_methods.append(sklearnex.linear_model.Ridge) + all_methods.append(sklearnex.linear_model.Lasso) + all_methods.append(sklearnex.linear_model.ElasticNet) + all_methods.append(sklearnex.svm.SVR) + all_methods.append(sklearnex.svm.NuSVR) + all_methods.append(sklearnex.ensemble.RandomForestRegressor) + all_methods.append(sklearnex.neighbors.KNeighborsRegressor) + all_methods.append(sklearnex.ensemble.RandomForestClassifier) + all_methods.append(sklearnex.neighbors.KNeighborsClassifier) + all_methods.append(sklearnex.svm.SVC) + all_methods.append(sklearnex.svm.NuSVC) + all_methods.append(sklearnex.linear_model.LogisticRegression) STRING_TO_CLASS = { @@ -155,15 +145,18 @@ } - +from sklearn.linear_model import PassiveAggressiveClassifier +from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis +from sklearn.linear_model import ARDRegression +from sklearn.gaussian_process import GaussianProcessRegressor GROUPNAMES = { "selectors": ["SelectFwe", "SelectPercentile", "VarianceThreshold",], "selectors_classification": ["SelectFwe", "SelectPercentile", "VarianceThreshold", "RFE_classification", "SelectFromModel_classification"], "selectors_regression": ["SelectFwe", "SelectPercentile", "VarianceThreshold", "RFE_regression", "SelectFromModel_regression"], - "classifiers" : ["LogisticRegression", "DecisionTreeClassifier", "KNeighborsClassifier", "GradientBoostingClassifier", "ExtraTreesClassifier", "RandomForestClassifier", "SGDClassifier", "GaussianNB", "BernoulliNB", "MultinomialNB", "XGBClassifier", "SVC", "MLPClassifier"], - "regressors" : ["ElasticNetCV", "ExtraTreesRegressor", "GradientBoostingRegressor", "AdaBoostRegressor", "DecisionTreeRegressor", "KNeighborsRegressor", "LassoLarsCV", "SVR", "RandomForestRegressor", "RidgeCV", "XGBRegressor", "SGDRegressor" ], - "transformers": ["Binarizer", "Normalizer", "PCA", "ZeroCount", "OneHotEncoder", "FastICA", "FeatureAgglomeration", "Nystroem", "RBFSampler"], + "classifiers" : ['AdaBoostClassifier', 'BernoulliNB', 'DecisionTreeClassifier', 'ExtraTreesClassifier', 'GaussianNB', 'HistGradientBoostingClassifier', 'KNeighborsClassifier', 'LogisticRegression', "LinearSVC", "SVC", 'MLPClassifier', 'MultinomialNB', "PassiveAggressiveClassifier", "QuadraticDiscriminantAnalysis", 'RandomForestClassifier', 'SGDClassifier', 'XGBClassifier'], + "regressors" : ['AdaBoostRegressor', "ARDRegression", 'DecisionTreeRegressor', 'ExtraTreesRegressor', 'GaussianProcessRegressor', 'HistGradientBoostingRegressor', 'KNeighborsRegressor', 'LinearDiscriminantAnalysis', 'LinearSVR', "MLPRegressor", 'RandomForestRegressor', 'SGDRegressor', 'SVR', 'XGBRegressor'], + "transformers": ["Binarizer", "Normalizer", "PCA", "ZeroCount", "OneHotEncoder", "FastICA", "FeatureAgglomeration", "Nystroem", "RBFSampler", "QuantileTransformer", "PowerTransformer"], "arithmatic": ["AddTransformer", "mul_neg_1_Transformer", "MulTransformer", "SafeReciprocalTransformer", "EQTransformer", "NETransformer", "GETransformer", "GTTransformer", "LETransformer", "LTTransformer", "MinTransformer", "MaxTransformer"], "imputers": [], "skrebate": ["ReliefF", "SURF", "SURFstar", "MultiSURF"], @@ -194,20 +187,24 @@ def get_configspace(name, n_classes=3, n_samples=100, n_features=100, random_sta #classifiers.py + case "AdaBoostClassifier": + return classifiers.get_AdaBoostClassifier_ConfigurationSpace(random_state=random_state) case "LogisticRegression": - return classifiers.get_LogisticRegression_ConfigurationSpace(random_state=random_state) + return classifiers.get_LogisticRegression_ConfigurationSpace(n_samples=n_samples, n_features=n_features, random_state=random_state) case "KNeighborsClassifier": return classifiers.get_KNeighborsClassifier_ConfigurationSpace(n_samples=n_samples) case "DecisionTreeClassifier": - return classifiers.get_DecisionTreeClassifier_ConfigurationSpace(random_state=random_state) + return classifiers.get_DecisionTreeClassifier_ConfigurationSpace(n_featues=n_features, random_state=random_state) case "SVC": return classifiers.get_SVC_ConfigurationSpace(random_state=random_state) case "LinearSVC": return classifiers.get_LinearSVC_ConfigurationSpace(random_state=random_state) case "RandomForestClassifier": - return classifiers.get_RandomForestClassifier_ConfigurationSpace(random_state=random_state) + return classifiers.get_RandomForestClassifier_ConfigurationSpace(n_features=n_features, random_state=random_state) case "GradientBoostingClassifier": - return classifiers.get_GradientBoostingClassifier_ConfigurationSpace(n_classes=n_classes) + return classifiers.get_GradientBoostingClassifier_ConfigurationSpace(n_features=n_features, random_state=random_state) + case "HistGradientBoostingClassifier": + return classifiers.get_HistGradientBoostingClassifier_ConfigurationSpace(n_features=n_features, random_state=random_state) case "XGBClassifier": return classifiers.get_XGBClassifier_ConfigurationSpace(random_state=random_state) case "LGBMClassifier": @@ -224,7 +221,63 @@ def get_configspace(name, n_classes=3, n_samples=100, n_features=100, random_sta return classifiers.get_MultinomialNB_ConfigurationSpace() case "GaussianNB": return {} - + case "LassoLarsCV": + return {} + case "ElasticNetCV": + return regressors.ElasticNetCV_configspace + case "RidgeCV": + return {} + + #regressors.py + case "RandomForestRegressor": + return regressors.get_RandomForestRegressor_ConfigurationSpace(random_state=random_state) + case "SGDRegressor": + return regressors.get_SGDRegressor_ConfigurationSpace(random_state=random_state) + case "Ridge": + return regressors.get_Ridge_ConfigurationSpace(random_state=random_state) + case "Lasso": + return regressors.get_Lasso_ConfigurationSpace(random_state=random_state) + case "ElasticNet": + return regressors.get_ElasticNet_ConfigurationSpace(random_state=random_state) + case "Lars": + return regressors.get_Lars_ConfigurationSpace(random_state=random_state) + case "OthogonalMatchingPursuit": + return regressors.get_OthogonalMatchingPursuit_ConfigurationSpace() + case "BayesianRidge": + return regressors.get_BayesianRidge_ConfigurationSpace() + case "LassoLars": + return regressors.get_LassoLars_ConfigurationSpace(random_state=random_state) + case "BaggingRegressor": + return regressors.get_BaggingRegressor_ConfigurationSpace(random_state=random_state) + case "ARDRegression": + return regressors.get_ARDRegression_ConfigurationSpace() + case "TheilSenRegressor": + return regressors.get_TheilSenRegressor_ConfigurationSpace(random_state=random_state) + case "Perceptron": + return regressors.get_Perceptron_ConfigurationSpace(random_state=random_state) + case "DecisionTreeRegressor": + return regressors.get_DecisionTreeRegressor_ConfigurationSpace(n_features=n_features, random_state=random_state) + case "LinearSVR": + return regressors.get_LinearSVR_ConfigurationSpace(random_state=random_state) + case "SVR": + return regressors.get_SVR_ConfigurationSpace() + case "XGBRegressor": + return regressors.get_XGBRegressor_ConfigurationSpace(random_state=random_state) + case "AdaBoostRegressor": + return regressors.get_AdaBoostRegressor_ConfigurationSpace(random_state=random_state) + case "ExtraTreesRegressor": + return regressors.get_ExtraTreesRegressor_ConfigurationSpace(random_state=random_state) + case "GradientBoostingRegressor": + return regressors.get_GradientBoostingRegressor_ConfigurationSpace(n_features=n_features, random_state=random_state) + case "HistGradientBoostingRegressor": + return regressors.get_HistGradientBoostingRegressor_ConfigurationSpace(n_features=n_features, random_state=random_state) + case "MLPRegressor": + return regressors.get_MLPRegressor_ConfigurationSpace(random_state=random_state) + case "KNeighborsRegressor": + return regressors.get_KNeighborsRegressor_ConfigurationSpace(n_samples=n_samples) + case "GaussianProcessRegressor": + return regressors.get_GaussianProcessRegressor_ConfigurationSpace(n_features=n_features, random_state=random_state) + #transformers.py case "Binarizer": return transformers.Binarizer_configspace @@ -244,7 +297,23 @@ def get_configspace(name, n_classes=3, n_samples=100, n_features=100, random_sta return transformers.get_Nystroem_configspace(n_features=n_features, random_state=random_state) case "RBFSampler": return transformers.get_RBFSampler_configspace(n_features=n_features, random_state=random_state) - + case "MinMaxScaler": + return {} + case "PowerTransformer": + return {} + case "QuantileTransformer": + return transformers.get_QuantileTransformer_configspace(random_state=random_state) + case "RobustScaler": + return transformers.RobustScaler_configspace + case "ColumnOneHotEncoder": + return {} + case "MaxAbsScaler": + return {} + case "PolynomialFeatures": + return transformers.PolynomialFeatures_configspace + case "StandardScaler": + return {} + #selectors.py case "SelectFwe": return selectors.SelectFwe_configspace @@ -256,6 +325,7 @@ def get_configspace(name, n_classes=3, n_samples=100, n_features=100, random_sta return selectors.RFE_configspace_part case "SelectFromModel": return selectors.SelectFromModel_configspace_part + #special_configs.py case "AddTransformer": @@ -291,7 +361,7 @@ def get_configspace(name, n_classes=3, n_samples=100, n_features=100, random_sta space = { - 'n': Float("n", bounds=(-1e3, 1e3), log=True), + 'n': Float("n", bounds=(-1e3, 1e3)), } ) @@ -341,7 +411,8 @@ def get_configspace(name, n_classes=3, n_samples=100, n_features=100, random_sta case "KNeighborsRegressor_sklearnex": return regressors_sklearnex.get_KNeighborsRegressor_ConfigurationSpace(n_samples=n_samples) - return {} + #raise error + raise ValueError(f"Could not find configspace for {name}") def get_search_space(name, n_classes=3, n_samples=100, n_features=100, random_state=None): @@ -359,9 +430,12 @@ def get_search_space(name, n_classes=3, n_samples=100, n_features=100, random_st return get_search_space(name_list, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state) if name is None: + warnings.warn(f"name is None") return None if name not in STRING_TO_CLASS: + print("FOOO ", name) + warnings.warn(f"Could not find class for {name}") return None return get_node(name, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state) @@ -370,6 +444,8 @@ def get_search_space(name, n_classes=3, n_samples=100, n_features=100, random_st def get_node(name, n_classes=3, n_samples=100, n_features=100, random_state=None): #these are wrappers that take in another estimator as a parameter + # TODO Add AdaBoostRegressor, AdaBoostClassifier as wrappers? wrap a decision tree with different params? + # TODO add other meta-estimators? if name == "RFE_classification": rfe_sp = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state) ext = get_node("ExtraTreesClassifier", n_classes=n_classes, n_samples=n_samples, random_state=random_state) @@ -386,7 +462,26 @@ def get_node(name, n_classes=3, n_samples=100, n_features=100, random_state=None sfm_sp = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state) ext = get_node("ExtraTreesRegressor", n_classes=n_classes, n_samples=n_samples, random_state=random_state) return WrapperPipeline(nodegen=ext, method=SelectFromModel, configspace=sfm_sp) - + + #these are nodes that have special search spaces which require custom parsing of the hyperparameters + if name == "RobustScaler": + configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state) + return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=transformers.robust_scaler_hyperparameter_parser) + if name == "GradientBoostingClassifier" or name == "HistGradientBoosting": + configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state) + return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=classifiers.GradientBoostingClassifier_hyperparameter_parser) + if name == "GradientBoostingRegressor" or name == "HistGradientBoostingRegressor": + configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state) + return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=regressors.GradientBoostingRegressor_hyperparameter_parser) + if name == "MLPClassifier": + configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state) + return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=classifiers.MLPClassifier_hyperparameter_parser) + if name == "MLPRegressor": + configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state) + return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=regressors.MLPRegressor_hyperparameter_parser) + if name == "GaussianProcessRegressor": + configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state) + return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=regressors.GaussianProcessRegressor_hyperparameter_parser) configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state) if configspace is None: diff --git a/tpot2/config/mdr_configs.py b/tpot2/config/mdr_configs.py index b99ec81e..df92cd17 100644 --- a/tpot2/config/mdr_configs.py +++ b/tpot2/config/mdr_configs.py @@ -14,7 +14,7 @@ -def get_skrebate_ReliefF_config_space(n_features=10): +def get_skrebate_ReliefF_config_space(n_features): return ConfigurationSpace( space = { 'n_features_to_select': Integer('n_features_to_select', bounds=(1, n_features), log=True), @@ -23,7 +23,7 @@ def get_skrebate_ReliefF_config_space(n_features=10): ) -def get_skrebate_SURF_config_space(n_features=10): +def get_skrebate_SURF_config_space(n_features): return ConfigurationSpace( space = { 'n_features_to_select': Integer('n_features_to_select', bounds=(1, n_features), log=True), @@ -31,13 +31,13 @@ def get_skrebate_SURF_config_space(n_features=10): ) -def get_skrebate_SURFstar_config_space(n_features=10): +def get_skrebate_SURFstar_config_space(n_features): return ConfigurationSpace( space = { 'n_features_to_select': Integer('n_features_to_select', bounds=(1, n_features), log=True), } ) -def get_skrebate_MultiSURF_config_space(n_features=10): +def get_skrebate_MultiSURF_config_space(n_features): return ConfigurationSpace( space = { 'n_features_to_select': Integer('n_features_to_select', bounds=(1, n_features), log=True), diff --git a/tpot2/config/regressors.py b/tpot2/config/regressors.py index 845f9ff1..e87e9eda 100644 --- a/tpot2/config/regressors.py +++ b/tpot2/config/regressors.py @@ -1,37 +1,21 @@ -from sklearn.linear_model import SGDRegressor -from sklearn.linear_model import LinearRegression -from sklearn.linear_model import Ridge -from sklearn.linear_model import Lasso -from sklearn.linear_model import ElasticNet -from sklearn.linear_model import Lars -from sklearn.linear_model import LassoLars, LassoLarsCV -from sklearn.linear_model import RidgeCV - - -from sklearn.svm import SVR -from sklearn.svm import LinearSVR - -from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor,RandomForestRegressor -from sklearn.ensemble import BaggingRegressor -from sklearn.ensemble import ExtraTreesRegressor -from sklearn.tree import DecisionTreeRegressor -from sklearn.neighbors import KNeighborsRegressor -from sklearn.linear_model import ElasticNetCV - -from xgboost import XGBRegressor -from functools import partial - - +import sklearn from ConfigSpace import ConfigurationSpace from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal - - +from ConfigSpace import EqualsCondition, OrConjunction, NotEqualsCondition, InCondition +from ..search_spaces.nodes.estimator_node import NONE_SPECIAL_STRING, TRUE_SPECIAL_STRING, FALSE_SPECIAL_STRING +import numpy as np #TODO: fill in remaining #TODO check for places were we could use log scaling -def get_RandomForestRegressor_ConfigurationSpace(random_state=None): + +ElasticNetCV_configspace = { + "l1_ratio" : np.arange(0.0, 1.01, 0.05), +} + +def get_RandomForestRegressor_ConfigurationSpace(random_state): space = { 'n_estimators': 100, + 'criterion': Categorical("criterion", ['mse', 'mae', "friedman_mse"]), 'max_features': Float("max_features", bounds=(0.05, 1.0)), 'bootstrap': Categorical("bootstrap", [True, False]), 'min_samples_split': Integer("min_samples_split", bounds=(2, 21)), @@ -46,27 +30,49 @@ def get_RandomForestRegressor_ConfigurationSpace(random_state=None): ) -def get_SGDRegressor_ConfigurationSpace(random_state=None): +def get_SGDRegressor_ConfigurationSpace(random_state): space = { - 'loss': Categorical("loss", ['squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']), - 'penalty': 'elasticnet', - 'alpha': Float("alpha", bounds=(1e-5, 0.01), log=True), - 'learning_rate': Categorical("learning_rate", ['invscaling', 'constant']), - 'l1_ratio': Float("l1_ratio", bounds=(0.0, 1.0)), - 'eta0': Float("eta0", bounds=(0.01, 1.0)), - 'power_t': Float("power_t", bounds=(1e-5, 100.0), log=True), + 'alpha': Float("alpha", bounds=(1e-7, 1e-1), log=True), + 'average': Categorical("average", [True, False]), 'fit_intercept': Categorical("fit_intercept", [True]), } if random_state is not None: #This is required because configspace doesn't allow None as a value space['random_state'] = random_state - return ConfigurationSpace( + cs = ConfigurationSpace( space = space ) + l1_ratio = Float("l1_ratio", bounds=(1e-7, 1.0), log=True) + penalty = Categorical("penalty", ["l1", "l2", "elasticnet"]) + epsilon = Float("epsilon", bounds=(1e-5, 1e-1), log=True) + loss = Categorical("loss", ["squared_loss", "huber", "epsilon_insensitive", "squared_epsilon_insensitive",]) + eta0 = Float("eta0", bounds=(1e-7, 1e-1), log=True) + learning_rate = Categorical("learning_rate", ['optimal', 'invscaling', 'constant']) + power_t = Float("power_t", bounds=(1e-5, 1.0), log=True) + + elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet") + epsilon_condition = InCondition( + epsilon, + loss, + ["huber", "epsilon_insensitive", "squared_epsilon_insensitive"], + ) + + eta0_in_inv_con = InCondition(eta0, learning_rate, ["invscaling", "constant"]) + power_t_condition = EqualsCondition(power_t, learning_rate, "invscaling") + + cs.add_hyperparameters( + [l1_ratio, penalty, epsilon, loss, eta0, learning_rate, power_t] + ) + cs.add_conditions( + [elasticnet, epsilon_condition, power_t_condition, eta0_in_inv_con] + ) + + return cs -def get_Ridge_ConfigurationSpace(random_state=None): + +def get_Ridge_ConfigurationSpace(random_state): space = { 'alpha': Float("alpha", bounds=(0.0, 1.0)), 'fit_intercept': Categorical("fit_intercept", [True]), @@ -81,7 +87,7 @@ def get_Ridge_ConfigurationSpace(random_state=None): space = space ) -def get_Lasso_ConfigurationSpace(random_state=None): +def get_Lasso_ConfigurationSpace(random_state): space = { 'alpha': Float("alpha", bounds=(0.0, 1.0)), 'fit_intercept': Categorical("fit_intercept", [True]), @@ -95,7 +101,7 @@ def get_Lasso_ConfigurationSpace(random_state=None): space = space ) -def get_ElasticNet_ConfigurationSpace(random_state=None): +def get_ElasticNet_ConfigurationSpace(random_state): space = { 'alpha': Float("alpha", bounds=(0.0, 1.0)), 'l1_ratio': Float("l1_ratio", bounds=(0.0, 1.0)), @@ -109,7 +115,7 @@ def get_ElasticNet_ConfigurationSpace(random_state=None): ) -def get_Lars_ConfigurationSpace(random_state=None): +def get_Lars_ConfigurationSpace(random_state): space = { } @@ -138,7 +144,7 @@ def get_BayesianRidge_ConfigurationSpace(): ) -def get_LassoLars_ConfigurationSpace(random_state=None): +def get_LassoLars_ConfigurationSpace(random_state): space = { 'alpha': Float("alpha", bounds=(0.0, 1.0)), 'eps': Float("eps", bounds=(1e-5, 1e-1), log=True), @@ -151,15 +157,8 @@ def get_LassoLars_ConfigurationSpace(random_state=None): space = space ) -def get_LassoLarsCV_ConfigurationSpace(cv): - return ConfigurationSpace( - space = { - 'cv': cv, - } - ) - -def get_BaggingRegressor_ConfigurationSpace(random_state=None): +def get_BaggingRegressor_ConfigurationSpace(random_state): space = { 'max_samples': Float("max_samples", bounds=(0.05, 1.00)), 'max_features': Float("max_features", bounds=(0.05, 1.00)), @@ -178,19 +177,19 @@ def get_ARDRegression_ConfigurationSpace(): return ConfigurationSpace( space = { - 'alpha_1': Float("alpha_1", bounds=(1e-6, 1e-1), log=True), - 'alpha_2': Float("alpha_2", bounds=(1e-6, 1e-1), log=True), - 'lambda_1': Float("lambda_1", bounds=(1e-6, 1e-1), log=True), - 'lambda_2': Float("lambda_2", bounds=(1e-6, 1e-1), log=True), - 'threshold_lambda': Integer("threshold_lambda", bounds=(100, 1000)), + 'alpha_1': Float("alpha_1", bounds=(1e-10, 1e-3), log=True), + 'alpha_2': Float("alpha_2", bounds=(1e-10, 1e-3), log=True), + 'lambda_1': Float("lambda_1", bounds=(1e-10, 1e-3), log=True), + 'lambda_2': Float("lambda_2", bounds=(1e-10, 1e-3), log=True), + 'threshold_lambda': Integer("threshold_lambda", bounds=(1e3, 1e5)), } ) -def get_TheilSenRegressor_ConfigurationSpace(random_state=None): +def get_TheilSenRegressor_ConfigurationSpace(random_state): space = { - 'n_subsamples': Integer("n_subsamples", bounds=(10, 100)), - 'max_subpopulation': Integer("max_subpopulation", bounds=(100, 1000)), + 'n_subsamples': Integer("n_subsamples", bounds=(10, 10000)), + 'max_subpopulation': Integer("max_subpopulation", bounds=(10, 1000)), } if random_state is not None: #This is required because configspace doesn't allow None as a value @@ -201,21 +200,10 @@ def get_TheilSenRegressor_ConfigurationSpace(random_state=None): ) -def get_SVR_ConfigurationSpace(): - return ConfigurationSpace( - space = { - 'kernel': Categorical("kernel", ['poly', 'rbf', 'linear', 'sigmoid']), - 'C': Float("C", bounds=(1e-4, 25), log=True), - 'degree': Integer("degree", bounds=(1, 4)), - 'max_iter': 3000, - 'tol': 0.005, - } - ) - -def get_Perceptron_ConfigurationSpace(random_state=None): +def get_Perceptron_ConfigurationSpace(random_state): space = { - 'penalty': Categorical("penalty", [None, 'l2', 'l1', 'elasticnet']), + 'penalty': Categorical("penalty", [NONE_SPECIAL_STRING, 'l2', 'l1', 'elasticnet']), 'alpha': Float("alpha", bounds=(1e-5, 1e-1), log=True), 'l1_ratio': Float("l1_ratio", bounds=(0.0, 1.0)), 'learning_rate': Categorical("learning_rate", ['constant', 'optimal', 'invscaling']), @@ -229,36 +217,12 @@ def get_Perceptron_ConfigurationSpace(random_state=None): space = space ) -def get_MLPRegressor_ConfigurationSpace(random_state=None): - space = { - 'alpha': Float("alpha", bounds=(1e-4, 1e-1), log=True), - 'learning_rate_init': Float("learning_rate_init", bounds=(1e-3, 1.), log=True), - } - - if random_state is not None: #This is required because configspace doesn't allow None as a value - space['random_state'] = random_state - - return ConfigurationSpace( - space = space - ) - - -def get_GradientBoostingRegressor_ConfigurationSpace(random_state=None): - space = { - 'n_estimators': 100, - 'loss': Categorical("loss", ['ls', 'lad', 'huber', 'quantile']), - 'learning_rate': Float("learning_rate", bounds=(1e-4, 1), log=True), - 'max_depth': Integer("max_depth", bounds=(1, 11)), - 'min_samples_split': Integer("min_samples_split", bounds=(2, 21)), - 'min_samples_leaf': Integer("min_samples_leaf", bounds=(1, 21)), - 'subsample': Float("subsample", bounds=(0.05, 1.00)), - 'max_features': Float("max_features", bounds=(0.05, 1.00)), - } -def get_DecisionTreeRegressor_ConfigurationSpace(random_state=None): +def get_DecisionTreeRegressor_ConfigurationSpace(n_features, random_state): space = { - 'max_depth': Integer("max_depth", bounds=(1, 11)), + 'criterion': Categorical("criterion", ['squared_error', 'friedman_mse', 'mae']), + 'max_depth': Integer("max_depth", bounds=(1, n_features*2)), 'min_samples_split': Integer("min_samples_split", bounds=(2, 21)), 'min_samples_leaf': Integer("min_samples_leaf", bounds=(1, 21)), } @@ -268,21 +232,22 @@ def get_DecisionTreeRegressor_ConfigurationSpace(random_state=None): ) -def get_KNeighborsRegressor_ConfigurationSpace(n_samples=100): +def get_KNeighborsRegressor_ConfigurationSpace(n_samples): return ConfigurationSpace( space = { - 'n_neighbors': Integer("n_neighbors", bounds=(1, n_samples)), + 'n_neighbors': Integer("n_neighbors", bounds=(1, min(100,n_samples))), 'weights': Categorical("weights", ['uniform', 'distance']), 'p': Integer("p", bounds=(1, 3)), 'metric': Categorical("metric", ['minkowski', 'euclidean', 'manhattan']), } ) -def get_LinearSVR_ConfigurationSpace(random_state=None): + +def get_LinearSVR_ConfigurationSpace(random_state): space = { 'epsilon': Float("epsilon", bounds=(1e-4, 1.0), log=True), - 'C': Float("C", bounds=(1e-4, 25.0), log=True), - 'dual': Categorical("dual", [True, False]), + 'C': Float('C', (0.01, 1e5), log=True), + 'dual': "auto", 'loss': Categorical("loss", ['epsilon_insensitive', 'squared_epsilon_insensitive']), } @@ -293,14 +258,49 @@ def get_LinearSVR_ConfigurationSpace(random_state=None): space = space ) +#add coef0? +def get_SVR_ConfigurationSpace(): + space = { + 'epislon': Float("epsilon", bounds=(1e-4, 1.0), log=True), + 'shrinking': Categorical("shrinking", [True, False]), + 'C': Float('C', (0.01, 1e5), log=True), + 'max_iter': 3000, + 'tol': 0.005, + } + + cs = ConfigurationSpace( + space = space + ) + + kernel = Categorical("kernel", ['poly', 'rbf', 'linear', 'sigmoid']) + degree = Integer("degree", bounds=(1, 5)) + gamma = Float("gamma", bounds=(1e-5, 10.0), log=True) + coef0 = Float("coef0", bounds=(-1, 1)) + + + degree_condition = EqualsCondition(degree, kernel, 'poly') + gamma_condition = InCondition(gamma, kernel, ['poly', 'rbf',]) + coef0_condition = InCondition(coef0, kernel, ['poly', 'sigmoid']) + + cs.add_hyperparameters([kernel, degree, gamma, coef0]) + cs.add_conditions([degree_condition,gamma_condition]) + + return cs + + -def get_XGBRegressor_ConfigurationSpace(random_state=None): + +def get_XGBRegressor_ConfigurationSpace(random_state): space = { + 'n_estimators': 100, 'learning_rate': Float("learning_rate", bounds=(1e-3, 1), log=True), - 'subsample': Float("subsample", bounds=(0.05, 1.0)), + 'subsample': Float("subsample", bounds=(0.5, 1.0)), 'min_child_weight': Integer("min_child_weight", bounds=(1, 21)), - 'n_estimators': 100, - 'max_depth': Integer("max_depth", bounds=(1, 11)), + 'gamma': Float("gamma", bounds=(1e-4, 20), log=True), + 'max_depth': Integer("max_depth", bounds=(3, 18)), + 'reg_alpha': Float("reg_alpha", bounds=(1e-4, 100), log=True), + 'reg_lambda': Float("reg_lambda", bounds=(1e-4, 1), log=True), + 'n_jobs': 1, 'nthread': 1, 'verbosity': 0, 'objective': 'reg:squarederror', @@ -314,11 +314,11 @@ def get_XGBRegressor_ConfigurationSpace(random_state=None): ) -def get_AdaBoostRegressor_ConfigurationSpace(random_state=None): +def get_AdaBoostRegressor_ConfigurationSpace(random_state): space = { - 'n_estimators': Integer("n_estimators", bounds=(50, 100)), - 'learning_rate': Float("learning_rate", bounds=(1e-3, 1.0), log=True), + 'n_estimators': Integer("n_estimators", bounds=(50, 500)), + 'learning_rate': Float("learning_rate", bounds=(1e-3, 2.0), log=True), 'loss': Categorical("loss", ['linear', 'square', 'exponential']), } @@ -330,9 +330,10 @@ def get_AdaBoostRegressor_ConfigurationSpace(random_state=None): space = space ) -def get_ExtraTreesRegressor_ConfigurationSpace(random_state=None): +def get_ExtraTreesRegressor_ConfigurationSpace(random_state): space = { 'n_estimators': 100, + 'criterion': Categorical("criterion", ["squared_error", "friedman_mse", "mae"]), 'max_features': Float("max_features", bounds=(0.05, 1.0)), 'min_samples_split': Integer("min_samples_split", bounds=(2, 21)), 'min_samples_leaf': Integer("min_samples_leaf", bounds=(1, 21)), @@ -344,4 +345,168 @@ def get_ExtraTreesRegressor_ConfigurationSpace(random_state=None): return ConfigurationSpace( space = space - ) \ No newline at end of file + ) +### + +def get_GaussianProcessRegressor_ConfigurationSpace(n_features, random_state): + space = { + 'n_features': n_features, + 'alpha': Float("alpha", bounds=(1e-14, 1.0), log=True), + 'thetaL': Float("thetaL", bounds=(1e-10, 1e-3), log=True), + 'thetaU': Float("thetaU", bounds=(1.0, 100000), log=True), + } + + if random_state is not None: #This is required because configspace doesn't allow None as a value + space['random_state'] = random_state + + return ConfigurationSpace( + space = space + ) + +def GaussianProcessRegressor_hyperparameter_parser(params): + kernel = sklearn.gaussian_process.kernels.RBF( + length_scale = [1.0]*params['n_features'], + length_scale_bounds=[(params['thetaL'], params['thetaU'])] * params['n_features'], + ) + final_params = {"kernel": kernel, + "alpha": params['alpha'], + "n_restarts_optimizer": 10, + "optimizer": "fmin_l_bfgs_b", + "normalize_y": True, + "copy_X_train": True, + } + + if "random_state" in params: + final_params['random_state'] = params['random_state'] + + return final_params + +### +def get_GradientBoostingRegressor_ConfigurationSpace(n_features, random_state): + early_stop = Categorical("early_stop", ["off", "valid", "train"]) + n_iter_no_change = Integer("n_iter_no_change",bounds=(1,20)) + validation_fraction = Float("validation_fraction", bounds=(0.01, 0.4)) + + n_iter_no_change_cond = InCondition(n_iter_no_change, early_stop, ["valid", "train"] ) + validation_fraction_cond = EqualsCondition(validation_fraction, early_stop, "valid") + + space = { + 'loss': Categorical("loss", ['log_loss', 'exponential']), + 'learning_rate': Float("learning_rate", bounds=(1e-3, 1), log=True), + 'min_samples_leaf': Integer("min_samples_leaf", bounds=(1, 200)), + 'min_samples_split': Integer("min_samples_split", bounds=(2, 20)), + 'subsample': Float("subsample", bounds=(0.1, 1.0)), + 'max_features': Integer("max_features", bounds=(1, max(1, n_features))), + 'max_leaf_nodes': Integer("max_leaf_nodes", bounds=(3, 2047)), + 'max_depth': Integer("max_depth", bounds=(1, 2*n_features)), + 'tol': 1e-4, + } + + if random_state is not None: #This is required because configspace doesn't allow None as a value + space['random_state'] = random_state + + cs = ConfigurationSpace( + space = space + ) + cs.add_hyperparameters([n_iter_no_change, validation_fraction, early_stop ]) + cs.add_conditions([validation_fraction_cond, n_iter_no_change_cond]) + return cs + +#only difference is l2_regularization +def get_HistGradientBoostingRegressor_ConfigurationSpace(n_features, random_state): + early_stop = Categorical("early_stop", ["off", "valid", "train"]) + n_iter_no_change = Integer("n_iter_no_change",bounds=(1,20)) + validation_fraction = Float("validation_fraction", bounds=(0.01, 0.4)) + + n_iter_no_change_cond = InCondition(n_iter_no_change, early_stop, ["valid", "train"] ) + validation_fraction_cond = EqualsCondition(validation_fraction, early_stop, "valid") + + space = { + 'loss': Categorical("loss", ['log_loss', 'exponential']), + 'learning_rate': Float("learning_rate", bounds=(1e-3, 1), log=True), + 'min_samples_leaf': Integer("min_samples_leaf", bounds=(1, 200)), + 'max_features': Float("max_features", bounds=(0.1,1.0)), + 'max_leaf_nodes': Integer("max_leaf_nodes", bounds=(3, 2047)), + 'max_depth': Integer("max_depth", bounds=(1, 2*n_features)), + 'l2_regularization': Float("l2_regularization", bounds=(1e-10, 1), log=True), + 'tol': 1e-4, + } + + if random_state is not None: #This is required because configspace doesn't allow None as a value + space['random_state'] = random_state + + cs = ConfigurationSpace( + space = space + ) + cs.add_hyperparameters([n_iter_no_change, validation_fraction, early_stop ]) + cs.add_conditions([validation_fraction_cond, n_iter_no_change_cond]) + + return cs + +def GradientBoostingRegressor_hyperparameter_parser(params): + + final_params = { + 'loss': params['loss'], + 'learning_rate': params['learning_rate'], + 'min_samples_leaf': params['min_samples_leaf'], + 'max_features': params['max_features'], + 'max_leaf_nodes': params['max_leaf_nodes'], + 'max_depth': params['max_depth'], + 'tol': params['tol'], + } + + if "l2_regularization" in params: + final_params['l2_regularization'] = params['l2_regularization'] + + if params['early_stop'] == 'off': + final_params['n_iter_no_change'] = None + final_params['validation_fraction'] = None + elif params['early_stop'] == 'valid': + final_params['n_iter_no_change'] = params['n_iter_no_change'] + final_params['validation_fraction'] = params['validation_fraction'] + elif params['early_stop'] == 'train': + final_params['n_iter_no_change'] = params['n_iter_no_change'] + final_params['validation_fraction'] = None + + + return final_params + + + +### + +def get_MLPRegressor_ConfigurationSpace(random_state): + space = {"n_iter_no_change":32} + + if random_state is not None: #This is required because configspace doesn't allow None as a value + space['random_state'] = random_state + + cs = ConfigurationSpace( + space = space + ) + + n_hidden_layers = Integer("n_hidden_layers", bounds=(1, 3)) + n_nodes_per_layer = Integer("n_nodes_per_layer", bounds=(16, 512)) + activation = Categorical("activation", ['tanh', 'relu']) + alpha = Float("alpha", bounds=(1e-7, 1e-1), log=True) + learning_rate = Float("learning_rate", bounds=(1e-4, 1e-1), log=True) + early_stopping = Categorical("early_stopping", [True,False]) + + cs.add_hyperparameters([n_hidden_layers, n_nodes_per_layer, activation, alpha, learning_rate, early_stopping]) + + return cs + +def MLPRegressor_hyperparameter_parser(params): + hyperparameters = { + 'n_iter_no_change': params['n_iter_no_change'], + 'hidden_layer_sizes' : [params['n_nodes_per_layer']]*params['n_hidden_layers'], + 'activation': params['activation'], + 'alpha': params['alpha'], + 'learning_rate': params['learning_rate'], + 'early_stopping': params['early_stopping'], + } + return hyperparameters + + + + \ No newline at end of file diff --git a/tpot2/config/regressors_sklearnex.py b/tpot2/config/regressors_sklearnex.py index 3473de56..7346a7c3 100644 --- a/tpot2/config/regressors_sklearnex.py +++ b/tpot2/config/regressors_sklearnex.py @@ -3,7 +3,7 @@ -def get_RandomForestRegressor_ConfigurationSpace(random_state=None): +def get_RandomForestRegressor_ConfigurationSpace(random_state): space = { 'n_estimators': 100, 'max_features': Float("max_features", bounds=(0.05, 1.0)), @@ -20,7 +20,7 @@ def get_RandomForestRegressor_ConfigurationSpace(random_state=None): ) -def get_KNeighborsRegressor_ConfigurationSpace(n_samples=100): +def get_KNeighborsRegressor_ConfigurationSpace(n_samples): return ConfigurationSpace( space = { 'n_neighbors': Integer("n_neighbors", bounds=(1, max(n_samples, 100))), @@ -29,7 +29,7 @@ def get_KNeighborsRegressor_ConfigurationSpace(n_samples=100): ) -def get_Ridge_ConfigurationSpace(random_state=None): +def get_Ridge_ConfigurationSpace(random_state): space = { 'alpha': Float("alpha", bounds=(0.0, 1.0)), 'fit_intercept': Categorical("fit_intercept", [True]), @@ -43,7 +43,7 @@ def get_Ridge_ConfigurationSpace(random_state=None): space = space ) -def get_Lasso_ConfigurationSpace(random_state=None): +def get_Lasso_ConfigurationSpace(random_state): space = { 'alpha': Float("alpha", bounds=(0.0, 1.0)), 'fit_intercept': Categorical("fit_intercept", [True]), @@ -60,7 +60,7 @@ def get_Lasso_ConfigurationSpace(random_state=None): space = space ) -def get_ElasticNet_ConfigurationSpace(random_state=None): +def get_ElasticNet_ConfigurationSpace(random_state): space = { 'alpha': Float("alpha", bounds=(0.0, 1.0)), 'l1_ratio': Float("l1_ratio", bounds=(0.0, 1.0)), @@ -74,7 +74,7 @@ def get_ElasticNet_ConfigurationSpace(random_state=None): ) -def get_SVR_ConfigurationSpace(random_state=None): +def get_SVR_ConfigurationSpace(random_state): space = { 'kernel': Categorical("kernel", ['poly', 'rbf', 'linear', 'sigmoid']), 'C': Float("C", bounds=(1e-4, 25), log=True), @@ -90,7 +90,7 @@ def get_SVR_ConfigurationSpace(random_state=None): space = space ) -def get_NuSVR_ConfigurationSpace(random_state=None): +def get_NuSVR_ConfigurationSpace(random_state): space = { 'nu': Float("nu", bounds=(0.05, 1.0)), 'kernel': Categorical("kernel", ['poly', 'rbf', 'linear', 'sigmoid']), diff --git a/tpot2/config/special_configs.py b/tpot2/config/special_configs.py index 38545f6c..5d22dfad 100644 --- a/tpot2/config/special_configs.py +++ b/tpot2/config/special_configs.py @@ -30,54 +30,3 @@ def get_ArithmeticTransformer_ConfigurationSpace(): # MinTransformer: {} # MaxTransformer: {} - - -def get_FeatureSetSelector_ConfigurationSpace(names_list = None, subset_dict=None): - return ConfigurationSpace( - space = { - 'name': Categorical("name", names_list), - } - ) - - -def make_FSS_config_dictionary(subsets=None, n_features=None, feature_names=None): - """Create the search space of parameters for FeatureSetSelector. - - Parameters - ---------- - subsets: Sets the subsets to select from. - - str : If a string, it is assumed to be a path to a csv file with the subsets. - The first column is assumed to be the name of the subset and the remaining columns are the features in the subset. - - list or np.ndarray : If a list or np.ndarray, it is assumed to be a list of subsets. - - n_features: int the number of features in the dataset. - If subsets is None, each column will be treated as a subset. One column will be selected per subset. - """ - - #require at least of of the parameters - if subsets is None and n_features is None: - raise ValueError('At least one of the parameters must be provided') - - if isinstance(subsets, str): - df = pd.read_csv(subsets,header=None,index_col=0) - df['features'] = df.apply(lambda x: list([x[c] for c in df.columns]),axis=1) - subset_dict = {} - for row in df.index: - subset_dict[row] = df.loc[row]['features'] - elif isinstance(subsets, dict): - subset_dict = subsets - elif isinstance(subsets, list) or isinstance(subsets, np.ndarray): - subset_dict = {str(i):subsets[i] for i in range(len(subsets))} - else: - if feature_names is None: - subset_dict = {str(i):i for i in range(n_features)} - else: - subset_dict = {str(i):feature_names[i] for i in range(len(feature_names))} - - names_list = list(subset_dict.keys()) - - return ConfigurationSpace({ - 'name': Categorical("name", names_list), - 'subset_dict': Categorical("subset", subset_dict), - }) - diff --git a/tpot2/config/tests/__init__.py b/tpot2/config/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tpot2/config/tests/test_get_configspace.py b/tpot2/config/tests/test_get_configspace.py new file mode 100644 index 00000000..a2ebcb59 --- /dev/null +++ b/tpot2/config/tests/test_get_configspace.py @@ -0,0 +1,26 @@ +import pytest +import tpot2 +from sklearn.datasets import load_iris +import random +import sklearn + +import tpot2.config + +from ..get_configspace import STRING_TO_CLASS + +def test_loop_through_all_hyperparameters(): + + n_classes=3 + n_samples=100 + n_features=100 + random_state=None + + for class_name, _ in STRING_TO_CLASS.items(): + estnode_gen = tpot2.config.get_search_space(class_name, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state) + + #generate 10 random hyperparameters and make sure they are all valid + for i in range(10): + estnode = estnode_gen.generate() + est = estnode.export_pipeline() + + \ No newline at end of file diff --git a/tpot2/config/transformers.py b/tpot2/config/transformers.py index f74d5e18..04180ac4 100644 --- a/tpot2/config/transformers.py +++ b/tpot2/config/transformers.py @@ -18,6 +18,13 @@ ZeroCount_configspace = {} +PolynomialFeatures_configspace = ConfigurationSpace( + space = { + 'degree': Integer('degree', bounds=(2, 3)), + 'interaction_only': Categorical('interaction_only', [True, False]), + } +) + OneHotEncoder_configspace = {} #TODO include the parameter for max unique values def get_FastICA_configspace(n_features=100, random_state=None): @@ -76,3 +83,31 @@ def get_RBFSampler_configspace(n_features=100, random_state=None): space = space ) + + +def get_QuantileTransformer_configspace(random_state=None): + + space = { + 'n_quantiles': Integer('n_quantiles', bounds=(10, 2000)), + 'output_distribution': Categorical('output_distribution', ['uniform', 'normal']), + } + + if random_state is not None: #This is required because configspace doesn't allow None as a value + space['random_state'] = random_state + + return ConfigurationSpace( + space = space + + ) + + + +### ROBUST SCALER + +RobustScaler_configspace = ConfigurationSpace({ + "q_min": Float("q_min", bounds=(0.001, 0.3)), + "q_max": Float("q_max", bounds=(0.7, 0.999)), + }) + +def robust_scaler_hyperparameter_parser(params): + return {"quantile_range": (params["q_min"], params["q_max"])} \ No newline at end of file diff --git a/tpot2/search_spaces/nodes/estimator_node.py b/tpot2/search_spaces/nodes/estimator_node.py index 6e084b59..0ec71e98 100644 --- a/tpot2/search_spaces/nodes/estimator_node.py +++ b/tpot2/search_spaces/nodes/estimator_node.py @@ -1,22 +1,45 @@ # try https://automl.github.io/ConfigSpace/main/api/hyperparameters.html -import tpot2 + import numpy as np -import pandas as pd -import sklearn -from tpot2 import config -from typing import Generator, List, Tuple, Union -import random from ..base import SklearnIndividual, SklearnIndividualGenerator from ConfigSpace import ConfigurationSpace +from typing import final + +NONE_SPECIAL_STRING = "" +TRUE_SPECIAL_STRING = "" +FALSE_SPECIAL_STRING = "" + + +def default_hyperparameter_parser(params:dict) -> dict: + return params + class EstimatorNodeIndividual(SklearnIndividual): + """ + Note that ConfigurationSpace does not support None as a parameter. Instead, use the special string "". TPOT will automatically replace instances of this string with the Python None. + + Parameters + ---------- + method : type + The class of the estimator to be used + + space : ConfigurationSpace|dict + The hyperparameter space to be used. If a dict is passed, hyperparameters are fixed and not learned. + + """ def __init__(self, method: type, space: ConfigurationSpace|dict, #TODO If a dict is passed, hyperparameters are fixed and not learned. Is this confusing? Should we make a second node type? + hyperparameter_parser: callable = None, rng=None) -> None: super().__init__() self.method = method self.space = space + if hyperparameter_parser is None: + self.hyperparameter_parser = default_hyperparameter_parser + else: + self.hyperparameter_parser = hyperparameter_parser + if isinstance(space, dict): self.hyperparameters = space else: @@ -24,6 +47,8 @@ def __init__(self, method: type, self.space.seed(rng.integers(0, 2**32)) self.hyperparameters = self.space.sample_configuration().get_dictionary() + self.check_hyperparameters_for_None() + def mutate(self, rng=None): if isinstance(self.space, dict): return False @@ -32,6 +57,7 @@ def mutate(self, rng=None): self.space.seed(rng.integers(0, 2**32)) self.hyperparameters = self.space.sample_configuration().get_dictionary() + self.check_hyperparameters_for_None() return True def crossover(self, other, rng=None): @@ -48,17 +74,34 @@ def crossover(self, other, rng=None): if hyperparameter in other.hyperparameters: self.hyperparameters[hyperparameter] = other.hyperparameters[hyperparameter] + self.check_hyperparameters_for_None() + + return True + + def check_hyperparameters_for_None(self): + for key, value in self.hyperparameters.items(): + #if string + if isinstance(value, str): + if value == NONE_SPECIAL_STRING: + self.hyperparameters[key] = None + elif value == TRUE_SPECIAL_STRING: + self.hyperparameters[key] = True + elif value == FALSE_SPECIAL_STRING: + self.hyperparameters[key] = False + + @final #this method should not be overridden, instead override hyperparameter_parser def export_pipeline(self, **kwargs): - return self.method(**self.hyperparameters) + return self.method(**self.hyperparameter_parser(self.hyperparameters)) def unique_id(self): #return a dictionary of the method and the hyperparameters return (self.method, self.hyperparameters) class EstimatorNode(SklearnIndividualGenerator): - def __init__(self, method, space): + def __init__(self, method, space, hyperparameter_parser=default_hyperparameter_parser): self.method = method self.space = space + self.hyperparameter_parser = hyperparameter_parser def generate(self, rng=None): - return EstimatorNodeIndividual(self.method, self.space) \ No newline at end of file + return EstimatorNodeIndividual(self.method, self.space, hyperparameter_parser=self.hyperparameter_parser, rng=rng) \ No newline at end of file From a66ff10bf5f36ce7ebc64e44c72bb44b39bf6103 Mon Sep 17 00:00:00 2001 From: perib Date: Wed, 17 Apr 2024 21:47:53 -0700 Subject: [PATCH 5/6] edits --- tpot2/config/get_configspace.py | 7 +- tpot2/config/tests/test.ipynb | 264 +++++++++++++++++++++ tpot2/config/tests/test_get_configspace.py | 3 +- tpot2/tpot_estimator/tests/__init__.py | 0 4 files changed, 271 insertions(+), 3 deletions(-) create mode 100644 tpot2/config/tests/test.ipynb create mode 100644 tpot2/tpot_estimator/tests/__init__.py diff --git a/tpot2/config/get_configspace.py b/tpot2/config/get_configspace.py index cf75cd47..5706c4f6 100644 --- a/tpot2/config/get_configspace.py +++ b/tpot2/config/get_configspace.py @@ -156,7 +156,12 @@ "selectors_regression": ["SelectFwe", "SelectPercentile", "VarianceThreshold", "RFE_regression", "SelectFromModel_regression"], "classifiers" : ['AdaBoostClassifier', 'BernoulliNB', 'DecisionTreeClassifier', 'ExtraTreesClassifier', 'GaussianNB', 'HistGradientBoostingClassifier', 'KNeighborsClassifier', 'LogisticRegression', "LinearSVC", "SVC", 'MLPClassifier', 'MultinomialNB', "PassiveAggressiveClassifier", "QuadraticDiscriminantAnalysis", 'RandomForestClassifier', 'SGDClassifier', 'XGBClassifier'], "regressors" : ['AdaBoostRegressor', "ARDRegression", 'DecisionTreeRegressor', 'ExtraTreesRegressor', 'GaussianProcessRegressor', 'HistGradientBoostingRegressor', 'KNeighborsRegressor', 'LinearDiscriminantAnalysis', 'LinearSVR', "MLPRegressor", 'RandomForestRegressor', 'SGDRegressor', 'SVR', 'XGBRegressor'], - "transformers": ["Binarizer", "Normalizer", "PCA", "ZeroCount", "OneHotEncoder", "FastICA", "FeatureAgglomeration", "Nystroem", "RBFSampler", "QuantileTransformer", "PowerTransformer"], + + + "transformers": ["Binarizer", "PCA", "ZeroCount", "ColumnOneHotEncoder", "FastICA", "FeatureAgglomeration", "Nystroem", "RBFSampler", "QuantileTransformer", "PowerTransformer"], + "scalers": ["MinMaxScaler", "RobustScaler", "StandardScaler", "MaxAbsScaler", "Normalizer", ], + "all_transformers" : ["transformers", "scalers"], + "arithmatic": ["AddTransformer", "mul_neg_1_Transformer", "MulTransformer", "SafeReciprocalTransformer", "EQTransformer", "NETransformer", "GETransformer", "GTTransformer", "LETransformer", "LTTransformer", "MinTransformer", "MaxTransformer"], "imputers": [], "skrebate": ["ReliefF", "SURF", "SURFstar", "MultiSURF"], diff --git a/tpot2/config/tests/test.ipynb b/tpot2/config/tests/test.ipynb new file mode 100644 index 00000000..97580f08 --- /dev/null +++ b/tpot2/config/tests/test.ipynb @@ -0,0 +1,264 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import importlib.util\n", + "import sys\n", + "import numpy as np\n", + "import warnings\n", + "\n", + "\n", + "\n", + "from ConfigSpace import ConfigurationSpace\n", + "from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal\n", + "\n", + "#autoqtl_builtins\n", + "from tpot2.builtin_modules import genetic_encoders\n", + "from tpot2.builtin_modules import feature_encoding_frequency_selector\n", + "\n", + "from sklearn.linear_model import SGDClassifier\n", + "from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier\n", + "from sklearn.neural_network import MLPClassifier\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from xgboost import XGBClassifier\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.svm import SVC\n", + "from sklearn.linear_model import LogisticRegression\n", + "from lightgbm import LGBMClassifier\n", + "from sklearn.svm import LinearSVC\n", + "from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB\n", + "from sklearn.ensemble import ExtraTreesRegressor, ExtraTreesClassifier\n", + "\n", + "\n", + "from tpot2.builtin_modules import ZeroCount, OneHotEncoder, ColumnOneHotEncoder\n", + "from sklearn.preprocessing import Binarizer\n", + "from sklearn.decomposition import FastICA\n", + "from sklearn.cluster import FeatureAgglomeration\n", + "from sklearn.preprocessing import MaxAbsScaler\n", + "from sklearn.preprocessing import MinMaxScaler\n", + "from sklearn.preprocessing import Normalizer\n", + "from sklearn.kernel_approximation import Nystroem\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.preprocessing import PolynomialFeatures\n", + "from sklearn.kernel_approximation import RBFSampler\n", + "from sklearn.preprocessing import RobustScaler\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.preprocessing import PowerTransformer, QuantileTransformer\n", + "\n", + "\n", + "from sklearn.feature_selection import SelectFwe\n", + "from sklearn.feature_selection import SelectPercentile\n", + "from sklearn.feature_selection import VarianceThreshold\n", + "from sklearn.feature_selection import RFE\n", + "from sklearn.feature_selection import SelectFromModel\n", + "\n", + "import sklearn.feature_selection\n", + "\n", + "#TODO create a selectomixin using these?\n", + "from sklearn.feature_selection import f_classif\n", + "from sklearn.feature_selection import f_regression\n", + "\n", + "\n", + "from sklearn.linear_model import SGDRegressor\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.linear_model import Ridge\n", + "from sklearn.linear_model import Lasso\n", + "from sklearn.linear_model import ElasticNet\n", + "from sklearn.linear_model import Lars\n", + "from sklearn.linear_model import LassoLars, LassoLarsCV\n", + "from sklearn.linear_model import RidgeCV\n", + "\n", + "from sklearn.svm import SVR, SVC\n", + "from sklearn.svm import LinearSVR, LinearSVC\n", + "\n", + "from sklearn.ensemble import AdaBoostRegressor, AdaBoostClassifier, GradientBoostingRegressor,RandomForestRegressor\n", + "from sklearn.ensemble import BaggingRegressor\n", + "from sklearn.ensemble import ExtraTreesRegressor\n", + "from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor\n", + "from sklearn.tree import DecisionTreeRegressor\n", + "from sklearn.neighbors import KNeighborsRegressor\n", + "from sklearn.linear_model import ElasticNetCV\n", + "\n", + "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n", + "\n", + "\n", + "from sklearn.gaussian_process import GaussianProcessRegressor\n", + "\n", + "from xgboost import XGBRegressor\n", + "\n", + "\n", + "from tpot2.builtin_modules import AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer\n", + "\n", + "\n", + "#MDR\n", + "\n", + "\n", + "all_methods = [SGDClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, MLPClassifier, DecisionTreeClassifier, XGBClassifier, KNeighborsClassifier, SVC, LogisticRegression, LGBMClassifier, LinearSVC, GaussianNB, BernoulliNB, MultinomialNB, ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, DecisionTreeRegressor, KNeighborsRegressor, XGBRegressor, ZeroCount, OneHotEncoder, ColumnOneHotEncoder, Binarizer, FastICA, FeatureAgglomeration, MaxAbsScaler, MinMaxScaler, Normalizer, Nystroem, PCA, PolynomialFeatures, RBFSampler, RobustScaler, StandardScaler, SelectFwe, SelectPercentile, VarianceThreshold, SGDRegressor, LinearRegression, Ridge, Lasso, ElasticNet, Lars, LassoLars, LassoLarsCV, RidgeCV, SVR, LinearSVR, AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, DecisionTreeRegressor, KNeighborsRegressor, ElasticNetCV,\n", + " AdaBoostClassifier,\n", + " GaussianProcessRegressor, HistGradientBoostingClassifier, HistGradientBoostingRegressor,\n", + " AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer,\n", + " PowerTransformer, QuantileTransformer,\n", + " ]\n", + "\n", + "\n", + "#if mdr is installed\n", + "if 'mdr' in sys.modules:\n", + " from mdr import MDR, ContinuousMDR\n", + " all_methods.append(MDR)\n", + " all_methods.append(ContinuousMDR)\n", + "\n", + "if 'skrebate' in sys.modules:\n", + " from skrebate import ReliefF, SURF, SURFstar, MultiSURF\n", + " all_methods.append(ReliefF)\n", + " all_methods.append(SURF)\n", + " all_methods.append(SURFstar)\n", + " all_methods.append(MultiSURF)\n", + "\n", + "if 'sklearnex' in sys.modules:\n", + " from sklearnex.linear_model import LinearRegression\n", + " from sklearnex.linear_model import Ridge\n", + " from sklearnex.linear_model import Lasso\n", + " from sklearnex.linear_model import ElasticNet\n", + " from sklearnex.svm import SVR\n", + " from sklearnex.svm import NuSVR\n", + " from sklearnex.ensemble import RandomForestRegressor\n", + " from sklearnex.neighbors import KNeighborsRegressor\n", + "\n", + " from sklearnex.ensemble import RandomForestClassifier\n", + " from sklearnex.neighbors import KNeighborsClassifier\n", + " from sklearnex.svm import SVC\n", + " from sklearnex.svm import NuSVC\n", + " from sklearnex.linear_model import LogisticRegression\n", + "\n", + "\n", + " all_methods.append(LinearRegression)\n", + " all_methods.append(Ridge)\n", + " all_methods.append(Lasso)\n", + " all_methods.append(ElasticNet)\n", + " all_methods.append(SVR)\n", + " all_methods.append(NuSVR)\n", + " all_methods.append(RandomForestRegressor)\n", + " all_methods.append(KNeighborsRegressor)\n", + " KNeighborsClassifier\n", + " all_methods.append(RandomForestClassifier)\n", + " all_methods.append(KNeighborsClassifier)\n", + " all_methods.append(SVC)\n", + " all_methods.append(NuSVC)\n", + " all_methods.append(LogisticRegression)\n", + "\n", + "\n", + "STRING_TO_CLASS = {\n", + " t.__name__: t for t in all_methods\n", + "}\n", + "\n", + "\n", + "from sklearn.linear_model import PassiveAggressiveClassifier\n", + "from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis\n", + "from sklearn.linear_model import ARDRegression\n", + "from sklearn.gaussian_process import GaussianProcessRegressor\n", + "\n", + "GROUPNAMES = {\n", + " \"selectors\": [\"SelectFwe\", \"SelectPercentile\", \"VarianceThreshold\",],\n", + " \"selectors_classification\": [\"SelectFwe\", \"SelectPercentile\", \"VarianceThreshold\", \"RFE_classification\", \"SelectFromModel_classification\"],\n", + " \"selectors_regression\": [\"SelectFwe\", \"SelectPercentile\", \"VarianceThreshold\", \"RFE_regression\", \"SelectFromModel_regression\"],\n", + " \"classifiers\" : ['AdaBoostClassifier', 'BernoulliNB', 'DecisionTreeClassifier', 'ExtraTreesClassifier', 'GaussianNB', 'HistGradientBoostingClassifier', 'KNeighborsClassifier', 'LogisticRegression', \"LinearSVC\", \"SVC\", 'MLPClassifier', 'MultinomialNB', \"PassiveAggressiveClassifier\", \"QuadraticDiscriminantAnalysis\", 'RandomForestClassifier', 'SGDClassifier', 'XGBClassifier'],\n", + " \"regressors\" : ['AdaBoostRegressor', \"ARDRegression\", 'DecisionTreeRegressor', 'ExtraTreesRegressor', 'GaussianProcessRegressor', 'HistGradientBoostingRegressor', 'KNeighborsRegressor', 'LinearDiscriminantAnalysis', 'LinearSVR', \"MLPRegressor\", 'RandomForestRegressor', 'SGDRegressor', 'SVR', 'XGBRegressor'],\n", + " \"transformers\": [\"Binarizer\", \"Normalizer\", \"PCA\", \"ZeroCount\", \"OneHotEncoder\", \"FastICA\", \"FeatureAgglomeration\", \"Nystroem\", \"RBFSampler\", \"QuantileTransformer\", \"PowerTransformer\"],\n", + " \"arithmatic\": [\"AddTransformer\", \"mul_neg_1_Transformer\", \"MulTransformer\", \"SafeReciprocalTransformer\", \"EQTransformer\", \"NETransformer\", \"GETransformer\", \"GTTransformer\", \"LETransformer\", \"LTTransformer\", \"MinTransformer\", \"MaxTransformer\"],\n", + " \"imputers\": [],\n", + " \"skrebate\": [\"ReliefF\", \"SURF\", \"SURFstar\", \"MultiSURF\"],\n", + " \"genetic_encoders\": [\"DominantEncoder\", \"RecessiveEncoder\", \"HeterosisEncoder\", \"UnderDominanceEncoder\", \"OverDominanceEncoder\"],\n", + "\n", + " \"classifiers_sklearnex\" : [\"RandomForestClassifier_sklearnex\", \"LogisticRegression_sklearnex\", \"KNeighborsClassifier_sklearnex\", \"SVC_sklearnex\",\"NuSVC_sklearnex\"],\n", + " \"regressors_sklearnex\" : [\"LinearRegression_sklearnex\", \"Ridge_sklearnex\", \"Lasso_sklearnex\", \"ElasticNet_sklearnex\", \"SVR_sklearnex\", \"NuSVR_sklearnex\", \"RandomForestRegressor_sklearnex\", \"KNeighborsRegressor_sklearnex\"],\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "ename": "TypeError", + "evalue": "RFE.__init__() missing 1 required positional argument: 'estimator'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[2], line 30\u001b[0m\n\u001b[1;32m 26\u001b[0m estnode \u001b[38;5;241m=\u001b[39m estnode_gen\u001b[38;5;241m.\u001b[39mgenerate()\n\u001b[1;32m 27\u001b[0m est \u001b[38;5;241m=\u001b[39m estnode\u001b[38;5;241m.\u001b[39mexport_pipeline()\n\u001b[0;32m---> 30\u001b[0m \u001b[43mtest_loop_through_all_hyperparameters\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "Cell \u001b[0;32mIn[2], line 27\u001b[0m, in \u001b[0;36mtest_loop_through_all_hyperparameters\u001b[0;34m()\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;241m1\u001b[39m):\n\u001b[1;32m 26\u001b[0m estnode \u001b[38;5;241m=\u001b[39m estnode_gen\u001b[38;5;241m.\u001b[39mgenerate()\n\u001b[0;32m---> 27\u001b[0m est \u001b[38;5;241m=\u001b[39m \u001b[43mestnode\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexport_pipeline\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/common/Projects/TPOT_Dev/tpot2/tpot2/search_spaces/nodes/estimator_node.py:92\u001b[0m, in \u001b[0;36mEstimatorNodeIndividual.export_pipeline\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m 90\u001b[0m \u001b[38;5;129m@final\u001b[39m \u001b[38;5;66;03m#this method should not be overridden, instead override hyperparameter_parser\u001b[39;00m\n\u001b[1;32m 91\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mexport_pipeline\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m---> 92\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mhyperparameter_parser\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mhyperparameters\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[0;31mTypeError\u001b[0m: RFE.__init__() missing 1 required positional argument: 'estimator'" + ] + } + ], + "source": [ + "import pytest\n", + "import tpot2\n", + "from sklearn.datasets import load_iris\n", + "import random\n", + "import sklearn\n", + "\n", + "import tpot2.config\n", + "\n", + "import importlib.util\n", + "import sys\n", + "import numpy as np\n", + "import warnings\n", + "\n", + "def test_loop_through_all_hyperparameters():\n", + "\n", + " n_classes=3\n", + " n_samples=100\n", + " n_features=100\n", + " random_state=None\n", + "\n", + " for class_name, _ in STRING_TO_CLASS.items():\n", + " estnode_gen = tpot2.config.get_search_space(class_name, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state)\n", + "\n", + " #generate 10 random hyperparameters and make sure they are all valid\n", + " for i in range(1):\n", + " estnode = estnode_gen.generate()\n", + " est = estnode.export_pipeline()\n", + " \n", + "\n", + "test_loop_through_all_hyperparameters()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tpot2.config.get_search_space(\"SGDClassifier\", n_classes=3, n_samples=100, n_features=5, random_state=5)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "tpot2env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tpot2/config/tests/test_get_configspace.py b/tpot2/config/tests/test_get_configspace.py index a2ebcb59..bccb349f 100644 --- a/tpot2/config/tests/test_get_configspace.py +++ b/tpot2/config/tests/test_get_configspace.py @@ -19,8 +19,7 @@ def test_loop_through_all_hyperparameters(): estnode_gen = tpot2.config.get_search_space(class_name, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state) #generate 10 random hyperparameters and make sure they are all valid - for i in range(10): + for i in range(1): estnode = estnode_gen.generate() est = estnode.export_pipeline() - \ No newline at end of file diff --git a/tpot2/tpot_estimator/tests/__init__.py b/tpot2/tpot_estimator/tests/__init__.py new file mode 100644 index 00000000..e69de29b From ca42398b0952c5812ebe6679aeaa63f7fbbb5ca9 Mon Sep 17 00:00:00 2001 From: perib Date: Thu, 18 Apr 2024 10:54:26 -0700 Subject: [PATCH 6/6] edits --- tpot2/config/get_configspace.py | 16 ++++++++-------- tpot2/tests/test_estimators.py | 7 ++++--- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/tpot2/config/get_configspace.py b/tpot2/config/get_configspace.py index 5706c4f6..473233ea 100644 --- a/tpot2/config/get_configspace.py +++ b/tpot2/config/get_configspace.py @@ -88,7 +88,10 @@ from sklearn.linear_model import ElasticNetCV from sklearn.discriminant_analysis import LinearDiscriminantAnalysis - +from sklearn.linear_model import PassiveAggressiveClassifier +from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis +from sklearn.linear_model import ARDRegression +from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process import GaussianProcessRegressor @@ -105,7 +108,7 @@ AdaBoostClassifier, GaussianProcessRegressor, HistGradientBoostingClassifier, HistGradientBoostingRegressor, AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer, - PowerTransformer, QuantileTransformer, + PowerTransformer, QuantileTransformer,ARDRegression, QuadraticDiscriminantAnalysis, PassiveAggressiveClassifier, LinearDiscriminantAnalysis, ] @@ -145,17 +148,14 @@ } -from sklearn.linear_model import PassiveAggressiveClassifier -from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis -from sklearn.linear_model import ARDRegression -from sklearn.gaussian_process import GaussianProcessRegressor + GROUPNAMES = { "selectors": ["SelectFwe", "SelectPercentile", "VarianceThreshold",], "selectors_classification": ["SelectFwe", "SelectPercentile", "VarianceThreshold", "RFE_classification", "SelectFromModel_classification"], "selectors_regression": ["SelectFwe", "SelectPercentile", "VarianceThreshold", "RFE_regression", "SelectFromModel_regression"], - "classifiers" : ['AdaBoostClassifier', 'BernoulliNB', 'DecisionTreeClassifier', 'ExtraTreesClassifier', 'GaussianNB', 'HistGradientBoostingClassifier', 'KNeighborsClassifier', 'LogisticRegression', "LinearSVC", "SVC", 'MLPClassifier', 'MultinomialNB', "PassiveAggressiveClassifier", "QuadraticDiscriminantAnalysis", 'RandomForestClassifier', 'SGDClassifier', 'XGBClassifier'], - "regressors" : ['AdaBoostRegressor', "ARDRegression", 'DecisionTreeRegressor', 'ExtraTreesRegressor', 'GaussianProcessRegressor', 'HistGradientBoostingRegressor', 'KNeighborsRegressor', 'LinearDiscriminantAnalysis', 'LinearSVR', "MLPRegressor", 'RandomForestRegressor', 'SGDRegressor', 'SVR', 'XGBRegressor'], + "classifiers" : ['AdaBoostClassifier', 'BernoulliNB', 'DecisionTreeClassifier', 'ExtraTreesClassifier', 'GaussianNB', 'HistGradientBoostingClassifier', 'KNeighborsClassifier','LinearDiscriminantAnalysis', 'LogisticRegression', "LinearSVC", "SVC", 'MLPClassifier', 'MultinomialNB', "PassiveAggressiveClassifier", "QuadraticDiscriminantAnalysis", 'RandomForestClassifier', 'SGDClassifier', 'XGBClassifier'], + "regressors" : ['AdaBoostRegressor', "ARDRegression", 'DecisionTreeRegressor', 'ExtraTreesRegressor', 'GaussianProcessRegressor', 'HistGradientBoostingRegressor', 'KNeighborsRegressor', 'LinearSVR', "MLPRegressor", 'RandomForestRegressor', 'SGDRegressor', 'SVR', 'XGBRegressor'], "transformers": ["Binarizer", "PCA", "ZeroCount", "ColumnOneHotEncoder", "FastICA", "FeatureAgglomeration", "Nystroem", "RBFSampler", "QuantileTransformer", "PowerTransformer"], diff --git a/tpot2/tests/test_estimators.py b/tpot2/tests/test_estimators.py index 5c6f47ba..98b607e0 100644 --- a/tpot2/tests/test_estimators.py +++ b/tpot2/tests/test_estimators.py @@ -7,7 +7,8 @@ #standard test @pytest.fixture def tpot_estimator(): - return tpot2.TPOTEstimator( population_size=50, + return tpot2.TPOTEstimator( population_size=10, + generations=5, scorers=['roc_auc_ovr'], scorers_weights=[1], classification=True, @@ -81,11 +82,11 @@ def test_tpot_estimator_config_dict_type(): @pytest.fixture def tpot_classifier(): - return tpot2.tpot_estimator.templates.TPOTClassifier(max_time_seconds=300,verbose=3) + return tpot2.tpot_estimator.templates.TPOTClassifier(max_time_seconds=10,verbose=3) @pytest.fixture def tpot_regressor(): - return tpot2.tpot_estimator.templates.TPOTRegressor(max_time_seconds=300,verbose=3) + return tpot2.tpot_estimator.templates.TPOTRegressor(max_time_seconds=10,verbose=3) def test_tpot_classifier_fit(tpot_classifier,sample_dataset): #load iris dataset