Skip to content

Commit

Permalink
Merge pull request #120 from perib/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
perib authored Mar 27, 2024
2 parents 14922f6 + 4681389 commit ef2a9a1
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 12 deletions.
30 changes: 22 additions & 8 deletions tpot2/builtin_modules/column_one_hot_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,13 @@



def auto_select_categorical_features(X):
def auto_select_categorical_features(X, min_unique=10,):

if not isinstance(X, pd.DataFrame):
return []

feature_mask = []
for column in X.columns:
feature_mask.append(not is_numeric_dtype(X[column]))
if isinstance(X, pd.DataFrame):
return [col for col in X.columns if len(X[col].unique()) < min_unique]
else:
return [i for i in range(X.shape[1]) if len(np.unique(X[:, i])) < min_unique]

return feature_mask


def _X_selected(X, selected):
Expand All @@ -41,6 +38,21 @@ class ColumnOneHotEncoder(BaseEstimator, TransformerMixin):


def __init__(self, columns='auto', drop=None, handle_unknown='error', sparse_output=False, min_frequency=None,max_categories=None):
'''
Parameters
----------
columns : str, list, default='auto'
- 'auto' : Automatically select categorical features based on columns with less than 10 unique values
- 'categorical' : Automatically select categorical features
- 'numeric' : Automatically select numeric features
- 'all' : Select all features
- list : A list of columns to select
drop, handle_unknown, sparse_output, min_frequency, max_categories : see sklearn.preprocessing.OneHotEncoder
'''

self.columns = columns
self.drop = drop
Expand Down Expand Up @@ -73,6 +85,8 @@ def fit(self, X, y=None):
self.columns_ = list(X.select_dtypes(exclude='number').columns)
elif self.columns == "numeric":
self.columns_ = [col for col in X.columns if is_numeric_dtype(X[col])]
elif self.columns == "auto":
self.columns_ = auto_select_categorical_features(X)
elif self.columns == "all":
if isinstance(X, pd.DataFrame):
self.columns_ = X.columns
Expand Down
4 changes: 2 additions & 2 deletions tpot2/config/transformers.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from functools import partial
import numpy as np

from tpot2.builtin_modules import ZeroCount, OneHotEncoder
from tpot2.builtin_modules import ZeroCount, OneHotEncoder, ColumnOneHotEncoder
from sklearn.preprocessing import Binarizer
from sklearn.decomposition import FastICA
from sklearn.cluster import FeatureAgglomeration
Expand Down Expand Up @@ -99,5 +99,5 @@ def make_transformer_config_dictionary(random_state=None, n_features=10):
RobustScaler: {},
StandardScaler: {},
ZeroCount: params_tpot_builtins_ZeroCount,
OneHotEncoder: params_tpot_builtins_OneHotEncoder,
ColumnOneHotEncoder: params_tpot_builtins_OneHotEncoder,
}
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def estimator_graph_individual_generator(
starting_ops = []
if inner_config_dict is not None:
starting_ops.append(ind._mutate_insert_inner_node)
if leaf_config_dict is not None:
if leaf_config_dict is not None or inner_config_dict is not None:
starting_ops.append(ind._mutate_insert_leaf)
n_nodes -= 1

Expand Down
2 changes: 1 addition & 1 deletion tpot2/objectives/complexity.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def MultinomialNB_Complexity(model):

def calculate_model_complexity(est):
if isinstance(est, sklearn.pipeline.Pipeline) or isinstance(est, sklearn.pipeline.FeatureUnion):
return sum(calculate_model_complexity(estimator) for estimator in est.steps)
return sum(calculate_model_complexity(estimator) for _,estimator in est.steps)
if isinstance(est, GraphPipeline):
return sum(calculate_model_complexity(est.graph.nodes[node]['instance']) for node in est.graph.nodes)

Expand Down

0 comments on commit ef2a9a1

Please sign in to comment.