Skip to content

Latest commit

 

History

History
911 lines (750 loc) · 16 KB

feature_scaling.md

File metadata and controls

911 lines (750 loc) · 16 KB

Note: This is a generated markdown export from the Jupyter notebook file feature_scaling.ipynb. You can also view the notebook with the nbviewer from Jupyter.

Feature scaling

import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import numpy as np

from sklearn import preprocessing, pipeline, datasets
iris = datasets.load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = iris.target_names[iris.target]
def plot_data(X, y):
        
    df = pd.DataFrame(X.values, columns=X.columns)
    df['labels'] = y
    
    _ = sns.pairplot(df, hue='labels')
X.describe()
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
count 150.000000 150.000000 150.000000 150.000000
mean 5.843333 3.057333 3.758000 1.199333
std 0.828066 0.435866 1.765298 0.762238
min 4.300000 2.000000 1.000000 0.100000
25% 5.100000 2.800000 1.600000 0.300000
50% 5.800000 3.000000 4.350000 1.300000
75% 6.400000 3.300000 5.100000 1.800000
max 7.900000 4.400000 6.900000 2.500000
plot_data(X, y)

png

Min / Max

scaler = preprocessing.MinMaxScaler()

X_scaled = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)
X_scaled.describe()
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
count 150.000000 150.000000 150.000000 150.000000
mean 0.428704 0.440556 0.467458 0.458056
std 0.230018 0.181611 0.299203 0.317599
min 0.000000 0.000000 0.000000 0.000000
25% 0.222222 0.333333 0.101695 0.083333
50% 0.416667 0.416667 0.567797 0.500000
75% 0.583333 0.541667 0.694915 0.708333
max 1.000000 1.000000 1.000000 1.000000
plot_data(X_scaled, y)

png

MaxAbsScaler

scaler = preprocessing.MaxAbsScaler()

X_scaled = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)
X_scaled.describe()
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
count 150.000000 150.000000 150.000000 150.000000
mean 0.739662 0.694848 0.544638 0.479733
std 0.104818 0.099061 0.255840 0.304895
min 0.544304 0.454545 0.144928 0.040000
25% 0.645570 0.636364 0.231884 0.120000
50% 0.734177 0.681818 0.630435 0.520000
75% 0.810127 0.750000 0.739130 0.720000
max 1.000000 1.000000 1.000000 1.000000
plot_data(X_scaled, y)

png

RobustScaler

scaler = preprocessing.RobustScaler()

X_scaled = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)
X_scaled.describe()
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
count 150.000000 150.000000 1.500000e+02 150.000000
mean 0.033333 0.114667 -1.691429e-01 -0.067111
std 0.636974 0.871733 5.043709e-01 0.508158
min -1.153846 -2.000000 -9.571429e-01 -0.800000
25% -0.538462 -0.400000 -7.857143e-01 -0.666667
50% 0.000000 0.000000 1.266348e-16 0.000000
75% 0.461538 0.600000 2.142857e-01 0.333333
max 1.615385 2.800000 7.285714e-01 0.800000
plot_data(X_scaled, y)

png

QuantileTransformer (normal)

scaler = preprocessing.QuantileTransformer(output_distribution='normal', n_quantiles=10)

X_scaled = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)
X_scaled.describe()
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
count 150.000000 150.000000 150.000000 150.000000
mean -0.029423 -0.035698 0.012632 -0.062428
std 1.110881 1.054759 1.077012 1.478357
min -5.199338 -5.199338 -5.199338 -5.199338
25% -0.764710 -0.764710 -0.732191 -0.715053
50% -0.039002 -0.139710 0.034842 -0.139710
75% 0.589456 0.589456 0.654452 0.654452
max 5.199338 5.199338 5.199338 5.199338
plot_data(X_scaled, y)

png

QuantileTransformer (uniform)

scaler = preprocessing.QuantileTransformer(output_distribution='uniform', n_quantiles=10)

X_scaled = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)
X_scaled.describe()
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
count 150.000000 150.000000 150.000000 150.000000
mean 0.490118 0.487531 0.501493 0.492239
std 0.289002 0.283368 0.286721 0.298009
min 0.000000 0.000000 0.000000 0.000000
25% 0.222222 0.222222 0.232026 0.237288
50% 0.484444 0.444444 0.513889 0.444444
75% 0.722222 0.722222 0.743590 0.743590
max 1.000000 1.000000 1.000000 1.000000
plot_data(X_scaled, y)

png

PowerTransformer

Apply a power transform featurewise to make data more Gaussian-like.

scaler = preprocessing.PowerTransformer()

X_scaled = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)
X_scaled.describe()
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
count 1.500000e+02 1.500000e+02 1.500000e+02 1.500000e+02
mean -5.447494e-15 6.300146e-15 2.842171e-16 1.089499e-15
std 1.003350e+00 1.003350e+00 1.003350e+00 1.003350e+00
min -2.137770e+00 -2.759144e+00 -1.545592e+00 -1.476845e+00
25% -8.956896e-01 -5.614702e-01 -1.224374e+00 -1.189599e+00
50% 2.642955e-02 -8.191725e-02 3.225908e-01 1.596788e-01
75% 7.222371e-01 5.958605e-01 7.598052e-01 7.964903e-01
max 2.176957e+00 2.743175e+00 1.828818e+00 1.658549e+00
plot_data(X_scaled, y)

png

Normalize samples individually to unit norm

Scale input vectors individually to unit norm (vector length).

scaler = mm = pipeline.make_pipeline(preprocessing.MinMaxScaler(), preprocessing.Normalizer())

X_scaled = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)
X_scaled.describe()
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
count 150.000000 150.000000 150.000000 150.000000
mean 0.409314 0.518059 0.430511 0.413522
std 0.123306 0.302146 0.219147 0.233686
min 0.000000 0.000000 0.000000 0.000000
25% 0.334888 0.296698 0.167761 0.124490
50% 0.427741 0.374264 0.544644 0.520427
75% 0.491244 0.914248 0.594792 0.595049
max 0.612540 0.999174 0.708205 0.738046
plot_data(X_scaled, y)

png

Standardization

Standardize features by removing the mean and scaling to unit variance

scaler = preprocessing.StandardScaler()

X_scaled = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)
X_scaled.describe()
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
count 1.500000e+02 1.500000e+02 1.500000e+02 1.500000e+02
mean -1.468455e-15 -1.823726e-15 -1.610564e-15 -9.473903e-16
std 1.003350e+00 1.003350e+00 1.003350e+00 1.003350e+00
min -1.870024e+00 -2.433947e+00 -1.567576e+00 -1.447076e+00
25% -9.006812e-01 -5.923730e-01 -1.226552e+00 -1.183812e+00
50% -5.250608e-02 -1.319795e-01 3.364776e-01 1.325097e-01
75% 6.745011e-01 5.586108e-01 7.627583e-01 7.906707e-01
max 2.492019e+00 3.090775e+00 1.785832e+00 1.712096e+00
plot_data(X_scaled, y)

png