Note: This is a generated markdown export from the Jupyter notebook file feature_scaling.ipynb. You can also view the notebook with the nbviewer from Jupyter.

Feature scaling

import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import numpy as np

from sklearn import preprocessing, pipeline, datasets

iris = datasets.load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = iris.target_names[iris.target]

def plot_data(X, y):
        
    df = pd.DataFrame(X.values, columns=X.columns)
    df['labels'] = y
    
    _ = sns.pairplot(df, hue='labels')

X.describe()

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
count	150.000000	150.000000	150.000000	150.000000
mean	5.843333	3.057333	3.758000	1.199333
std	0.828066	0.435866	1.765298	0.762238
min	4.300000	2.000000	1.000000	0.100000
25%	5.100000	2.800000	1.600000	0.300000
50%	5.800000	3.000000	4.350000	1.300000
75%	6.400000	3.300000	5.100000	1.800000
max	7.900000	4.400000	6.900000	2.500000

plot_data(X, y)

Min / Max

scaler = preprocessing.MinMaxScaler()

X_scaled = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)
X_scaled.describe()

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
count	150.000000	150.000000	150.000000	150.000000
mean	0.428704	0.440556	0.467458	0.458056
std	0.230018	0.181611	0.299203	0.317599
min	0.000000	0.000000	0.000000	0.000000
25%	0.222222	0.333333	0.101695	0.083333
50%	0.416667	0.416667	0.567797	0.500000
75%	0.583333	0.541667	0.694915	0.708333
max	1.000000	1.000000	1.000000	1.000000

plot_data(X_scaled, y)

MaxAbsScaler

scaler = preprocessing.MaxAbsScaler()

X_scaled = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)
X_scaled.describe()

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
count	150.000000	150.000000	150.000000	150.000000
mean	0.739662	0.694848	0.544638	0.479733
std	0.104818	0.099061	0.255840	0.304895
min	0.544304	0.454545	0.144928	0.040000
25%	0.645570	0.636364	0.231884	0.120000
50%	0.734177	0.681818	0.630435	0.520000
75%	0.810127	0.750000	0.739130	0.720000
max	1.000000	1.000000	1.000000	1.000000

plot_data(X_scaled, y)

RobustScaler

scaler = preprocessing.RobustScaler()

X_scaled = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)
X_scaled.describe()

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
count	150.000000	150.000000	1.500000e+02	150.000000
mean	0.033333	0.114667	-1.691429e-01	-0.067111
std	0.636974	0.871733	5.043709e-01	0.508158
min	-1.153846	-2.000000	-9.571429e-01	-0.800000
25%	-0.538462	-0.400000	-7.857143e-01	-0.666667
50%	0.000000	0.000000	1.266348e-16	0.000000
75%	0.461538	0.600000	2.142857e-01	0.333333
max	1.615385	2.800000	7.285714e-01	0.800000

plot_data(X_scaled, y)

QuantileTransformer (normal)

scaler = preprocessing.QuantileTransformer(output_distribution='normal', n_quantiles=10)

X_scaled = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)
X_scaled.describe()

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
count	150.000000	150.000000	150.000000	150.000000
mean	-0.029423	-0.035698	0.012632	-0.062428
std	1.110881	1.054759	1.077012	1.478357
min	-5.199338	-5.199338	-5.199338	-5.199338
25%	-0.764710	-0.764710	-0.732191	-0.715053
50%	-0.039002	-0.139710	0.034842	-0.139710
75%	0.589456	0.589456	0.654452	0.654452
max	5.199338	5.199338	5.199338	5.199338

plot_data(X_scaled, y)

QuantileTransformer (uniform)

scaler = preprocessing.QuantileTransformer(output_distribution='uniform', n_quantiles=10)

X_scaled = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)
X_scaled.describe()

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
count	150.000000	150.000000	150.000000	150.000000
mean	0.490118	0.487531	0.501493	0.492239
std	0.289002	0.283368	0.286721	0.298009
min	0.000000	0.000000	0.000000	0.000000
25%	0.222222	0.222222	0.232026	0.237288
50%	0.484444	0.444444	0.513889	0.444444
75%	0.722222	0.722222	0.743590	0.743590
max	1.000000	1.000000	1.000000	1.000000

plot_data(X_scaled, y)

PowerTransformer

Apply a power transform featurewise to make data more Gaussian-like.

scaler = preprocessing.PowerTransformer()

X_scaled = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)
X_scaled.describe()

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
count	1.500000e+02	1.500000e+02	1.500000e+02	1.500000e+02
mean	-5.447494e-15	6.300146e-15	2.842171e-16	1.089499e-15
std	1.003350e+00	1.003350e+00	1.003350e+00	1.003350e+00
min	-2.137770e+00	-2.759144e+00	-1.545592e+00	-1.476845e+00
25%	-8.956896e-01	-5.614702e-01	-1.224374e+00	-1.189599e+00
50%	2.642955e-02	-8.191725e-02	3.225908e-01	1.596788e-01
75%	7.222371e-01	5.958605e-01	7.598052e-01	7.964903e-01
max	2.176957e+00	2.743175e+00	1.828818e+00	1.658549e+00

plot_data(X_scaled, y)

Normalize samples individually to unit norm

Scale input vectors individually to unit norm (vector length).

scaler = mm = pipeline.make_pipeline(preprocessing.MinMaxScaler(), preprocessing.Normalizer())

X_scaled = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)
X_scaled.describe()

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
count	150.000000	150.000000	150.000000	150.000000
mean	0.409314	0.518059	0.430511	0.413522
std	0.123306	0.302146	0.219147	0.233686
min	0.000000	0.000000	0.000000	0.000000
25%	0.334888	0.296698	0.167761	0.124490
50%	0.427741	0.374264	0.544644	0.520427
75%	0.491244	0.914248	0.594792	0.595049
max	0.612540	0.999174	0.708205	0.738046

plot_data(X_scaled, y)

Standardization

Standardize features by removing the mean and scaling to unit variance

scaler = preprocessing.StandardScaler()

X_scaled = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)
X_scaled.describe()

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
count	1.500000e+02	1.500000e+02	1.500000e+02	1.500000e+02
mean	-1.468455e-15	-1.823726e-15	-1.610564e-15	-9.473903e-16
std	1.003350e+00	1.003350e+00	1.003350e+00	1.003350e+00
min	-1.870024e+00	-2.433947e+00	-1.567576e+00	-1.447076e+00
25%	-9.006812e-01	-5.923730e-01	-1.226552e+00	-1.183812e+00
50%	-5.250608e-02	-1.319795e-01	3.364776e-01	1.325097e-01
75%	6.745011e-01	5.586108e-01	7.627583e-01	7.906707e-01
max	2.492019e+00	3.090775e+00	1.785832e+00	1.712096e+00

plot_data(X_scaled, y)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

feature_scaling.md

feature_scaling.md

Feature scaling

Min / Max

MaxAbsScaler

RobustScaler

QuantileTransformer (normal)

QuantileTransformer (uniform)

PowerTransformer

Normalize samples individually to unit norm

Standardization

Files

feature_scaling.md

Latest commit

History

feature_scaling.md

File metadata and controls

Feature scaling

Min / Max

MaxAbsScaler

RobustScaler

QuantileTransformer (normal)

QuantileTransformer (uniform)

PowerTransformer

Normalize samples individually to unit norm

Standardization