From 37168c29327e794bcfcab2011fe2a80f982f4ece Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Tue, 28 Apr 2020 16:55:34 +0300 Subject: [PATCH 01/17] :hammer: Fix necessary imports --- sklift/viz/base.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklift/viz/base.py b/sklift/viz/base.py index 2d2bb75..dbdc63f 100644 --- a/sklift/viz/base.py +++ b/sklift/viz/base.py @@ -1,5 +1,7 @@ import matplotlib.pyplot as plt import numpy as np +from sklearn.utils.validation import check_consistent_length +import warnings from ..metrics import uplift_curve, auuc, qini_curve, auqc, response_rate_by_percentile, treatment_balance_curve From 61f68a1a2194fc9a6f611813c049aab8d6ebd94f Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Tue, 28 Apr 2020 22:23:43 +0300 Subject: [PATCH 02/17] :cop: Refactor models module --- sklift/models/models.py | 51 +++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 28 deletions(-) diff --git a/sklift/models/models.py b/sklift/models/models.py index 2b8b676..c49e4d3 100644 --- a/sklift/models/models.py +++ b/sklift/models/models.py @@ -44,7 +44,6 @@ class SoloModel(BaseEstimator): .. _SoloModel in documentation: https://scikit-uplift.readthedocs.io/en/latest/api/models.html#one-model-with-treatment-as-feature - """ def __init__(self, estimator): @@ -54,8 +53,7 @@ def __init__(self, estimator): self._type_of_target = None def fit(self, X, y, treatment, estimator_fit_params=None): - """ - Fit the model according to the given training data. + """Fit the model according to the given training data. For each test example calculate predictions on new set twice: by the first and second models. After that calculate uplift as a delta between these predictions. @@ -63,8 +61,8 @@ def fit(self, X, y, treatment, estimator_fit_params=None): Return delta of predictions for each example. Args: - X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of samples and - n_features is the number of features. + X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of + samples and n_features is the number of features. y (array-like, shape (n_samples,)): Target vector relative to X. treatment (array-like, shape (n_samples,)): Binary treatment vector relative to X. estimator_fit_params (dict, optional): Parameters to pass to the fit method of the estimator. @@ -80,7 +78,7 @@ def fit(self, X, y, treatment, estimator_fit_params=None): if isinstance(X, np.ndarray): X_mod = np.column_stack((X, treatment)) - elif isinstance(X, pd.core.frame.DataFrame): + elif isinstance(X, pd.DataFrame): X_mod = X.assign(treatment=treatment) else: raise TypeError("Expected numpy.ndarray or pandas.DataFrame in training vector X, got %s" % type(X)) @@ -93,8 +91,7 @@ def fit(self, X, y, treatment, estimator_fit_params=None): return self def predict(self, X): - """ - Perform uplift on samples in X. + """Perform uplift on samples in X. Args: X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of samples @@ -106,7 +103,7 @@ def predict(self, X): if isinstance(X, np.ndarray): X_mod_trmnt = np.column_stack((X, np.ones(X.shape[0]))) X_mod_ctrl = np.column_stack((X, np.zeros(X.shape[0]))) - elif isinstance(X, pd.core.frame.DataFrame): + elif isinstance(X, pd.DataFrame): X_mod_trmnt = X.assign(treatment=np.ones(X.shape[0])) X_mod_ctrl = X.assign(treatment=np.zeros(X.shape[0])) else: @@ -167,8 +164,7 @@ def __init__(self, estimator): self._type_of_target = None def fit(self, X, y, treatment, estimator_fit_params=None): - """ - Fit the model according to the given training data. + """Fit the model according to the given training data. Args: X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of samples and @@ -187,7 +183,6 @@ def fit(self, X, y, treatment, estimator_fit_params=None): if self._type_of_target != 'binary': raise ValueError("This approach is only suitable for binary classification problem") - # TODO: Заменить raise на Warning _, treatment_counts = np.unique(treatment, return_counts=True) if treatment_counts[0] != treatment_counts[1]: warnings.warn( @@ -204,8 +199,7 @@ def fit(self, X, y, treatment, estimator_fit_params=None): return self def predict(self, X): - """ - Perform uplift on samples in X. + """Perform uplift on samples in X. Args: X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of samples @@ -220,6 +214,7 @@ def predict(self, X): class TwoModels(BaseEstimator): """aka naïve approach, or difference score method, or double classifier approach. + Fit two separate models: on the treatment data and on the control data. See more details about `TwoModels in documentation`_. @@ -293,8 +288,7 @@ def __init__(self, estimator_trmnt, estimator_ctrl, method='vanilla'): raise ValueError('Control and Treatment estimators should be different objects.') def fit(self, X, y, treatment, estimator_trmnt_fit_params=None, estimator_ctrl_fit_params=None): - """ - Fit the model according to the given training data. + """Fit the model according to the given training data. For each test example calculate predictions on new set twice: by the first and second models. After that calculate uplift as a delta between these predictions. @@ -302,12 +296,14 @@ def fit(self, X, y, treatment, estimator_trmnt_fit_params=None, estimator_ctrl_f Return delta of predictions for each example. Args: - X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of samples and - n_features is the number of features. + X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number + of samples and n_features is the number of features. y (array-like, shape (n_samples,)): Target vector relative to X. treatment (array-like, shape (n_samples,)): Binary treatment vector relative to X. - estimator_trmnt_fit_params (dict, optional): Parameters to pass to the fit method of the treatment estimator. - estimator_ctrl_fit_params (dict, optional): Parameters to pass to the fit method of the control estimator. + estimator_trmnt_fit_params (dict, optional): Parameters to pass to the fit method + of the treatment estimator. + estimator_ctrl_fit_params (dict, optional): Parameters to pass to the fit method + of the control estimator. Returns: object: self @@ -343,7 +339,7 @@ def fit(self, X, y, treatment, estimator_trmnt_fit_params=None, estimator_ctrl_f if isinstance(X_trmnt, np.ndarray): X_trmnt_mod = np.column_stack((X_trmnt, ddr_control)) - elif isinstance(X_trmnt, pd.core.frame.DataFrame): + elif isinstance(X_trmnt, pd.DataFrame): X_trmnt_mod = X_trmnt.assign(ddr_control=ddr_control) else: raise TypeError("Expected numpy.ndarray or pandas.DataFrame, got %s" % type(X_trmnt)) @@ -363,7 +359,7 @@ def fit(self, X, y, treatment, estimator_trmnt_fit_params=None, estimator_ctrl_f if isinstance(X_ctrl, np.ndarray): X_ctrl_mod = np.column_stack((X_ctrl, ddr_treatment)) - elif isinstance(X_trmnt, pd.core.frame.DataFrame): + elif isinstance(X_trmnt, pd.DataFrame): X_ctrl_mod = X_ctrl.assign(ddr_treatment=ddr_treatment) else: raise TypeError("Expected numpy.ndarray or pandas.DataFrame, got %s" % type(X_ctrl)) @@ -375,8 +371,7 @@ def fit(self, X, y, treatment, estimator_trmnt_fit_params=None, estimator_ctrl_f return self def predict(self, X): - """ - Perform uplift on samples in X. + """Perform uplift on samples in X. Args: X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of samples @@ -394,10 +389,10 @@ def predict(self, X): if isinstance(X, np.ndarray): X_mod = np.column_stack((X, self.ctrl_preds_)) - elif isinstance(X, pd.core.frame.DataFrame): + elif isinstance(X, pd.DataFrame): X_mod = X.assign(ddr_control=self.ctrl_preds_) else: - raise TypeError("Expected numpy.ndarray or pandas.DataFrame, got %s" % type(X_mod)) + raise TypeError("Expected numpy.ndarray or pandas.DataFrame, got %s" % type(X)) self.trmnt_preds_ = self.estimator_trmnt.predict_proba(X_mod)[:, 1] elif self.method == 'ddr_treatment': @@ -408,10 +403,10 @@ def predict(self, X): if isinstance(X, np.ndarray): X_mod = np.column_stack((X, self.trmnt_preds_)) - elif isinstance(X, pd.core.frame.DataFrame): + elif isinstance(X, pd.DataFrame): X_mod = X.assign(ddr_treatment=self.trmnt_preds_) else: - raise TypeError("Expected numpy.ndarray or pandas.DataFrame, got %s" % type(X_mod)) + raise TypeError("Expected numpy.ndarray or pandas.DataFrame, got %s" % type(X)) self.ctrl_preds_ = self.estimator_ctrl.predict_proba(X_mod)[:, 1] else: From 794a07786c8f89093681d3ff5203d9db44ffedad Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Tue, 28 Apr 2020 22:35:31 +0300 Subject: [PATCH 03/17] :cop: Refactor metrics module --- sklift/metrics/metrics.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/sklift/metrics/metrics.py b/sklift/metrics/metrics.py index efbf185..a1651ed 100644 --- a/sklift/metrics/metrics.py +++ b/sklift/metrics/metrics.py @@ -25,9 +25,7 @@ def uplift_curve(y_true, uplift, treatment): :func:`plot_uplift_qini_curves`: Plot Uplift and Qini curves. """ - # ToDo: Добавить проверки на наличие обоих классов в столбце treatment - # ToDo: Добавить проверку на наличие обоих классов в y_true для каждого уникального значения из столбца treatment - + # TODO: check the treatment is binary y_true, uplift, treatment = np.array(y_true), np.array(uplift), np.array(treatment) desc_score_indices = np.argsort(uplift, kind="mergesort")[::-1] y_true, uplift, treatment = y_true[desc_score_indices], uplift[desc_score_indices], treatment[desc_score_indices] @@ -79,9 +77,7 @@ def qini_curve(y_true, uplift, treatment): :func:`plot_uplift_qini_curves`: Plot Uplift and Qini curves. """ - # ToDo: Добавить проверки на наличие обоих классов в столбце treatment - # ToDo: Добавить проверку на наличие обоих классов в столбце y_true для каждого уникального значения из столбца treatment - + # TODO: check the treatment is binary y_true, uplift, treatment = np.array(y_true), np.array(uplift), np.array(treatment) desc_score_indices = np.argsort(uplift, kind="mergesort")[::-1] @@ -127,7 +123,8 @@ def uplift_auc_score(y_true, uplift, treatment): Returns: float: Area Under the Uplift Curve. """ - # ToDO: Добавить бейзлайн + # ToDO: Add normalization + # ToDO: Add baseline return auc(*uplift_curve(y_true, uplift, treatment)) @@ -147,7 +144,6 @@ def auuc(y_true, uplift, treatment): Metric `auuc` was renamed to :func:`uplift_auc_score` in version 0.1.0 and will be removed in 0.2.0 """ - # ToDO: Добавить бейзлайн warnings.warn( 'Metric `auuc` was renamed to `uplift_auc_score`' 'in version 0.1.0 and will be removed in 0.2.0', @@ -167,7 +163,8 @@ def qini_auc_score(y_true, uplift, treatment): Returns: float: Area Under the Qini Curve. """ - # ToDO: Добавить бейзлайн + # ToDO: Add normalization + # ToDO: Add baseline return auc(*qini_curve(y_true, uplift, treatment)) @@ -187,7 +184,6 @@ def auqc(y_true, uplift, treatment): Metric `auqc` was renamed to :func:`qini_auc_score` in version 0.1.0 and will be removed in 0.2.0 """ - # ToDO: Добавить бейзлайн warnings.warn( 'Metric `auqc` was renamed to `qini_auc_score`' 'in version 0.1.0 and will be removed in 0.2.0', @@ -259,7 +255,7 @@ def uplift_at_k(y_true, uplift, treatment, strategy, k=0.3): else: n_size = k - # ToDo: _checker_ there are obervations among two groups among first k + # ToDo: _checker_ there are observations among two groups among first k score_ctrl = y_true[order][:n_size][treatment[order][:n_size] == 0].mean() score_trmnt = y_true[order][:n_size][treatment[order][:n_size] == 1].mean() @@ -331,7 +327,7 @@ def response_rate_by_percentile(y_true, uplift, treatment, group, strategy, bins f' got {strategy}.') if not isinstance(bins, int) or bins <= 0: - raise ValueError(f'bins should be positive integer.' + raise ValueError(f'Bins should be positive integer.' f' Invalid value bins: {bins}') if bins >= n_samples: From 66345d0fa4a7eca7090aa4a74aaee2f1ea597683 Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Tue, 28 Apr 2020 22:50:46 +0300 Subject: [PATCH 04/17] :cop: Refactor viz module --- sklift/viz/base.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/sklift/viz/base.py b/sklift/viz/base.py index dbdc63f..bca69bc 100644 --- a/sklift/viz/base.py +++ b/sklift/viz/base.py @@ -20,7 +20,12 @@ def plot_uplift_preds(trmnt_preds, ctrl_preds, log=False, bins=100): Returns: Object that stores computed values. """ - # ToDo: Добавить квантиль как параметр + # ToDo: Add k as parameter: vertical line on plots + check_consistent_length(trmnt_preds, ctrl_preds) + + if not isinstance(bins, int) or bins <= 0: + raise ValueError(f'Bins should be positive integer. Invalid value for bins: {bins}') + if log: trmnt_preds = np.log(trmnt_preds + 1) ctrl_preds = np.log(ctrl_preds + 1) @@ -58,6 +63,9 @@ def plot_uplift_qini_curves(y_true, uplift, treatment, random=True, perfect=Fals Returns: Object that stores computed values. """ + check_consistent_length(y_true, uplift, treatment) + y_true, uplift, treatment = np.array(y_true), np.array(uplift), np.array(treatment) + x_up, y_up = uplift_curve(y_true, uplift, treatment) x_qi, y_qi = qini_curve(y_true, uplift, treatment) @@ -67,11 +75,11 @@ def plot_uplift_qini_curves(y_true, uplift, treatment, random=True, perfect=Fals axes[1].plot(x_qi, y_qi, label='Model', color='b') if random: - up_ratio_random = y_true[treatment == 1].sum() / len(y_true[treatment == 1]) - \ - y_true[treatment == 0].sum() / len(y_true[treatment == 0]) + up_ratio_random = (y_true[treatment == 1].sum() / len(y_true[treatment == 1]) - + y_true[treatment == 0].sum() / len(y_true[treatment == 0])) y_up_random = x_up * up_ratio_random - qi_ratio_random = (y_true[treatment == 1].sum() - len(y_true[treatment == 1]) * \ + qi_ratio_random = (y_true[treatment == 1].sum() - len(y_true[treatment == 1]) * y_true[treatment == 0].sum() / len(y_true[treatment == 0])) / len(y_true) y_qi_random = x_qi * qi_ratio_random @@ -137,9 +145,8 @@ def plot_uplift_by_percentile(y_true, uplift, treatment, strategy, bins=10): f' got {strategy}.') if not isinstance(bins, int) or bins <= 0: - raise ValueError(f'bins should be positive integer.' - f' Invalid value bins: {bins}') - + raise ValueError(f'Bins should be positive integer. Invalid value bins: {bins}') + if bins >= n_samples: raise ValueError(f'Number of bins = {bins} should be smaller than the length of y_true {n_samples}') From 3dfe235508b45f91e28fd4ad8505a870dfc7e10e Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Thu, 30 Apr 2020 01:23:48 +0300 Subject: [PATCH 05/17] :hammer: Fix typo in predict method in ddr_control regr model --- sklift/models/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklift/models/models.py b/sklift/models/models.py index c49e4d3..7592715 100644 --- a/sklift/models/models.py +++ b/sklift/models/models.py @@ -335,7 +335,7 @@ def fit(self, X, y, treatment, estimator_trmnt_fit_params=None, estimator_ctrl_f if self._type_of_target == 'binary': ddr_control = self.estimator_ctrl.predict_proba(X_trmnt)[:, 1] else: - ddr_control = self.estimator_ctrl.predict_(X_trmnt) + ddr_control = self.estimator_ctrl.predict(X_trmnt) if isinstance(X_trmnt, np.ndarray): X_trmnt_mod = np.column_stack((X_trmnt, ddr_control)) From 1781f7a3a87dc2d12723137d9bef69ba627cffe6 Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Thu, 30 Apr 2020 01:32:38 +0300 Subject: [PATCH 06/17] Add changes in changelog --- docs/changelog.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/docs/changelog.md b/docs/changelog.md index 977495d..729a75a 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -8,6 +8,23 @@ * 🔨 something that previously didn’t work as documentated – or according to reasonable expectations – should now work. * ❗️ you will need to change your code to have the same effect in the future; or a feature will be removed in the future. +## Version 0.1.2 +_in development_ + +### [sklift.models](https://scikit-uplift.readthedocs.io/en/latest/api/models.html) + +* 🔨 Fix bug in [TwoModels](https://scikit-uplift.readthedocs.io/en/latest/api/models.html#sklift.models.models.TwoModels) (ddr_control) in regression models. +* 📝 Minor code refactoring. + +### [sklift.metrics](https://scikit-uplift.readthedocs.io/en/latest/api/metrics.html) + +* 📝 Minor code refactoring. + +### [sklift.viz](https://scikit-uplift.readthedocs.io/en/latest/api/viz.html) + +* 🔨 Fix bug in [plot_uplift_by_percentile](https://scikit-uplift.readthedocs.io/en/latest/api/viz.html#sklift.viz.base.plot_uplift_by_percentile). +* 📝 Minor code refactoring. + ## Version 0.1.1 ### [sklift.viz](https://scikit-uplift.readthedocs.io/en/latest/api/viz.html) From 4652f9444551c3a93df2dad7094193d274a2ed1a Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Thu, 30 Apr 2020 18:11:04 +0300 Subject: [PATCH 07/17] :hammer: Fix Two models approach for regression problem --- sklift/models/models.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/sklift/models/models.py b/sklift/models/models.py index 7592715..8cdc438 100644 --- a/sklift/models/models.py +++ b/sklift/models/models.py @@ -355,7 +355,7 @@ def fit(self, X, y, treatment, estimator_trmnt_fit_params=None, estimator_ctrl_f if self._type_of_target == 'binary': ddr_treatment = self.estimator_trmnt.predict_proba(X_ctrl)[:, 1] else: - ddr_treatment = self.estimator_trmnt.predict(X_ctrl)[:, 1] + ddr_treatment = self.estimator_trmnt.predict(X_ctrl) if isinstance(X_ctrl, np.ndarray): X_ctrl_mod = np.column_stack((X_ctrl, ddr_treatment)) @@ -393,13 +393,17 @@ def predict(self, X): X_mod = X.assign(ddr_control=self.ctrl_preds_) else: raise TypeError("Expected numpy.ndarray or pandas.DataFrame, got %s" % type(X)) - self.trmnt_preds_ = self.estimator_trmnt.predict_proba(X_mod)[:, 1] + + if self._type_of_target == 'binary': + self.trmnt_preds_ = self.estimator_trmnt.predict_proba(X_mod)[:, 1] + else: + self.trmnt_preds_ = self.estimator_trmnt.predict(X_mod) elif self.method == 'ddr_treatment': if self._type_of_target == 'binary': self.trmnt_preds_ = self.estimator_trmnt.predict_proba(X)[:, 1] else: - self.trmnt_preds_ = self.estimator_trmnt.predict_proba(X)[:, 1] + self.trmnt_preds_ = self.estimator_trmnt.predict(X) if isinstance(X, np.ndarray): X_mod = np.column_stack((X, self.trmnt_preds_)) @@ -407,7 +411,11 @@ def predict(self, X): X_mod = X.assign(ddr_treatment=self.trmnt_preds_) else: raise TypeError("Expected numpy.ndarray or pandas.DataFrame, got %s" % type(X)) - self.ctrl_preds_ = self.estimator_ctrl.predict_proba(X_mod)[:, 1] + + if self._type_of_target == 'binary': + self.ctrl_preds_ = self.estimator_ctrl.predict_proba(X_mod)[:, 1] + else: + self.ctrl_preds_ = self.estimator_ctrl.predict(X_mod) else: if self._type_of_target == 'binary': From 27173ff5a5dc72d23856e9d78c8d6b4b294f15dc Mon Sep 17 00:00:00 2001 From: Irina Elisova Date: Sat, 2 May 2020 18:06:45 +0300 Subject: [PATCH 08/17] Bar plot added in plot_uplift_by_percentile function (#11) Add kind == 'bar' in viz.plot_uplift_by_percentile Fix docstring params to look nice in rtd :sparkles: --- sklift/metrics/metrics.py | 57 ++++++++++-------- sklift/viz/base.py | 122 +++++++++++++++++++++++++------------- 2 files changed, 112 insertions(+), 67 deletions(-) diff --git a/sklift/metrics/metrics.py b/sklift/metrics/metrics.py index a1651ed..5fdc46d 100644 --- a/sklift/metrics/metrics.py +++ b/sklift/metrics/metrics.py @@ -286,83 +286,90 @@ def uplift_at_k(y_true, uplift, treatment, strategy, k=0.3): def response_rate_by_percentile(y_true, uplift, treatment, group, strategy, bins=10): - """Compute response rate (target mean in the control or treatment group) at each percentile. - + """Compute response rate and its variance at each percentile. + + Response rate ia a target mean in the group. + Args: y_true (1d array-like): Correct (true) target values. uplift (1d array-like): Predicted uplift, as returned by a model. treatment (1d array-like): Treatment labels. group (string, ['treatment', 'control']): Group type for computing response rate: treatment or control. + * ``'treatment'``: - Values equal 1 in the treatment column. + Values equal 1 in the treatment column. + * ``'control'``: - Values equal 0 in the treatment column. - strategy (string, ['overall', 'by_group']): Determines the calculating strategy. + Values equal 0 in the treatment column. + + strategy (string, ['overall', 'by_group']): Determines the calculating strategy. + * ``'overall'``: The first step is taking the first k observations of all test data ordered by uplift prediction (overall both groups - control and treatment) and conversions in treatment and control groups calculated only on them. Then the difference between these conversions is calculated. + * ``'by_group'``: Separately calculates conversions in top k observations in each group (control and treatment) - sorted by uplift predictions. Then the difference between these conversions is calculated - bins (int): Determines the number of bins (and relative percentile) in the test data. - + sorted by uplift predictions. Then the difference between these conversions is calculated. + + bins (int): Determines а number of bins (and а relative percentile) in the test data. Default is 10. + Returns: array: Response rate at each percentile for control or treatment group - array: Variance of the response rate at each percentile + array: Variance of the response rate at each percentile """ - + group_types = ['treatment', 'control'] strategy_methods = ['overall', 'by_group'] - + n_samples = len(y_true) check_consistent_length(y_true, uplift, treatment) - + if group not in group_types: raise ValueError(f'Response rate supports only group types in {group_types},' - f' got {group}.') + f' got {group}.') if strategy not in strategy_methods: raise ValueError(f'Response rate supports only calculating methods in {strategy_methods},' f' got {strategy}.') - + if not isinstance(bins, int) or bins <= 0: - raise ValueError(f'Bins should be positive integer.' - f' Invalid value bins: {bins}') - + raise ValueError(f'Bins should be positive integer. Invalid value bins: {bins}') + if bins >= n_samples: raise ValueError(f'Number of bins = {bins} should be smaller than the length of y_true {n_samples}') - + if bins == 1: warnings.warn(f'You will get the only one bin of {n_samples} samples' f' which is the length of y_true.' f'\nPlease consider using uplift_at_k function instead', UserWarning) - + y_true, uplift, treatment = np.array(y_true), np.array(uplift), np.array(treatment) order = np.argsort(uplift, kind='mergesort')[::-1] - + if group == 'treatment': trmnt_flag = 1 else: # group == 'control' trmnt_flag = 0 - + if strategy == 'overall': y_true_bin = np.array_split(y_true[order], bins) trmnt_bin = np.array_split(treatment[order], bins) - + group_size = np.array([len(y[trmnt == trmnt_flag]) for y, trmnt in zip(y_true_bin, trmnt_bin)]) response_rate = np.array([np.mean(y[trmnt == trmnt_flag]) for y, trmnt in zip(y_true_bin, trmnt_bin)]) else: # strategy == 'by_group' y_bin = np.array_split(y_true[order][treatment[order] == trmnt_flag], bins) - + group_size = np.array([len(y) for y in y_bin]) response_rate = np.array([np.mean(y) for y in y_bin]) variance = np.multiply(response_rate, np.divide((1 - response_rate), group_size)) - - return response_rate, variance + + return response_rate, variance def treatment_balance_curve(uplift, treatment, winsize): diff --git a/sklift/viz/base.py b/sklift/viz/base.py index bca69bc..c45135e 100644 --- a/sklift/viz/base.py +++ b/sklift/viz/base.py @@ -1,7 +1,7 @@ -import matplotlib.pyplot as plt import numpy as np -from sklearn.utils.validation import check_consistent_length import warnings +import matplotlib.pyplot as plt +from sklearn.utils.validation import check_consistent_length from ..metrics import uplift_curve, auuc, qini_curve, auqc, response_rate_by_percentile, treatment_balance_curve @@ -20,7 +20,7 @@ def plot_uplift_preds(trmnt_preds, ctrl_preds, log=False, bins=100): Returns: Object that stores computed values. """ - # ToDo: Add k as parameter: vertical line on plots + # TODO: Add k as parameter: vertical line on plots check_consistent_length(trmnt_preds, ctrl_preds) if not isinstance(bins, int) or bins <= 0: @@ -112,78 +112,116 @@ def plot_uplift_qini_curves(y_true, uplift, treatment, random=True, perfect=Fals return axes -def plot_uplift_by_percentile(y_true, uplift, treatment, strategy, bins=10): - """Plot Uplift score at each percentile, - Treatment response rate (target mean in the treatment group) - and Control response rate (target mean in the control group) at each percentile. - +def plot_uplift_by_percentile(y_true, uplift, treatment, strategy, kind='line', bins=10): + """Plot uplift score, treatment response rate and control response rate at each percentile. + + Treatment response rate ia a target mean in the treatment group. + Control response rate is a target mean in the control group. + Uplift score is a difference between treatment response rate and control response rate. + Args: y_true (1d array-like): Correct (true) target values. uplift (1d array-like): Predicted uplift, as returned by a model. treatment (1d array-like): Treatment labels. - strategy (string, ['overall', 'by_group']): Determines the calculating strategy. Defaults to 'first'. + strategy (string, ['overall', 'by_group']): Determines the calculating strategy. + * ``'overall'``: The first step is taking the first k observations of all test data ordered by uplift prediction (overall both groups - control and treatment) and conversions in treatment and control groups calculated only on them. Then the difference between these conversions is calculated. + * ``'by_group'``: Separately calculates conversions in top k observations in each group (control and treatment) - sorted by uplift predictions. Then the difference between these conversions is calculated - bins (int): Determines the number of bins (and relative percentile) in the test data. - + sorted by uplift predictions. Then the difference between these conversions is calculated. + + kind (string, ['line', 'bar']): The type of plot to draw. Default is 'line'. + + * ``'line'``: + Generates a line plot. + + * ``'bar'``: + Generates a traditional bar-style plot. + + bins (int): Determines а number of bins (and а relative percentile) in the test data. Default is 10. + Returns: Object that stores computed values. """ - + strategy_methods = ['overall', 'by_group'] - + kind_methods = ['line', 'bar'] + n_samples = len(y_true) check_consistent_length(y_true, uplift, treatment) - + if strategy not in strategy_methods: raise ValueError(f'Response rate supports only calculating methods in {strategy_methods},' f' got {strategy}.') - + + if kind not in kind_methods: + raise ValueError(f'Function supports only types of plots in {kind_methods},' + f' got {kind}.') + if not isinstance(bins, int) or bins <= 0: raise ValueError(f'Bins should be positive integer. Invalid value bins: {bins}') if bins >= n_samples: raise ValueError(f'Number of bins = {bins} should be smaller than the length of y_true {n_samples}') - - if bins == 1: - warnings.warn(f'You will get the only one bin of {n_samples} samples' - f' which is the length of y_true.' - f'\nPlease consider using uplift_at_k function instead', - UserWarning) - + rspns_rate_trmnt, var_trmnt = response_rate_by_percentile(y_true, uplift, treatment, group='treatment', strategy=strategy, bins=bins) - + rspns_rate_ctrl, var_ctrl = response_rate_by_percentile(y_true, uplift, treatment, group='control', strategy=strategy, bins=bins) uplift_score, uplift_variance = np.subtract(rspns_rate_trmnt, rspns_rate_ctrl), np.add(var_trmnt, var_ctrl) - + percentiles = [p * 100 / bins for p in range(1, bins + 1)] - - _, axes = plt.subplots(ncols=1, nrows=1, figsize=(8, 6)) - - axes.errorbar(percentiles, uplift_score, yerr=np.sqrt(uplift_variance), - linewidth=2, color='red', label='uplift') - axes.errorbar(percentiles, rspns_rate_trmnt, yerr=np.sqrt(var_trmnt), - linewidth=2, color='forestgreen', label='treatment\nresponse rate') - axes.errorbar(percentiles, rspns_rate_ctrl, yerr=np.sqrt(var_ctrl), - linewidth=2, color='orange', label='control\nresponse rate') - axes.fill_between(percentiles, rspns_rate_ctrl, rspns_rate_trmnt, alpha=0.1, color='red') - - axes.set_xticks(percentiles) - axes.legend(loc='upper right') - axes.set_title('Uplift by percentile') - axes.set_xlabel('Percentile') - axes.set_ylabel('Uplift = treatment response rate - control response rate') - + + if kind == 'line': + _, axes = plt.subplots(ncols=1, nrows=1, figsize=(8, 6)) + axes.errorbar(percentiles, uplift_score, yerr=np.sqrt(uplift_variance), + linewidth=2, color='red', label='uplift') + axes.errorbar(percentiles, rspns_rate_trmnt, yerr=np.sqrt(var_trmnt), + linewidth=2, color='forestgreen', label='treatment\nresponse rate') + axes.errorbar(percentiles, rspns_rate_ctrl, yerr=np.sqrt(var_ctrl), + linewidth=2, color='orange', label='control\nresponse rate') + axes.fill_between(percentiles, rspns_rate_ctrl, rspns_rate_trmnt, alpha=0.1, color='red') + + if np.amin(uplift_score) < 0: + axes.axhline(y=0, color='black', linewidth=1) + axes.set_xticks(percentiles) + axes.legend(loc='upper right') + axes.set_title('Uplift by percentile') + axes.set_xlabel('Percentile') + axes.set_ylabel('Uplift = treatment response rate - control response rate') + + else: # kind == 'bar' + delta = percentiles[0] + fig, axes = plt.subplots(ncols=1, nrows=2, figsize=(8, 6), sharex=True, sharey=True) + fig.text(0.04, 0.5, 'Uplift = treatment response rate - control response rate', + va='center', ha='center', rotation='vertical') + + axes[0].bar(np.array(percentiles), uplift_score, delta / 1.5, + yerr=np.sqrt(uplift_variance), color='red', label='uplift') + axes[1].bar(np.array(percentiles) - delta / 6, rspns_rate_trmnt, delta / 3, + yerr=np.sqrt(var_trmnt), color='forestgreen', label='treatment\nresponse rate') + axes[1].bar(np.array(percentiles) + delta / 6, rspns_rate_ctrl, delta / 3, + yerr=np.sqrt(var_ctrl), color='orange', label='control\nresponse rate') + + axes[0].legend(loc='upper right') + axes[0].tick_params(axis='x', bottom=False) + axes[0].axhline(y=0, color='black', linewidth=1) + axes[0].set_title('Uplift by percentile') + + axes[1].set_xticks(percentiles) + axes[1].legend(loc='upper right') + axes[1].axhline(y=0, color='black', linewidth=1) + axes[1].set_xlabel('Percentile') + axes[1].set_title('Response rate by percentile') + return axes From 253112384c129200681aa940332851efbf954071 Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Sat, 2 May 2020 18:19:21 +0300 Subject: [PATCH 09/17] :memo: Add changes to changelog --- docs/changelog.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/changelog.md b/docs/changelog.md index 729a75a..2bd61d7 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -13,7 +13,7 @@ _in development_ ### [sklift.models](https://scikit-uplift.readthedocs.io/en/latest/api/models.html) -* 🔨 Fix bug in [TwoModels](https://scikit-uplift.readthedocs.io/en/latest/api/models.html#sklift.models.models.TwoModels) (ddr_control) in regression models. +* 🔨 Fix bugs in [TwoModels](https://scikit-uplift.readthedocs.io/en/latest/api/models.html#sklift.models.models.TwoModels) for regression problem. * 📝 Minor code refactoring. ### [sklift.metrics](https://scikit-uplift.readthedocs.io/en/latest/api/metrics.html) @@ -22,6 +22,7 @@ _in development_ ### [sklift.viz](https://scikit-uplift.readthedocs.io/en/latest/api/viz.html) +* 💥 Add bar plot in [plot_uplift_by_percentile](https://scikit-uplift.readthedocs.io/en/latest/api/viz.html#sklift.viz.base.plot_uplift_by_percentile) by [@ElisovaIra](https://github.com/ElisovaIra). * 🔨 Fix bug in [plot_uplift_by_percentile](https://scikit-uplift.readthedocs.io/en/latest/api/viz.html#sklift.viz.base.plot_uplift_by_percentile). * 📝 Minor code refactoring. From a7d28b3cabb1b983b5ceee12bb8b0da289ca15cc Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Sat, 2 May 2020 18:22:08 +0300 Subject: [PATCH 10/17] :memo: Prettify docs/conf.py --- docs/conf.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index b2406a2..44dcb23 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -33,7 +33,7 @@ def get_version(): # -- Project information ----------------------------------------------------- project = 'scikit-uplift' -author = 'Maksim Shevchenko' +author = 'Maksim Shevchenko and Contributors' copyright = "{}, {}".format(datetime.datetime.now().year, author) # The full version, including alpha/beta/rc tags @@ -46,8 +46,11 @@ def get_version(): # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - "sphinx.ext.autodoc", "sphinx.ext.viewcode", "sphinx.ext.mathjax", "sphinx.ext.napoleon", - "recommonmark" + "sphinx.ext.autodoc", + "sphinx.ext.viewcode", + "sphinx.ext.mathjax", + "sphinx.ext.napoleon", + "recommonmark", ] master_doc = 'index' @@ -58,7 +61,7 @@ def get_version(): # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'Readme.rst'] # -- Options for HTML output ------------------------------------------------- From b10ff872c8a933bc15bf43ed70e2034956dcd552 Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Sat, 2 May 2020 19:33:44 +0300 Subject: [PATCH 11/17] :sparkles: Fix docsting in TwoModels --- sklift/models/models.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklift/models/models.py b/sklift/models/models.py index 8cdc438..e34fed2 100644 --- a/sklift/models/models.py +++ b/sklift/models/models.py @@ -222,10 +222,11 @@ class TwoModels(BaseEstimator): Args: estimator_trmnt (estimator object implementing 'fit'): The object to use to fit the treatment data. estimator_ctrl (estimator object implementing 'fit'): The object to use to fit the control data. - method (string, ‘vanilla’, ’ddr_control’ or ‘ddr_treatment’, default='vanilla'): Specifies the approach: - * ‘vanilla’ - two independent models - * ’ddr_control’ - dependent data representation (First train control estimator) - * ’ddr_treatment’ - dependent data representation (First train treatment estimator) + method (string, 'vanilla', 'ddr_control' or 'ddr_treatment', default='vanilla'): Specifies the approach: + + * ``'vanilla'`` - two independent models; + * ``'ddr_control'`` - dependent data representation (First train control estimator); + * ``'ddr_treatment'`` - dependent data representation (First train treatment estimator). Attributes: trmnt_preds_ (array-like, shape (n_samples, )): Estimator predictions on samples when treatment. From 4fc5ee858ca41a712a9ed7c6632f194fdc0e9353 Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Sat, 2 May 2020 19:36:25 +0300 Subject: [PATCH 12/17] :sparkles: Fix docsting in uplift_at_k, response_rate_by_percentile --- sklift/metrics/metrics.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sklift/metrics/metrics.py b/sklift/metrics/metrics.py index 5fdc46d..2f2ee9e 100644 --- a/sklift/metrics/metrics.py +++ b/sklift/metrics/metrics.py @@ -207,10 +207,9 @@ def uplift_at_k(y_true, uplift, treatment, strategy, k=0.3): The first step is taking the first k observations of all test data ordered by uplift prediction (overall both groups - control and treatment) and conversions in treatment and control groups calculated only on them. Then the difference between these conversions is calculated. - * ``'by_group'``: Separately calculates conversions in top k observations in each group (control and treatment) - sorted by uplift predictions. Then the difference between these conversions is calculated + sorted by uplift predictions. Then the difference between these conversions is calculated. .. versionchanged:: 0.1.0 @@ -298,7 +297,6 @@ def response_rate_by_percentile(y_true, uplift, treatment, group, strategy, bins * ``'treatment'``: Values equal 1 in the treatment column. - * ``'control'``: Values equal 0 in the treatment column. @@ -308,7 +306,6 @@ def response_rate_by_percentile(y_true, uplift, treatment, group, strategy, bins The first step is taking the first k observations of all test data ordered by uplift prediction (overall both groups - control and treatment) and conversions in treatment and control groups calculated only on them. Then the difference between these conversions is calculated. - * ``'by_group'``: Separately calculates conversions in top k observations in each group (control and treatment) sorted by uplift predictions. Then the difference between these conversions is calculated. From 5097b81cb8763e279c0e92ef763fb24c34720598 Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Sat, 2 May 2020 19:38:58 +0300 Subject: [PATCH 13/17] :sparkles: Fix docsting in TwoModels --- sklift/models/models.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sklift/models/models.py b/sklift/models/models.py index e34fed2..ea43c66 100644 --- a/sklift/models/models.py +++ b/sklift/models/models.py @@ -224,9 +224,12 @@ class TwoModels(BaseEstimator): estimator_ctrl (estimator object implementing 'fit'): The object to use to fit the control data. method (string, 'vanilla', 'ddr_control' or 'ddr_treatment', default='vanilla'): Specifies the approach: - * ``'vanilla'`` - two independent models; - * ``'ddr_control'`` - dependent data representation (First train control estimator); - * ``'ddr_treatment'`` - dependent data representation (First train treatment estimator). + * ``'vanilla'``: + Two independent models; + * ``'ddr_control'``: + Dependent data representation (First train control estimator). + * ``'ddr_treatment'``: + Dependent data representation (First train treatment estimator). Attributes: trmnt_preds_ (array-like, shape (n_samples, )): Estimator predictions on samples when treatment. From 40cc7967347052e9b73fbfa9115de84b154e0a47 Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Sat, 2 May 2020 19:40:54 +0300 Subject: [PATCH 14/17] :sparkles: Fix docsting plot_uplift_by_percentile --- sklift/viz/base.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklift/viz/base.py b/sklift/viz/base.py index c45135e..41753a9 100644 --- a/sklift/viz/base.py +++ b/sklift/viz/base.py @@ -129,7 +129,6 @@ def plot_uplift_by_percentile(y_true, uplift, treatment, strategy, kind='line', The first step is taking the first k observations of all test data ordered by uplift prediction (overall both groups - control and treatment) and conversions in treatment and control groups calculated only on them. Then the difference between these conversions is calculated. - * ``'by_group'``: Separately calculates conversions in top k observations in each group (control and treatment) sorted by uplift predictions. Then the difference between these conversions is calculated. @@ -138,7 +137,6 @@ def plot_uplift_by_percentile(y_true, uplift, treatment, strategy, kind='line', * ``'line'``: Generates a line plot. - * ``'bar'``: Generates a traditional bar-style plot. From ab70f0e11c92a854c01e10df8204defbc6fce9bb Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Sat, 2 May 2020 19:55:48 +0300 Subject: [PATCH 15/17] :rocket: Bump version to 0.1.2 --- docs/changelog.md | 1 - sklift/__init__.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index 2bd61d7..ab46e1a 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -9,7 +9,6 @@ * ❗️ you will need to change your code to have the same effect in the future; or a feature will be removed in the future. ## Version 0.1.2 -_in development_ ### [sklift.models](https://scikit-uplift.readthedocs.io/en/latest/api/models.html) diff --git a/sklift/__init__.py b/sklift/__init__.py index df9144c..10939f0 100644 --- a/sklift/__init__.py +++ b/sklift/__init__.py @@ -1 +1 @@ -__version__ = '0.1.1' +__version__ = '0.1.2' From a44ff91c56d1bd25d0d119e64d5d6aa3b2835a77 Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Sat, 2 May 2020 20:10:56 +0300 Subject: [PATCH 16/17] :rocket: Update notebooks to 0.1.2 --- notebooks/RetailHero.ipynb | 71 +++++++++++++++++-------------- notebooks/RetailHero_EN.ipynb | 69 ++++++++++++++++-------------- notebooks/pipeline_usage_EN.ipynb | 36 ++++++++-------- notebooks/pipeline_usage_RU.ipynb | 36 ++++++++-------- 4 files changed, 111 insertions(+), 101 deletions(-) diff --git a/notebooks/RetailHero.ipynb b/notebooks/RetailHero.ipynb index e2c424c..40e7eb6 100644 --- a/notebooks/RetailHero.ipynb +++ b/notebooks/RetailHero.ipynb @@ -107,8 +107,8 @@ "execution_count": 1, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:24:37.492036Z", - "start_time": "2020-04-28T09:24:37.488584Z" + "end_time": "2020-05-02T17:07:22.372375Z", + "start_time": "2020-05-02T17:07:22.368436Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -126,7 +126,7 @@ "urllib.request.urlretrieve(url, '/content/retail_hero.zip')\n", "\n", "!unzip /content/retail_hero.zip\n", - "!pip install scikit-uplift==0.1.1 catboost=0.22" + "!pip install scikit-uplift==0.1.2 catboost=0.22" ] }, { @@ -144,8 +144,8 @@ "execution_count": 2, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:24:40.783897Z", - "start_time": "2020-04-28T09:24:37.503470Z" + "end_time": "2020-05-02T17:07:25.384054Z", + "start_time": "2020-05-02T17:07:22.383222Z" }, "colab": {}, "colab_type": "code", @@ -196,8 +196,8 @@ "execution_count": 3, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:24:42.100096Z", - "start_time": "2020-04-28T09:24:40.786498Z" + "end_time": "2020-05-02T17:07:26.483716Z", + "start_time": "2020-05-02T17:07:25.386480Z" }, "colab": {}, "colab_type": "code", @@ -251,8 +251,8 @@ "execution_count": 4, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:24:44.483576Z", - "start_time": "2020-04-28T09:24:42.102707Z" + "end_time": "2020-05-02T17:07:28.491581Z", + "start_time": "2020-05-02T17:07:26.486312Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -313,8 +313,8 @@ "execution_count": 5, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:24:44.511016Z", - "start_time": "2020-04-28T09:24:44.486035Z" + "end_time": "2020-05-02T17:07:28.514717Z", + "start_time": "2020-05-02T17:07:28.494500Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -484,8 +484,8 @@ "execution_count": 6, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:24:45.715602Z", - "start_time": "2020-04-28T09:24:44.514353Z" + "end_time": "2020-05-02T17:07:29.570605Z", + "start_time": "2020-05-02T17:07:28.518362Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -545,8 +545,8 @@ "execution_count": 7, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:24:47.941480Z", - "start_time": "2020-04-28T09:24:45.719641Z" + "end_time": "2020-05-02T17:07:31.489869Z", + "start_time": "2020-05-02T17:07:29.572733Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -632,8 +632,8 @@ "execution_count": 8, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:24:50.571779Z", - "start_time": "2020-04-28T09:24:47.944822Z" + "end_time": "2020-05-02T17:07:33.865281Z", + "start_time": "2020-05-02T17:07:31.494251Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -694,8 +694,8 @@ "execution_count": 9, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:24:52.942803Z", - "start_time": "2020-04-28T09:24:50.576741Z" + "end_time": "2020-05-02T17:07:36.709646Z", + "start_time": "2020-05-02T17:07:33.871512Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -758,8 +758,8 @@ "execution_count": 10, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:24:52.964396Z", - "start_time": "2020-04-28T09:24:52.945544Z" + "end_time": "2020-05-02T17:07:36.726223Z", + "start_time": "2020-05-02T17:07:36.712564Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -858,8 +858,8 @@ "execution_count": 11, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:24:56.505700Z", - "start_time": "2020-04-28T09:24:53.019392Z" + "end_time": "2020-05-02T17:07:39.436995Z", + "start_time": "2020-05-02T17:07:36.729508Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -874,7 +874,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/Users/Maksim/Library/Python/3.6/lib/python/site-packages/ipykernel_launcher.py:2: UserWarning: It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.\n", + "/Users/Maksim/Library/Python/3.6/lib/python/site-packages/ipykernel_launcher.py:6: UserWarning: It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.\n", " \n" ] }, @@ -891,10 +891,15 @@ } ], "source": [ - "cm_full = ClassTransformation(CatBoostClassifier(iterations=20, thread_count=2, random_state=42, silent=True))\n", - "cm_full = cm_full.fit(X_train_full, y_train_full, treat_train_full, estimator_fit_params={'cat_features': [1]})\n", + "ct_full = ClassTransformation(CatBoostClassifier(iterations=20, thread_count=2, random_state=42, silent=True))\n", + "ct_full = ct_full.fit(\n", + " X_train_full, \n", + " y_train_full, \n", + " treat_train_full, \n", + " estimator_fit_params={'cat_features': cat_features}\n", + ")\n", "\n", - "X_test.loc[:, 'uplift'] = cm_full.predict(X_test.values)\n", + "X_test.loc[:, 'uplift'] = ct_full.predict(X_test.values)\n", "\n", "sub = X_test[['uplift']].to_csv('sub1.csv')\n", "\n", @@ -906,8 +911,8 @@ "execution_count": 12, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:24:56.560018Z", - "start_time": "2020-04-28T09:24:56.508541Z" + "end_time": "2020-05-02T17:07:39.478855Z", + "start_time": "2020-05-02T17:07:39.440546Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -988,12 +993,12 @@ } ], "source": [ - "cm_full_fi = pd.DataFrame({\n", - " 'feature_name': cm_full.estimator.feature_names_,\n", - " 'feature_score': cm_full.estimator.feature_importances_\n", + "ct_full_fi = pd.DataFrame({\n", + " 'feature_name': ct_full.estimator.feature_names_,\n", + " 'feature_score': ct_full.estimator.feature_importances_\n", "}).sort_values('feature_score', ascending=False).reset_index(drop=True)\n", "\n", - "cm_full_fi" + "ct_full_fi" ] }, { diff --git a/notebooks/RetailHero_EN.ipynb b/notebooks/RetailHero_EN.ipynb index 45eacf9..465abb5 100644 --- a/notebooks/RetailHero_EN.ipynb +++ b/notebooks/RetailHero_EN.ipynb @@ -98,8 +98,8 @@ "execution_count": 1, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:26:59.973637Z", - "start_time": "2020-04-28T09:26:59.969856Z" + "end_time": "2020-05-02T17:03:42.686542Z", + "start_time": "2020-05-02T17:03:42.682766Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -117,7 +117,7 @@ "urllib.request.urlretrieve(url, '/content/retail_hero.zip')\n", "\n", "!unzip /content/retail_hero.zip\n", - "!pip install scikit-uplift==0.1.1 catboost=0.22" + "!pip install scikit-uplift==0.1.2 catboost=0.22" ] }, { @@ -135,8 +135,8 @@ "execution_count": 2, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:27:03.353098Z", - "start_time": "2020-04-28T09:26:59.984369Z" + "end_time": "2020-05-02T17:03:45.696618Z", + "start_time": "2020-05-02T17:03:42.697098Z" }, "colab": {}, "colab_type": "code", @@ -187,8 +187,8 @@ "execution_count": 3, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:27:04.563554Z", - "start_time": "2020-04-28T09:27:03.355432Z" + "end_time": "2020-05-02T17:03:46.792933Z", + "start_time": "2020-05-02T17:03:45.698939Z" }, "colab": {}, "colab_type": "code", @@ -241,8 +241,8 @@ "execution_count": 4, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:27:06.789462Z", - "start_time": "2020-04-28T09:27:04.570306Z" + "end_time": "2020-05-02T17:03:48.848922Z", + "start_time": "2020-05-02T17:03:46.795574Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -305,8 +305,8 @@ "execution_count": 5, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:27:06.813310Z", - "start_time": "2020-04-28T09:27:06.792837Z" + "end_time": "2020-05-02T17:03:48.871869Z", + "start_time": "2020-05-02T17:03:48.851856Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -476,8 +476,8 @@ "execution_count": 6, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:27:07.960855Z", - "start_time": "2020-04-28T09:27:06.816440Z" + "end_time": "2020-05-02T17:03:49.905064Z", + "start_time": "2020-05-02T17:03:48.875206Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -538,8 +538,8 @@ "execution_count": 7, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:27:10.062915Z", - "start_time": "2020-04-28T09:27:07.972200Z" + "end_time": "2020-05-02T17:03:51.782971Z", + "start_time": "2020-05-02T17:03:49.907481Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -627,8 +627,8 @@ "execution_count": 8, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:27:12.177386Z", - "start_time": "2020-04-28T09:27:10.065653Z" + "end_time": "2020-05-02T17:03:53.820694Z", + "start_time": "2020-05-02T17:03:51.785856Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -689,8 +689,8 @@ "execution_count": 9, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:27:14.459477Z", - "start_time": "2020-04-28T09:27:12.181749Z" + "end_time": "2020-05-02T17:03:56.085589Z", + "start_time": "2020-05-02T17:03:53.824592Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -753,8 +753,8 @@ "execution_count": 10, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:27:14.480794Z", - "start_time": "2020-04-28T09:27:14.463302Z" + "end_time": "2020-05-02T17:03:56.140144Z", + "start_time": "2020-05-02T17:03:56.091196Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -853,8 +853,8 @@ "execution_count": 11, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:27:17.097949Z", - "start_time": "2020-04-28T09:27:14.483691Z" + "end_time": "2020-05-02T17:03:59.864286Z", + "start_time": "2020-05-02T17:03:56.151623Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -886,10 +886,15 @@ } ], "source": [ - "cm_full = ClassTransformation(CatBoostClassifier(iterations=20, thread_count=2, random_state=42, silent=True))\n", - "cm_full = cm_full.fit(X_train_full, y_train_full, treat_train_full, estimator_fit_params={'cat_features': [1]})\n", + "ct_full = ClassTransformation(CatBoostClassifier(iterations=20, thread_count=2, random_state=42, silent=True))\n", + "ct_full = ct_full.fit(\n", + " X_train_full, \n", + " y_train_full, \n", + " treat_train_full, \n", + " estimator_fit_params={'cat_features': cat_features}\n", + ")\n", "\n", - "X_test.loc[:, 'uplift'] = cm_full.predict(X_test.values)\n", + "X_test.loc[:, 'uplift'] = ct_full.predict(X_test.values)\n", "\n", "sub = X_test[['uplift']].to_csv('sub1.csv')\n", "\n", @@ -901,8 +906,8 @@ "execution_count": 12, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:27:17.138755Z", - "start_time": "2020-04-28T09:27:17.101433Z" + "end_time": "2020-05-02T17:03:59.898275Z", + "start_time": "2020-05-02T17:03:59.868331Z" }, "colab": { "base_uri": "https://localhost:8080/", @@ -983,12 +988,12 @@ } ], "source": [ - "cm_full_fi = pd.DataFrame({\n", - " 'feature_name': cm_full.estimator.feature_names_,\n", - " 'feature_score': cm_full.estimator.feature_importances_\n", + "ct_full = pd.DataFrame({\n", + " 'feature_name': ct_full.estimator.feature_names_,\n", + " 'feature_score': ct_full.estimator.feature_importances_\n", "}).sort_values('feature_score', ascending=False).reset_index(drop=True)\n", "\n", - "cm_full_fi" + "ct_full" ] }, { diff --git a/notebooks/pipeline_usage_EN.ipynb b/notebooks/pipeline_usage_EN.ipynb index 5627162..c295bae 100644 --- a/notebooks/pipeline_usage_EN.ipynb +++ b/notebooks/pipeline_usage_EN.ipynb @@ -50,13 +50,13 @@ "execution_count": 1, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:14:10.044500Z", - "start_time": "2020-04-28T09:14:10.037045Z" + "end_time": "2020-05-02T16:59:19.201984Z", + "start_time": "2020-05-02T16:59:19.196511Z" } }, "outputs": [], "source": [ - "!pip install scikit-uplift==0.1.1 xgboost==1.0.2 category_encoders==2.1.0" + "!pip install scikit-uplift==0.1.2 xgboost==1.0.2 category_encoders==2.1.0" ] }, { @@ -71,15 +71,15 @@ "execution_count": 2, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:14:14.101729Z", - "start_time": "2020-04-28T09:14:10.048276Z" + "end_time": "2020-05-02T16:59:34.848414Z", + "start_time": "2020-05-02T16:59:19.206449Z" } }, "outputs": [ { "data": { "text/plain": [ - "('./content/Hilstorm.csv', )" + "('./content/Hilstorm.csv', )" ] }, "execution_count": 2, @@ -113,8 +113,8 @@ "execution_count": 3, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:14:14.638574Z", - "start_time": "2020-04-28T09:14:14.106920Z" + "end_time": "2020-05-02T16:59:35.421058Z", + "start_time": "2020-05-02T16:59:34.851896Z" } }, "outputs": [ @@ -281,8 +281,8 @@ "execution_count": 4, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:14:15.419422Z", - "start_time": "2020-04-28T09:14:14.642252Z" + "end_time": "2020-05-02T16:59:36.153705Z", + "start_time": "2020-05-02T16:59:35.425904Z" } }, "outputs": [], @@ -313,8 +313,8 @@ "execution_count": 5, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:14:15.432871Z", - "start_time": "2020-04-28T09:14:15.421629Z" + "end_time": "2020-05-02T16:59:36.173267Z", + "start_time": "2020-05-02T16:59:36.155955Z" } }, "outputs": [ @@ -343,8 +343,8 @@ "execution_count": 6, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:14:15.562848Z", - "start_time": "2020-04-28T09:14:15.437086Z" + "end_time": "2020-05-02T16:59:36.269784Z", + "start_time": "2020-05-02T16:59:36.179904Z" } }, "outputs": [], @@ -377,8 +377,8 @@ "execution_count": 7, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:14:16.816735Z", - "start_time": "2020-04-28T09:14:15.568337Z" + "end_time": "2020-05-02T16:59:37.436032Z", + "start_time": "2020-05-02T16:59:36.275110Z" } }, "outputs": [ @@ -416,8 +416,8 @@ "execution_count": 8, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:14:16.960683Z", - "start_time": "2020-04-28T09:14:16.819117Z" + "end_time": "2020-05-02T16:59:37.583995Z", + "start_time": "2020-05-02T16:59:37.438748Z" } }, "outputs": [ diff --git a/notebooks/pipeline_usage_RU.ipynb b/notebooks/pipeline_usage_RU.ipynb index 5e306b5..5277541 100644 --- a/notebooks/pipeline_usage_RU.ipynb +++ b/notebooks/pipeline_usage_RU.ipynb @@ -44,13 +44,13 @@ "execution_count": 1, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:23:06.767010Z", - "start_time": "2020-04-28T09:23:06.762871Z" + "end_time": "2020-05-02T17:00:49.841653Z", + "start_time": "2020-05-02T17:00:49.835556Z" } }, "outputs": [], "source": [ - "!pip install scikit-uplift==0.1.1 xgboost==1.0.2 category_encoders==2.1.0" + "!pip install scikit-uplift==0.1.2 xgboost==1.0.2 category_encoders==2.1.0" ] }, { @@ -70,15 +70,15 @@ "execution_count": 2, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:23:10.820274Z", - "start_time": "2020-04-28T09:23:06.776057Z" + "end_time": "2020-05-02T17:01:04.035180Z", + "start_time": "2020-05-02T17:00:49.846452Z" } }, "outputs": [ { "data": { "text/plain": [ - "('./content/Hilstorm.csv', )" + "('./content/Hilstorm.csv', )" ] }, "execution_count": 2, @@ -112,8 +112,8 @@ "execution_count": 3, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:23:11.319111Z", - "start_time": "2020-04-28T09:23:10.823302Z" + "end_time": "2020-05-02T17:01:04.476684Z", + "start_time": "2020-05-02T17:01:04.038420Z" } }, "outputs": [ @@ -280,8 +280,8 @@ "execution_count": 4, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:23:11.919143Z", - "start_time": "2020-04-28T09:23:11.324058Z" + "end_time": "2020-05-02T17:01:05.037943Z", + "start_time": "2020-05-02T17:01:04.480545Z" } }, "outputs": [], @@ -312,8 +312,8 @@ "execution_count": 5, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:23:11.936024Z", - "start_time": "2020-04-28T09:23:11.921716Z" + "end_time": "2020-05-02T17:01:05.052632Z", + "start_time": "2020-05-02T17:01:05.040798Z" } }, "outputs": [ @@ -342,8 +342,8 @@ "execution_count": 6, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:23:12.019176Z", - "start_time": "2020-04-28T09:23:11.939728Z" + "end_time": "2020-05-02T17:01:05.144900Z", + "start_time": "2020-05-02T17:01:05.059079Z" } }, "outputs": [], @@ -381,8 +381,8 @@ "execution_count": 7, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:23:13.244343Z", - "start_time": "2020-04-28T09:23:12.021559Z" + "end_time": "2020-05-02T17:01:06.298101Z", + "start_time": "2020-05-02T17:01:05.149361Z" } }, "outputs": [ @@ -415,8 +415,8 @@ "execution_count": 8, "metadata": { "ExecuteTime": { - "end_time": "2020-04-28T09:23:13.383577Z", - "start_time": "2020-04-28T09:23:13.246513Z" + "end_time": "2020-05-02T17:01:06.418688Z", + "start_time": "2020-05-02T17:01:06.300295Z" } }, "outputs": [ From fd17ee2ed0898dc84b68d61b7363a9e96ee552c8 Mon Sep 17 00:00:00 2001 From: Maksim Shevchenko Date: Sat, 2 May 2020 20:20:47 +0300 Subject: [PATCH 17/17] :memo: Remove unused import --- sklift/viz/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklift/viz/base.py b/sklift/viz/base.py index 41753a9..91397b6 100644 --- a/sklift/viz/base.py +++ b/sklift/viz/base.py @@ -1,5 +1,4 @@ import numpy as np -import warnings import matplotlib.pyplot as plt from sklearn.utils.validation import check_consistent_length from ..metrics import uplift_curve, auuc, qini_curve, auqc, response_rate_by_percentile, treatment_balance_curve