Skip to content

Commit

Permalink
closes #3: added simple inspection method
Browse files Browse the repository at this point in the history
  • Loading branch information
Stefan Heid committed Aug 10, 2023
1 parent 0915684 commit e86a2bf
Show file tree
Hide file tree
Showing 3 changed files with 249 additions and 7 deletions.
23 changes: 20 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,32 @@ pip install scikit-psl
```

# Usage

```python
from skpsl import ProbabilisticScoringList
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

from skpsl import ProbabilisticScoringList

# Generating synthetic data with continuous features and a binary target variable
X, y = make_classification(random_state=42)
X = (X > .5).astype(int)

clf = ProbabilisticScoringList([-1, 1, 2])
print(cross_val_score(clf, X, y, cv=5))

for train, test in ShuffleSplit(1, test_size=.2, random_state=42).split(X):
psl = ProbabilisticScoringList([-1, 1, 2])
psl.fit(X[train], y[train])
print(f"Brier score: {psl.score(X[test], y[test]):.4f}")
#> Brier score: 0.1924 (lower is better)

df = psl.inspect(5)
print(df.to_string(index=False, na_rep="-", justify="center", float_format=lambda x: f"{x:.2f}"))
#> Stage Score T = -3 T = -2 T = -1 T = 0 T = 1 T = 2 T = 3
#> 0 - - - - 0.54 - - -
#> 1 2.00 - - - 0.18 - 0.97 -
#> 2 -1.00 - - 0.00 0.28 0.91 1.00 -
#> 3 -1.00 - 0.00 0.07 0.86 0.91 1.00 -
#> 4 1.00 - 0.00 0.00 0.29 0.92 1.00 1.00
#> 5 -1.00 0.00 0.00 0.00 0.40 1.00 1.00 1.00
```
203 changes: 203 additions & 0 deletions scratch/psl_describe.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "adc6c663-b073-4451-9821-216c578bbd69",
"metadata": {},
"outputs": [],
"source": [
"from skpsl import ProbabilisticScoringList\n",
"from sklearn.datasets import make_classification\n",
"from sklearn.model_selection import cross_val_score, ShuffleSplit\n",
"from functools import reduce\n",
"from operator import or_\n",
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "764414e5-261c-4a77-b14b-2235472b9baf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Brier score: 0.1924\n"
]
}
],
"source": [
"# Generating synthetic data with continuous features and a binary target variable\n",
"X, y = make_classification(random_state=42)\n",
"X = (X > .5).astype(int)\n",
"\n",
"for train, test in ShuffleSplit(1, test_size=.2, random_state=42).split(X):\n",
" psl = ProbabilisticScoringList([-1, 1, 2])\n",
" psl.fit(X[train], y[train])\n",
" print(f\"Brier score: {psl.score(X[test], y[test]):.4f}\")"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "26a044d2-d3a1-43eb-bbef-b8e7050ce568",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Stage</th>\n",
" <th>Score</th>\n",
" <th>T = -3</th>\n",
" <th>T = -2</th>\n",
" <th>T = -1</th>\n",
" <th>T = 0</th>\n",
" <th>T = 1</th>\n",
" <th>T = 2</th>\n",
" <th>T = 3</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>-</td>\n",
" <td>-</td>\n",
" <td>-</td>\n",
" <td>-</td>\n",
" <td>0.5375</td>\n",
" <td>-</td>\n",
" <td>-</td>\n",
" <td>-</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2.0</td>\n",
" <td>-</td>\n",
" <td>-</td>\n",
" <td>-</td>\n",
" <td>0.1818</td>\n",
" <td>-</td>\n",
" <td>0.9722</td>\n",
" <td>-</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>-1.0</td>\n",
" <td>-</td>\n",
" <td>-</td>\n",
" <td>0.0</td>\n",
" <td>0.2759</td>\n",
" <td>0.9091</td>\n",
" <td>1.0</td>\n",
" <td>-</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>-1.0</td>\n",
" <td>-</td>\n",
" <td>0.0</td>\n",
" <td>0.069</td>\n",
" <td>0.8571</td>\n",
" <td>0.9091</td>\n",
" <td>1.0</td>\n",
" <td>-</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>1.0</td>\n",
" <td>-</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.2857</td>\n",
" <td>0.9167</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>5</td>\n",
" <td>-1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.4000</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Stage Score T = -3 T = -2 T = -1 T = 0 T = 1 T = 2 T = 3\n",
"0 0 - - - - 0.5375 - - -\n",
"1 1 2.0 - - - 0.1818 - 0.9722 -\n",
"2 2 -1.0 - - 0.0 0.2759 0.9091 1.0 -\n",
"3 3 -1.0 - 0.0 0.069 0.8571 0.9091 1.0 -\n",
"4 4 1.0 - 0.0 0.0 0.2857 0.9167 1.0 1.0\n",
"5 5 -1.0 0.0 0.0 0.0 0.4000 1.0 1.0 1.0"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = psl.inspect(5)\n",
"\n",
"pd.set_option(\"display.precision\", 4)\n",
"df.fillna(\"-\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
30 changes: 26 additions & 4 deletions skpsl/probabilistic_scoring_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import List

import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from scipy.stats import entropy as stats_entropy
from sklearn.base import BaseEstimator, ClassifierMixin
Expand Down Expand Up @@ -105,7 +106,7 @@ def fit(self, X, y, l=1, n_jobs=1, predef_features=None, predef_scores=None) ->
:param n_jobs: passed to joblib for parallelization
:param predef_features:
:param predef_scores:
:return:
:return: The fitted classifier
"""

number_features = X.shape[1]
Expand Down Expand Up @@ -162,15 +163,36 @@ def predict_proba(self, X, k=-1):

return self.stage_clfs[k].predict_proba(X)

def score(self, X, y, sample_weight=None):
def score(self, X, y, k=-1, sample_weight=None):
"""
Calculates the Brier score of the model
:param X:
:param y:
:param sample_weight:
:param k: Classifier stage to use for prediction
:param sample_weight: ignored
:return:
"""
return brier_score_loss(y, self.predict_proba(X, k=k)[:, 1])

def inspect(self, k=None, feature_names=None) -> pd.DataFrame:
"""
Returns a dataframe that visualizes the internal model
:param k: maximum stage to include in the visualization (default: all stages)
:param feature_names: names of the features.
:return:
"""
return brier_score_loss(y, self.predict_proba(X)[:, 1])
k = k or len(self.stage_clfs) - 1

pmfs = [clf.probabilities for clf in self.stage_clfs[:k + 1]]
all_total_scores = sorted(set.union(*[set(pmf.keys()) for pmf in pmfs]))
data = [[pmfs[i].get(t_, np.nan) for t_ in all_total_scores] for i in range(k + 1)]

df = pd.DataFrame(columns=[f"T = {t_}" for t_ in all_total_scores], data=data)
df.insert(0, "Score", [np.nan] + self.stage_clfs[k].scores)
if feature_names is not None:
df.insert(0, "Feature", [np.nan] + feature_names[:k] + [np.nan] * (k - len(feature_names)))
return df.reset_index(names=["Stage"])

@property
def features(self):
Expand Down

0 comments on commit e86a2bf

Please sign in to comment.