Skip to content

Commit

Permalink
l-step lookahead evaluated using synthetic datasets
Browse files Browse the repository at this point in the history
changed psl scoring to brier score
  • Loading branch information
Stefan Heid committed Aug 9, 2023
1 parent 9991a4d commit a29390b
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 6 deletions.
14 changes: 10 additions & 4 deletions experiments/performance.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -37,23 +37,29 @@
"name": "stdout",
"output_type": "stream",
"text": [
"0.9604\n",
"CPU times: user 157 ms, sys: 108 ms, total: 266 ms\n",
"Wall time: 18 s\n"
"0.0274\n",
"CPU times: user 33.9 ms, sys: 87 ms, total: 121 ms\n",
"Wall time: 3.03 s\n"
]
}
],
"source": [
"%%time\n",
"clf = ProbabilisticScoringList([-1, 1, 2])\n",
"print(f\"{cross_val_score(clf, X, y, fit_params=dict(l=1), cv=ShuffleSplit(100, test_size=.2, random_state=42), n_jobs=-1).mean():.4f}\")"
"print(f\"{cross_val_score(clf, X, y, fit_params=dict(l=1), cv=ShuffleSplit(5, test_size=.2, random_state=42), n_jobs=-1).mean():.4f}\")"
]
},
{
"cell_type": "markdown",
"id": "eb1d191d-38b6-4f1c-8740-7101c9bd192c",
"metadata": {},
"source": [
"non-neg 0.2349\n",
"l1 0.0265\n",
"l2 0.0276 20\n",
"l2 0.0315 5\n",
"l3 0.0320 5\n",
"\n",
"### 0.2.0\n",
"- l=2 14min 11s, 0.9599\n",
"- l=1 30s, 0.9604\n",
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ python = ">=3.9,<3.13"
scikit-learn = "^1.3.0"
numpy = "^1.25.2"
scipy = "^1.11.1"
joblib = "^1.3.1"
joblib = "^1.3.2"
pandas = "^2.0.3"

sphinx = { version = "^7.1", optional = true }
sphinx_rtd_theme = { version = "^1.2", optional = true }
Expand Down
Empty file added skpsl/data/__init__.py
Empty file.
52 changes: 52 additions & 0 deletions skpsl/data/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import numpy as np
from itertools import permutations

from sklearn.model_selection import ShuffleSplit

from skpsl.probabilistic_scoring_list import _ClassifierAtK
from skpsl import ProbabilisticScoringList


def lookahead_example():
"""
scores→ [2,1] [1,2]
(2, 5, 1, 4, 3) 0.7839 0.7783 l2_psl.scores=[1, 2] l2_psl.stage_clfs[-1].score(X_)=0.7783 score_l1=0.1944 score_l2=0.1926
"""
psl = ProbabilisticScoringList(s)
psl.fit(X_, y_)
if psl.features == [0, 1] and psl.scores == [2, 1]:
# the psl ordered the features really in the way that feature 0 is the better one
# now lets test if we can improve the performance by inverting the scores
score, invscore = [_ClassifierAtK(features=f, scores=s_).fit(X_, y_, ).score(X_) for s_ in permutations(s)]
if score > invscore:
score_l1 = psl.score(X_, y_)
l2_psl = ProbabilisticScoringList(s).fit(X_, y_, l=2)
score_l2 = l2_psl.score(X_, y_)
if score_l2 < score_l1:
print(
f"{w} {score:.4f} {invscore:.4f} {l2_psl.scores=} {l2_psl.stage_clfs[-1].score(X_)=} {score_l1=} {score_l2=}")
print(np.corrcoef(np.hstack([X_, y_.reshape(-1, 1)].T)))


if __name__ == '__main__':
X = np.array([[1, 0], [0, 1], [1, 1], [1, 0], [0, 1], [1, 1]])
y = np.array([0, 0, 0, 1, 1, 1])
f = [0, 1]
# s = [2, 1]
s = [2, 1, -1]

for w in permutations(range(1, X.shape[0] + 1)):
X_ = np.repeat(X, np.array(w), axis=0)
y_ = np.repeat(y, np.array(w), axis=0)

rs = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
for (train_index, test_index) in rs.split(X):
l1_psl = ProbabilisticScoringList(s).fit(X_[train_index], y_[train_index])
l1_out = l1_psl.score(X_[test_index], y_[test_index])
l1_in = l1_psl.score(X_[train_index], y_[train_index])

l2_psl = ProbabilisticScoringList(s).fit(X_[train_index], y_[train_index], l=2)
l2_out = l2_psl.score(X_[test_index], y_[test_index])
l2_in = l2_psl.score(X_[train_index], y_[train_index])
if l1_in > l2_in:
print(f"{l1_in=} {l2_in=} {l1_out=} {l2_out=}")
12 changes: 11 additions & 1 deletion skpsl/probabilistic_scoring_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.exceptions import NotFittedError
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import brier_score_loss


class _ClassifierAtK(BaseEstimator, ClassifierMixin):
Expand Down Expand Up @@ -97,7 +98,6 @@ def __init__(self, score_set, entropy_threshold=-1):
self.total_scores_at_k = []
self.probabilities_at_k = []
self.stage_clfs = []
self.entropy_at_k = []
self._stage_clf = _ClassifierAtK

def fit(self, X, y, l=1, n_jobs=1, predef_features=None, predef_scores=None) -> "ProbabilisticScoringList":
Expand Down Expand Up @@ -179,6 +179,16 @@ def predict_proba(self, X, k=-1):

return self.stage_clfs[k].predict_proba(X)

def score(self, X, y, sample_weight=None):
"""
Calculates the Brier score of the model
:param X:
:param y:
:param sample_weight:
:return:
"""
return brier_score_loss(y, self.predict_proba(X)[:, 1])

@staticmethod
def _optimize(features, feature_extension, scores, score_extension, clfcls, X, y):
clf = clfcls(features=features + feature_extension, scores=scores + score_extension).fit(X, y)
Expand Down

0 comments on commit a29390b

Please sign in to comment.