Skip to content

Commit

Permalink
Merge pull request #105 from PNNL-CompBio/model_version
Browse files Browse the repository at this point in the history
Model version
  • Loading branch information
christinehc authored Mar 7, 2023
2 parents def47da + 31ccb37 commit 89e82d5
Show file tree
Hide file tree
Showing 7 changed files with 128 additions and 94 deletions.
3 changes: 2 additions & 1 deletion snekmer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
from . import cluster
from . import vectorize
from . import report
from . import _version

# from . import walk

__version__ = "1.0.3"
__version__ = _version.__version__
1 change: 1 addition & 0 deletions snekmer/_version.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__version__ = "1.0.4"
64 changes: 38 additions & 26 deletions snekmer/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,16 @@
# imports
import pandas as pd
from typing import Any, Dict, List, Optional
from ._version import __version__
from .vectorize import KmerBasis
from numpy.typing import NDArray
from sklearn.base import ClassifierMixin
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression # LogisticRegressionCV
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.svm import SVC

# define default gridsearch param dict
MODEL_PARAMS = {
Expand All @@ -25,15 +27,16 @@
}

NAME2MODEL = {
"decisiontree": DecisionTreeClassifier(),
"randomforest": RandomForestClassifier(),
"adaboost": AdaBoostClassifier(),
"logistic": LogisticRegression(),
"decisiontree": DecisionTreeClassifier,
"randomforest": RandomForestClassifier,
"adaboost": AdaBoostClassifier,
"logistic": LogisticRegression,
"svc": SVC,
}


# classification models for protein families
class SnekmerModel(ClassifierMixin):
class SnekmerModel(ClassifierMixin, BaseEstimator):
"""Classify a protein family using kmer vectors as input.
Attributes
Expand Down Expand Up @@ -62,7 +65,8 @@ class SnekmerModel(ClassifierMixin):
def __init__(
self,
scaler: Optional[Any] = None,
model: str = "decisiontree",
model: str = "logistic",
model_params: Optional[Dict[Any, Any]] = {},
params: Dict[str, List] = MODEL_PARAMS,
step_name="clf",
):
Expand All @@ -80,15 +84,19 @@ def __init__(
Optional custom name for classifier pipeline step.
"""
# always compute svc probability
if model == "svc":
model_params["probability"] = True

self.scaler = scaler
self.params = params
self.model = NAME2MODEL[model]
self.model = NAME2MODEL[model](**model_params)
self.step_name = step_name
self.snekmer_version = __version__

self.pipeline = None
self.search = None

def fit(self, X: NDArray, y: NDArray, verbose: bool = True):
def fit(
self, X: NDArray, y: NDArray, gridsearch: bool = False, verbose: bool = True
):
"""Train model using gridsearch-tuned hyperparameters.
Parameters
Expand All @@ -99,8 +107,11 @@ def fit(self, X: NDArray, y: NDArray, verbose: bool = True):
Target values (class labels) as integers or strings.
scores : array-like of shape (n_features,) or (n_features, n_outputs)
NOT IMPLEMENTED YET-- for KmerScaler integration.
gridsearch: bool (default: False)
If True, uses grid search to determine hyperparameters.
verbose : bool (default: True)
If True, prints best gridsearch CV results to console.
Only active if `gridsearch=True`.
Returns
-------
Expand All @@ -109,25 +120,26 @@ def fit(self, X: NDArray, y: NDArray, verbose: bool = True):
"""
# define pipeline and gridsearch
self.pipeline = Pipeline(
steps=[("scaler", self.scaler), (self.step_name, self.model)]
)
self.search = GridSearchCV(self.pipeline, self.params)

# use gridsearch on training data to find best parameter set
self.search.fit(X, y)
if verbose:
print(
"best mean cross-validation score: {:.3f}".format(
self.search.best_score_
)
if gridsearch:
self.pipeline = Pipeline(
steps=[("scaler", self.scaler), (self.step_name, self.model)]
)
print("best parameters:", self.search.best_params_)
self.search = GridSearchCV(self.pipeline, self.params)

# use gridsearch on training data to find best parameter set
self.search.fit(X, y)
if verbose:
print(
"best mean cross-validation score: {:.3f}".format(
self.search.best_score_
)
)
print("best parameters:", self.search.best_params_)

# fit model with best gridsearch parameters
# self.scaler.fit(scores)
self.model.fit(X, y)
return
return self

def score(self, X: NDArray, y: NDArray):
"""Score model on test set.
Expand Down
114 changes: 62 additions & 52 deletions snekmer/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,45 +4,48 @@
"""
# imports
from typing import Optional

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from numpy.typing import ArrayLike
from sklearn.metrics import (
accuracy_score,
PrecisionRecallDisplay,
RocCurveDisplay,
auc,
average_precision_score,
roc_curve,
precision_recall_curve,
)
from sklearn.model_selection import (
KFold,
train_test_split,
RandomizedSearchCV,
StratifiedKFold,
)
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE


def cv_roc_curve(clf, X, y, title="ROC Curve", ax=None, dpi=400):
def cv_roc_curve(
clf,
X: ArrayLike,
y: ArrayLike,
title: str = "ROC Curve",
ax: Optional[plt.Axes] = None,
dpi: int = 400,
):
"""Plot cross-validated receiver operator characteristics curve.
Adapted from example in sklearn documentation [1].
[1]: http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html#sphx-glr-auto-examples-model-selection-plot-roc-crossval-py
Parameters
----------
clf : dict of Classifier objects
clf : object
Classifier object (e.g. DecisionTreeClassifier()).
X : dict of arrays
Feature arrays.
y : dict of arrays
Response arrays.
X : numpy.typing.ArrayLike
Feature array.
y : numpy.typing.ArrayLike
Response array.
title : str
Plot title.
ax : matplotlib.axes.Axes object or None (default: None)
ax : matplotlib.axes.Axes or None (default: None)
Axis object, if already created.
dpi : int (default: 400)
Figure resolution.
Expand All @@ -64,21 +67,23 @@ def cv_roc_curve(clf, X, y, title="ROC Curve", ax=None, dpi=400):

# take each cv result
for i in X.keys():
probabilities = clf.fit(X[i]["train"], y[i]["train"]).predict_proba(
X[i]["test"]
clf.fit(X[i]["train"], y[i]["train"])
viz = RocCurveDisplay.from_estimator(
clf.model,
X[i]["test"],
y[i]["test"],
name=f"ROC fold {i}",
alpha=0.3,
lw=1,
ax=ax,
)
interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
interp_tpr[0] = 0.0
tprs.append(interp_tpr)
auc_rocs.append(viz.roc_auc)

# calculate roc curve/area, and interpolate values
fpr, tpr, _ = roc_curve(y[i]["test"], probabilities[:, 1])
tprs.append(np.interp(mean_fpr, fpr, tpr))

tprs[-1][0] = 0.0
roc_auc = auc(fpr, tpr)
auc_rocs.append(roc_auc)
ax.plot(fpr, tpr, lw=1, alpha=0.3, label=f"ROC fold {i} (AUC = {roc_auc:0.2f})")

# plot y = x (50% chance) reference line
ax.plot([0, 1], [0, 1], linestyle="--", lw=2, color="r", label="Chance", alpha=0.8)
# plot random chance reference line
ax.plot([0, 1], [0, 1], "k--", label="Chance level (AUC = 0.5)")

# calculate mean and stdev roc_auc and show in plot
mean_tpr = np.mean(tprs, axis=0)
Expand Down Expand Up @@ -118,20 +123,27 @@ def cv_roc_curve(clf, X, y, title="ROC Curve", ax=None, dpi=400):
return fig, ax, auc_rocs


def cv_pr_curve(clf, X, y, title="PR Curve", ax=None, dpi=400):
def cv_pr_curve(
clf,
X: ArrayLike,
y: ArrayLike,
title: str = "PR Curve",
ax: Optional[plt.Axes] = None,
dpi: int = 400,
):
"""Plot cross-validated precision-recall curve.
Parameters
----------
clf : Classifer object
Classifier object (e.g. DecisionTreeClassifier()).
X : dict
Feature dataframe.
y : dict
Response dataframe.
X : numpy.typing.ArrayLike
Feature array.
y : numpy.typing.ArrayLike
Response array.
title : str
Plot title.
ax : matplotlib.axes.Axes object or None (default: None)
ax : matplotlib.axes.Axes or None (default: None)
Axis object, if already created.
dpi : int (default: 400)
Figure resolution.
Expand All @@ -141,7 +153,6 @@ def cv_pr_curve(clf, X, y, title="PR Curve", ax=None, dpi=400):
fig, ax, pr_aucs
"""

# initialize figure and define all axes
fig = plt.figure(
dpi=dpi, figsize=(8, 4), facecolor="white", constrained_layout=True
Expand All @@ -152,38 +163,37 @@ def cv_pr_curve(clf, X, y, title="PR Curve", ax=None, dpi=400):
y_real, y_proba, pr_aucs = [], [], []

for i in X.keys():
probabilities = clf.fit(X[i]["train"], y[i]["train"]).predict_proba(
X[i]["test"]
clf.fit(X[i]["train"], y[i]["train"])
probabilities = clf.model.predict_proba(X[i]["test"])
precision, recall, _ = precision_recall_curve(
y[i]["test"], clf.predict(X[i]["test"])
)
# Compute ROC curve and area the curve
precision, recall, _ = precision_recall_curve(y[i]["test"], probabilities[:, 1])
avg_precision = average_precision_score(y[i]["test"], probabilities[:, 1])

# Plotting each individual PR Curve
ax.plot(
recall,
precision,
lw=1,
viz = PrecisionRecallDisplay.from_estimator(
clf.model,
X[i]["test"],
y[i]["test"],
name=f"ROC fold {i}",
alpha=0.3,
label=f"PR fold {i} (AUC = {avg_precision:0.2f})",
lw=1,
ax=ax,
)

y_real.append(y[i]["test"])
y_proba.append(probabilities[:, 1])
pr_aucs.append(avg_precision)
pr_aucs.append(viz.average_precision)

y_real = np.concatenate(y_real)
y_proba = np.concatenate(y_proba)

precision, recall, _ = precision_recall_curve(y_real, y_proba)
avg_total_precision = average_precision_score(y_real, y_proba)
# avg_total_precision = average_precision_score(y_real, y_proba)

# plot average p-r curve
ax.plot(
recall,
precision,
color="b",
label=f"Precision-Recall (AUC = {avg_total_precision:0.2f})",
label=f"Precision-Recall (AUC = {viz.average_precision:0.2f})",
lw=2,
alpha=0.8,
)
Expand Down
Loading

0 comments on commit 89e82d5

Please sign in to comment.