-
Notifications
You must be signed in to change notification settings - Fork 0
/
metrics.py
239 lines (189 loc) · 8.63 KB
/
metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
from __future__ import print_function, unicode_literals
import logging
import math
import numpy as np
from scipy.stats import kendalltau, spearmanr, pearsonr
from six import string_types
from six.moves import xrange as range
from sklearn.metrics import confusion_matrix, f1_score, SCORERS
# from sklearn.metrics import mean_squared_error
# Constants
_CORRELATION_METRICS = frozenset(['kendall_tau', 'spearman', 'pearson'])
def kappa(y_true, y_pred, weights=None, allow_off_by_one=False):
"""
Calculates the kappa inter-rater agreement between two the gold standard
and the predicted ratings. Potential values range from -1 (representing
complete disagreement) to 1 (representing complete agreement). A kappa
value of 0 is expected if all agreement is due to chance.
In the course of calculating kappa, all items in `y_true` and `y_pred` will
first be converted to floats and then rounded to integers.
It is assumed that y_true and y_pred contain the complete range of possible
ratings.
This function contains a combination of code from yorchopolis's kappa-stats
and Ben Hamner's Metrics projects on Github.
:param y_true: The true/actual/gold labels for the data.
:type y_true: array-like of float
:param y_pred: The predicted/observed labels for the data.
:type y_pred: array-like of float
:param weights: Specifies the weight matrix for the calculation.
Options are:
- None = unweighted-kappa
- 'quadratic' = quadratic-weighted kappa
- 'linear' = linear-weighted kappa
- two-dimensional numpy array = a custom matrix of
weights. Each weight corresponds to the
:math:`w_{ij}` values in the wikipedia description
of how to calculate weighted Cohen's kappa.
:type weights: str or numpy array
:param allow_off_by_one: If true, ratings that are off by one are counted as
equal, and all other differences are reduced by
one. For example, 1 and 2 will be considered to be
equal, whereas 1 and 3 will have a difference of 1
for when building the weights matrix.
:type allow_off_by_one: bool
"""
logger = logging.getLogger(__name__)
# Ensure that the lists are both the same length
assert(len(y_true) == len(y_pred))
# This rather crazy looking typecast is intended to work as follows:
# If an input is an int, the operations will have no effect.
# If it is a float, it will be rounded and then converted to an int
# because the ml_metrics package requires ints.
# If it is a str like "1", then it will be converted to a (rounded) int.
# If it is a str that can't be typecast, then the user is
# given a hopefully useful error message.
# Note: numpy and python 3.3 use bankers' rounding.
try:
y_true = [int(np.round(float(y))) for y in y_true]
y_pred = [int(np.round(float(y))) for y in y_pred]
except ValueError as e:
logger.error("For kappa, the labels should be integers or strings "
"that can be converted to ints (E.g., '4.0' or '3').")
raise e
# Figure out normalized expected values
min_rating = min(min(y_true), min(y_pred))
max_rating = max(max(y_true), max(y_pred))
# shift the values so that the lowest value is 0
# (to support scales that include negative values)
y_true = [y - min_rating for y in y_true]
y_pred = [y - min_rating for y in y_pred]
# Build the observed/confusion matrix
num_ratings = max_rating - min_rating + 1
observed = confusion_matrix(y_true, y_pred,
labels=list(range(num_ratings)))
num_scored_items = float(len(y_true))
# Build weight array if weren't passed one
if isinstance(weights, string_types):
wt_scheme = weights
weights = None
else:
wt_scheme = ''
if weights is None:
weights = np.empty((num_ratings, num_ratings))
for i in range(num_ratings):
for j in range(num_ratings):
diff = abs(i - j)
if allow_off_by_one and diff:
diff -= 1
if wt_scheme == 'linear':
weights[i, j] = diff
elif wt_scheme == 'quadratic':
weights[i, j] = diff ** 2
elif not wt_scheme: # unweighted
weights[i, j] = bool(diff)
else:
raise ValueError('Invalid weight scheme specified for '
'kappa: {}'.format(wt_scheme))
hist_true = np.bincount(y_true, minlength=num_ratings)
hist_true = hist_true[: num_ratings] / num_scored_items
hist_pred = np.bincount(y_pred, minlength=num_ratings)
hist_pred = hist_pred[: num_ratings] / num_scored_items
expected = np.outer(hist_true, hist_pred)
# Normalize observed array
observed = observed / num_scored_items
# If all weights are zero, that means no disagreements matter.
k = 1.0
if np.count_nonzero(weights):
k -= (sum(sum(weights * observed)) / sum(sum(weights * expected)))
return k
def kendall_tau(y_true, y_pred):
"""
Calculate Kendall's tau between ``y_true`` and ``y_pred``.
:param y_true: The true/actual/gold labels for the data.
:type y_true: array-like of float
:param y_pred: The predicted/observed labels for the data.
:type y_pred: array-like of float
:returns: Kendall's tau if well-defined, else 0
"""
ret_score = kendalltau(y_true, y_pred)[0]
return ret_score if not np.isnan(ret_score) else 0.0
def spearman(y_true, y_pred):
"""
Calculate Spearman's rank correlation coefficient between ``y_true`` and
``y_pred``.
:param y_true: The true/actual/gold labels for the data.
:type y_true: array-like of float
:param y_pred: The predicted/observed labels for the data.
:type y_pred: array-like of float
:returns: Spearman's rank correlation coefficient if well-defined, else 0
"""
ret_score = spearmanr(y_true, y_pred)[0]
return ret_score if not np.isnan(ret_score) else 0.0
def pearson(y_true, y_pred):
"""
Calculate Pearson product-moment correlation coefficient between ``y_true``
and ``y_pred``.
:param y_true: The true/actual/gold labels for the data.
:type y_true: array-like of float
:param y_pred: The predicted/observed labels for the data.
:type y_pred: array-like of float
:returns: Pearson product-moment correlation coefficient if well-defined,
else 0
"""
ret_score = pearsonr(y_true, y_pred)[0]
return ret_score if not np.isnan(ret_score) else 0.0
def f1_score_least_frequent(y_true, y_pred):
"""
Calculate the F1 score of the least frequent label/class in ``y_true`` for
``y_pred``.
:param y_true: The true/actual/gold labels for the data.
:type y_true: array-like of float
:param y_pred: The predicted/observed labels for the data.
:type y_pred: array-like of float
:returns: F1 score of the least frequent label
"""
least_frequent = np.bincount(y_true).argmin()
return f1_score(y_true, y_pred, average=None)[least_frequent]
def use_score_func(func_name, y_true, y_pred):
"""
Call the scoring function in `sklearn.metrics.SCORERS` with the given name.
This takes care of handling keyword arguments that were pre-specified when
creating the scorer. This applies any sign-flipping that was specified by
`make_scorer` when the scorer was created.
"""
scorer = SCORERS[func_name]
return scorer._sign * scorer._score_func(y_true, y_pred, **scorer._kwargs)
def mean_square_error(y_true, y_pred):
"""
Calculate the mean square error between predictions and true scores
:param y_true: true score list
:param y_pred: predicted score list
return mean_square_error value
"""
# return mean_squared_error(y_true, y_pred) # use sklean default function
y_true = np.asarray(y_true)
y_pred = np.asarray(y_pred)
mse = ((y_true-y_pred)**2).mean(axis=0)
return float(mse)
def root_mean_square_error(y_true, y_pred):
"""
Calculate the mean square error between predictions and true scores
:param y_true: true score list
:param y_pred: predicted score list
return mean_square_error value
"""
# return mean_squared_error(y_true, y_pred) # use sklean default function
y_true = np.asarray(y_true)
y_pred = np.asarray(y_pred)
mse = ((y_true-y_pred)**2).mean(axis=0)
return float(math.sqrt(mse))