From 608cac3946311df5cca069a76aaf33d82cc080ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20Monlla=C3=B3?= Date: Tue, 31 Dec 2019 09:34:54 +0800 Subject: [PATCH 1/2] MDL-67040 Dataset-dependant hyperparameters Also some logging fix ups sneaked in. --- moodlemlbackend/model/tensor.py | 5 +- moodlemlbackend/processor/estimator.py | 70 ++++++++++++++++++++------ 2 files changed, 58 insertions(+), 17 deletions(-) diff --git a/moodlemlbackend/model/tensor.py b/moodlemlbackend/model/tensor.py index e016772..e9b2393 100644 --- a/moodlemlbackend/model/tensor.py +++ b/moodlemlbackend/model/tensor.py @@ -21,7 +21,10 @@ def __init__(self, n_features, n_classes, n_epoch, batch_size, self.batch_size = batch_size self.starter_learning_rate = starter_learning_rate self.n_features = n_features - self.n_hidden = 10 + + # Based on the number of features although we need a reasonable + # minimum. + self.n_hidden = max(4, int(n_features / 3)) self.n_classes = n_classes self.tensor_logdir = tensor_logdir diff --git a/moodlemlbackend/processor/estimator.py b/moodlemlbackend/processor/estimator.py index c4892d2..c1e35d6 100644 --- a/moodlemlbackend/processor/estimator.py +++ b/moodlemlbackend/processor/estimator.py @@ -68,6 +68,8 @@ def __init__(self, modelid, directory): # Logging. logfile = os.path.join(self.logsdir, 'info.log') + for handler in logging.root.handlers[:]: + logging.root.removeHandler(handler) logging.basicConfig(filename=logfile, level=logging.DEBUG) warnings.showwarning = self.warnings_to_log @@ -229,9 +231,32 @@ def __init__(self, modelid, directory, dataset=None): def get_classifier(self, X, y, initial_weights=False): """Gets the classifier""" - n_epoch = 50 - batch_size = 1000 + try: + n_rows = X.shape[0] + except AttributeError: + # No X during model import. + # n_rows value does not really matter during import. + n_rows = 1 + + if n_rows < 1000: + batch_size = n_rows + else: + # A min batch size of 1000. + x_tenpercent = int(n_rows / 10) + batch_size = max(1000, x_tenpercent) + + # We need ~10,000 iterations so that the 0.5 learning rate decreases + # to 0.01 with a decay rate of 0.96. We use 12,000 so that the + # algorithm has some time to finish the training on lr < 0.01. starter_learning_rate = 0.5 + if n_rows > batch_size: + n_epoch = int(12000 / (n_rows / batch_size)) + else: + # Less than 1000 rows (1000 is the minimum batch size we defined). + # We don't need to iterate than many times if we have less than + # 1000 records, starting with 0.5 the learning rate will get to + # ~0.05 in 5000 epochs. + n_epoch = 5000 n_classes = self.n_classes n_features = self.n_features @@ -383,6 +408,7 @@ def evaluate_dataset(self, filepath, min_score=0.6, logging.info("AUC: %.2f%%", result['auc']) logging.info("AUC standard deviation: %.4f", result['auc_deviation']) + logging.info("Accuracy: %.2f%%", result['accuracy'] * 100) logging.info("Precision (predicted elements that are real): %.2f%%", result['precision'] * 100) @@ -405,21 +431,27 @@ def rate_prediction(self, classifier, X_test, y_test): y_test = y_test.T[0] if self.is_binary: - # ROC curve calculations. - fpr, tpr, _ = roc_curve(y_test, y_score) - # When the amount of samples is small we can randomly end up - # having just one class instead of examples of each, which - # triggers a "UndefinedMetricWarning: No negative samples in - # y_true, false positive value should be meaningless" - # and returning NaN. - if math.isnan(fpr[0]) or math.isnan(tpr[0]): - return + try: + # ROC curve calculations. + fpr, tpr, _ = roc_curve(y_test, y_score) + + # When the amount of samples is small we can randomly end up + # having just one class instead of examples of each, which + # triggers a "UndefinedMetricWarning: No negative samples in + # y_true, false positive value should be meaningless" + # and returning NaN. + if math.isnan(fpr[0]) or math.isnan(tpr[0]): + return + + self.aucs.append(auc(fpr, tpr)) - self.aucs.append(auc(fpr, tpr)) + # Draw it. + self.roc_curve_plot.add(fpr, tpr, 'Positives') - # Draw it. - self.roc_curve_plot.add(fpr, tpr, 'Positives') + except Exception: + # Nevermind. + pass # Calculate accuracy, sensitivity and specificity. [acc, prec, rec, f1score] = self.calculate_metrics(y_test, y_pred) @@ -473,8 +505,14 @@ def get_evaluation_results(self, min_score, accepted_deviation): result = dict() if self.is_binary and len(self.aucs) > 0: - result['auc'] = np.mean(self.aucs) - result['auc_deviation'] = np.std(self.aucs) + try: + result['auc'] = np.mean(self.aucs) + result['auc_deviation'] = np.std(self.aucs) + except Exception: + # No worries. + result['auc'] = 0.0 + result['auc_deviation'] = 1.0 + pass result['accuracy'] = avg_accuracy result['precision'] = avg_precision From 91db1e9457140b336e1d19ea4ef7424ad9d96fc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20Monlla=C3=B3?= Date: Thu, 2 Jan 2020 12:21:27 +0800 Subject: [PATCH 2/2] MDL-67040 Regularization for the loss function --- moodlemlbackend/VERSION | 2 +- moodlemlbackend/model/tensor.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/moodlemlbackend/VERSION b/moodlemlbackend/VERSION index 2bf1c1c..197c4d5 100644 --- a/moodlemlbackend/VERSION +++ b/moodlemlbackend/VERSION @@ -1 +1 @@ -2.3.1 +2.4.0 diff --git a/moodlemlbackend/model/tensor.py b/moodlemlbackend/model/tensor.py index e9b2393..2753667 100644 --- a/moodlemlbackend/model/tensor.py +++ b/moodlemlbackend/model/tensor.py @@ -144,6 +144,11 @@ def build_graph(self, initial_weights=False): loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2( logits=self.probs, labels=self.y_)) + + regularizer = (tf.nn.l2_loss(W['input-hidden']) * 0.01) + \ + (tf.nn.l2_loss(W['hidden-output']) * 0.01) + loss = tf.reduce_mean(loss + regularizer) + tf.summary.scalar("loss", loss) with tf.name_scope('accuracy'):