From 608cac3946311df5cca069a76aaf33d82cc080ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?David=20Monlla=C3=B3?= <davidm@moodle.com>
Date: Tue, 31 Dec 2019 09:34:54 +0800
Subject: [PATCH 1/2] MDL-67040 Dataset-dependant hyperparameters

Also some logging fix ups sneaked in.
---
 moodlemlbackend/model/tensor.py        |  5 +-
 moodlemlbackend/processor/estimator.py | 70 ++++++++++++++++++++------
 2 files changed, 58 insertions(+), 17 deletions(-)

diff --git a/moodlemlbackend/model/tensor.py b/moodlemlbackend/model/tensor.py
index e016772..e9b2393 100644
--- a/moodlemlbackend/model/tensor.py
+++ b/moodlemlbackend/model/tensor.py
@@ -21,7 +21,10 @@ def __init__(self, n_features, n_classes, n_epoch, batch_size,
         self.batch_size = batch_size
         self.starter_learning_rate = starter_learning_rate
         self.n_features = n_features
-        self.n_hidden = 10
+
+        # Based on the number of features although we need a reasonable
+        # minimum.
+        self.n_hidden = max(4, int(n_features / 3))
         self.n_classes = n_classes
         self.tensor_logdir = tensor_logdir
 
diff --git a/moodlemlbackend/processor/estimator.py b/moodlemlbackend/processor/estimator.py
index c4892d2..c1e35d6 100644
--- a/moodlemlbackend/processor/estimator.py
+++ b/moodlemlbackend/processor/estimator.py
@@ -68,6 +68,8 @@ def __init__(self, modelid, directory):
 
         # Logging.
         logfile = os.path.join(self.logsdir, 'info.log')
+        for handler in logging.root.handlers[:]:
+            logging.root.removeHandler(handler)
         logging.basicConfig(filename=logfile, level=logging.DEBUG)
         warnings.showwarning = self.warnings_to_log
 
@@ -229,9 +231,32 @@ def __init__(self, modelid, directory, dataset=None):
     def get_classifier(self, X, y, initial_weights=False):
         """Gets the classifier"""
 
-        n_epoch = 50
-        batch_size = 1000
+        try:
+            n_rows = X.shape[0]
+        except AttributeError:
+            # No X during model import.
+            # n_rows value does not really matter during import.
+            n_rows = 1
+
+        if n_rows < 1000:
+            batch_size = n_rows
+        else:
+            # A min batch size of 1000.
+            x_tenpercent = int(n_rows / 10)
+            batch_size = max(1000, x_tenpercent)
+
+        # We need ~10,000 iterations so that the 0.5 learning rate decreases
+        # to 0.01 with a decay rate of 0.96. We use 12,000 so that the
+        # algorithm has some time to finish the training on lr < 0.01.
         starter_learning_rate = 0.5
+        if n_rows > batch_size:
+            n_epoch = int(12000 / (n_rows / batch_size))
+        else:
+            # Less than 1000 rows (1000 is the minimum batch size we defined).
+            # We don't need to iterate than many times if we have less than
+            # 1000 records, starting with 0.5 the learning rate will get to
+            # ~0.05 in 5000 epochs.
+            n_epoch = 5000
 
         n_classes = self.n_classes
         n_features = self.n_features
@@ -383,6 +408,7 @@ def evaluate_dataset(self, filepath, min_score=0.6,
             logging.info("AUC: %.2f%%", result['auc'])
             logging.info("AUC standard deviation: %.4f",
                          result['auc_deviation'])
+
         logging.info("Accuracy: %.2f%%", result['accuracy'] * 100)
         logging.info("Precision (predicted elements that are real): %.2f%%",
                      result['precision'] * 100)
@@ -405,21 +431,27 @@ def rate_prediction(self, classifier, X_test, y_test):
         y_test = y_test.T[0]
 
         if self.is_binary:
-            # ROC curve calculations.
-            fpr, tpr, _ = roc_curve(y_test, y_score)
 
-            # When the amount of samples is small we can randomly end up
-            # having just one class instead of examples of each, which
-            # triggers a "UndefinedMetricWarning: No negative samples in
-            # y_true, false positive value should be meaningless"
-            # and returning NaN.
-            if math.isnan(fpr[0]) or math.isnan(tpr[0]):
-                return
+            try:
+                # ROC curve calculations.
+                fpr, tpr, _ = roc_curve(y_test, y_score)
+
+                # When the amount of samples is small we can randomly end up
+                # having just one class instead of examples of each, which
+                # triggers a "UndefinedMetricWarning: No negative samples in
+                # y_true, false positive value should be meaningless"
+                # and returning NaN.
+                if math.isnan(fpr[0]) or math.isnan(tpr[0]):
+                    return
+
+                self.aucs.append(auc(fpr, tpr))
 
-            self.aucs.append(auc(fpr, tpr))
+                # Draw it.
+                self.roc_curve_plot.add(fpr, tpr, 'Positives')
 
-            # Draw it.
-            self.roc_curve_plot.add(fpr, tpr, 'Positives')
+            except Exception:
+                # Nevermind.
+                pass
 
         # Calculate accuracy, sensitivity and specificity.
         [acc, prec, rec, f1score] = self.calculate_metrics(y_test, y_pred)
@@ -473,8 +505,14 @@ def get_evaluation_results(self, min_score, accepted_deviation):
 
         result = dict()
         if self.is_binary and len(self.aucs) > 0:
-            result['auc'] = np.mean(self.aucs)
-            result['auc_deviation'] = np.std(self.aucs)
+            try:
+                result['auc'] = np.mean(self.aucs)
+                result['auc_deviation'] = np.std(self.aucs)
+            except Exception:
+                # No worries.
+                result['auc'] = 0.0
+                result['auc_deviation'] = 1.0
+                pass
 
         result['accuracy'] = avg_accuracy
         result['precision'] = avg_precision

From 91db1e9457140b336e1d19ea4ef7424ad9d96fc0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?David=20Monlla=C3=B3?= <davidm@moodle.com>
Date: Thu, 2 Jan 2020 12:21:27 +0800
Subject: [PATCH 2/2] MDL-67040 Regularization for the loss function

---
 moodlemlbackend/VERSION         | 2 +-
 moodlemlbackend/model/tensor.py | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/moodlemlbackend/VERSION b/moodlemlbackend/VERSION
index 2bf1c1c..197c4d5 100644
--- a/moodlemlbackend/VERSION
+++ b/moodlemlbackend/VERSION
@@ -1 +1 @@
-2.3.1
+2.4.0
diff --git a/moodlemlbackend/model/tensor.py b/moodlemlbackend/model/tensor.py
index e9b2393..2753667 100644
--- a/moodlemlbackend/model/tensor.py
+++ b/moodlemlbackend/model/tensor.py
@@ -144,6 +144,11 @@ def build_graph(self, initial_weights=False):
 
             loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(
                 logits=self.probs, labels=self.y_))
+
+            regularizer = (tf.nn.l2_loss(W['input-hidden']) * 0.01) + \
+                (tf.nn.l2_loss(W['hidden-output']) * 0.01)
+            loss = tf.reduce_mean(loss + regularizer)
+
             tf.summary.scalar("loss", loss)
 
         with tf.name_scope('accuracy'):