From 342eb587528d7b7500cc4c2696dae1242bc970d1 Mon Sep 17 00:00:00 2001
From: Hallvard Lavik <hallvard.hoyland.lavik@nmbu.no>
Date: Wed, 28 Aug 2024 13:01:10 +0200
Subject: [PATCH] Fix batched weight update bug. Completed: #12 #13 #15.

---
 Cargo.toml                    |   2 +-
 README.md                     |  28 ++++--
 examples/example_benchmark.rs | 165 +++++++++++++++-------------------
 examples/example_mnist.rs     |  20 ++---
 src/network.rs                | 103 ++++++++++-----------
 5 files changed, 153 insertions(+), 165 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
index a95ff81..e3dd9e4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "neurons"
-version = "1.1.0"
+version = "2.0.0"
 edition = "2021"
 description = "Neural networks from scratch, in Rust."
 authors = ["Hallvard Høyland Lavik"]
diff --git a/README.md b/README.md
index 897a19e..3538be7 100644
--- a/README.md
+++ b/README.md
@@ -71,6 +71,18 @@ Create modular neural networks in Rust with ease!
 <details>
   <summary>Releases</summary>
 
+  ## 2.0.0 (Fix batched weight updates)
+
+  * Weight updates are now batched correctly.
+    - See `network::Network::learn` for details.
+
+  ### Benchmarking examples/example_benchmark.rs (mnist version)
+
+  ```raw
+  batched (128): 17.268632412s (4.82x speedup)
+  unbatched (1): 83.347593292s
+  ```
+
   ## 1.1.0 (Improved optimizer step)
 
   * Optimizer step more intuitive and easy to read.
@@ -85,7 +97,7 @@ Create modular neural networks in Rust with ease!
   * Batched training (`network::Network::learn`)
   * Parallelization of batches (`rayon`)
 
-  ### Benchmarking example/example_benchmark.rs
+  ### Benchmarking examples/example_benchmark.rs (iris version)
 
   ```raw
   v0.3.0: 0.318811179s (6.95x speedup)
@@ -164,6 +176,7 @@ Create modular neural networks in Rust with ease!
     - [x] Feedforward (dubbed `Network`)
     - [x] Convolutional
     - [ ] Recurrent
+    - [ ] Skip connections
     - [ ] Feedback connections
       - [x] Dense to Dense
       - [ ] Dense to Convolutional
@@ -173,7 +186,7 @@ Create modular neural networks in Rust with ease!
   ## Regularization
     - [x] Dropout
     - [ ] Batch normalization
-    - [ ] Early stopping
+    - [x] Early stopping
 
   ## Parallelization
     - [x] Parallelization of batches
@@ -186,10 +199,10 @@ Create modular neural networks in Rust with ease!
       - [x] Thorough testing of objective functions
       - [x] Thorough testing of optimization techniques
       - [ ] Thorough testing of feedback scaling (wrt. gradients)
-    - [ ] Integration tests
+    - [x] Integration tests
       - [x] Network forward pass
       - [x] Network backward pass
-      - [ ] Network training (i.e., weight updates)
+      - [x] Network training (i.e., weight updates)
 
   ## Examples
     - [x] XOR
@@ -220,7 +233,6 @@ Create modular neural networks in Rust with ease!
       - [ ] General data loading functionality
     - [x] Custom icon/image for documentation
     - [x] Custom stylesheet for documentation
-    - [ ] Type conversion (e.g. f32, f64)
     - [ ] Network type specification (e.g. f32, f64)
     - [ ] Saving and loading
       - [ ] Single layer weights
@@ -242,9 +254,9 @@ Create modular neural networks in Rust with ease!
   * [Adam](https://pytorch.org/docs/stable/generated/torch.optim.Adam.html)
   * [AdamW](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html)
   * [RMSprop](https://pytorch.org/docs/stable/generated/torch.optim.RMSprop.html)
-  * [backpropagation convolution 1](https://deeplearning.cs.cmu.edu/F21/document/recitation/Recitation5/CNN_Backprop_Recitation_5_F21.pdf)
-  * [backpropagation convolution 2](https://www.jefkine.com/general/2016/09/05/backpropagation-in-convolutional-neural-networks/)
-  * [backpropagation convolution 3](https://sites.cc.gatech.edu/classes/AY2021/cs7643_spring/assets/L11_CNNs.pdf)
+  * [convolution 1](https://deeplearning.cs.cmu.edu/F21/document/recitation/Recitation5/CNN_Backprop_Recitation_5_F21.pdf)
+  * [convolution 2](https://www.jefkine.com/general/2016/09/05/backpropagation-in-convolutional-neural-networks/)
+  * [convolution 3](https://sites.cc.gatech.edu/classes/AY2021/cs7643_spring/assets/L11_CNNs.pdf)
 
   ### Tools used
 
diff --git a/examples/example_benchmark.rs b/examples/example_benchmark.rs
index 4a4fa62..731979c 100644
--- a/examples/example_benchmark.rs
+++ b/examples/example_benchmark.rs
@@ -1,118 +1,101 @@
 // Copyright (C) 2024 Hallvard Høyland Lavik
 
-use neurons::{activation, network, objective, optimizer, random, tensor};
+use neurons::{activation, network, objective, optimizer, tensor};
 
+use std::fs::File;
+use std::io::{BufReader, Read, Result};
 use std::time;
-extern crate csv;
-
-fn data(path: &str) -> (Vec<tensor::Tensor>, Vec<tensor::Tensor>) {
-    let mut reader = csv::Reader::from_path(path).unwrap();
-
-    let mut x: Vec<Vec<f32>> = Vec::new();
-    let mut y: Vec<Vec<f32>> = Vec::new();
-
-    reader.records().for_each(|record| {
-        let record = record.unwrap();
-        x.push(vec![
-            record.get(1).unwrap().parse::<f32>().unwrap(),
-            record.get(2).unwrap().parse::<f32>().unwrap(),
-            record.get(3).unwrap().parse::<f32>().unwrap(),
-            record.get(4).unwrap().parse::<f32>().unwrap(),
-        ]);
-        y.push(match record.get(5).unwrap() {
-            "Iris-setosa" => vec![1.0, 0.0, 0.0],
-            "Iris-versicolor" => vec![0.0, 1.0, 0.0],
-            "Iris-virginica" => vec![0.0, 0.0, 1.0],
-            // "Iris-setosa" => vec![0.0],
-            // "Iris-versicolor" => vec![1.0],
-            // "Iris-virginica" => vec![2.0],
-            _ => panic!("Unknown class"),
-        });
-    });
-
-    let mut generator = random::Generator::create(12345);
-    let mut indices: Vec<usize> = (0..x.len()).collect();
-    generator.shuffle(&mut indices);
-
-    let x: Vec<tensor::Tensor> = indices
-        .iter()
-        .map(|&i| tensor::Tensor::vector(x[i].clone()))
-        .collect();
-    let y: Vec<tensor::Tensor> = indices
-        .iter()
-        .map(|&i| tensor::Tensor::vector(y[i].clone()))
-        .collect();
 
-    (x, y)
+fn read(reader: &mut dyn Read) -> Result<u32> {
+    let mut buffer = [0; 4];
+    reader.read_exact(&mut buffer)?;
+    Ok(u32::from_be_bytes(buffer))
+}
+
+fn load_images(path: &str) -> Result<Vec<tensor::Tensor>> {
+    let mut reader = BufReader::new(File::open(path)?);
+    let mut images: Vec<tensor::Tensor> = Vec::new();
+
+    let _magic_number = read(&mut reader)?;
+    let num_images = read(&mut reader)?;
+    let num_rows = read(&mut reader)?;
+    let num_cols = read(&mut reader)?;
+
+    for _ in 0..num_images {
+        let mut image: Vec<Vec<f32>> = Vec::new();
+        for _ in 0..num_rows {
+            let mut row: Vec<f32> = Vec::new();
+            for _ in 0..num_cols {
+                let mut pixel = [0];
+                reader.read_exact(&mut pixel)?;
+                row.push(pixel[0] as f32 / 255.0);
+            }
+            image.push(row);
+        }
+        images.push(tensor::Tensor::tensor(vec![image]).resize(tensor::Shape::Tensor(1, 14, 14)));
+    }
+
+    Ok(images)
+}
+
+fn load_labels(file_path: &str, numbers: f32) -> Result<Vec<tensor::Tensor>> {
+    let mut reader = BufReader::new(File::open(file_path)?);
+    let _magic_number = read(&mut reader)?;
+    let num_labels = read(&mut reader)?;
+
+    let mut _labels = vec![0; num_labels as usize];
+    reader.read_exact(&mut _labels)?;
+
+    Ok(_labels
+        .iter()
+        .map(|&x| tensor::Tensor::one_hot(x as f32, numbers))
+        .collect())
 }
 
 fn main() {
-    // Load the iris dataset
-    let (x, y) = data("./datasets/iris.csv");
-
-    let split = (x.len() as f32 * 0.8) as usize;
-    let x = x.split_at(split);
-    let y = y.split_at(split);
-
-    let x_train: Vec<&tensor::Tensor> = x.0.iter().collect();
-    let y_train: Vec<&tensor::Tensor> = y.0.iter().collect();
-    let x_test: Vec<&tensor::Tensor> = x.1.iter().collect();
-    let y_test: Vec<&tensor::Tensor> = y.1.iter().collect();
-
-    let (x_train, y_train, x_test, y_test) = (
-        x_train.to_vec(),
-        y_train.to_vec(),
-        x_test.to_vec(),
-        y_test.to_vec(),
-    );
-    println!(
-        "Train data {}x{}: {} => {}",
-        x_train.len(),
-        x_train[0].shape,
-        x_train[0].data,
-        y_train[0].data
-    );
-    println!(
-        "Test data {}x{}: {} => {}",
-        x_test.len(),
-        x_test[0].shape,
-        x_test[0].data,
-        y_test[0].data
-    );
+    let x_train = load_images("./examples/datasets/mnist/train-images-idx3-ubyte").unwrap();
+    let y_train = load_labels("./examples/datasets/mnist/train-labels-idx1-ubyte", 10f32).unwrap();
+
+    let x_train: Vec<&tensor::Tensor> = x_train.iter().collect();
+    let y_train: Vec<&tensor::Tensor> = y_train.iter().collect();
 
     let mut times: Vec<time::Duration> = Vec::new();
 
-    for _ in 0..10 {
+    for iteration in 0..10 {
         let start = time::Instant::now();
 
         // Create the network
-        let mut network = network::Network::new(tensor::Shape::Vector(4));
-
-        network.dense(50, activation::Activation::ReLU, false, Some(0.1));
-        network.dense(50, activation::Activation::ReLU, false, Some(0.1));
-        network.dense(3, activation::Activation::Softmax, false, Some(0.1));
+        let mut network = network::Network::new(tensor::Shape::Tensor(1, 14, 14));
+
+        network.convolution(
+            8,
+            (3, 3),
+            (1, 1),
+            (0, 0),
+            activation::Activation::ReLU,
+            Some(0.05),
+        );
+        network.maxpool((2, 2), (2, 2));
+        network.dense(10, activation::Activation::Softmax, true, None);
 
-        network.set_optimizer(optimizer::Optimizer::RMSprop(optimizer::RMSprop {
+        network.set_optimizer(optimizer::Optimizer::Adam(optimizer::Adam {
             learning_rate: 0.001,
-            alpha: 0.0,
+            decay: None,
+            beta1: 0.9,
+            beta2: 0.999,
             epsilon: 1e-8,
-
-            decay: Some(0.01),
-            momentum: Some(0.01),
-            centered: Some(true),
-
-            // To be filled by the network:
             velocity: vec![],
-            gradient: vec![],
-            buffer: vec![],
+            momentum: vec![],
         }));
         network.set_objective(
             objective::Objective::CrossEntropy, // Objective function
-            Some((-1f32, 1f32)),                // Gradient clipping
+            None,                               // Gradient clipping
         );
 
         // Train the network
-        let (_train_loss, _val_loss) = network.learn(&x_train, &y_train, None, 25, 500, Some(50));
+        let (train_loss, _) = network.learn(&x_train, &y_train, None, 128, 10, None);
+
+        println!("Iteration: {}, Loss: {:?}", iteration, train_loss);
 
         let duration = start.elapsed();
         times.push(duration);
diff --git a/examples/example_mnist.rs b/examples/example_mnist.rs
index 7bd9c5c..6d31826 100644
--- a/examples/example_mnist.rs
+++ b/examples/example_mnist.rs
@@ -31,7 +31,7 @@ fn load_images(path: &str) -> Result<Vec<tensor::Tensor>> {
             }
             image.push(row);
         }
-        images.push(tensor::Tensor::tensor(vec![image]).resize(tensor::Shape::Tensor(1, 14, 14)));
+        images.push(tensor::Tensor::tensor(vec![image]).resize(tensor::Shape::Tensor(1, 10, 10)));
     }
 
     Ok(images)
@@ -67,7 +67,7 @@ fn main() {
     let x_test: Vec<&tensor::Tensor> = x_test.iter().collect();
     let y_test: Vec<&tensor::Tensor> = y_test.iter().collect();
 
-    let mut network = network::Network::new(tensor::Shape::Tensor(1, 14, 14));
+    let mut network = network::Network::new(tensor::Shape::Tensor(1, 10, 10));
 
     network.convolution(
         8,
@@ -78,15 +78,6 @@ fn main() {
         Some(0.05),
     );
     network.maxpool((2, 2), (2, 2));
-    // network.convolution(
-    //     8,
-    //     (3, 3),
-    //     (1, 1),
-    //     (0, 0),
-    //     activation::Activation::ReLU,
-    //     None,
-    // );
-    network.dense(512, activation::Activation::ReLU, true, Some(0.25));
     network.dense(10, activation::Activation::Softmax, true, None);
 
     network.set_optimizer(optimizer::Optimizer::Adam(optimizer::Adam {
@@ -106,13 +97,12 @@ fn main() {
     println!("{}", network);
 
     // Train the network
-    let (train_loss, val_loss) =
-        network.learn(&x_train, &y_train, Some((0.1, 5)), 128, 50, Some(1));
+    let (train_loss, val_loss) = network.learn(&x_train, &y_train, Some((0.1, 5)), 128, 5, Some(1));
     plot::loss(&train_loss, &val_loss, "Loss per epoch", "loss.png");
 
     // Validate the network
-    let (val_loss, val_acc) = network.validate(&x_test, &y_test, 0.1);
-    println!("1. Validation acc: {}, loss: {}", val_acc, val_loss);
+    let (test_loss, test_acc) = network.validate(&x_test, &y_test, 0.1);
+    println!("1. Test acc: {}, loss: {}", test_acc, test_loss);
 
     // Use the network
     let prediction = network.predict(x_test.get(0).unwrap());
diff --git a/src/network.rs b/src/network.rs
index d4249a0..82da3ec 100644
--- a/src/network.rs
+++ b/src/network.rs
@@ -515,71 +515,74 @@ impl Network {
             .collect();
 
         for epoch in 1..epochs + 1 {
-            let results: Vec<_> = batches
-                .par_iter()
-                .map(|(inputs, targets)| {
-                    let mut loss = 0.0f32;
-                    let mut weight_gradients: Vec<tensor::Tensor> = Vec::new();
-                    let mut bias_gradients: Vec<Option<tensor::Tensor>> = Vec::new();
-
-                    for (i, (input, target)) in inputs.iter().zip(targets.iter()).enumerate() {
+            let mut loss_epoch = 0.0;
+            for batch in batches.iter() {
+                // Parallel iteration over the batch.
+                // I.e., parallell forward and backward pass for each sample in the batch.
+                let results: Vec<_> = batch
+                    .into_par_iter()
+                    .map(|(input, target)| {
                         let (unactivated, activated, maxpools) = self.forward(input);
-                        let (_loss, gradient) =
+                        let (loss, gradient) =
                             self.objective.loss(&activated.last().unwrap(), target);
-                        loss += _loss;
 
                         let (wg, bg) = self.backward(gradient, &unactivated, &activated, maxpools);
 
-                        if i == 0 {
-                            weight_gradients = wg;
-                            bias_gradients = bg;
-                        } else {
-                            for (gradient, new) in weight_gradients.iter_mut().zip(wg.iter()) {
-                                gradient.add_inplace(new)
-                            }
+                        (wg, bg, loss)
+                    })
+                    .collect();
 
-                            for (gradient, new) in bias_gradients.iter_mut().zip(bg.iter()) {
-                                match gradient {
-                                    Some(gradient) => match new {
-                                        Some(new) => gradient.add_inplace(new),
-                                        None => panic!("Expected Some, got None."),
-                                    },
-                                    None => match new {
-                                        Some(_) => panic!("Expected None, got Some."),
-                                        None => (),
-                                    },
-                                }
-                            }
-                        }
-                    }
+                let mut weight_gradients: Vec<tensor::Tensor> = Vec::new();
+                let mut bias_gradients: Vec<Option<tensor::Tensor>> = Vec::new();
+                let mut losses: Vec<f32> = Vec::new();
 
-                    let size = inputs.len() as f32;
+                // Collect the results from the parallel iteration, and sum the gradients and loss.
+                for (wg, wb, loss) in results {
+                    if loss.is_nan() {
+                        panic!("ERROR: Loss is NaN.");
+                    }
+                    losses.push(loss);
+
+                    if weight_gradients.is_empty() {
+                        weight_gradients = wg;
+                        bias_gradients = wb;
+                    } else {
+                        for (gradient, new) in weight_gradients.iter_mut().zip(wg.iter()) {
+                            gradient.add_inplace(new)
+                        }
 
-                    if size > 1.0 {
-                        weight_gradients
-                            .iter_mut()
-                            .for_each(|gradient| gradient.div_scalar_inplace(size));
-                        bias_gradients.iter_mut().for_each(|gradient| {
-                            if let Some(gradient) = gradient {
-                                gradient.div_scalar_inplace(size);
+                        for (gradient, new) in bias_gradients.iter_mut().zip(wb.iter()) {
+                            match gradient {
+                                Some(gradient) => match new {
+                                    Some(new) => gradient.add_inplace(new),
+                                    None => panic!("Expected Some, got None."),
+                                },
+                                None => match new {
+                                    Some(_) => panic!("Expected None, got Some."),
+                                    None => (),
+                                },
                             }
-                        });
+                        }
                     }
+                }
 
-                    (loss / size, weight_gradients, bias_gradients)
-                })
-                .collect();
+                // // Average the gradients wrt. batch size.
+                // weight_gradients.iter_mut().for_each(|gradient| {
+                //     gradient.div_scalar_inplace(batch.0.len() as f32);
+                // });
+                // bias_gradients
+                //     .iter_mut()
+                //     .for_each(|gradient| match gradient {
+                //         Some(gradient) => gradient.div_scalar_inplace(batch.0.len() as f32),
+                //         None => (),
+                //     });
 
-            let mut losses: Vec<f32> = Vec::new();
-            for (loss, weight_gradients, bias_gradients) in results {
-                if loss.is_nan() {
-                    panic!("ERROR: Loss is NaN.");
-                }
+                loss_epoch += losses.iter().sum::<f32>() / losses.len() as f32;
 
+                // Perform the update step wrt. the summed gradients for the batch.
                 self.update(epoch, weight_gradients, bias_gradients);
-                losses.push(loss);
             }
-            train_loss.push(losses.iter().sum::<f32>() / losses.len() as f32);
+            train_loss.push(loss_epoch / batches.len() as f32);
 
             let (_val_loss, val_acc) = self.validate(val_inputs, val_targets, 1e-6);
             val_loss.push(_val_loss);