From 342eb587528d7b7500cc4c2696dae1242bc970d1 Mon Sep 17 00:00:00 2001 From: Hallvard Lavik Date: Wed, 28 Aug 2024 13:01:10 +0200 Subject: [PATCH] Fix batched weight update bug. Completed: #12 #13 #15. --- Cargo.toml | 2 +- README.md | 28 ++++-- examples/example_benchmark.rs | 165 +++++++++++++++------------------- examples/example_mnist.rs | 20 ++--- src/network.rs | 103 ++++++++++----------- 5 files changed, 153 insertions(+), 165 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index a95ff81..e3dd9e4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "neurons" -version = "1.1.0" +version = "2.0.0" edition = "2021" description = "Neural networks from scratch, in Rust." authors = ["Hallvard Høyland Lavik"] diff --git a/README.md b/README.md index 897a19e..3538be7 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,18 @@ Create modular neural networks in Rust with ease!
Releases + ## 2.0.0 (Fix batched weight updates) + + * Weight updates are now batched correctly. + - See `network::Network::learn` for details. + + ### Benchmarking examples/example_benchmark.rs (mnist version) + + ```raw + batched (128): 17.268632412s (4.82x speedup) + unbatched (1): 83.347593292s + ``` + ## 1.1.0 (Improved optimizer step) * Optimizer step more intuitive and easy to read. @@ -85,7 +97,7 @@ Create modular neural networks in Rust with ease! * Batched training (`network::Network::learn`) * Parallelization of batches (`rayon`) - ### Benchmarking example/example_benchmark.rs + ### Benchmarking examples/example_benchmark.rs (iris version) ```raw v0.3.0: 0.318811179s (6.95x speedup) @@ -164,6 +176,7 @@ Create modular neural networks in Rust with ease! - [x] Feedforward (dubbed `Network`) - [x] Convolutional - [ ] Recurrent + - [ ] Skip connections - [ ] Feedback connections - [x] Dense to Dense - [ ] Dense to Convolutional @@ -173,7 +186,7 @@ Create modular neural networks in Rust with ease! ## Regularization - [x] Dropout - [ ] Batch normalization - - [ ] Early stopping + - [x] Early stopping ## Parallelization - [x] Parallelization of batches @@ -186,10 +199,10 @@ Create modular neural networks in Rust with ease! - [x] Thorough testing of objective functions - [x] Thorough testing of optimization techniques - [ ] Thorough testing of feedback scaling (wrt. gradients) - - [ ] Integration tests + - [x] Integration tests - [x] Network forward pass - [x] Network backward pass - - [ ] Network training (i.e., weight updates) + - [x] Network training (i.e., weight updates) ## Examples - [x] XOR @@ -220,7 +233,6 @@ Create modular neural networks in Rust with ease! - [ ] General data loading functionality - [x] Custom icon/image for documentation - [x] Custom stylesheet for documentation - - [ ] Type conversion (e.g. f32, f64) - [ ] Network type specification (e.g. f32, f64) - [ ] Saving and loading - [ ] Single layer weights @@ -242,9 +254,9 @@ Create modular neural networks in Rust with ease! * [Adam](https://pytorch.org/docs/stable/generated/torch.optim.Adam.html) * [AdamW](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) * [RMSprop](https://pytorch.org/docs/stable/generated/torch.optim.RMSprop.html) - * [backpropagation convolution 1](https://deeplearning.cs.cmu.edu/F21/document/recitation/Recitation5/CNN_Backprop_Recitation_5_F21.pdf) - * [backpropagation convolution 2](https://www.jefkine.com/general/2016/09/05/backpropagation-in-convolutional-neural-networks/) - * [backpropagation convolution 3](https://sites.cc.gatech.edu/classes/AY2021/cs7643_spring/assets/L11_CNNs.pdf) + * [convolution 1](https://deeplearning.cs.cmu.edu/F21/document/recitation/Recitation5/CNN_Backprop_Recitation_5_F21.pdf) + * [convolution 2](https://www.jefkine.com/general/2016/09/05/backpropagation-in-convolutional-neural-networks/) + * [convolution 3](https://sites.cc.gatech.edu/classes/AY2021/cs7643_spring/assets/L11_CNNs.pdf) ### Tools used diff --git a/examples/example_benchmark.rs b/examples/example_benchmark.rs index 4a4fa62..731979c 100644 --- a/examples/example_benchmark.rs +++ b/examples/example_benchmark.rs @@ -1,118 +1,101 @@ // Copyright (C) 2024 Hallvard Høyland Lavik -use neurons::{activation, network, objective, optimizer, random, tensor}; +use neurons::{activation, network, objective, optimizer, tensor}; +use std::fs::File; +use std::io::{BufReader, Read, Result}; use std::time; -extern crate csv; - -fn data(path: &str) -> (Vec, Vec) { - let mut reader = csv::Reader::from_path(path).unwrap(); - - let mut x: Vec> = Vec::new(); - let mut y: Vec> = Vec::new(); - - reader.records().for_each(|record| { - let record = record.unwrap(); - x.push(vec![ - record.get(1).unwrap().parse::().unwrap(), - record.get(2).unwrap().parse::().unwrap(), - record.get(3).unwrap().parse::().unwrap(), - record.get(4).unwrap().parse::().unwrap(), - ]); - y.push(match record.get(5).unwrap() { - "Iris-setosa" => vec![1.0, 0.0, 0.0], - "Iris-versicolor" => vec![0.0, 1.0, 0.0], - "Iris-virginica" => vec![0.0, 0.0, 1.0], - // "Iris-setosa" => vec![0.0], - // "Iris-versicolor" => vec![1.0], - // "Iris-virginica" => vec![2.0], - _ => panic!("Unknown class"), - }); - }); - - let mut generator = random::Generator::create(12345); - let mut indices: Vec = (0..x.len()).collect(); - generator.shuffle(&mut indices); - - let x: Vec = indices - .iter() - .map(|&i| tensor::Tensor::vector(x[i].clone())) - .collect(); - let y: Vec = indices - .iter() - .map(|&i| tensor::Tensor::vector(y[i].clone())) - .collect(); - (x, y) +fn read(reader: &mut dyn Read) -> Result { + let mut buffer = [0; 4]; + reader.read_exact(&mut buffer)?; + Ok(u32::from_be_bytes(buffer)) +} + +fn load_images(path: &str) -> Result> { + let mut reader = BufReader::new(File::open(path)?); + let mut images: Vec = Vec::new(); + + let _magic_number = read(&mut reader)?; + let num_images = read(&mut reader)?; + let num_rows = read(&mut reader)?; + let num_cols = read(&mut reader)?; + + for _ in 0..num_images { + let mut image: Vec> = Vec::new(); + for _ in 0..num_rows { + let mut row: Vec = Vec::new(); + for _ in 0..num_cols { + let mut pixel = [0]; + reader.read_exact(&mut pixel)?; + row.push(pixel[0] as f32 / 255.0); + } + image.push(row); + } + images.push(tensor::Tensor::tensor(vec![image]).resize(tensor::Shape::Tensor(1, 14, 14))); + } + + Ok(images) +} + +fn load_labels(file_path: &str, numbers: f32) -> Result> { + let mut reader = BufReader::new(File::open(file_path)?); + let _magic_number = read(&mut reader)?; + let num_labels = read(&mut reader)?; + + let mut _labels = vec![0; num_labels as usize]; + reader.read_exact(&mut _labels)?; + + Ok(_labels + .iter() + .map(|&x| tensor::Tensor::one_hot(x as f32, numbers)) + .collect()) } fn main() { - // Load the iris dataset - let (x, y) = data("./datasets/iris.csv"); - - let split = (x.len() as f32 * 0.8) as usize; - let x = x.split_at(split); - let y = y.split_at(split); - - let x_train: Vec<&tensor::Tensor> = x.0.iter().collect(); - let y_train: Vec<&tensor::Tensor> = y.0.iter().collect(); - let x_test: Vec<&tensor::Tensor> = x.1.iter().collect(); - let y_test: Vec<&tensor::Tensor> = y.1.iter().collect(); - - let (x_train, y_train, x_test, y_test) = ( - x_train.to_vec(), - y_train.to_vec(), - x_test.to_vec(), - y_test.to_vec(), - ); - println!( - "Train data {}x{}: {} => {}", - x_train.len(), - x_train[0].shape, - x_train[0].data, - y_train[0].data - ); - println!( - "Test data {}x{}: {} => {}", - x_test.len(), - x_test[0].shape, - x_test[0].data, - y_test[0].data - ); + let x_train = load_images("./examples/datasets/mnist/train-images-idx3-ubyte").unwrap(); + let y_train = load_labels("./examples/datasets/mnist/train-labels-idx1-ubyte", 10f32).unwrap(); + + let x_train: Vec<&tensor::Tensor> = x_train.iter().collect(); + let y_train: Vec<&tensor::Tensor> = y_train.iter().collect(); let mut times: Vec = Vec::new(); - for _ in 0..10 { + for iteration in 0..10 { let start = time::Instant::now(); // Create the network - let mut network = network::Network::new(tensor::Shape::Vector(4)); - - network.dense(50, activation::Activation::ReLU, false, Some(0.1)); - network.dense(50, activation::Activation::ReLU, false, Some(0.1)); - network.dense(3, activation::Activation::Softmax, false, Some(0.1)); + let mut network = network::Network::new(tensor::Shape::Tensor(1, 14, 14)); + + network.convolution( + 8, + (3, 3), + (1, 1), + (0, 0), + activation::Activation::ReLU, + Some(0.05), + ); + network.maxpool((2, 2), (2, 2)); + network.dense(10, activation::Activation::Softmax, true, None); - network.set_optimizer(optimizer::Optimizer::RMSprop(optimizer::RMSprop { + network.set_optimizer(optimizer::Optimizer::Adam(optimizer::Adam { learning_rate: 0.001, - alpha: 0.0, + decay: None, + beta1: 0.9, + beta2: 0.999, epsilon: 1e-8, - - decay: Some(0.01), - momentum: Some(0.01), - centered: Some(true), - - // To be filled by the network: velocity: vec![], - gradient: vec![], - buffer: vec![], + momentum: vec![], })); network.set_objective( objective::Objective::CrossEntropy, // Objective function - Some((-1f32, 1f32)), // Gradient clipping + None, // Gradient clipping ); // Train the network - let (_train_loss, _val_loss) = network.learn(&x_train, &y_train, None, 25, 500, Some(50)); + let (train_loss, _) = network.learn(&x_train, &y_train, None, 128, 10, None); + + println!("Iteration: {}, Loss: {:?}", iteration, train_loss); let duration = start.elapsed(); times.push(duration); diff --git a/examples/example_mnist.rs b/examples/example_mnist.rs index 7bd9c5c..6d31826 100644 --- a/examples/example_mnist.rs +++ b/examples/example_mnist.rs @@ -31,7 +31,7 @@ fn load_images(path: &str) -> Result> { } image.push(row); } - images.push(tensor::Tensor::tensor(vec![image]).resize(tensor::Shape::Tensor(1, 14, 14))); + images.push(tensor::Tensor::tensor(vec![image]).resize(tensor::Shape::Tensor(1, 10, 10))); } Ok(images) @@ -67,7 +67,7 @@ fn main() { let x_test: Vec<&tensor::Tensor> = x_test.iter().collect(); let y_test: Vec<&tensor::Tensor> = y_test.iter().collect(); - let mut network = network::Network::new(tensor::Shape::Tensor(1, 14, 14)); + let mut network = network::Network::new(tensor::Shape::Tensor(1, 10, 10)); network.convolution( 8, @@ -78,15 +78,6 @@ fn main() { Some(0.05), ); network.maxpool((2, 2), (2, 2)); - // network.convolution( - // 8, - // (3, 3), - // (1, 1), - // (0, 0), - // activation::Activation::ReLU, - // None, - // ); - network.dense(512, activation::Activation::ReLU, true, Some(0.25)); network.dense(10, activation::Activation::Softmax, true, None); network.set_optimizer(optimizer::Optimizer::Adam(optimizer::Adam { @@ -106,13 +97,12 @@ fn main() { println!("{}", network); // Train the network - let (train_loss, val_loss) = - network.learn(&x_train, &y_train, Some((0.1, 5)), 128, 50, Some(1)); + let (train_loss, val_loss) = network.learn(&x_train, &y_train, Some((0.1, 5)), 128, 5, Some(1)); plot::loss(&train_loss, &val_loss, "Loss per epoch", "loss.png"); // Validate the network - let (val_loss, val_acc) = network.validate(&x_test, &y_test, 0.1); - println!("1. Validation acc: {}, loss: {}", val_acc, val_loss); + let (test_loss, test_acc) = network.validate(&x_test, &y_test, 0.1); + println!("1. Test acc: {}, loss: {}", test_acc, test_loss); // Use the network let prediction = network.predict(x_test.get(0).unwrap()); diff --git a/src/network.rs b/src/network.rs index d4249a0..82da3ec 100644 --- a/src/network.rs +++ b/src/network.rs @@ -515,71 +515,74 @@ impl Network { .collect(); for epoch in 1..epochs + 1 { - let results: Vec<_> = batches - .par_iter() - .map(|(inputs, targets)| { - let mut loss = 0.0f32; - let mut weight_gradients: Vec = Vec::new(); - let mut bias_gradients: Vec> = Vec::new(); - - for (i, (input, target)) in inputs.iter().zip(targets.iter()).enumerate() { + let mut loss_epoch = 0.0; + for batch in batches.iter() { + // Parallel iteration over the batch. + // I.e., parallell forward and backward pass for each sample in the batch. + let results: Vec<_> = batch + .into_par_iter() + .map(|(input, target)| { let (unactivated, activated, maxpools) = self.forward(input); - let (_loss, gradient) = + let (loss, gradient) = self.objective.loss(&activated.last().unwrap(), target); - loss += _loss; let (wg, bg) = self.backward(gradient, &unactivated, &activated, maxpools); - if i == 0 { - weight_gradients = wg; - bias_gradients = bg; - } else { - for (gradient, new) in weight_gradients.iter_mut().zip(wg.iter()) { - gradient.add_inplace(new) - } + (wg, bg, loss) + }) + .collect(); - for (gradient, new) in bias_gradients.iter_mut().zip(bg.iter()) { - match gradient { - Some(gradient) => match new { - Some(new) => gradient.add_inplace(new), - None => panic!("Expected Some, got None."), - }, - None => match new { - Some(_) => panic!("Expected None, got Some."), - None => (), - }, - } - } - } - } + let mut weight_gradients: Vec = Vec::new(); + let mut bias_gradients: Vec> = Vec::new(); + let mut losses: Vec = Vec::new(); - let size = inputs.len() as f32; + // Collect the results from the parallel iteration, and sum the gradients and loss. + for (wg, wb, loss) in results { + if loss.is_nan() { + panic!("ERROR: Loss is NaN."); + } + losses.push(loss); + + if weight_gradients.is_empty() { + weight_gradients = wg; + bias_gradients = wb; + } else { + for (gradient, new) in weight_gradients.iter_mut().zip(wg.iter()) { + gradient.add_inplace(new) + } - if size > 1.0 { - weight_gradients - .iter_mut() - .for_each(|gradient| gradient.div_scalar_inplace(size)); - bias_gradients.iter_mut().for_each(|gradient| { - if let Some(gradient) = gradient { - gradient.div_scalar_inplace(size); + for (gradient, new) in bias_gradients.iter_mut().zip(wb.iter()) { + match gradient { + Some(gradient) => match new { + Some(new) => gradient.add_inplace(new), + None => panic!("Expected Some, got None."), + }, + None => match new { + Some(_) => panic!("Expected None, got Some."), + None => (), + }, } - }); + } } + } - (loss / size, weight_gradients, bias_gradients) - }) - .collect(); + // // Average the gradients wrt. batch size. + // weight_gradients.iter_mut().for_each(|gradient| { + // gradient.div_scalar_inplace(batch.0.len() as f32); + // }); + // bias_gradients + // .iter_mut() + // .for_each(|gradient| match gradient { + // Some(gradient) => gradient.div_scalar_inplace(batch.0.len() as f32), + // None => (), + // }); - let mut losses: Vec = Vec::new(); - for (loss, weight_gradients, bias_gradients) in results { - if loss.is_nan() { - panic!("ERROR: Loss is NaN."); - } + loss_epoch += losses.iter().sum::() / losses.len() as f32; + // Perform the update step wrt. the summed gradients for the batch. self.update(epoch, weight_gradients, bias_gradients); - losses.push(loss); } - train_loss.push(losses.iter().sum::() / losses.len() as f32); + train_loss.push(loss_epoch / batches.len() as f32); let (_val_loss, val_acc) = self.validate(val_inputs, val_targets, 1e-6); val_loss.push(_val_loss);