From 19c9263fd8e3b9b2155e4cef6fc67ca2ac796e7f Mon Sep 17 00:00:00 2001 From: Andrew Bowell <49959927+AnBowell@users.noreply.github.com> Date: Fri, 2 Feb 2024 07:44:43 +0000 Subject: [PATCH 01/12] feat: :sparkles: Crosss validation for 100 and under elements working. --- Cargo.toml | 2 +- benches/whittaker.rs | 181 ++--- src/whittaker_smoother.rs | 637 ++++++++++-------- tests/data/original_scripts/whitsm.m | 1 + tests/data/original_scripts/whitsmw.m | 126 ++-- tests/mod.rs | 10 +- tests/validation.rs | 65 ++ whittaker-eilers-py/src/whittaker_smoother.rs | 232 +++---- 8 files changed, 702 insertions(+), 552 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 10ef061..db2e095 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,7 +18,7 @@ readme = "README.md" [dependencies] sprs = "0.11.1" sprs-ldl = "0.10.0" - +nalgebra = "0.32.3" [dev-dependencies] plotly = "0.8.4" diff --git a/benches/whittaker.rs b/benches/whittaker.rs index 2742fac..bef8851 100644 --- a/benches/whittaker.rs +++ b/benches/whittaker.rs @@ -1,86 +1,95 @@ -use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use whittaker_eilers::WhittakerSmoother; - -fn new_y_whittaker(y: &Vec) -> Vec { - WhittakerSmoother::new(2e4, 2, y.len(), None, None) - .unwrap() - .smooth(y) - .unwrap() -} - -fn new_x_y_whittaker(x: &Vec, y: &Vec) -> Vec { - WhittakerSmoother::new(2e4, 2, y.len(), Some(x), None) - .unwrap() - .smooth(y) - .unwrap() -} - -fn new_x_y_weights_whittaker(x: &Vec, y: &Vec, weights: &Vec) -> Vec { - WhittakerSmoother::new(2e4, 2, y.len(), Some(x), Some(weights)) - .unwrap() - .smooth(y) - .unwrap() -} - -fn criterion_benchmark(c: &mut Criterion) { - let wood_data_vec: Vec = WOOD_DATASET.to_vec(); - let wood_x_vec: Vec = (0..wood_data_vec.len()).map(|x| x as f64).collect(); - let weights = vec![1.0; wood_data_vec.len()]; - - let reusable_smoother = WhittakerSmoother::new( - 2e4, - 2, - wood_data_vec.len(), - Some(&wood_x_vec), - Some(&weights), - ) - .unwrap(); - c.bench_function("Whittaker Wood Y Only", |b| { - b.iter(|| new_y_whittaker(black_box(&wood_data_vec))) - }); - c.bench_function("Whittaker Wood X and Y", |b| { - b.iter(|| new_x_y_whittaker(black_box(&wood_x_vec), black_box(&wood_data_vec))) - }); - c.bench_function("Whittaker Wood X, Y, and weights", |b| { - b.iter(|| { - new_x_y_weights_whittaker( - black_box(&wood_x_vec), - black_box(&wood_data_vec), - black_box(&weights), - ) - }) - }); - c.bench_function("Whittaker Wood X, Y, and weights reused", |b| { - b.iter(|| reusable_smoother.smooth(&wood_data_vec).unwrap()) - }); -} -criterion_group!(benches, criterion_benchmark); -criterion_main!(benches); - -/// Data from this repo: -/// -pub const WOOD_DATASET: &[f64] = &[ - 106.0, 111.0, 111.0, 107.0, 105.0, 107.0, 110.0, 108.0, 111.0, 119.0, 117.0, 107.0, 105.0, - 107.0, 109.0, 105.0, 104.0, 102.0, 108.0, 113.0, 113.0, 107.0, 103.0, 103.0, 98.0, 102.0, - 103.0, 104.0, 105.0, 105.0, 105.0, 101.0, 103.0, 107.0, 109.0, 104.0, 100.0, 103.0, 100.0, - 105.0, 102.0, 105.0, 106.0, 107.0, 104.0, 107.0, 109.0, 108.0, 111.0, 107.0, 107.0, 106.0, - 107.0, 102.0, 102.0, 101.0, 103.0, 103.0, 103.0, 100.0, 101.0, 101.0, 100.0, 102.0, 101.0, - 96.0, 96.0, 98.0, 104.0, 107.0, 107.0, 102.0, 105.0, 101.0, 105.0, 110.0, 111.0, 111.0, 100.0, - 102.0, 102.0, 107.0, 112.0, 114.0, 113.0, 108.0, 106.0, 103.0, 103.0, 101.0, 103.0, 106.0, - 107.0, 106.0, 107.0, 107.0, 104.0, 111.0, 117.0, 118.0, 115.0, 107.0, 110.0, 117.0, 121.0, - 122.0, 123.0, 119.0, 117.0, 118.0, 115.0, 111.0, 108.0, 107.0, 105.0, 105.0, 105.0, 103.0, - 105.0, 107.0, 109.0, 110.0, 111.0, 108.0, 107.0, 106.0, 108.0, 107.0, 105.0, 102.0, 101.0, - 102.0, 101.0, 97.0, 100.0, 105.0, 108.0, 108.0, 105.0, 103.0, 103.0, 100.0, 103.0, 106.0, - 107.0, 97.0, 98.0, 100.0, 101.0, 97.0, 99.0, 101.0, 104.0, 107.0, 109.0, 111.0, 109.0, 103.0, - 105.0, 102.0, 108.0, 113.0, 113.0, 108.0, 107.0, 102.0, 106.0, 106.0, 106.0, 103.0, 97.0, - 103.0, 107.0, 102.0, 107.0, 111.0, 110.0, 107.0, 103.0, 99.0, 97.0, 99.0, 100.0, 99.0, 100.0, - 99.0, 100.0, 99.0, 99.0, 98.0, 100.0, 102.0, 102.0, 106.0, 112.0, 113.0, 109.0, 107.0, 105.0, - 97.0, 105.0, 110.0, 113.0, 108.0, 101.0, 95.0, 99.0, 100.0, 97.0, 92.0, 98.0, 101.0, 103.0, - 101.0, 92.0, 95.0, 91.0, 86.0, 86.0, 87.0, 93.0, 97.0, 95.0, 91.0, 86.0, 87.0, 88.0, 88.0, - 89.0, 87.0, 90.0, 88.0, 87.0, 89.0, 90.0, 90.0, 87.0, 86.0, 88.0, 83.0, 85.0, 85.0, 87.0, 91.0, - 93.0, 96.0, 95.0, 89.0, 89.0, 85.0, 88.0, 89.0, 92.0, 95.0, 91.0, 87.0, 83.0, 83.0, 82.0, 81.0, - 81.0, 80.0, 81.0, 82.0, 80.0, 76.0, 72.0, 73.0, 75.0, 77.0, 75.0, 80.0, 81.0, 81.0, 81.0, 81.0, - 81.0, 84.0, 86.0, 87.0, 88.0, 86.0, 84.0, 82.0, 80.0, 79.0, 82.0, 82.0, 76.0, 81.0, 83.0, 82.0, - 81.0, 75.0, 78.0, 78.0, 78.0, 79.0, 82.0, 82.0, 84.0, 82.0, 77.0, 77.0, 77.0, 75.0, 77.0, 73.0, - 75.0, 76.0, 80.0, 77.0, 68.0, 71.0, 71.0, 68.0, 67.0, 69.0, 72.0, 82.0, -]; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use whittaker_eilers::WhittakerSmoother; + +fn new_y_whittaker(y: &Vec) -> Vec { + WhittakerSmoother::new(2e4, 2, y.len(), None, None) + .unwrap() + .smooth(y) + .unwrap() +} +fn new_y_whittaker_cross_validate(y: &Vec) -> f64 { + WhittakerSmoother::new(2e4, 2, y.len(), None, None) + .unwrap() + .smooth_and_cross_validate(y) + .unwrap() +} + +fn new_x_y_whittaker(x: &Vec, y: &Vec) -> Vec { + WhittakerSmoother::new(2e4, 2, y.len(), Some(x), None) + .unwrap() + .smooth(y) + .unwrap() +} + +fn new_x_y_weights_whittaker(x: &Vec, y: &Vec, weights: &Vec) -> Vec { + WhittakerSmoother::new(2e4, 2, y.len(), Some(x), Some(weights)) + .unwrap() + .smooth(y) + .unwrap() +} + +fn criterion_benchmark(c: &mut Criterion) { + let wood_data_vec: Vec = WOOD_DATASET.to_vec(); + let wood_x_vec: Vec = (0..wood_data_vec.len()).map(|x| x as f64).collect(); + let weights = vec![1.0; wood_data_vec.len()]; + + let reusable_smoother = WhittakerSmoother::new( + 2e4, + 2, + wood_data_vec.len(), + Some(&wood_x_vec), + Some(&weights), + ) + .unwrap(); + c.bench_function("Whittaker Wood Y Only", |b| { + b.iter(|| new_y_whittaker(black_box(&wood_data_vec))) + }); + c.bench_function("Whittaker Wood Cross validate", |b| { + b.iter(|| new_y_whittaker_cross_validate(black_box(&wood_data_vec))) + }); + c.bench_function("Whittaker Wood X and Y", |b| { + b.iter(|| new_x_y_whittaker(black_box(&wood_x_vec), black_box(&wood_data_vec))) + }); + c.bench_function("Whittaker Wood X, Y, and weights", |b| { + b.iter(|| { + new_x_y_weights_whittaker( + black_box(&wood_x_vec), + black_box(&wood_data_vec), + black_box(&weights), + ) + }) + }); + c.bench_function("Whittaker Wood X, Y, and weights reused", |b| { + b.iter(|| reusable_smoother.smooth(&wood_data_vec).unwrap()) + }); +} +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); + +/// Data from this repo: +/// +pub const WOOD_DATASET: &[f64] = &[ + 106.0, 111.0, 111.0, 107.0, 105.0, 107.0, 110.0, 108.0, 111.0, 119.0, 117.0, 107.0, 105.0, + 107.0, 109.0, 105.0, 104.0, 102.0, 108.0, 113.0, 113.0, 107.0, 103.0, 103.0, 98.0, 102.0, + 103.0, 104.0, 105.0, 105.0, 105.0, 101.0, 103.0, 107.0, 109.0, 104.0, 100.0, 103.0, 100.0, + 105.0, 102.0, 105.0, 106.0, 107.0, 104.0, 107.0, 109.0, 108.0, 111.0, 107.0, 107.0, 106.0, + 107.0, 102.0, 102.0, 101.0, 103.0, 103.0, 103.0, 100.0, 101.0, 101.0, 100.0, 102.0, 101.0, + 96.0, 96.0, 98.0, 104.0, 107.0, 107.0, 102.0, 105.0, 101.0, 105.0, 110.0, 111.0, 111.0, 100.0, + 102.0, 102.0, 107.0, 112.0, 114.0, 113.0, 108.0, 106.0, 103.0, 103.0, 101.0, 103.0, 106.0, + 107.0, 106.0, 107.0, 107.0, 104.0, 111.0, 117.0, 118.0, 115.0, 107.0, 110.0, 117.0, 121.0, + 122.0, 123.0, 119.0, 117.0, 118.0, 115.0, 111.0, 108.0, 107.0, 105.0, 105.0, 105.0, 103.0, + 105.0, 107.0, 109.0, 110.0, 111.0, 108.0, 107.0, 106.0, 108.0, 107.0, 105.0, 102.0, 101.0, + 102.0, 101.0, 97.0, 100.0, 105.0, 108.0, 108.0, 105.0, 103.0, 103.0, 100.0, 103.0, 106.0, + 107.0, 97.0, 98.0, 100.0, 101.0, 97.0, 99.0, 101.0, 104.0, 107.0, 109.0, 111.0, 109.0, 103.0, + 105.0, 102.0, 108.0, 113.0, 113.0, 108.0, 107.0, 102.0, 106.0, 106.0, 106.0, 103.0, 97.0, + 103.0, 107.0, 102.0, 107.0, 111.0, 110.0, 107.0, 103.0, 99.0, 97.0, 99.0, 100.0, 99.0, 100.0, + 99.0, 100.0, 99.0, 99.0, 98.0, 100.0, 102.0, 102.0, 106.0, 112.0, 113.0, 109.0, 107.0, 105.0, + 97.0, 105.0, 110.0, 113.0, 108.0, 101.0, 95.0, 99.0, 100.0, 97.0, 92.0, 98.0, 101.0, 103.0, + 101.0, 92.0, 95.0, 91.0, 86.0, 86.0, 87.0, 93.0, 97.0, 95.0, 91.0, 86.0, 87.0, 88.0, 88.0, + 89.0, 87.0, 90.0, 88.0, 87.0, 89.0, 90.0, 90.0, 87.0, 86.0, 88.0, 83.0, 85.0, 85.0, 87.0, 91.0, + 93.0, 96.0, 95.0, 89.0, 89.0, 85.0, 88.0, 89.0, 92.0, 95.0, 91.0, 87.0, 83.0, 83.0, 82.0, 81.0, + 81.0, 80.0, 81.0, 82.0, 80.0, 76.0, 72.0, 73.0, 75.0, 77.0, 75.0, 80.0, 81.0, 81.0, 81.0, 81.0, + 81.0, 84.0, 86.0, 87.0, 88.0, 86.0, 84.0, 82.0, 80.0, 79.0, 82.0, 82.0, 76.0, 81.0, 83.0, 82.0, + 81.0, 75.0, 78.0, 78.0, 78.0, 79.0, 82.0, 82.0, 84.0, 82.0, 77.0, 77.0, 77.0, 75.0, 77.0, 73.0, + 75.0, 76.0, 80.0, 77.0, 68.0, 71.0, 71.0, 68.0, 67.0, 69.0, 72.0, 82.0, +]; diff --git a/src/whittaker_smoother.rs b/src/whittaker_smoother.rs index 0d796f6..ebe3786 100644 --- a/src/whittaker_smoother.rs +++ b/src/whittaker_smoother.rs @@ -1,281 +1,356 @@ -use sprs::FillInReduction::ReverseCuthillMcKee; -use sprs::SymmetryCheck::CheckSymmetry; -use sprs::{CsMat, CsMatView}; -use sprs_ldl::{Ldl, LdlNumeric}; - -use crate::errors::WhittakerError; -use crate::WHITTAKER_X_EPSILON; - -/// Whitaker-Eilers Smoother and Interpolator -/// -/// The smoother must be created via [WhittakerSmoother::new()] and once created, can be reused to smooth multiple sets of data as -/// efficiently as possible. You can update `lambda`, the smoothness; the order of the smoother `order`; the measurement `weights`; or the sample -/// times/positions `x_input` through the provided functions. They enable you to control the smoother without remaking costly matrices. -/// -pub struct WhittakerSmoother { - lambda: f64, - order: usize, - data_length: usize, - x_input: Option>, - e_mat: CsMat, - d_mat: CsMat, - weights_mat: Option>, - to_solve: CsMat, - ldl: LdlNumeric, -} - -impl WhittakerSmoother { - /// Create a new Whittaker-Eilers smoother and interpolator. - /// - /// The smoother is configured through it's `lambda` and it's `order`. `Lambda` controls the smoothness of the data and `order` controls - /// the order of which the penalities are applied (generally 2 - 4). The smoother can then be configured to weight measurements between 0 and 1 - /// to interpolate (0 weight) or to complete trust (1 weight) the measurement. The smoother can handle equally spaced measurements by simply not passing - /// an `x_input` or unequally spaced data by providing the sampling times/positions as `x_input`. - /// - /// The smoother parameters can be updated using the provided functions to avoid remaking this costly struct. The only time the [WhittakerSmoother] should be - /// remade is when the data length has changed, or a different sampling rate has been provided. - /// - /// # Arguments: - /// * `lambda`: Controls the smoothing strength, the larger, the smoother. - /// * `order`: The order of the filter. - /// * `data_length`: The length of the data which is to be smoothed. - /// * `x_input`: The time/position at which the y measurement was taken. Used to smooth unequally spaced data. Must be monotonically increasing. - /// * `weights`: The weight of each y measurement. - pub fn new( - lambda: f64, - order: usize, - data_length: usize, - x_input: Option<&Vec>, - weights: Option<&Vec>, - ) -> Result { - let e_mat: CsMat = CsMat::eye(data_length); - - if data_length < order { - return Err(WhittakerError::DataTooShort(data_length, order)); - } - - let (d_mat, cloned_vals_x) = match x_input { - Some(x_vec) => { - if data_length != x_vec.len() { - return Err(WhittakerError::LengthMismatch(data_length, x_vec.len())); - } - let mut cloned_x_vals = Vec::with_capacity(data_length); - for i in 0..data_length - 1 { - if x_vec[i] >= x_vec[i + 1] { - return Err(WhittakerError::NotMonotonicallyIncreasing(i)); - } - if (x_vec[i] - x_vec[i + 1]).abs() < WHITTAKER_X_EPSILON { - return Err(WhittakerError::SampleRateError(i)); - } - cloned_x_vals.push(x_vec[i]); - } - cloned_x_vals.push(x_vec[data_length - 1]); - - (ddmat(x_vec, x_vec.len(), order), Some(cloned_x_vals)) - } - None => (diff_no_ddmat(&e_mat, order), None), - }; - - let weights_mat: Option> = match weights { - Some(weights) => { - if data_length != weights.len() { - return Err(WhittakerError::LengthMismatch(data_length, weights.len())); - } - - let diags = (0..weights.len() + 1).collect::>(); - - Some(CsMat::new_csc( - (weights.len(), weights.len()), - diags[..].to_vec(), - diags[..weights.len()].to_vec(), - weights.to_vec(), - )) - } - None => None, - }; - - let to_solve: CsMat = match weights_mat.as_ref() { - Some(weights) => weights + &(&(&d_mat.transpose_view() * &d_mat) * lambda), - None => &e_mat + &(&(&d_mat.transpose_view() * &d_mat) * lambda), - }; - - let ldl = Ldl::new() - .fill_in_reduction(sprs::FillInReduction::ReverseCuthillMcKee) - .check_symmetry(CheckSymmetry) - .check_perm(sprs::CheckPerm) - .numeric(to_solve.view()) - .map_err(|x| WhittakerError::SolverError(x))?; - - return Ok(WhittakerSmoother { - lambda, - order, - data_length, - x_input: cloned_vals_x, - e_mat, - d_mat, - weights_mat, - to_solve, - ldl, - }); - } - - /// Retrieve the smoother's current lambda. - pub fn get_lambda(&self) -> f64 { - self.lambda - } - - /// Retrieve the smoother's current order. - pub fn get_order(&self) -> usize { - self.order - } - - /// Retrieve the length of the smoother's data. - pub fn get_data_length(&self) -> usize { - self.data_length - } - - /// Updates the weights of the data to be smoothed. - /// - /// The length of weights should be equal to that of the data you are to smooth. The values of the weights should fall between 0 and 1. - /// - /// # Arguments: - /// * `weights`: The weights of the measurements to be smoothed. The smaller the weight the more the measurement will be ignored. Setting a weight to 0 results in interpolation. - pub fn update_weights(&mut self, weights: &Vec) -> Result<(), WhittakerError> { - if self.data_length != weights.len() { - return Err(WhittakerError::LengthMismatch( - self.data_length, - weights.len(), - )); - } - - self.data_length = weights.len(); - - let diags = (0..weights.len() + 1).collect::>(); - - self.weights_mat = Some(CsMat::new_csc( - (weights.len(), weights.len()), - diags[..].to_vec(), - diags[..weights.len()].to_vec(), - weights.clone(), - )); - - self.update_lambda(self.lambda)?; - Ok(()) - } - - /// Updates the order of the Whittaker-Eilers smoother. - /// - /// Efficiently updates the order at which the Whittaker will use to smooth the data. - /// - /// # Arguments: - /// * `order`: The order to smooth. - pub fn update_order(&mut self, order: usize) -> Result<(), WhittakerError> { - if self.data_length < order { - return Err(WhittakerError::DataTooShort(self.data_length, order)); - } - - self.order = order; - - self.d_mat = match &self.x_input { - Some(x) => ddmat(x, x.len(), order), - None => diff_no_ddmat(&self.e_mat, order), - }; - - self.update_lambda(self.lambda)?; - Ok(()) - } - - /// Updates the smoothing constant `lambda` of the Whittaker-Eilers smoother. - /// - /// Efficiently update the target smoothness of the Whittaker smoother. The larger the `lambda`, the smoother the data. - /// - /// # Arguments: - /// * `lambda`: The smoothing constant of the Whittaker-Eilers smoother. - pub fn update_lambda(&mut self, lambda: f64) -> Result<(), WhittakerError> { - self.lambda = lambda; - - self.to_solve = match self.weights_mat.as_ref() { - Some(weights) => weights + &(&(&self.d_mat.transpose_view() * &self.d_mat) * lambda), - None => &self.e_mat + &(&(&self.d_mat.transpose_view() * &self.d_mat) * lambda), - }; - - self.ldl = Ldl::new() - .fill_in_reduction(ReverseCuthillMcKee) - .check_symmetry(CheckSymmetry) - .numeric(self.to_solve.view()) - .map_err(|x| WhittakerError::SolverError(x))?; - - Ok(()) - } - - /// Run Whittaker-Eilers smoothing and interpolation. - /// - /// This function actually runs the solver which results in the smoothed data. If you just wish to continuously smooth - /// data of different y values with the sample rate remaining the same, simply call this function with different data. Remaking the `WhittakerSmoother` struct - /// will result in a lot of overhead. - /// - /// # Arguments - /// * `vals_y`: The values which are to be smoothed and interpolated by the Whittaker-Eilers smoother. - /// - /// # Returns: - /// The smoothed and interpolated data. - pub fn smooth(&self, y_input: &[f64]) -> Result, WhittakerError> { - if y_input.len() != self.data_length { - return Err(WhittakerError::LengthMismatch( - self.data_length, - y_input.len(), - )); - } - return if self.weights_mat.is_some() { - Ok(self.ldl.solve( - self.weights_mat - .as_ref() - .unwrap() - .data() - .iter() - .zip(y_input) - .map(|(a, b)| a * b) - .collect::>(), - )) - } else { - Ok(self.ldl.solve(y_input)) - }; - } -} - -/// Dividing differencing matrix of order d -/// -/// # Arguments -/// * `x`: Sampling positions. -/// * `size`: Length og the data. -/// * `d`: order of differences. -fn ddmat(x: &[f64], size: usize, d: usize) -> CsMat { - if d == 0 { - return CsMat::eye(size); - } else { - let dx: Vec = x.windows(d + 1).map(|t| 1_f64 / (t[d] - t[0])).collect(); - - let ind: Vec = (0..(size - d) + 1).collect(); - - let v = CsMatView::new((size - d, size - d), &ind, &ind[..(size - d)], &dx); - - let d = &v * &diff(&ddmat(x, size, d - 1)); - - return d; - } -} - -// Finds the difference between adjacent elements of a sparse matrix -fn diff(e: &CsMat) -> CsMat { - let e1 = e.slice_outer(0..e.rows() - 1); - let e2 = e.slice_outer(1..e.rows()); - return &e2 - &e1; -} -// Dividing difference matrix for equally spaced data. -fn diff_no_ddmat(e: &CsMat, d: usize) -> CsMat { - if d == 0 { - return e.clone(); - } else { - return diff_no_ddmat(&diff(e), d - 1); - } -} +use crate::errors::WhittakerError; +use crate::WHITTAKER_X_EPSILON; +use nalgebra::{DMatrix, DVector}; +// use ndarray_inverse::Inverse; +use sprs::FillInReduction::ReverseCuthillMcKee; +use sprs::SymmetryCheck::CheckSymmetry; +use sprs::{CsMat, CsMatView}; +use sprs_ldl::{Ldl, LdlNumeric}; + +/// Whitaker-Eilers Smoother and Interpolator +/// +/// The smoother must be created via [WhittakerSmoother::new()] and once created, can be reused to smooth multiple sets of data as +/// efficiently as possible. You can update `lambda`, the smoothness; the order of the smoother `order`; the measurement `weights`; or the sample +/// times/positions `x_input` through the provided functions. They enable you to control the smoother without remaking costly matrices. +/// +pub struct WhittakerSmoother { + lambda: f64, + order: usize, + data_length: usize, + x_input: Option>, + e_mat: CsMat, + d_mat: CsMat, + weights_mat: Option>, + to_solve: CsMat, + ldl: LdlNumeric, +} + +impl WhittakerSmoother { + /// Create a new Whittaker-Eilers smoother and interpolator. + /// + /// The smoother is configured through it's `lambda` and it's `order`. `Lambda` controls the smoothness of the data and `order` controls + /// the order of which the penalities are applied (generally 2 - 4). The smoother can then be configured to weight measurements between 0 and 1 + /// to interpolate (0 weight) or to complete trust (1 weight) the measurement. The smoother can handle equally spaced measurements by simply not passing + /// an `x_input` or unequally spaced data by providing the sampling times/positions as `x_input`. + /// + /// The smoother parameters can be updated using the provided functions to avoid remaking this costly struct. The only time the [WhittakerSmoother] should be + /// remade is when the data length has changed, or a different sampling rate has been provided. + /// + /// # Arguments: + /// * `lambda`: Controls the smoothing strength, the larger, the smoother. + /// * `order`: The order of the filter. + /// * `data_length`: The length of the data which is to be smoothed. + /// * `x_input`: The time/position at which the y measurement was taken. Used to smooth unequally spaced data. Must be monotonically increasing. + /// * `weights`: The weight of each y measurement. + pub fn new( + lambda: f64, + order: usize, + data_length: usize, + x_input: Option<&Vec>, + weights: Option<&Vec>, + ) -> Result { + let e_mat: CsMat = CsMat::eye(data_length); + + if data_length < order { + return Err(WhittakerError::DataTooShort(data_length, order)); + } + + let (d_mat, cloned_vals_x) = match x_input { + Some(x_vec) => { + if data_length != x_vec.len() { + return Err(WhittakerError::LengthMismatch(data_length, x_vec.len())); + } + let mut cloned_x_vals = Vec::with_capacity(data_length); + for i in 0..data_length - 1 { + if x_vec[i] >= x_vec[i + 1] { + return Err(WhittakerError::NotMonotonicallyIncreasing(i)); + } + if (x_vec[i] - x_vec[i + 1]).abs() < WHITTAKER_X_EPSILON { + return Err(WhittakerError::SampleRateError(i)); + } + cloned_x_vals.push(x_vec[i]); + } + cloned_x_vals.push(x_vec[data_length - 1]); + + (ddmat(x_vec, x_vec.len(), order), Some(cloned_x_vals)) + } + None => (diff_no_ddmat(&e_mat, order), None), + }; + + let weights_mat: Option> = match weights { + Some(weights) => { + if data_length != weights.len() { + return Err(WhittakerError::LengthMismatch(data_length, weights.len())); + } + + let diags = (0..weights.len() + 1).collect::>(); + + Some(CsMat::new_csc( + (weights.len(), weights.len()), + diags[..].to_vec(), + diags[..weights.len()].to_vec(), + weights.to_vec(), + )) + } + None => None, + }; + + let to_solve: CsMat = match weights_mat.as_ref() { + Some(weights) => weights + &(&(&d_mat.transpose_view() * &d_mat) * lambda), + None => &e_mat + &(&(&d_mat.transpose_view() * &d_mat) * lambda), + }; + + let ldl = Ldl::new() + .fill_in_reduction(sprs::FillInReduction::ReverseCuthillMcKee) + .check_symmetry(CheckSymmetry) + .check_perm(sprs::CheckPerm) + .numeric(to_solve.view()) + .map_err(|x| WhittakerError::SolverError(x))?; + + return Ok(WhittakerSmoother { + lambda, + order, + data_length, + x_input: cloned_vals_x, + e_mat, + d_mat, + weights_mat, + to_solve, + ldl, + }); + } + + /// Retrieve the smoother's current lambda. + pub fn get_lambda(&self) -> f64 { + self.lambda + } + + /// Retrieve the smoother's current order. + pub fn get_order(&self) -> usize { + self.order + } + + /// Retrieve the length of the smoother's data. + pub fn get_data_length(&self) -> usize { + self.data_length + } + + /// Updates the weights of the data to be smoothed. + /// + /// The length of weights should be equal to that of the data you are to smooth. The values of the weights should fall between 0 and 1. + /// + /// # Arguments: + /// * `weights`: The weights of the measurements to be smoothed. The smaller the weight the more the measurement will be ignored. Setting a weight to 0 results in interpolation. + pub fn update_weights(&mut self, weights: &Vec) -> Result<(), WhittakerError> { + if self.data_length != weights.len() { + return Err(WhittakerError::LengthMismatch( + self.data_length, + weights.len(), + )); + } + + self.data_length = weights.len(); + + let diags = (0..weights.len() + 1).collect::>(); + + self.weights_mat = Some(CsMat::new_csc( + (weights.len(), weights.len()), + diags[..].to_vec(), + diags[..weights.len()].to_vec(), + weights.clone(), + )); + + self.update_lambda(self.lambda)?; + Ok(()) + } + + /// Updates the order of the Whittaker-Eilers smoother. + /// + /// Efficiently updates the order at which the Whittaker will use to smooth the data. + /// + /// # Arguments: + /// * `order`: The order to smooth. + pub fn update_order(&mut self, order: usize) -> Result<(), WhittakerError> { + if self.data_length < order { + return Err(WhittakerError::DataTooShort(self.data_length, order)); + } + + self.order = order; + + self.d_mat = match &self.x_input { + Some(x) => ddmat(x, x.len(), order), + None => diff_no_ddmat(&self.e_mat, order), + }; + + self.update_lambda(self.lambda)?; + Ok(()) + } + + /// Updates the smoothing constant `lambda` of the Whittaker-Eilers smoother. + /// + /// Efficiently update the target smoothness of the Whittaker smoother. The larger the `lambda`, the smoother the data. + /// + /// # Arguments: + /// * `lambda`: The smoothing constant of the Whittaker-Eilers smoother. + pub fn update_lambda(&mut self, lambda: f64) -> Result<(), WhittakerError> { + self.lambda = lambda; + + self.to_solve = match self.weights_mat.as_ref() { + Some(weights) => weights + &(&(&self.d_mat.transpose_view() * &self.d_mat) * lambda), + None => &self.e_mat + &(&(&self.d_mat.transpose_view() * &self.d_mat) * lambda), + }; + + self.ldl = Ldl::new() + .fill_in_reduction(ReverseCuthillMcKee) + .check_symmetry(CheckSymmetry) + .numeric(self.to_solve.view()) + .map_err(|x| WhittakerError::SolverError(x))?; + + Ok(()) + } + + /// Run Whittaker-Eilers smoothing and interpolation. + /// + /// This function actually runs the solver which results in the smoothed data. If you just wish to continuously smooth + /// data of different y values with the sample rate remaining the same, simply call this function with different data. Remaking the `WhittakerSmoother` struct + /// will result in a lot of overhead. + /// + /// # Arguments + /// * `vals_y`: The values which are to be smoothed and interpolated by the Whittaker-Eilers smoother. + /// + /// # Returns: + /// The smoothed and interpolated data. + pub fn smooth(&self, y_input: &[f64]) -> Result, WhittakerError> { + if y_input.len() != self.data_length { + return Err(WhittakerError::LengthMismatch( + self.data_length, + y_input.len(), + )); + } + return if self.weights_mat.is_some() { + Ok(self.ldl.solve( + self.weights_mat + .as_ref() + .unwrap() + .data() + .iter() + .zip(y_input) + .map(|(a, b)| a * b) + .collect::>(), + )) + } else { + Ok(self.ldl.solve(y_input)) + }; + } + + /// TODO! Document func + pub fn smooth_and_cross_validate(&self, y_input: &[f64]) -> Result { + if y_input.len() != self.data_length { + return Err(WhittakerError::LengthMismatch( + self.data_length, + y_input.len(), + )); + } + + let smoothed_series = self.smooth(y_input)?; + let smoothed_dvec = DVector::from_vec(smoothed_series); + let y_input_dvec = DVector::from_vec(y_input.to_vec()); + let identity_dvec = DVector::from_element(self.data_length, 1.0); + + if self.data_length > 100 { + let n = 100; + let e1: CsMat = CsMat::eye(n); + let d1 = match &self.x_input { + Some(x) => todo!(), // ddmat(&e1, x.len(), self.order), + None => diff_no_ddmat(&e1, self.order), + }; + let lambda1 = + self.lambda * (n as f64 / self.data_length as f64).powf(2.0 * self.order as f64); + + let to_solve = &e1 + &(&(&d1.transpose_view() * &d1) * lambda1); + + let mut hat_matrix = DMatrix::from_iterator( + to_solve.rows(), + to_solve.cols(), + to_solve.to_dense().into_iter(), + ) + .lu() + .solve(&DMatrix::identity(n, n)) + .unwrap(); + + let h1 = hat_matrix.diagonal(); + + let u = DVector::from_element(self.data_length, 0.0); + + let k = (self.data_length as f64 / 2 as f64).floor(); + + let k1 = (n as f64 / 2.0).floor(); + } + + let mut hat_matrix = DMatrix::from_iterator( + self.to_solve.rows(), + self.to_solve.cols(), + self.to_solve.to_dense().into_iter(), + ) + .lu() + .solve(&DMatrix::identity(self.data_length, self.data_length)) + .unwrap(); + + let weights_vec = self + .weights_mat + .as_ref() + .map(|x| DVector::from_row_slice(x.diag().data())); + + if weights_vec.is_some() { + hat_matrix *= DMatrix::from_diagonal(weights_vec.as_ref().unwrap()); + } + + let r = + (y_input_dvec - smoothed_dvec).component_div(&(identity_dvec - hat_matrix.diagonal())); + + let cve = match weights_vec.as_ref() { + Some(weights) => (r.transpose() * r.component_mul(weights)).sum() / weights.sum(), + None => (r.transpose() * r).sum() / self.data_length as f64, + } + .sqrt(); + + return Ok(cve); + } +} + +/// Dividing differencing matrix of order d +/// +/// # Arguments +/// * `x`: Sampling positions. +/// * `size`: Length og the data. +/// * `d`: order of differences. +fn ddmat(x: &[f64], size: usize, d: usize) -> CsMat { + if d == 0 { + return CsMat::eye(size); + } else { + let dx: Vec = x.windows(d + 1).map(|t| 1_f64 / (t[d] - t[0])).collect(); + + let ind: Vec = (0..(size - d) + 1).collect(); + + let v = CsMatView::new((size - d, size - d), &ind, &ind[..(size - d)], &dx); + + let d = &v * &diff(&ddmat(x, size, d - 1)); + + return d; + } +} + +// Finds the difference between adjacent elements of a sparse matrix +fn diff(e: &CsMat) -> CsMat { + let e1 = e.slice_outer(0..e.rows() - 1); + let e2 = e.slice_outer(1..e.rows()); + return &e2 - &e1; +} +// Dividing difference matrix for equally spaced data. +fn diff_no_ddmat(e: &CsMat, d: usize) -> CsMat { + if d == 0 { + return e.clone(); + } else { + return diff_no_ddmat(&diff(e), d - 1); + } +} diff --git a/tests/data/original_scripts/whitsm.m b/tests/data/original_scripts/whitsm.m index 45ab864..f4d5b7d 100644 --- a/tests/data/original_scripts/whitsm.m +++ b/tests/data/original_scripts/whitsm.m @@ -45,6 +45,7 @@ h = h1(f) * v(k) / h1(k1); end r = (y - z) ./ (1 - h); + cve = sqrt(r' * r / m); end diff --git a/tests/data/original_scripts/whitsmw.m b/tests/data/original_scripts/whitsmw.m index 5d42700..20505a3 100644 --- a/tests/data/original_scripts/whitsmw.m +++ b/tests/data/original_scripts/whitsmw.m @@ -1,63 +1,63 @@ - - - -function [z, cve, h] = whitsmw(y, w, lambda, d) -% Whittaker smoother with weights -% Input: -% y: data series, sampled at equal intervals -% (arbitrary values allowed when missing, but not NaN!) -% w: weights (0/1 for missing/non-missing data) -% lambda: smoothing parameter; large lambda gives smoother result -% d: order of differences (default = 2) -% Output: -% z: smoothed series -% cve: RMS leave-one-out prediction error -% h: diagonal of hat matrix -% -% Remark: the computation of the hat diagonal for m > 100 is experimental; -% with many missing observation it may fail. -% -% Paul Eilers, 2003 - -% Default order of differences -if nargin < 4 - d = 2; -end - -% Smoothing -m = length(y); -E = speye(m); -W = spdiags(w, 0, m, m); -D = diff(E, d); -C = chol(W + lambda * D' * D); -z = C \ (C' \ (w .* y)); - -% Computation of hat diagonal and cross-validation -if nargout > 1 - if m <= 100 % Exact hat diagonal - H = W * inv(W + lambda * D' * D); - h = diag(H); - else % Map to diag(H) for n = 100 - n = 100; - E1 = eye(n); - D1 = diff(E1, d); - lambda1 = lambda * (n / m) ^ (2 * d); - g = round(((1:n) - 1) * (m - 1) / (n - 1) + 1); - W1 = diag(w(g)); - H1 = inv(W1 + lambda1 * D1' * D1); - h1 = diag(H1); - u = zeros(m, 1); - k = floor(m / 2); - k1 = floor(n / 2); - u(k) = 1; - v = C \ (C' \ u); - hk = v(k); - f = round(((1:m)' - 1) * (n - 1)/ (m - 1) + 1); - h = w.* h1(f) * v(k) / h1(k1); - end - r = (y - z) ./ (1 - h); - cve = sqrt(r' * (r .* w) / sum(w)); -end - - - + + + +function [z, cve, h] = whitsmw(y, w, lambda, d) +% Whittaker smoother with weights +% Input: +% y: data series, sampled at equal intervals +% (arbitrary values allowed when missing, but not NaN!) +% w: weights (0/1 for missing/non-missing data) +% lambda: smoothing parameter; large lambda gives smoother result +% d: order of differences (default = 2) +% Output: +% z: smoothed series +% cve: RMS leave-one-out prediction error +% h: diagonal of hat matrix +% +% Remark: the computation of the hat diagonal for m > 100 is experimental; +% with many missing observation it may fail. +% +% Paul Eilers, 2003 + +% Default order of differences +if nargin < 4 + d = 2; +end + +% Smoothing +m = length(y); +E = speye(m); +W = spdiags(w, 0, m, m); +D = diff(E, d); +C = chol(W + lambda * D' * D); +z = C \ (C' \ (w .* y)); + +% Computation of hat diagonal and cross-validation +if nargout > 1 + if m <= 100 % Exact hat diagonal + H = W * inv(W + lambda * D' * D); + h = diag(H); + else % Map to diag(H) for n = 100 + n = 100; + E1 = eye(n); + D1 = diff(E1, d); + lambda1 = lambda * (n / m) ^ (2 * d); + g = round(((1:n) - 1) * (m - 1) / (n - 1) + 1); + W1 = diag(w(g)); + H1 = inv(W1 + lambda1 * D1' * D1); + h1 = diag(H1); + u = zeros(m, 1); + k = floor(m / 2); + k1 = floor(n / 2); + u(k) = 1; + v = C \ (C' \ u); + hk = v(k); + f = round(((1:m)' - 1) * (n - 1)/ (m - 1) + 1); + h = w.* h1(f) * v(k) / h1(k1); + end + r = (y - z) ./ (1 - h); + cve = sqrt(r' * (r .* w) / sum(w)); +end + + + diff --git a/tests/mod.rs b/tests/mod.rs index e059842..7eac959 100644 --- a/tests/mod.rs +++ b/tests/mod.rs @@ -1,5 +1,5 @@ -#[cfg(test)] -mod validation; - -#[cfg(test)] -mod robustness; +#[cfg(test)] +mod validation; + +#[cfg(test)] +mod robustness; diff --git a/tests/validation.rs b/tests/validation.rs index 95cfad7..0558272 100644 --- a/tests/validation.rs +++ b/tests/validation.rs @@ -159,6 +159,71 @@ fn validated_x_input_with_weights_whittaker() { } } +#[test] +fn validate_cross_validation_no_weights() { + let input_data = read_input_to_vecs(); + + let mut whittaker_smoother = + WhittakerSmoother::new(2e4, 2, input_data.y[..100].len(), None, None).unwrap(); + + let cve = whittaker_smoother + .smooth_and_cross_validate(&input_data.y[..100]) + .unwrap(); + + assert_relative_eq!(cve, 1.5806, epsilon = 1e-4); // Produced from matlab scripts. + + whittaker_smoother.update_order(3).unwrap(); + + let cve = whittaker_smoother + .smooth_and_cross_validate(&input_data.y[..100]) + .unwrap(); + + assert_relative_eq!(cve, 1.6178, epsilon = 1e-4); + + println!("Cve: {}", cve) +} +#[test] +fn validate_cross_validation_weights() { + let input_data = read_input_to_vecs(); + + let mut whittaker_smoother = WhittakerSmoother::new( + 2e4, + 2, + input_data.y[..100].len(), + None, + Some(&input_data.weights[..100].to_vec()), + ) + .unwrap(); + + let cve = whittaker_smoother + .smooth_and_cross_validate(&input_data.y[..100]) + .unwrap(); + + assert_relative_eq!(cve, 1.7282, epsilon = 1e-4); + + println!("Cve: {}", cve) +} +#[test] +fn validate_cross_validation_weights_x_input() { + let input_data = read_input_to_vecs(); + + let mut whittaker_smoother = WhittakerSmoother::new( + 2e4, + 2, + input_data.y[..100].len(), + Some(&input_data.x[..100].to_vec()), + Some(&input_data.weights[..100].to_vec()), + ) + .unwrap(); + + let cve = whittaker_smoother + .smooth_and_cross_validate(&input_data.y[..100]) + .unwrap(); + + assert_relative_eq!(cve, 1.7426, epsilon = 1e-4); + + println!("Cve: {}", cve) +} struct InputData { x: Vec, y: Vec, diff --git a/whittaker-eilers-py/src/whittaker_smoother.rs b/whittaker-eilers-py/src/whittaker_smoother.rs index 089271b..96c5c66 100644 --- a/whittaker-eilers-py/src/whittaker_smoother.rs +++ b/whittaker-eilers-py/src/whittaker_smoother.rs @@ -1,116 +1,116 @@ -use pyo3::prelude::*; - -use whittaker_eilers_rs::WhittakerError as WhittakerErrorRs; -use whittaker_eilers_rs::WhittakerSmoother as WhittakerSmootherRs; - -use crate::errors::WhittakerError; - -/// A new Whittaker-Eilers smoother and interpolator. -/// -/// The smoother is configured through it's `lambda` and it's `order`. `Lambda` controls the smoothness of the data (1e2~1e4) and `order` controls -/// the order of which the penalities are applied (generally 2 - 4). The smoother can then be configured to weight measurements between 0 and 1 -/// to interpolate (0 weight) or to complete trust (1 weight) the measurement. The smoother can handle equally spaced measurements by simply not passing -/// an `x_input` or unequally spaced data by providing the sampling times/positions as `x_input`. -/// -/// The smoother parameters can be updated using the provided functions to avoid remaking this costly struct. The only time the WhittakerSmoother should be -/// remade is when the data length has changed, or a different sampling rate has been provided. -/// -/// Parameters -/// ---------- -/// lmbda : Controls the smoothing strength, the larger, the smoother. Try 1e2~2e4 to start with and adjust based on the result. `lmbda` must be positive. -/// order : The order of the filter. Try 2~4 to start with. Order must be positive. -/// data_length : The length of the data which is to be smoothed. Must be positive. -/// x_input : The time/position at which the y measurement was taken. Used to smooth unequally spaced data. Must be monotonically increasing. -/// weights : The weight of each y measurement. -#[pyclass] -#[repr(transparent)] -pub struct WhittakerSmoother(WhittakerSmootherRs); - -#[pymethods] -impl WhittakerSmoother { - #[new] - // #[pyo3(signature = (lmbda, order, data_length, x_input, weights), text_signature = "(lmbda, order, data_length, x_input, weights)")] - pub fn __init__( - lmbda: f64, // Lambda is a key word in python - order: usize, - data_length: usize, - x_input: Option>, - weights: Option>, - ) -> PyResult { - Ok(WhittakerSmoother( - WhittakerSmootherRs::new( - lmbda, - order, - data_length, - x_input.as_ref(), - weights.as_ref(), - ) - .map_err(map_err_to_py)?, - )) - } - /// Retrieve the smoother's current order. - pub fn get_order(&self) -> usize { - self.0.get_order() - } - /// Retrieve the smoother's current lambda. - pub fn get_lambda(&self) -> f64 { - self.0.get_lambda() - } - /// Retrieve the smoother's current length. - pub fn get_data_length(&self) -> usize { - self.0.get_data_length() - } - - /// Updates the weights of the data to be smoothed. - /// The length of weights should be equal to that of the data you are to smooth. The values of the weights should fall between 0 and 1. - /// - /// Parameters - /// ---------- - /// weights : The weights of the measurements to be smoothed. The smaller the weight the more the measurement will be ignored. Setting a weight to 0 results in interpolation. - pub fn update_weights(&mut self, weights: Vec) -> PyResult<()> { - self.0.update_weights(&weights).map_err(map_err_to_py) - } - - /// Updates the order of the Whittaker-Eilers smoother. - /// - /// Efficiently updates the order at which the Whittaker will use to smooth the data. - /// - /// Parameters - /// ---------- - /// order : The order to smooth. - pub fn update_order(&mut self, order: usize) -> PyResult<()> { - self.0.update_order(order).map_err(map_err_to_py) - } - - /// Updates the smoothing constant `lambda` of the Whittaker-Eilers smoother. - /// - /// Efficiently update the target smoothness of the Whittaker smoother. The larger the `lambda`, the smoother the data. - /// - /// Parameters - /// ---------- - /// lmbda : The smoothing constant of the Whittaker-Eilers smoother. - pub fn update_lambda(&mut self, lambda: f64) -> PyResult<()> { - self.0.update_lambda(lambda).map_err(map_err_to_py) - } - - /// Run Whittaker-Eilers smoothing and interpolation. - /// - /// This function actually runs the solver which results in the smoothed data. If you just wish to continuously smooth - /// data of different y values with the sample rate remaining the same, simply call this function with different data. Remaking the `WhittakerSmoother` class - /// will result in a lot of overhead. - /// - /// Parameters - /// ---------- - /// vals_y : The values which are to be smoothed and interpolated by the Whittaker-Eilers smoother. - /// - /// Returns - /// ------- - /// The smoothed and interpolated data. - pub fn smooth(&self, y_vals: Vec) -> PyResult> { - self.0.smooth(&y_vals).map_err(map_err_to_py) - } -} - -fn map_err_to_py(err: WhittakerErrorRs) -> PyErr { - PyErr::from(WhittakerError(err)) -} +use pyo3::prelude::*; + +use whittaker_eilers_rs::WhittakerError as WhittakerErrorRs; +use whittaker_eilers_rs::WhittakerSmoother as WhittakerSmootherRs; + +use crate::errors::WhittakerError; + +/// A new Whittaker-Eilers smoother and interpolator. +/// +/// The smoother is configured through it's `lambda` and it's `order`. `Lambda` controls the smoothness of the data (1e2~1e4) and `order` controls +/// the order of which the penalities are applied (generally 2 - 4). The smoother can then be configured to weight measurements between 0 and 1 +/// to interpolate (0 weight) or to complete trust (1 weight) the measurement. The smoother can handle equally spaced measurements by simply not passing +/// an `x_input` or unequally spaced data by providing the sampling times/positions as `x_input`. +/// +/// The smoother parameters can be updated using the provided functions to avoid remaking this costly struct. The only time the WhittakerSmoother should be +/// remade is when the data length has changed, or a different sampling rate has been provided. +/// +/// Parameters +/// ---------- +/// lmbda : Controls the smoothing strength, the larger, the smoother. Try 1e2~2e4 to start with and adjust based on the result. `lmbda` must be positive. +/// order : The order of the filter. Try 2~4 to start with. Order must be positive. +/// data_length : The length of the data which is to be smoothed. Must be positive. +/// x_input : The time/position at which the y measurement was taken. Used to smooth unequally spaced data. Must be monotonically increasing. +/// weights : The weight of each y measurement. +#[pyclass] +#[repr(transparent)] +pub struct WhittakerSmoother(WhittakerSmootherRs); + +#[pymethods] +impl WhittakerSmoother { + #[new] + // #[pyo3(signature = (lmbda, order, data_length, x_input, weights), text_signature = "(lmbda, order, data_length, x_input, weights)")] + pub fn __init__( + lmbda: f64, // Lambda is a key word in python + order: usize, + data_length: usize, + x_input: Option>, + weights: Option>, + ) -> PyResult { + Ok(WhittakerSmoother( + WhittakerSmootherRs::new( + lmbda, + order, + data_length, + x_input.as_ref(), + weights.as_ref(), + ) + .map_err(map_err_to_py)?, + )) + } + /// Retrieve the smoother's current order. + pub fn get_order(&self) -> usize { + self.0.get_order() + } + /// Retrieve the smoother's current lambda. + pub fn get_lambda(&self) -> f64 { + self.0.get_lambda() + } + /// Retrieve the smoother's current length. + pub fn get_data_length(&self) -> usize { + self.0.get_data_length() + } + + /// Updates the weights of the data to be smoothed. + /// The length of weights should be equal to that of the data you are to smooth. The values of the weights should fall between 0 and 1. + /// + /// Parameters + /// ---------- + /// weights : The weights of the measurements to be smoothed. The smaller the weight the more the measurement will be ignored. Setting a weight to 0 results in interpolation. + pub fn update_weights(&mut self, weights: Vec) -> PyResult<()> { + self.0.update_weights(&weights).map_err(map_err_to_py) + } + + /// Updates the order of the Whittaker-Eilers smoother. + /// + /// Efficiently updates the order at which the Whittaker will use to smooth the data. + /// + /// Parameters + /// ---------- + /// order : The order to smooth. + pub fn update_order(&mut self, order: usize) -> PyResult<()> { + self.0.update_order(order).map_err(map_err_to_py) + } + + /// Updates the smoothing constant `lambda` of the Whittaker-Eilers smoother. + /// + /// Efficiently update the target smoothness of the Whittaker smoother. The larger the `lambda`, the smoother the data. + /// + /// Parameters + /// ---------- + /// lmbda : The smoothing constant of the Whittaker-Eilers smoother. + pub fn update_lambda(&mut self, lambda: f64) -> PyResult<()> { + self.0.update_lambda(lambda).map_err(map_err_to_py) + } + + /// Run Whittaker-Eilers smoothing and interpolation. + /// + /// This function actually runs the solver which results in the smoothed data. If you just wish to continuously smooth + /// data of different y values with the sample rate remaining the same, simply call this function with different data. Remaking the `WhittakerSmoother` class + /// will result in a lot of overhead. + /// + /// Parameters + /// ---------- + /// vals_y : The values which are to be smoothed and interpolated by the Whittaker-Eilers smoother. + /// + /// Returns + /// ------- + /// The smoothed and interpolated data. + pub fn smooth(&self, y_vals: Vec) -> PyResult> { + self.0.smooth(&y_vals).map_err(map_err_to_py) + } +} + +fn map_err_to_py(err: WhittakerErrorRs) -> PyErr { + PyErr::from(WhittakerError(err)) +} From 7bfd1dfc495af4eb1a672eb2279b83fff6210506 Mon Sep 17 00:00:00 2001 From: Andrew Bowell <49959927+AnBowell@users.noreply.github.com> Date: Sun, 4 Feb 2024 17:36:31 +0000 Subject: [PATCH 02/12] feat: :sparkles: Basic cross validation implemented for all scenarios --- src/whittaker_smoother.rs | 146 +++++++++++++++++++++++++++++--------- tests/validation.rs | 113 ++++++++++++++++++++++++++++- 2 files changed, 224 insertions(+), 35 deletions(-) diff --git a/src/whittaker_smoother.rs b/src/whittaker_smoother.rs index ebe3786..9704051 100644 --- a/src/whittaker_smoother.rs +++ b/src/whittaker_smoother.rs @@ -1,3 +1,5 @@ +// use std::collections::HashSet; + use crate::errors::WhittakerError; use crate::WHITTAKER_X_EPSILON; use nalgebra::{DMatrix, DVector}; @@ -243,6 +245,15 @@ impl WhittakerSmoother { }; } + /// Todo + fn smooth_and_cross_validates() -> Result<(Vec, f64), WhittakerError> { + todo!() + } + // TODO! + fn optimise_and_smooth() -> Result<(Vec, f64), WhittakerError> { + todo!() + } + /// TODO! Document func pub fn smooth_and_cross_validate(&self, y_input: &[f64]) -> Result { if y_input.len() != self.data_length { @@ -260,16 +271,45 @@ impl WhittakerSmoother { if self.data_length > 100 { let n = 100; let e1: CsMat = CsMat::eye(n); + + let g = (0..n) + .map(|x| { + ((x as f64) * ((self.data_length - 1) as f64 / (n - 1) as f64)).round() as usize + }) + // .collect::>() + // .into_iter() + .collect::>(); + let d1 = match &self.x_input { - Some(x) => todo!(), // ddmat(&e1, x.len(), self.order), + Some(x) => ddmat( + &g.iter().map(|index| x[*index]).collect::>(), + g.len(), + self.order, + ), None => diff_no_ddmat(&e1, self.order), }; let lambda1 = self.lambda * (n as f64 / self.data_length as f64).powf(2.0 * self.order as f64); - let to_solve = &e1 + &(&(&d1.transpose_view() * &d1) * lambda1); + let to_solve = match self.weights_mat.as_ref() { + Some(x) => { + let weights_vec = g.iter().map(|index| x.data()[*index]).collect::>(); - let mut hat_matrix = DMatrix::from_iterator( + let diags = (0..weights_vec.len() + 1).collect::>(); + + let weights_mat = CsMat::new_csc( + (weights_vec.len(), weights_vec.len()), + diags[..].to_vec(), + diags[..weights_vec.len()].to_vec(), + weights_vec.clone(), + ); + + &weights_mat + &(&(&d1.transpose_view() * &d1) * lambda1) + } + None => &e1 + &(&(&d1.transpose_view() * &d1) * lambda1), + }; + + let hat_matrix = DMatrix::from_iterator( to_solve.rows(), to_solve.cols(), to_solve.to_dense().into_iter(), @@ -280,41 +320,83 @@ impl WhittakerSmoother { let h1 = hat_matrix.diagonal(); - let u = DVector::from_element(self.data_length, 0.0); + let mut u = DVector::from_element(self.data_length, 0.0); - let k = (self.data_length as f64 / 2 as f64).floor(); + let k = (self.data_length as f64 / 2.0).floor() as usize; - let k1 = (n as f64 / 2.0).floor(); - } + let k1 = (n as f64 / 2.0).floor() as usize; - let mut hat_matrix = DMatrix::from_iterator( - self.to_solve.rows(), - self.to_solve.cols(), - self.to_solve.to_dense().into_iter(), - ) - .lu() - .solve(&DMatrix::identity(self.data_length, self.data_length)) - .unwrap(); - - let weights_vec = self - .weights_mat - .as_ref() - .map(|x| DVector::from_row_slice(x.diag().data())); - - if weights_vec.is_some() { - hat_matrix *= DMatrix::from_diagonal(weights_vec.as_ref().unwrap()); - } + u[k - 1] = 1.0; - let r = - (y_input_dvec - smoothed_dvec).component_div(&(identity_dvec - hat_matrix.diagonal())); + let v = self.ldl.solve(u.data.as_slice()); // Doesn't need weights - let cve = match weights_vec.as_ref() { - Some(weights) => (r.transpose() * r.component_mul(weights)).sum() / weights.sum(), - None => (r.transpose() * r).sum() / self.data_length as f64, - } - .sqrt(); + let f = (0..self.data_length) + .map(|x| { + ((x as f64) * ((n - 1) as f64 / (self.data_length - 1) as f64)).round() as usize + }) + // .collect::>() + // .into_iter() + .collect::>(); - return Ok(cve); + let vk = v[k - 1]; + let h1k1 = h1[k1 - 1]; + + println!("len: {}", h1.len()); + + let h = match self.weights_mat.as_ref() { + Some(x) => f + .iter() + .zip(x.data()) + .map(|(index, weight)| weight * h1[*index] * vk / h1k1) + .collect::>(), + None => f + .iter() + .map(|index| h1[*index] * vk / h1k1) + .collect::>(), + }; + + let h = DVector::from_vec(h); + + let r = (y_input_dvec - smoothed_dvec).component_div(&(identity_dvec - h)); + let weights_vec = self + .weights_mat + .as_ref() + .map(|x| DVector::from_row_slice(x.diag().data())); + let cve = match weights_vec.as_ref() { + Some(weights) => (r.transpose() * r.component_mul(weights)).sum() / weights.sum(), + None => (r.transpose() * r).sum() / self.data_length as f64, + } + .sqrt(); + return Ok(cve); + } else { + let mut hat_matrix = DMatrix::from_iterator( + self.to_solve.rows(), + self.to_solve.cols(), + self.to_solve.to_dense().into_iter(), + ) + .lu() + .solve(&DMatrix::identity(self.data_length, self.data_length)) + .unwrap(); + + let weights_vec = self + .weights_mat + .as_ref() + .map(|x| DVector::from_row_slice(x.diag().data())); + + if weights_vec.is_some() { + hat_matrix *= DMatrix::from_diagonal(weights_vec.as_ref().unwrap()); + } + let r = (y_input_dvec - smoothed_dvec) + .component_div(&(identity_dvec - hat_matrix.diagonal())); + + let cve = match weights_vec.as_ref() { + Some(weights) => (r.transpose() * r.component_mul(weights)).sum() / weights.sum(), + None => (r.transpose() * r).sum() / self.data_length as f64, + } + .sqrt(); + + return Ok(cve); + } } } diff --git a/tests/validation.rs b/tests/validation.rs index 0558272..6696a55 100644 --- a/tests/validation.rs +++ b/tests/validation.rs @@ -160,7 +160,7 @@ fn validated_x_input_with_weights_whittaker() { } #[test] -fn validate_cross_validation_no_weights() { +fn validate_cross_validation_no_weights_100() { let input_data = read_input_to_vecs(); let mut whittaker_smoother = @@ -183,7 +183,30 @@ fn validate_cross_validation_no_weights() { println!("Cve: {}", cve) } #[test] -fn validate_cross_validation_weights() { +fn validate_cross_validation_no_weights() { + let input_data = read_input_to_vecs(); + + let mut whittaker_smoother = + WhittakerSmoother::new(2e4, 2, input_data.y.len(), None, None).unwrap(); + + let cve = whittaker_smoother + .smooth_and_cross_validate(&input_data.y) + .unwrap(); + + assert_relative_eq!(cve, 3.3568, epsilon = 1e-4); // Produced from matlab scripts. + + whittaker_smoother.update_order(3).unwrap(); + + let cve = whittaker_smoother + .smooth_and_cross_validate(&input_data.y) + .unwrap(); + + assert_relative_eq!(cve, 2.6859, epsilon = 1e-4); + + println!("Cve: {}", cve) +} +#[test] +fn validate_cross_validation_weights_100() { let input_data = read_input_to_vecs(); let mut whittaker_smoother = WhittakerSmoother::new( @@ -204,7 +227,28 @@ fn validate_cross_validation_weights() { println!("Cve: {}", cve) } #[test] -fn validate_cross_validation_weights_x_input() { +fn validate_cross_validation_weights() { + let input_data = read_input_to_vecs(); + + let mut whittaker_smoother = WhittakerSmoother::new( + 2e4, + 2, + input_data.y.len(), + None, + Some(&input_data.weights.to_vec()), + ) + .unwrap(); + + let cve = whittaker_smoother + .smooth_and_cross_validate(&input_data.y) + .unwrap(); + + assert_relative_eq!(cve, 3.4549, epsilon = 1e-4); + + println!("Cve: {}", cve) +} +#[test] +fn validate_cross_validation_weights_x_input_100() { let input_data = read_input_to_vecs(); let mut whittaker_smoother = WhittakerSmoother::new( @@ -224,6 +268,69 @@ fn validate_cross_validation_weights_x_input() { println!("Cve: {}", cve) } +#[test] +fn validate_cross_validation_weights_x_input() { + let input_data = read_input_to_vecs(); + + let mut whittaker_smoother = WhittakerSmoother::new( + 2e4, + 2, + input_data.y.len(), + Some(&input_data.x.to_vec()), + Some(&input_data.weights.to_vec()), + ) + .unwrap(); + + let cve = whittaker_smoother + .smooth_and_cross_validate(&input_data.y) + .unwrap(); + + assert_relative_eq!(cve, 3.0762, epsilon = 1e-4); + + println!("Cve: {}", cve) +} +#[test] +fn validate_cross_validation_x_input_100() { + let input_data = read_input_to_vecs(); + + let mut whittaker_smoother = WhittakerSmoother::new( + 2e4, + 2, + input_data.y[..100].len(), + Some(&input_data.x[..100].to_vec()), + None, + ) + .unwrap(); + + let cve = whittaker_smoother + .smooth_and_cross_validate(&input_data.y[..100]) + .unwrap(); + + assert_relative_eq!(cve, 1.5872, epsilon = 1e-4); + + println!("Cve: {}", cve) +} +#[test] +fn validate_cross_validation_x_input() { + let input_data = read_input_to_vecs(); + + let mut whittaker_smoother = WhittakerSmoother::new( + 2e4, + 2, + input_data.y.len(), + Some(&input_data.x.to_vec()), + None, + ) + .unwrap(); + + let cve = whittaker_smoother + .smooth_and_cross_validate(&input_data.y) + .unwrap(); + + assert_relative_eq!(cve, 3.0413, epsilon = 1e-4); + + println!("Cve: {}", cve) +} struct InputData { x: Vec, y: Vec, From e3581b53a0477ba2f2626d4cb27965a804a050e2 Mon Sep 17 00:00:00 2001 From: Andrew Bowell <49959927+AnBowell@users.noreply.github.com> Date: Mon, 5 Feb 2024 07:52:43 +0000 Subject: [PATCH 03/12] feat: :sparkles: Attempting linear grid search for optimal lambda --- benches/whittaker.rs | 2 +- examples/smooth_and_interpolate.rs | 383 +++++++++++++------------ src/whittaker_smoother.rs | 71 +++-- tests/data/original_scripts/whitsm.m | 2 +- tests/data/original_scripts/whitsmdd.m | 2 +- tests/validation.rs | 91 +++--- 6 files changed, 304 insertions(+), 247 deletions(-) diff --git a/benches/whittaker.rs b/benches/whittaker.rs index bef8851..0d2ab42 100644 --- a/benches/whittaker.rs +++ b/benches/whittaker.rs @@ -7,7 +7,7 @@ fn new_y_whittaker(y: &Vec) -> Vec { .smooth(y) .unwrap() } -fn new_y_whittaker_cross_validate(y: &Vec) -> f64 { +fn new_y_whittaker_cross_validate(y: &Vec) -> (Vec, f64) { WhittakerSmoother::new(2e4, 2, y.len(), None, None) .unwrap() .smooth_and_cross_validate(y) diff --git a/examples/smooth_and_interpolate.rs b/examples/smooth_and_interpolate.rs index aee4d93..05abc91 100644 --- a/examples/smooth_and_interpolate.rs +++ b/examples/smooth_and_interpolate.rs @@ -1,180 +1,203 @@ -use plotly::{common::Mode, Layout, Plot, Scatter}; -use rand::thread_rng; -use rand_distr::{Distribution, Uniform}; -use whittaker_eilers::WhittakerSmoother; - -fn basic_smooth(x_input: &Vec, y_input: &Vec, lambda: f64, order: usize) { - let smoothed_y_only = WhittakerSmoother::new(lambda, order, y_input.len(), None, None) - .unwrap() - .smooth(&y_input) - .unwrap(); - - let raw_points = Scatter::new(x_input.clone(), y_input.to_vec()) - .mode(Mode::Markers) - .name("Raw Wood Data"); - let smoothed_points = Scatter::new(x_input.clone(), smoothed_y_only) - .mode(Mode::Lines) - .name("Whittaker Smoothed"); - - let mut plot = Plot::new(); - plot.add_trace(raw_points); - plot.add_trace(smoothed_points); - - let layout = Layout::new().title("Basic smoothing of equally spaced Y".into()); - plot.set_layout(layout); - - plot.show(); -} - -fn smooth_with_x(x_input_with_noise: &Vec, y_input: &Vec, lambda: f64, order: usize) { - let smoothed_y_only = WhittakerSmoother::new( - lambda, - order, - y_input.len(), - Some(&x_input_with_noise), - None, - ) - .unwrap() - .smooth(&y_input) - .unwrap(); - - let raw_points = Scatter::new(x_input_with_noise.clone(), y_input.to_vec()) - .mode(Mode::Markers) - .name("Raw Wood Data"); - let smoothed_points = Scatter::new(x_input_with_noise.clone(), smoothed_y_only) - .mode(Mode::Lines) - .name("Whittaker Smoothed"); - - let mut plot = Plot::new(); - plot.add_trace(raw_points); - plot.add_trace(smoothed_points); - - let layout = Layout::new().title("Basic smoothing of Y with arbitrary X".into()); - plot.set_layout(layout); - plot.show(); -} - -fn smooth_with_weights( - x_input_with_noise: &Vec, - y_input: &Vec, - weights: &Vec, - lambda: f64, - order: usize, -) { - let smoothed_y_only = WhittakerSmoother::new( - lambda, - order, - y_input.len(), - Some(&x_input_with_noise), - Some(&weights), - ) - .unwrap() - .smooth(&y_input) - .unwrap(); - - let raw_points = Scatter::new(x_input_with_noise.clone(), y_input.to_vec()) - .mode(Mode::Markers) - .name("Raw Wood Data"); - let smoothed_points = Scatter::new(x_input_with_noise.clone(), smoothed_y_only) - .mode(Mode::Lines) - .name("Whittaker Smoothed With Weights"); - - let mut plot = Plot::new(); - plot.add_trace(raw_points); - plot.add_trace(smoothed_points); - - let layout = Layout::new().title("Basic smoothing of Y with arbitrary X and weights".into()); - plot.set_layout(layout); - plot.show(); -} -fn smooth_and_interpolate( - x_input: &Vec, - y_input: &Vec, - weights: &Vec, - lambda: f64, - order: usize, -) { - let smoothed_y_only = - WhittakerSmoother::new(lambda, order, y_input.len(), Some(&x_input), Some(&weights)) - .unwrap() - .smooth(&y_input) - .unwrap(); - - let raw_points = Scatter::new(x_input.clone(), y_input.to_vec()) - .mode(Mode::Markers) - .name("Raw Wood Data"); - let smoothed_points = Scatter::new(x_input.clone(), smoothed_y_only) - .mode(Mode::Lines) - .name("Whittaker Smoothed With Weights"); - - let mut plot = Plot::new(); - plot.add_trace(raw_points); - plot.add_trace(smoothed_points); - - let layout = Layout::new().title("Smoothing and Interpolation using weights".into()); - plot.set_layout(layout); - plot.show(); -} - -fn main() { - let mut y_input = WOOD_DATASET.to_vec(); - let x_input = (0..y_input.len()).map(|x| x as f64).collect::>(); - - let mut rng = thread_rng(); - - let x_noise_distribution = Uniform::new(0.0, 0.7); - let weights_distribution = Uniform::new(0.0, 1.0); - - let x_input_with_noise = (0..y_input.len()) - .map(|x| (x as f64) + x_noise_distribution.sample(&mut rng)) //TODO! Check this is sampling each time - .collect::>(); - - let mut weights = weights_distribution - .sample_iter(&mut rng) - .take(y_input.len()) - .collect::>(); - - let lambda = 2e4; - let order = 2; - - basic_smooth(&x_input, &y_input, lambda, order); - - smooth_with_x(&x_input_with_noise, &y_input, lambda, order); - - smooth_with_weights(&x_input_with_noise, &y_input, &weights, lambda, order); - - for i in 30..60 { - y_input[i] = 0.0; // Set y to some arbitrary value we want to interpolate. - weights[i] = 0.0; // Set weights to 0 for data we want to interpolate. - } - - smooth_and_interpolate(&x_input, &y_input, &weights, lambda, order); -} - -/// Data from this repo: -/// -pub const WOOD_DATASET: &[f64] = &[ - 106.0, 111.0, 111.0, 107.0, 105.0, 107.0, 110.0, 108.0, 111.0, 119.0, 117.0, 107.0, 105.0, - 107.0, 109.0, 105.0, 104.0, 102.0, 108.0, 113.0, 113.0, 107.0, 103.0, 103.0, 98.0, 102.0, - 103.0, 104.0, 105.0, 105.0, 105.0, 101.0, 103.0, 107.0, 109.0, 104.0, 100.0, 103.0, 100.0, - 105.0, 102.0, 105.0, 106.0, 107.0, 104.0, 107.0, 109.0, 108.0, 111.0, 107.0, 107.0, 106.0, - 107.0, 102.0, 102.0, 101.0, 103.0, 103.0, 103.0, 100.0, 101.0, 101.0, 100.0, 102.0, 101.0, - 96.0, 96.0, 98.0, 104.0, 107.0, 107.0, 102.0, 105.0, 101.0, 105.0, 110.0, 111.0, 111.0, 100.0, - 102.0, 102.0, 107.0, 112.0, 114.0, 113.0, 108.0, 106.0, 103.0, 103.0, 101.0, 103.0, 106.0, - 107.0, 106.0, 107.0, 107.0, 104.0, 111.0, 117.0, 118.0, 115.0, 107.0, 110.0, 117.0, 121.0, - 122.0, 123.0, 119.0, 117.0, 118.0, 115.0, 111.0, 108.0, 107.0, 105.0, 105.0, 105.0, 103.0, - 105.0, 107.0, 109.0, 110.0, 111.0, 108.0, 107.0, 106.0, 108.0, 107.0, 105.0, 102.0, 101.0, - 102.0, 101.0, 97.0, 100.0, 105.0, 108.0, 108.0, 105.0, 103.0, 103.0, 100.0, 103.0, 106.0, - 107.0, 97.0, 98.0, 100.0, 101.0, 97.0, 99.0, 101.0, 104.0, 107.0, 109.0, 111.0, 109.0, 103.0, - 105.0, 102.0, 108.0, 113.0, 113.0, 108.0, 107.0, 102.0, 106.0, 106.0, 106.0, 103.0, 97.0, - 103.0, 107.0, 102.0, 107.0, 111.0, 110.0, 107.0, 103.0, 99.0, 97.0, 99.0, 100.0, 99.0, 100.0, - 99.0, 100.0, 99.0, 99.0, 98.0, 100.0, 102.0, 102.0, 106.0, 112.0, 113.0, 109.0, 107.0, 105.0, - 97.0, 105.0, 110.0, 113.0, 108.0, 101.0, 95.0, 99.0, 100.0, 97.0, 92.0, 98.0, 101.0, 103.0, - 101.0, 92.0, 95.0, 91.0, 86.0, 86.0, 87.0, 93.0, 97.0, 95.0, 91.0, 86.0, 87.0, 88.0, 88.0, - 89.0, 87.0, 90.0, 88.0, 87.0, 89.0, 90.0, 90.0, 87.0, 86.0, 88.0, 83.0, 85.0, 85.0, 87.0, 91.0, - 93.0, 96.0, 95.0, 89.0, 89.0, 85.0, 88.0, 89.0, 92.0, 95.0, 91.0, 87.0, 83.0, 83.0, 82.0, 81.0, - 81.0, 80.0, 81.0, 82.0, 80.0, 76.0, 72.0, 73.0, 75.0, 77.0, 75.0, 80.0, 81.0, 81.0, 81.0, 81.0, - 81.0, 84.0, 86.0, 87.0, 88.0, 86.0, 84.0, 82.0, 80.0, 79.0, 82.0, 82.0, 76.0, 81.0, 83.0, 82.0, - 81.0, 75.0, 78.0, 78.0, 78.0, 79.0, 82.0, 82.0, 84.0, 82.0, 77.0, 77.0, 77.0, 75.0, 77.0, 73.0, - 75.0, 76.0, 80.0, 77.0, 68.0, 71.0, 71.0, 68.0, 67.0, 69.0, 72.0, 82.0, -]; +use plotly::{common::Mode, Layout, Plot, Scatter}; +use rand::thread_rng; +use rand_distr::{Distribution, Uniform}; +use whittaker_eilers::WhittakerSmoother; + +fn basic_smooth(x_input: &Vec, y_input: &Vec, lambda: f64, order: usize) { + let smoothed_y_only = WhittakerSmoother::new(lambda, order, y_input.len(), None, None) + .unwrap() + .smooth(&y_input) + .unwrap(); + + let raw_points = Scatter::new(x_input.clone(), y_input.to_vec()) + .mode(Mode::Markers) + .name("Raw Wood Data"); + let smoothed_points = Scatter::new(x_input.clone(), smoothed_y_only) + .mode(Mode::Lines) + .name("Whittaker Smoothed"); + + let mut plot = Plot::new(); + plot.add_trace(raw_points); + plot.add_trace(smoothed_points); + + let layout = Layout::new().title("Basic smoothing of equally spaced Y".into()); + plot.set_layout(layout); + + plot.show(); +} + +fn smooth_with_x(x_input_with_noise: &Vec, y_input: &Vec, lambda: f64, order: usize) { + let smoothed_y_only = WhittakerSmoother::new( + lambda, + order, + y_input.len(), + Some(&x_input_with_noise), + None, + ) + .unwrap() + .smooth(&y_input) + .unwrap(); + + let raw_points = Scatter::new(x_input_with_noise.clone(), y_input.to_vec()) + .mode(Mode::Markers) + .name("Raw Wood Data"); + let smoothed_points = Scatter::new(x_input_with_noise.clone(), smoothed_y_only) + .mode(Mode::Lines) + .name("Whittaker Smoothed"); + + let mut plot = Plot::new(); + plot.add_trace(raw_points); + plot.add_trace(smoothed_points); + + let layout = Layout::new().title("Basic smoothing of Y with arbitrary X".into()); + plot.set_layout(layout); + plot.show(); +} + +fn smooth_with_weights( + x_input_with_noise: &Vec, + y_input: &Vec, + weights: &Vec, + lambda: f64, + order: usize, +) { + let smoothed_y_only = WhittakerSmoother::new( + lambda, + order, + y_input.len(), + Some(&x_input_with_noise), + Some(&weights), + ) + .unwrap() + .smooth(&y_input) + .unwrap(); + + let raw_points = Scatter::new(x_input_with_noise.clone(), y_input.to_vec()) + .mode(Mode::Markers) + .name("Raw Wood Data"); + let smoothed_points = Scatter::new(x_input_with_noise.clone(), smoothed_y_only) + .mode(Mode::Lines) + .name("Whittaker Smoothed With Weights"); + + let mut plot = Plot::new(); + plot.add_trace(raw_points); + plot.add_trace(smoothed_points); + + let layout = Layout::new().title("Basic smoothing of Y with arbitrary X and weights".into()); + plot.set_layout(layout); + plot.show(); +} +fn smooth_and_interpolate( + x_input: &Vec, + y_input: &Vec, + weights: &Vec, + lambda: f64, + order: usize, +) { + let smoothed_y_only = + WhittakerSmoother::new(lambda, order, y_input.len(), Some(&x_input), Some(&weights)) + .unwrap() + .smooth(&y_input) + .unwrap(); + + let raw_points = Scatter::new(x_input.clone(), y_input.to_vec()) + .mode(Mode::Markers) + .name("Raw Wood Data"); + let smoothed_points = Scatter::new(x_input.clone(), smoothed_y_only) + .mode(Mode::Lines) + .name("Whittaker Smoothed With Weights"); + + let mut plot = Plot::new(); + plot.add_trace(raw_points); + plot.add_trace(smoothed_points); + + let layout = Layout::new().title("Smoothing and Interpolation using weights".into()); + plot.set_layout(layout); + plot.show(); +} + +fn smooth_cross_validate(x_input: &Vec, y_input: &Vec, order: usize) { + let optimal_smooth_with_error = WhittakerSmoother::new(100.0, order, y_input.len(), None, None) + .unwrap() + .optimise_and_smooth(y_input, false) + .unwrap(); + + let raw_points = Scatter::new(x_input.clone(), y_input.to_vec()) + .mode(Mode::Markers) + .name("Raw Wood Data"); + let smoothed_points = Scatter::new(x_input.clone(), optimal_smooth_with_error.0) + .mode(Mode::Lines) + .name("Whittaker Optimally Smoothed"); + + let mut plot = Plot::new(); + plot.add_trace(raw_points); + plot.add_trace(smoothed_points); + + let layout = Layout::new().title("Whittaker optimal cross validation".into()); + plot.set_layout(layout); + plot.show(); +} +fn main() { + let mut y_input = WOOD_DATASET.to_vec(); + let x_input = (0..y_input.len()).map(|x| x as f64).collect::>(); + + let mut rng = thread_rng(); + + let x_noise_distribution = Uniform::new(0.0, 0.7); + let weights_distribution = Uniform::new(0.0, 1.0); + + let x_input_with_noise = (0..y_input.len()) + .map(|x| (x as f64) + x_noise_distribution.sample(&mut rng)) //TODO! Check this is sampling each time + .collect::>(); + + let mut weights = weights_distribution + .sample_iter(&mut rng) + .take(y_input.len()) + .collect::>(); + + let lambda = 2e4; + let order = 2; + + basic_smooth(&x_input, &y_input, lambda, order); + + smooth_with_x(&x_input_with_noise, &y_input, lambda, order); + + smooth_with_weights(&x_input_with_noise, &y_input, &weights, lambda, order); + + smooth_cross_validate(&x_input, &y_input, order); + + for i in 30..60 { + y_input[i] = 0.0; // Set y to some arbitrary value we want to interpolate. + weights[i] = 0.0; // Set weights to 0 for data we want to interpolate. + } + + smooth_and_interpolate(&x_input, &y_input, &weights, lambda, order); +} + +/// Data from this repo, though originally from Eiler's paper. +/// . +pub const WOOD_DATASET: &[f64] = &[ + 106.0, 111.0, 111.0, 107.0, 105.0, 107.0, 110.0, 108.0, 111.0, 119.0, 117.0, 107.0, 105.0, + 107.0, 109.0, 105.0, 104.0, 102.0, 108.0, 113.0, 113.0, 107.0, 103.0, 103.0, 98.0, 102.0, + 103.0, 104.0, 105.0, 105.0, 105.0, 101.0, 103.0, 107.0, 109.0, 104.0, 100.0, 103.0, 100.0, + 105.0, 102.0, 105.0, 106.0, 107.0, 104.0, 107.0, 109.0, 108.0, 111.0, 107.0, 107.0, 106.0, + 107.0, 102.0, 102.0, 101.0, 103.0, 103.0, 103.0, 100.0, 101.0, 101.0, 100.0, 102.0, 101.0, + 96.0, 96.0, 98.0, 104.0, 107.0, 107.0, 102.0, 105.0, 101.0, 105.0, 110.0, 111.0, 111.0, 100.0, + 102.0, 102.0, 107.0, 112.0, 114.0, 113.0, 108.0, 106.0, 103.0, 103.0, 101.0, 103.0, 106.0, + 107.0, 106.0, 107.0, 107.0, 104.0, 111.0, 117.0, 118.0, 115.0, 107.0, 110.0, 117.0, 121.0, + 122.0, 123.0, 119.0, 117.0, 118.0, 115.0, 111.0, 108.0, 107.0, 105.0, 105.0, 105.0, 103.0, + 105.0, 107.0, 109.0, 110.0, 111.0, 108.0, 107.0, 106.0, 108.0, 107.0, 105.0, 102.0, 101.0, + 102.0, 101.0, 97.0, 100.0, 105.0, 108.0, 108.0, 105.0, 103.0, 103.0, 100.0, 103.0, 106.0, + 107.0, 97.0, 98.0, 100.0, 101.0, 97.0, 99.0, 101.0, 104.0, 107.0, 109.0, 111.0, 109.0, 103.0, + 105.0, 102.0, 108.0, 113.0, 113.0, 108.0, 107.0, 102.0, 106.0, 106.0, 106.0, 103.0, 97.0, + 103.0, 107.0, 102.0, 107.0, 111.0, 110.0, 107.0, 103.0, 99.0, 97.0, 99.0, 100.0, 99.0, 100.0, + 99.0, 100.0, 99.0, 99.0, 98.0, 100.0, 102.0, 102.0, 106.0, 112.0, 113.0, 109.0, 107.0, 105.0, + 97.0, 105.0, 110.0, 113.0, 108.0, 101.0, 95.0, 99.0, 100.0, 97.0, 92.0, 98.0, 101.0, 103.0, + 101.0, 92.0, 95.0, 91.0, 86.0, 86.0, 87.0, 93.0, 97.0, 95.0, 91.0, 86.0, 87.0, 88.0, 88.0, + 89.0, 87.0, 90.0, 88.0, 87.0, 89.0, 90.0, 90.0, 87.0, 86.0, 88.0, 83.0, 85.0, 85.0, 87.0, 91.0, + 93.0, 96.0, 95.0, 89.0, 89.0, 85.0, 88.0, 89.0, 92.0, 95.0, 91.0, 87.0, 83.0, 83.0, 82.0, 81.0, + 81.0, 80.0, 81.0, 82.0, 80.0, 76.0, 72.0, 73.0, 75.0, 77.0, 75.0, 80.0, 81.0, 81.0, 81.0, 81.0, + 81.0, 84.0, 86.0, 87.0, 88.0, 86.0, 84.0, 82.0, 80.0, 79.0, 82.0, 82.0, 76.0, 81.0, 83.0, 82.0, + 81.0, 75.0, 78.0, 78.0, 78.0, 79.0, 82.0, 82.0, 84.0, 82.0, 77.0, 77.0, 77.0, 75.0, 77.0, 73.0, + 75.0, 76.0, 80.0, 77.0, 68.0, 71.0, 71.0, 68.0, 67.0, 69.0, 72.0, 82.0, +]; diff --git a/src/whittaker_smoother.rs b/src/whittaker_smoother.rs index 9704051..73b0979 100644 --- a/src/whittaker_smoother.rs +++ b/src/whittaker_smoother.rs @@ -1,9 +1,7 @@ -// use std::collections::HashSet; - use crate::errors::WhittakerError; use crate::WHITTAKER_X_EPSILON; use nalgebra::{DMatrix, DVector}; -// use ndarray_inverse::Inverse; + use sprs::FillInReduction::ReverseCuthillMcKee; use sprs::SymmetryCheck::CheckSymmetry; use sprs::{CsMat, CsMatView}; @@ -245,17 +243,58 @@ impl WhittakerSmoother { }; } - /// Todo - fn smooth_and_cross_validates() -> Result<(Vec, f64), WhittakerError> { - todo!() - } - // TODO! - fn optimise_and_smooth() -> Result<(Vec, f64), WhittakerError> { - todo!() + /// TODO: Document + pub fn optimise_and_smooth( + &mut self, + y_input: &[f64], + break_serial_correlation: bool, + ) -> Result<(Vec, f64), WhittakerError> { + let mut start_lambda_log = (1e-2_f64).log10(); // -2 + let end_lambda_log = (1e8_f64).log10(); // 8 + + let mut smoothed_data = Vec::::new(); + let mut optimal_lambda = f64::MAX; + let mut cve = f64::MAX; + + if break_serial_correlation { + // TODO! Make this work when we already have weights. + let cross_validation_weights: Vec = (0..self.data_length) + .map(|x| if x % 10 == 0 { 1.0 } else { 0.0 }) + .collect::>(); + + println!("Weights: {:?}", cross_validation_weights); + self.update_weights(&cross_validation_weights)?; + } + + while start_lambda_log != end_lambda_log { + let new_lambda = 10_f64.powf(start_lambda_log); + + self.update_lambda(new_lambda)?; + let res = self.smooth_and_cross_validate(y_input)?; + + if res.1 < cve { + cve = res.1; + smoothed_data = res.0; + optimal_lambda = new_lambda; + } + println!("Lambda: {}, Error: {}", new_lambda, res.1); + start_lambda_log += 0.5; + } + + if break_serial_correlation { + self.update_lambda(optimal_lambda)?; + self.update_weights(&vec![1.0; self.data_length])?; + smoothed_data = self.smooth(y_input)?; + } + + return Ok((smoothed_data, cve)); } - /// TODO! Document func - pub fn smooth_and_cross_validate(&self, y_input: &[f64]) -> Result { + /// TODO: Document + pub fn smooth_and_cross_validate( + &self, + y_input: &[f64], + ) -> Result<(Vec, f64), WhittakerError> { if y_input.len() != self.data_length { return Err(WhittakerError::LengthMismatch( self.data_length, @@ -264,7 +303,7 @@ impl WhittakerSmoother { } let smoothed_series = self.smooth(y_input)?; - let smoothed_dvec = DVector::from_vec(smoothed_series); + let smoothed_dvec = DVector::from_vec(smoothed_series.clone()); let y_input_dvec = DVector::from_vec(y_input.to_vec()); let identity_dvec = DVector::from_element(self.data_length, 1.0); @@ -341,8 +380,6 @@ impl WhittakerSmoother { let vk = v[k - 1]; let h1k1 = h1[k1 - 1]; - println!("len: {}", h1.len()); - let h = match self.weights_mat.as_ref() { Some(x) => f .iter() @@ -367,7 +404,7 @@ impl WhittakerSmoother { None => (r.transpose() * r).sum() / self.data_length as f64, } .sqrt(); - return Ok(cve); + return Ok((smoothed_series, cve)); } else { let mut hat_matrix = DMatrix::from_iterator( self.to_solve.rows(), @@ -395,7 +432,7 @@ impl WhittakerSmoother { } .sqrt(); - return Ok(cve); + return Ok((smoothed_series, cve)); } } } diff --git a/tests/data/original_scripts/whitsm.m b/tests/data/original_scripts/whitsm.m index f4d5b7d..76bedd6 100644 --- a/tests/data/original_scripts/whitsm.m +++ b/tests/data/original_scripts/whitsm.m @@ -27,7 +27,7 @@ if nargout > 1 if m <= 100 % Exact hat diagonal H = inv(E + lambda * D' * D); - h = diag(h); + h = diag(H); else % Map to diag(H) for n = 100 n = 100; E1 = speye(n); diff --git a/tests/data/original_scripts/whitsmdd.m b/tests/data/original_scripts/whitsmdd.m index 8643ac7..7b05940 100644 --- a/tests/data/original_scripts/whitsmdd.m +++ b/tests/data/original_scripts/whitsmdd.m @@ -28,7 +28,7 @@ if nargout > 1 if m <= 100 % Exact hat diagonal H = inv(E + lambda * D' * D); - h = diag(h); + h = diag(H); else % Map to diag(H) for n = 100 n = 100; E1 = speye(n); diff --git a/tests/validation.rs b/tests/validation.rs index 6696a55..958b6f3 100644 --- a/tests/validation.rs +++ b/tests/validation.rs @@ -5,7 +5,7 @@ use std::{ }; use whittaker_eilers::WhittakerSmoother; -const INPUT_DATA_LOC: &'static str = "tests/data/input/nmr_with_weights_and_x.csv"; +// TODO: Tests need a refactor to extract common code. #[test] fn validate_standard_whittaker() { @@ -160,7 +160,7 @@ fn validated_x_input_with_weights_whittaker() { } #[test] -fn validate_cross_validation_no_weights_100() { +fn cross_validation_no_weights_100() { let input_data = read_input_to_vecs(); let mut whittaker_smoother = @@ -170,7 +170,7 @@ fn validate_cross_validation_no_weights_100() { .smooth_and_cross_validate(&input_data.y[..100]) .unwrap(); - assert_relative_eq!(cve, 1.5806, epsilon = 1e-4); // Produced from matlab scripts. + assert_relative_eq!(cve.1, 1.5806, epsilon = 1e-4); // Produced from matlab scripts. whittaker_smoother.update_order(3).unwrap(); @@ -178,12 +178,10 @@ fn validate_cross_validation_no_weights_100() { .smooth_and_cross_validate(&input_data.y[..100]) .unwrap(); - assert_relative_eq!(cve, 1.6178, epsilon = 1e-4); - - println!("Cve: {}", cve) + assert_relative_eq!(cve.1, 1.6178, epsilon = 1e-4); } #[test] -fn validate_cross_validation_no_weights() { +fn cross_validation_no_weights() { let input_data = read_input_to_vecs(); let mut whittaker_smoother = @@ -193,7 +191,7 @@ fn validate_cross_validation_no_weights() { .smooth_and_cross_validate(&input_data.y) .unwrap(); - assert_relative_eq!(cve, 3.3568, epsilon = 1e-4); // Produced from matlab scripts. + assert_relative_eq!(cve.1, 3.3568, epsilon = 1e-4); // Produced from matlab scripts. whittaker_smoother.update_order(3).unwrap(); @@ -201,15 +199,13 @@ fn validate_cross_validation_no_weights() { .smooth_and_cross_validate(&input_data.y) .unwrap(); - assert_relative_eq!(cve, 2.6859, epsilon = 1e-4); - - println!("Cve: {}", cve) + assert_relative_eq!(cve.1, 2.6859, epsilon = 1e-4); } #[test] -fn validate_cross_validation_weights_100() { +fn cross_validation_weights_100() { let input_data = read_input_to_vecs(); - let mut whittaker_smoother = WhittakerSmoother::new( + let whittaker_smoother = WhittakerSmoother::new( 2e4, 2, input_data.y[..100].len(), @@ -222,15 +218,13 @@ fn validate_cross_validation_weights_100() { .smooth_and_cross_validate(&input_data.y[..100]) .unwrap(); - assert_relative_eq!(cve, 1.7282, epsilon = 1e-4); - - println!("Cve: {}", cve) + assert_relative_eq!(cve.1, 1.7282, epsilon = 1e-4); } #[test] -fn validate_cross_validation_weights() { +fn cross_validation_weights() { let input_data = read_input_to_vecs(); - let mut whittaker_smoother = WhittakerSmoother::new( + let whittaker_smoother = WhittakerSmoother::new( 2e4, 2, input_data.y.len(), @@ -243,15 +237,13 @@ fn validate_cross_validation_weights() { .smooth_and_cross_validate(&input_data.y) .unwrap(); - assert_relative_eq!(cve, 3.4549, epsilon = 1e-4); - - println!("Cve: {}", cve) + assert_relative_eq!(cve.1, 3.4549, epsilon = 1e-4); } #[test] -fn validate_cross_validation_weights_x_input_100() { +fn cross_validation_weights_x_input_100() { let input_data = read_input_to_vecs(); - let mut whittaker_smoother = WhittakerSmoother::new( + let whittaker_smoother = WhittakerSmoother::new( 2e4, 2, input_data.y[..100].len(), @@ -264,15 +256,13 @@ fn validate_cross_validation_weights_x_input_100() { .smooth_and_cross_validate(&input_data.y[..100]) .unwrap(); - assert_relative_eq!(cve, 1.7426, epsilon = 1e-4); - - println!("Cve: {}", cve) + assert_relative_eq!(cve.1, 1.7426, epsilon = 1e-4); } #[test] -fn validate_cross_validation_weights_x_input() { +fn cross_validation_weights_x_input() { let input_data = read_input_to_vecs(); - let mut whittaker_smoother = WhittakerSmoother::new( + let whittaker_smoother = WhittakerSmoother::new( 2e4, 2, input_data.y.len(), @@ -285,15 +275,13 @@ fn validate_cross_validation_weights_x_input() { .smooth_and_cross_validate(&input_data.y) .unwrap(); - assert_relative_eq!(cve, 3.0762, epsilon = 1e-4); - - println!("Cve: {}", cve) + assert_relative_eq!(cve.1, 3.0762, epsilon = 1e-4); } #[test] -fn validate_cross_validation_x_input_100() { +fn cross_validation_x_input_100() { let input_data = read_input_to_vecs(); - let mut whittaker_smoother = WhittakerSmoother::new( + let whittaker_smoother = WhittakerSmoother::new( 2e4, 2, input_data.y[..100].len(), @@ -306,15 +294,13 @@ fn validate_cross_validation_x_input_100() { .smooth_and_cross_validate(&input_data.y[..100]) .unwrap(); - assert_relative_eq!(cve, 1.5872, epsilon = 1e-4); - - println!("Cve: {}", cve) + assert_relative_eq!(cve.1, 1.5872, epsilon = 1e-4); } #[test] -fn validate_cross_validation_x_input() { +fn cross_validation_x_input() { let input_data = read_input_to_vecs(); - let mut whittaker_smoother = WhittakerSmoother::new( + let whittaker_smoother = WhittakerSmoother::new( 2e4, 2, input_data.y.len(), @@ -327,18 +313,29 @@ fn validate_cross_validation_x_input() { .smooth_and_cross_validate(&input_data.y) .unwrap(); - assert_relative_eq!(cve, 3.0413, epsilon = 1e-4); + assert_relative_eq!(cve.1, 3.0413, epsilon = 1e-4); +} + +#[test] +fn smooth_and_optimise() { + let input_data = read_input_to_vecs(); + + let mut whittaker_smoother = + WhittakerSmoother::new(2e4, 2, input_data.y.len(), None, None).unwrap(); - println!("Cve: {}", cve) + let cve = whittaker_smoother.optimise_and_smooth(&input_data.y, false); } -struct InputData { - x: Vec, - y: Vec, - weights: Vec, - random_weights: Vec, + +const INPUT_DATA_LOC: &'static str = "tests/data/input/nmr_with_weights_and_x.csv"; + +pub struct InputData { + pub x: Vec, + pub y: Vec, + pub weights: Vec, + pub random_weights: Vec, } -fn read_input_to_vecs() -> InputData { +pub fn read_input_to_vecs() -> InputData { let file = File::open(INPUT_DATA_LOC).unwrap(); let reader = BufReader::new(file); @@ -366,7 +363,7 @@ fn read_input_to_vecs() -> InputData { }; } -fn read_output_to_vec(file_name: &str) -> Vec { +pub fn read_output_to_vec(file_name: &str) -> Vec { let file = File::open(file_name).unwrap(); let reader = BufReader::new(file); From b0b35fa2114609d52400174868906e4694b241d5 Mon Sep 17 00:00:00 2001 From: Andrew Bowell <49959927+AnBowell@users.noreply.github.com> Date: Wed, 7 Feb 2024 08:51:49 +0000 Subject: [PATCH 04/12] feat: :sparkles: Breaking serial correlation implemented --- benches/whittaker.rs | 4 +- examples/smooth_and_interpolate.rs | 44 +++++-- src/cross_validation.rs | 31 +++++ src/lib.rs | 2 + src/whittaker_smoother.rs | 159 +++++++++++++++-------- tests/data/original_scripts/demo_nmrcv.m | 40 ++++++ tests/validation.rs | 45 +++++-- 7 files changed, 244 insertions(+), 81 deletions(-) create mode 100644 src/cross_validation.rs create mode 100644 tests/data/original_scripts/demo_nmrcv.m diff --git a/benches/whittaker.rs b/benches/whittaker.rs index 0d2ab42..ee57e33 100644 --- a/benches/whittaker.rs +++ b/benches/whittaker.rs @@ -1,5 +1,5 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use whittaker_eilers::WhittakerSmoother; +use whittaker_eilers::{CrossValidationResult, WhittakerSmoother}; fn new_y_whittaker(y: &Vec) -> Vec { WhittakerSmoother::new(2e4, 2, y.len(), None, None) @@ -7,7 +7,7 @@ fn new_y_whittaker(y: &Vec) -> Vec { .smooth(y) .unwrap() } -fn new_y_whittaker_cross_validate(y: &Vec) -> (Vec, f64) { +fn new_y_whittaker_cross_validate(y: &Vec) -> CrossValidationResult { WhittakerSmoother::new(2e4, 2, y.len(), None, None) .unwrap() .smooth_and_cross_validate(y) diff --git a/examples/smooth_and_interpolate.rs b/examples/smooth_and_interpolate.rs index 05abc91..d5e7f7d 100644 --- a/examples/smooth_and_interpolate.rs +++ b/examples/smooth_and_interpolate.rs @@ -103,6 +103,7 @@ fn smooth_and_interpolate( let raw_points = Scatter::new(x_input.clone(), y_input.to_vec()) .mode(Mode::Markers) .name("Raw Wood Data"); + let smoothed_points = Scatter::new(x_input.clone(), smoothed_y_only) .mode(Mode::Lines) .name("Whittaker Smoothed With Weights"); @@ -117,17 +118,21 @@ fn smooth_and_interpolate( } fn smooth_cross_validate(x_input: &Vec, y_input: &Vec, order: usize) { - let optimal_smooth_with_error = WhittakerSmoother::new(100.0, order, y_input.len(), None, None) - .unwrap() - .optimise_and_smooth(y_input, false) - .unwrap(); + let optimal_smooth_with_error = + WhittakerSmoother::new(100.0, order, y_input.len(), Some(x_input), None) + .unwrap() + .smooth_and_optimise(&y_input, true) + .unwrap(); let raw_points = Scatter::new(x_input.clone(), y_input.to_vec()) .mode(Mode::Markers) .name("Raw Wood Data"); - let smoothed_points = Scatter::new(x_input.clone(), optimal_smooth_with_error.0) - .mode(Mode::Lines) - .name("Whittaker Optimally Smoothed"); + let smoothed_points = Scatter::new( + x_input.clone(), + optimal_smooth_with_error.get_optimal().smoothed, + ) + .mode(Mode::Lines) + .name("Whittaker Optimally Smoothed"); let mut plot = Plot::new(); plot.add_trace(raw_points); @@ -136,6 +141,29 @@ fn smooth_cross_validate(x_input: &Vec, y_input: &Vec, order: usize) { let layout = Layout::new().title("Whittaker optimal cross validation".into()); plot.set_layout(layout); plot.show(); + + let smoothed_points = Scatter::new( + optimal_smooth_with_error + .validation_results + .iter() + .map(|x| x.lambda.log10()) + .collect(), + optimal_smooth_with_error + .validation_results + .iter() + .map(|x| x.cross_validation_error) + .collect(), + ) + .mode(Mode::Lines) + .name("Cross validation error"); + + let mut plot = Plot::new(); + + plot.add_trace(smoothed_points); + + let layout = Layout::new().title("Cross Validation Error".into()); + plot.set_layout(layout); + plot.show(); } fn main() { let mut y_input = WOOD_DATASET.to_vec(); @@ -174,8 +202,6 @@ fn main() { smooth_and_interpolate(&x_input, &y_input, &weights, lambda, order); } -/// Data from this repo, though originally from Eiler's paper. -/// . pub const WOOD_DATASET: &[f64] = &[ 106.0, 111.0, 111.0, 107.0, 105.0, 107.0, 110.0, 108.0, 111.0, 119.0, 117.0, 107.0, 105.0, 107.0, 109.0, 105.0, 104.0, 102.0, 108.0, 113.0, 113.0, 107.0, 103.0, 103.0, 98.0, 102.0, diff --git a/src/cross_validation.rs b/src/cross_validation.rs new file mode 100644 index 0000000..3fb130f --- /dev/null +++ b/src/cross_validation.rs @@ -0,0 +1,31 @@ +/// TODO +pub struct OptimisedSmoothResult { + /// TODO + pub validation_results: Vec, + pub(crate) optimal_index: usize, +} + +impl OptimisedSmoothResult { + /// TODO + pub fn get_optimal(&self) -> CrossValidationResult { + return self.validation_results[self.optimal_index].to_owned(); + } +} +/// TODO +#[derive(Clone, Debug)] +pub struct CrossValidationResult { + /// TODO + pub lambda: f64, + /// TODO + pub smoothed: Vec, + /// TODO + pub cross_validation_error: f64, +} + +pub(crate) fn every_tenth_element(data: &[f64]) -> Vec { + data.iter() + .enumerate() + .filter(|(index, _)| index % 10 == 0) + .map(|(_, val)| *val) + .collect::>() +} diff --git a/src/lib.rs b/src/lib.rs index 384eae3..e0f85f5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,9 +1,11 @@ #![doc = include_str!("../README.md")] #![deny(missing_docs, unused_imports)] +mod cross_validation; mod errors; mod whittaker_smoother; +pub use cross_validation::{CrossValidationResult, OptimisedSmoothResult}; pub use errors::WhittakerError; pub use errors::WHITTAKER_X_EPSILON; pub use whittaker_smoother::WhittakerSmoother; diff --git a/src/whittaker_smoother.rs b/src/whittaker_smoother.rs index 73b0979..d74fa09 100644 --- a/src/whittaker_smoother.rs +++ b/src/whittaker_smoother.rs @@ -1,5 +1,6 @@ +use crate::cross_validation::every_tenth_element; use crate::errors::WhittakerError; -use crate::WHITTAKER_X_EPSILON; +use crate::{CrossValidationResult, OptimisedSmoothResult, WHITTAKER_X_EPSILON}; use nalgebra::{DMatrix, DVector}; use sprs::FillInReduction::ReverseCuthillMcKee; @@ -232,6 +233,7 @@ impl WhittakerSmoother { self.weights_mat .as_ref() .unwrap() + .diag() .data() .iter() .zip(y_input) @@ -243,58 +245,11 @@ impl WhittakerSmoother { }; } - /// TODO: Document - pub fn optimise_and_smooth( - &mut self, - y_input: &[f64], - break_serial_correlation: bool, - ) -> Result<(Vec, f64), WhittakerError> { - let mut start_lambda_log = (1e-2_f64).log10(); // -2 - let end_lambda_log = (1e8_f64).log10(); // 8 - - let mut smoothed_data = Vec::::new(); - let mut optimal_lambda = f64::MAX; - let mut cve = f64::MAX; - - if break_serial_correlation { - // TODO! Make this work when we already have weights. - let cross_validation_weights: Vec = (0..self.data_length) - .map(|x| if x % 10 == 0 { 1.0 } else { 0.0 }) - .collect::>(); - - println!("Weights: {:?}", cross_validation_weights); - self.update_weights(&cross_validation_weights)?; - } - - while start_lambda_log != end_lambda_log { - let new_lambda = 10_f64.powf(start_lambda_log); - - self.update_lambda(new_lambda)?; - let res = self.smooth_and_cross_validate(y_input)?; - - if res.1 < cve { - cve = res.1; - smoothed_data = res.0; - optimal_lambda = new_lambda; - } - println!("Lambda: {}, Error: {}", new_lambda, res.1); - start_lambda_log += 0.5; - } - - if break_serial_correlation { - self.update_lambda(optimal_lambda)?; - self.update_weights(&vec![1.0; self.data_length])?; - smoothed_data = self.smooth(y_input)?; - } - - return Ok((smoothed_data, cve)); - } - /// TODO: Document pub fn smooth_and_cross_validate( &self, y_input: &[f64], - ) -> Result<(Vec, f64), WhittakerError> { + ) -> Result { if y_input.len() != self.data_length { return Err(WhittakerError::LengthMismatch( self.data_length, @@ -330,9 +285,12 @@ impl WhittakerSmoother { let lambda1 = self.lambda * (n as f64 / self.data_length as f64).powf(2.0 * self.order as f64); - let to_solve = match self.weights_mat.as_ref() { + let to_inverse = match self.weights_mat.as_ref() { Some(x) => { - let weights_vec = g.iter().map(|index| x.data()[*index]).collect::>(); + let weights_vec = g + .iter() + .map(|index| x.diag().data()[*index]) + .collect::>(); let diags = (0..weights_vec.len() + 1).collect::>(); @@ -343,15 +301,17 @@ impl WhittakerSmoother { weights_vec.clone(), ); + // println!("EWeights mat: {:#?}", weights_mat); + &weights_mat + &(&(&d1.transpose_view() * &d1) * lambda1) } None => &e1 + &(&(&d1.transpose_view() * &d1) * lambda1), }; let hat_matrix = DMatrix::from_iterator( - to_solve.rows(), - to_solve.cols(), - to_solve.to_dense().into_iter(), + to_inverse.rows(), + to_inverse.cols(), + to_inverse.to_dense().into_iter(), ) .lu() .solve(&DMatrix::identity(n, n)) @@ -383,7 +343,7 @@ impl WhittakerSmoother { let h = match self.weights_mat.as_ref() { Some(x) => f .iter() - .zip(x.data()) + .zip(x.diag().data()) .map(|(index, weight)| weight * h1[*index] * vk / h1k1) .collect::>(), None => f @@ -404,7 +364,12 @@ impl WhittakerSmoother { None => (r.transpose() * r).sum() / self.data_length as f64, } .sqrt(); - return Ok((smoothed_series, cve)); + + return Ok(CrossValidationResult { + lambda: self.get_lambda(), + smoothed: smoothed_series, + cross_validation_error: cve, + }); } else { let mut hat_matrix = DMatrix::from_iterator( self.to_solve.rows(), @@ -432,9 +397,89 @@ impl WhittakerSmoother { } .sqrt(); - return Ok((smoothed_series, cve)); + return Ok(CrossValidationResult { + lambda: self.get_lambda(), + smoothed: smoothed_series, + cross_validation_error: cve, + }); } } + + /// TODO + pub fn smooth_and_optimise( + &mut self, + y_input: &[f64], + break_serial_correlation: bool, + ) -> Result { + let step = 0.2; + let mut start_lambda_log = (1e-2_f64).log10(); // -2 + let end_lambda_log = (1e8_f64).log10(); // 8 + + let mut optimal_index = 0; + let mut validation_results = Vec::new(); + let mut min_cve = f64::MAX; + + let mut possible_new_config = if break_serial_correlation { + let every_n_y_input = every_tenth_element(y_input); + + let new_length = every_n_y_input.len(); + + let every_n_x_input = self.x_input.as_ref().map(|x| every_tenth_element(&x)); + + let every_n_weight = self + .weights_mat + .as_ref() + .map(|x| every_tenth_element(x.diag().data())); + + let new_smoother = WhittakerSmoother::new( + 1.0, + self.order, + new_length, + every_n_x_input.as_ref(), + every_n_weight.as_ref(), + )?; + Some((new_smoother, every_n_y_input)) + } else { + None + }; + + let mut loop_counter = 0; + while (start_lambda_log - end_lambda_log - step).abs() > 1e-6 { + let new_lambda = 10_f64.powf(start_lambda_log); + + let res = match possible_new_config.as_mut() { + Some((new_smoother, y)) => { + new_smoother.update_lambda(new_lambda)?; + new_smoother.smooth_and_cross_validate(y)? + } + None => { + self.update_lambda(new_lambda)?; + self.smooth_and_cross_validate(y_input)? + } + }; + + if res.cross_validation_error < min_cve { + optimal_index = loop_counter; + min_cve = res.cross_validation_error + } + validation_results.push(res); + + start_lambda_log += step; + loop_counter += 1; + } + + if break_serial_correlation { + for res in validation_results.iter_mut() { + self.update_lambda(res.lambda)?; + res.smoothed = self.smooth(y_input)?; + } + } + + return Ok(OptimisedSmoothResult { + validation_results, + optimal_index, + }); + } } /// Dividing differencing matrix of order d diff --git a/tests/data/original_scripts/demo_nmrcv.m b/tests/data/original_scripts/demo_nmrcv.m new file mode 100644 index 0000000..a66b454 --- /dev/null +++ b/tests/data/original_scripts/demo_nmrcv.m @@ -0,0 +1,40 @@ +% Demonstration NMR spectrum smoothing with Whittaker smoother +% Optimal smoothing by cross-validation +% +% Paul Eilers, 2003 + +% Get the data +y = load('nmr.dat'); +m = 1000; +y = y(1:m); + +% Smooth for series of lambdas +lambdas = 10 .^ (-2:0.5:8); +cvs = []; +for lambda = lambdas + [z cv] = whitsm(y, lambda, 2); + cvs = [cvs cv]; +end +disp(sprintf('%d,', lambdas)); +disp(cvs) + +% Choose optimal lambda +[cm ci] = min(cvs); +lambda = lambdas(ci); +[z cv] = whitsm(y, lambda, 2); + +% Plot data and smooth +subplot(2, 1, 1); +plot([z-10 y] ) % Downward shift for visibility +title('NMR spectrum and optimal smooth') +xlabel('Channel') +ylabel('Signal strength') + +% Plot CV profile +subplot(2, 1, 2) +semilogx(lambdas, cvs) +title('Cross-validation error') +xlabel('\lambda') +set(gcf, 'PaperPosition', [1 2 6 6]) +ylabel('CVE') + diff --git a/tests/validation.rs b/tests/validation.rs index 958b6f3..0abd5cf 100644 --- a/tests/validation.rs +++ b/tests/validation.rs @@ -170,7 +170,7 @@ fn cross_validation_no_weights_100() { .smooth_and_cross_validate(&input_data.y[..100]) .unwrap(); - assert_relative_eq!(cve.1, 1.5806, epsilon = 1e-4); // Produced from matlab scripts. + assert_relative_eq!(cve.cross_validation_error, 1.5806, epsilon = 1e-4); // Produced from matlab scripts. whittaker_smoother.update_order(3).unwrap(); @@ -178,7 +178,7 @@ fn cross_validation_no_weights_100() { .smooth_and_cross_validate(&input_data.y[..100]) .unwrap(); - assert_relative_eq!(cve.1, 1.6178, epsilon = 1e-4); + assert_relative_eq!(cve.cross_validation_error, 1.6178, epsilon = 1e-4); } #[test] fn cross_validation_no_weights() { @@ -191,7 +191,7 @@ fn cross_validation_no_weights() { .smooth_and_cross_validate(&input_data.y) .unwrap(); - assert_relative_eq!(cve.1, 3.3568, epsilon = 1e-4); // Produced from matlab scripts. + assert_relative_eq!(cve.cross_validation_error, 3.3568, epsilon = 1e-4); // Produced from matlab scripts. whittaker_smoother.update_order(3).unwrap(); @@ -199,7 +199,7 @@ fn cross_validation_no_weights() { .smooth_and_cross_validate(&input_data.y) .unwrap(); - assert_relative_eq!(cve.1, 2.6859, epsilon = 1e-4); + assert_relative_eq!(cve.cross_validation_error, 2.6859, epsilon = 1e-4); } #[test] fn cross_validation_weights_100() { @@ -218,7 +218,7 @@ fn cross_validation_weights_100() { .smooth_and_cross_validate(&input_data.y[..100]) .unwrap(); - assert_relative_eq!(cve.1, 1.7282, epsilon = 1e-4); + assert_relative_eq!(cve.cross_validation_error, 1.7282, epsilon = 1e-4); } #[test] fn cross_validation_weights() { @@ -237,7 +237,7 @@ fn cross_validation_weights() { .smooth_and_cross_validate(&input_data.y) .unwrap(); - assert_relative_eq!(cve.1, 3.4549, epsilon = 1e-4); + assert_relative_eq!(cve.cross_validation_error, 3.4549, epsilon = 1e-4); } #[test] fn cross_validation_weights_x_input_100() { @@ -256,7 +256,7 @@ fn cross_validation_weights_x_input_100() { .smooth_and_cross_validate(&input_data.y[..100]) .unwrap(); - assert_relative_eq!(cve.1, 1.7426, epsilon = 1e-4); + assert_relative_eq!(cve.cross_validation_error, 1.7426, epsilon = 1e-4); } #[test] fn cross_validation_weights_x_input() { @@ -275,7 +275,7 @@ fn cross_validation_weights_x_input() { .smooth_and_cross_validate(&input_data.y) .unwrap(); - assert_relative_eq!(cve.1, 3.0762, epsilon = 1e-4); + assert_relative_eq!(cve.cross_validation_error, 3.0762, epsilon = 1e-4); } #[test] fn cross_validation_x_input_100() { @@ -294,7 +294,7 @@ fn cross_validation_x_input_100() { .smooth_and_cross_validate(&input_data.y[..100]) .unwrap(); - assert_relative_eq!(cve.1, 1.5872, epsilon = 1e-4); + assert_relative_eq!(cve.cross_validation_error, 1.5872, epsilon = 1e-4); } #[test] fn cross_validation_x_input() { @@ -313,17 +313,36 @@ fn cross_validation_x_input() { .smooth_and_cross_validate(&input_data.y) .unwrap(); - assert_relative_eq!(cve.1, 3.0413, epsilon = 1e-4); + assert_relative_eq!(cve.cross_validation_error, 3.0413, epsilon = 1e-4); } #[test] fn smooth_and_optimise() { let input_data = read_input_to_vecs(); - let mut whittaker_smoother = - WhittakerSmoother::new(2e4, 2, input_data.y.len(), None, None).unwrap(); + let mut whittaker_smoother = WhittakerSmoother::new( + 2e4, + 2, + input_data.y[..1000].len(), + None, + Some(&vec![1.0; input_data.y[..1000].len()]), + ) + .unwrap(); + + let cve = whittaker_smoother + .smooth_and_optimise(&input_data.y[..1000], false) + .unwrap(); + + let expected = vec![ + 2.1757, 2.1454, 2.0975, 2.0590, 2.0582, 2.0985, 2.1701, 2.2592, 2.3701, 2.5221, 2.7156, + 2.9600, 3.2329, 3.4642, 3.6345, 3.7721, 3.9201, 4.1200, 4.4106, 4.8411, 5.5055, + ]; + + for (actual, expected) in cve.validation_results.iter().zip(expected) { + assert_relative_eq!(actual.cross_validation_error, expected, epsilon = 1e-4); + } - let cve = whittaker_smoother.optimise_and_smooth(&input_data.y, false); + // println!("Cve: {:?}", cve); } const INPUT_DATA_LOC: &'static str = "tests/data/input/nmr_with_weights_and_x.csv"; From 30245acb80c1f944a1fb22c69dd08590cb2c8939 Mon Sep 17 00:00:00 2001 From: Andrew Bowell <49959927+AnBowell@users.noreply.github.com> Date: Thu, 8 Feb 2024 07:46:16 +0000 Subject: [PATCH 05/12] feat: :sparkles: Python wrapper working --- Cargo.toml | 2 +- README.md | 179 +++++++++--------- examples/smooth_and_interpolate.rs | 6 +- src/cross_validation.rs | 4 +- src/errors.rs | 119 ++++++------ src/whittaker_smoother.rs | 32 ++-- .../original_scripts/demo_wood_cross_val.m | 54 ++++++ tests/data/original_scripts/readme.md | 125 ++++++------ whittaker-eilers-py/Cargo.toml | 38 ++-- .../examples/cross_validation.py | 55 ++++++ whittaker-eilers-py/src/cross_validation.rs | 33 ++++ whittaker-eilers-py/src/errors.rs | 58 +++--- whittaker-eilers-py/src/lib.rs | 24 +-- whittaker-eilers-py/src/whittaker_smoother.rs | 23 +++ whittaker-eilers-py/whittaker_eilers.pyi | 179 ++++++++++-------- 15 files changed, 566 insertions(+), 365 deletions(-) create mode 100644 tests/data/original_scripts/demo_wood_cross_val.m create mode 100644 whittaker-eilers-py/examples/cross_validation.py create mode 100644 whittaker-eilers-py/src/cross_validation.rs diff --git a/Cargo.toml b/Cargo.toml index db2e095..f77bf59 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "whittaker-eilers" -version = "0.1.1" +version = "0.1.3" edition = "2021" description = "A sparse matrix implementation of Whittaker-Eilers smoothing and interpolation" diff --git a/README.md b/README.md index 0168f11..5d8a4d0 100644 --- a/README.md +++ b/README.md @@ -1,90 +1,89 @@ -# Whittaker-Eilers Smoothing and Interpolation -**The Whittaker-Eilers smoother is the perfect smoother.** It offers extremely quick, efficient smoothing with built-in interpolation via weights on each measurement. This crate provides a sparse-matrix implementation for additional speed and memory efficiency and can handle both equally and unequally spaced measurements. - ---- - -```toml -[dependencies] -whittaker-eilers = "0.1.1" -``` - -## Usage -To start smoothing and interpolating data, create a reusable WhittakerSmoother struct via the `new` function. You'll only need to recreate this struct if the length or sampling rate of your data changes. - -### Equally spaced data -This is the fastest smoothing option. It smooths equally spaced y measurements using two tunable parameters, `lambda` (2e4) and the smoother `order` (2). The larger the lambda, the smoother the data. -```rust -use whittaker_eilers::WhittakerSmoother; - -let data_to_smooth = vec![1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0]; - -let whittaker_smoother = - WhittakerSmoother::new(2e4, 2, data_to_smooth.len(), None, None) - .unwrap(); - -let smoothed_data = whittaker_smoother.smooth(&data_to_smooth).unwrap(); -println!("Smoothed data: {:?}", smoothed_data); -``` - - - -### Non-equally spaced data -If you wish to smooth unequally spaced data, you need to provide an `x_input` with the sample times/positions. -```rust -use whittaker_eilers::WhittakerSmoother; - -let x_input = vec![1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0]; -let data_to_smooth = vec![1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0]; - -let whittaker_smoother = - WhittakerSmoother::new(2e4, 2, data_to_smooth.len(), Some(&x_input), None) - .unwrap(); - -let smoothed_data = whittaker_smoother.smooth(&data_to_smooth).unwrap(); - -println!("Smoothed data: {:?}", smoothed_data); - -``` - -### Weighted data & Interpolation -Each measurement can then be weighted to trust some measurements more than others. Setting `weights` to 0 for measurements will lead to interpolation. -```rust -use whittaker_eilers::WhittakerSmoother; - -let x_input = vec![1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0]; -let data_to_smooth = vec![1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0]; -let mut weights = vec![1.0; x_input.len()]; -weights[5] = 0.0; - -let whittaker_smoother = - WhittakerSmoother::new(2e4, 2, data_to_smooth.len(), Some(&x_input), Some(&weights)) - .unwrap(); - -let smoothed_data = whittaker_smoother.smooth(&data_to_smooth).unwrap(); - -println!("Smoothed data: {:?}", smoothed_data); - -``` -You can use these methods in combination with each other for instance, interpolating measurements without providing an x input. For more advanced examples of usage take a look at the examples, tests, and benches in the [Github](https://github.com/AnBowell/whittaker-eilers) repository. Here's an image of some smoothed data from an example: - -Time-series smoothed by Whittaker-Eilers method - - -## Further Reading -If you'd like to see a more detailed run through of the library, check out this [Medium post](https://medium.com/towards-data-science/the-perfect-way-to-smooth-your-noisy-data-4f3fe6b44440). Within it, I run through examples and benchmarks against other smoothing methods. - -## Future Features -- Cross validation options to find optimal lambda. -- Scatter plot smoothing -- Generic typing - - - - -## References -The algorithm implemented here mirrors a 2003 implementation by Paul H. C. Eilers in Matlab. I've included scripts and data from the original paper in the tests for this crate. The original paper and code can be found here: - -[A Perfect Smoother](https://pubs.acs.org/doi/10.1021/ac034173t) -Paul H. C. Eilers -Analytical Chemistry 2003 75 (14), 3631-3636 -DOI: 10.1021/ac034173t +# Whittaker-Eilers Smoothing and Interpolation +**The Whittaker-Eilers smoother is the perfect smoother.** It offers extremely quick, efficient smoothing with built-in interpolation via weights on each measurement. This crate provides a sparse-matrix implementation for additional speed and memory efficiency and can handle both equally and unequally spaced measurements. + +--- + +```toml +[dependencies] +whittaker-eilers = "0.1.1" +``` + +## Usage +To start smoothing and interpolating data, create a reusable WhittakerSmoother struct via the `new` function. You'll only need to recreate this struct if the length or sampling rate of your data changes. + +### Equally spaced data +This is the fastest smoothing option. It smooths equally spaced y measurements using two tunable parameters, `lambda` (2e4) and the smoother `order` (2). The larger the lambda, the smoother the data. +```rust +use whittaker_eilers::WhittakerSmoother; + +let data_to_smooth = vec![1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0]; + +let whittaker_smoother = + WhittakerSmoother::new(2e4, 2, data_to_smooth.len(), None, None) + .unwrap(); + +let smoothed_data = whittaker_smoother.smooth(&data_to_smooth).unwrap(); +println!("Smoothed data: {:?}", smoothed_data); +``` + + + +### Non-equally spaced data +If you wish to smooth unequally spaced data, you need to provide an `x_input` with the sample times/positions. +```rust +use whittaker_eilers::WhittakerSmoother; + +let x_input = vec![1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0]; +let data_to_smooth = vec![1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0]; + +let whittaker_smoother = + WhittakerSmoother::new(2e4, 2, data_to_smooth.len(), Some(&x_input), None) + .unwrap(); + +let smoothed_data = whittaker_smoother.smooth(&data_to_smooth).unwrap(); + +println!("Smoothed data: {:?}", smoothed_data); + +``` + +### Weighted data & Interpolation +Each measurement can then be weighted to trust some measurements more than others. Setting `weights` to 0 for measurements will lead to interpolation. +```rust +use whittaker_eilers::WhittakerSmoother; + +let x_input = vec![1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0]; +let data_to_smooth = vec![1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0]; +let mut weights = vec![1.0; x_input.len()]; +weights[5] = 0.0; + +let whittaker_smoother = + WhittakerSmoother::new(2e4, 2, data_to_smooth.len(), Some(&x_input), Some(&weights)) + .unwrap(); + +let smoothed_data = whittaker_smoother.smooth(&data_to_smooth).unwrap(); + +println!("Smoothed data: {:?}", smoothed_data); + +``` +You can use these methods in combination with each other for instance, interpolating measurements without providing an x input. For more advanced examples of usage take a look at the examples, tests, and benches in the [Github](https://github.com/AnBowell/whittaker-eilers) repository. Here's an image of some smoothed data from an example: + +Time-series smoothed by Whittaker-Eilers method + + +## Further Reading +If you'd like to see a more detailed run through of the library, check out this [Medium post](https://medium.com/towards-data-science/the-perfect-way-to-smooth-your-noisy-data-4f3fe6b44440). Within it, I run through examples and benchmarks against other smoothing methods. + +## Future Features +- Scatter plot smoothing +- Generic typing + + + + +## References +The algorithm implemented here mirrors a 2003 implementation by Paul H. C. Eilers in Matlab. I've included scripts and data from the original paper in the tests for this crate. The original paper and code can be found here: + +[A Perfect Smoother](https://pubs.acs.org/doi/10.1021/ac034173t) +Paul H. C. Eilers +Analytical Chemistry 2003 75 (14), 3631-3636 +DOI: 10.1021/ac034173t diff --git a/examples/smooth_and_interpolate.rs b/examples/smooth_and_interpolate.rs index d5e7f7d..4d7c786 100644 --- a/examples/smooth_and_interpolate.rs +++ b/examples/smooth_and_interpolate.rs @@ -117,9 +117,9 @@ fn smooth_and_interpolate( plot.show(); } -fn smooth_cross_validate(x_input: &Vec, y_input: &Vec, order: usize) { +fn smooth_cross_validate(x_input: &Vec, y_input: &Vec, weights: &Vec, order: usize) { let optimal_smooth_with_error = - WhittakerSmoother::new(100.0, order, y_input.len(), Some(x_input), None) + WhittakerSmoother::new(100.0, order, y_input.len(), Some(x_input), Some(weights)) .unwrap() .smooth_and_optimise(&y_input, true) .unwrap(); @@ -192,7 +192,7 @@ fn main() { smooth_with_weights(&x_input_with_noise, &y_input, &weights, lambda, order); - smooth_cross_validate(&x_input, &y_input, order); + smooth_cross_validate(&x_input, &y_input, &weights, order); for i in 30..60 { y_input[i] = 0.0; // Set y to some arbitrary value we want to interpolate. diff --git a/src/cross_validation.rs b/src/cross_validation.rs index 3fb130f..2f650b6 100644 --- a/src/cross_validation.rs +++ b/src/cross_validation.rs @@ -22,10 +22,10 @@ pub struct CrossValidationResult { pub cross_validation_error: f64, } -pub(crate) fn every_tenth_element(data: &[f64]) -> Vec { +pub(crate) fn every_fifth_element(data: &[f64]) -> Vec { data.iter() .enumerate() - .filter(|(index, _)| index % 10 == 0) + .filter(|(index, _)| index % 5 == 0) .map(|(_, val)| *val) .collect::>() } diff --git a/src/errors.rs b/src/errors.rs index 0aeebb5..054e15b 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -1,57 +1,62 @@ -use sprs::errors::LinalgError; - -/// The smallest difference allowed between elements of x inputs. -/// -/// When using an x input in the Whittaker, the difference between consecutive inputs is calculated -/// and used as a divisor. If this gets too small you'll end up with NaNs everywhere. Easier to prevent it! -pub const WHITTAKER_X_EPSILON: f64 = 1e-6; - - -#[derive(PartialEq, Eq, Debug, Clone)] -/// Common errors that occur within the Whittaker-Eilers smoother. -pub enum WhittakerError { - /// Occurs when two inputs provided (x, y, or weights) do not have the same length. Contains the two lengths. - LengthMismatch(usize, usize), - /// Occurs when input length is smaller than the order of the smoother. Contains the length and order. - DataTooShort(usize, usize), - /// Occurs when the LDLT decomposition fails to solve. Passes through error from [sprs]. Contains the [LinalgError] from [sprs]. - SolverError(LinalgError), - /// Occurs when the x input is more closely spaced than [WHITTAKER_X_EPSILON]. This error prevents NaNs. Contains the offending data index. - SampleRateError(usize), - /// Occurs when the x input is not increasing Monotonically. It should be always increasing; never remaining constant or decreasing. Contains the offending data index. - NotMonotonicallyIncreasing(usize), -} - -impl std::fmt::Display for WhittakerError { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - match self { - WhittakerError::LengthMismatch(expected, actual) => { - write!( - f, - "Length mismatch: expected {}, got {}.", - expected, actual, - ) - } - WhittakerError::DataTooShort(length, order) => write!( - f, - "Input too short. Data must be longer than the order of the smoother. Data length: {}, smoother order: {}.", - length, order - ), - WhittakerError::SolverError(linalg_error) => write!( - f, - "Error attempting to create solver for system: {}", - linalg_error - ), - WhittakerError::SampleRateError(position) => write!( - f, - "vals_x input data needs to be spaced a minimum of {} apart. If this is not the case, try scaling up your vals_x. Offending index: {}.", - WHITTAKER_X_EPSILON, position - ), - WhittakerError::NotMonotonicallyIncreasing(position) => write!( - f, - "vals_x input data needs to be monotonically increasing. Offending index: {}", position - ) - } - } -} -impl std::error::Error for WhittakerError {} +use sprs::errors::LinalgError; + +/// The smallest difference allowed between elements of x inputs. +/// +/// When using an x input in the Whittaker, the difference between consecutive inputs is calculated +/// and used as a divisor. If this gets too small you'll end up with NaNs everywhere. Easier to prevent it! +pub const WHITTAKER_X_EPSILON: f64 = 1e-6; + + +#[derive(PartialEq, Eq, Debug, Clone)] +/// Common errors that occur within the Whittaker-Eilers smoother. +pub enum WhittakerError { + /// Occurs when two inputs provided (x, y, or weights) do not have the same length. Contains the two lengths. + LengthMismatch(usize, usize), + /// Occurs when input length is smaller than the order of the smoother. Contains the length and order. + DataTooShort(usize, usize), + /// Occurs when the LDLT decomposition fails to solve. Passes through error from [sprs]. Contains the [LinalgError] from [sprs]. + SolverError(LinalgError), + /// Occurs when the x input is more closely spaced than [WHITTAKER_X_EPSILON]. This error prevents NaNs. Contains the offending data index. + SampleRateError(usize), + /// Occurs when the x input is not increasing Monotonically. It should be always increasing; never remaining constant or decreasing. Contains the offending data index. + NotMonotonicallyIncreasing(usize), + /// Occurs when trying to invert a matrix that cannot be inverted. This can only occur when computing cross validation error. + MatrixNotInvertible +} + +impl std::fmt::Display for WhittakerError { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match self { + WhittakerError::LengthMismatch(expected, actual) => { + write!( + f, + "Length mismatch: expected {}, got {}.", + expected, actual, + ) + } + WhittakerError::DataTooShort(length, order) => write!( + f, + "Input too short. Data must be longer than the order of the smoother. Data length: {}, smoother order: {}.", + length, order + ), + WhittakerError::SolverError(linalg_error) => write!( + f, + "Error attempting to create solver for system: {}", + linalg_error + ), + WhittakerError::SampleRateError(position) => write!( + f, + "vals_x input data needs to be spaced a minimum of {} apart. If this is not the case, try scaling up your vals_x. Offending index: {}.", + WHITTAKER_X_EPSILON, position + ), + WhittakerError::NotMonotonicallyIncreasing(position) => write!( + f, + "vals_x input data needs to be monotonically increasing. Offending index: {}", position + ), + WhittakerError::MatrixNotInvertible => write!( + f, "When computing cross validation, a matrix inversion is computed. Your current data is unable to be inverted." + ) + } + } +} +impl std::error::Error for WhittakerError {} diff --git a/src/whittaker_smoother.rs b/src/whittaker_smoother.rs index d74fa09..5e004f8 100644 --- a/src/whittaker_smoother.rs +++ b/src/whittaker_smoother.rs @@ -1,4 +1,4 @@ -use crate::cross_validation::every_tenth_element; +use crate::cross_validation::every_fifth_element; use crate::errors::WhittakerError; use crate::{CrossValidationResult, OptimisedSmoothResult, WHITTAKER_X_EPSILON}; use nalgebra::{DMatrix, DVector}; @@ -217,7 +217,7 @@ impl WhittakerSmoother { /// will result in a lot of overhead. /// /// # Arguments - /// * `vals_y`: The values which are to be smoothed and interpolated by the Whittaker-Eilers smoother. + /// * `y_input`: The values which are to be smoothed and interpolated by the Whittaker-Eilers smoother. /// /// # Returns: /// The smoothed and interpolated data. @@ -245,7 +245,17 @@ impl WhittakerSmoother { }; } - /// TODO: Document + /// Run Whittaker-Eilers smoothing, interpolation and cross validation. + /// + /// This function will run the smoother and assess the cross validation error on the result. This is defined in Eiler's + /// 2003 paper: "A Perfect Smoother". It involves computing the "hat matrix" or "smoother matrix" which inverses a sparse matrix. The + /// inverse of a sparse matrix is usually dense, so this function will take much longer to run in comparison to just running `smooth`. + /// + /// # Arguments + /// * `y_input`: The values which are to be smoothed and interpolated and have their cross validation error calculated. + /// + /// # Returns + /// whittaker_eilers::cross_validation::CrossValidationResult: The smoothed data, lambda it was smoothed at, and the cross validation error. pub fn smooth_and_cross_validate( &self, y_input: &[f64], @@ -301,8 +311,6 @@ impl WhittakerSmoother { weights_vec.clone(), ); - // println!("EWeights mat: {:#?}", weights_mat); - &weights_mat + &(&(&d1.transpose_view() * &d1) * lambda1) } None => &e1 + &(&(&d1.transpose_view() * &d1) * lambda1), @@ -315,7 +323,7 @@ impl WhittakerSmoother { ) .lu() .solve(&DMatrix::identity(n, n)) - .unwrap(); + .ok_or_else(|| WhittakerError::MatrixNotInvertible)?; let h1 = hat_matrix.diagonal(); @@ -333,8 +341,6 @@ impl WhittakerSmoother { .map(|x| { ((x as f64) * ((n - 1) as f64 / (self.data_length - 1) as f64)).round() as usize }) - // .collect::>() - // .into_iter() .collect::>(); let vk = v[k - 1]; @@ -378,7 +384,7 @@ impl WhittakerSmoother { ) .lu() .solve(&DMatrix::identity(self.data_length, self.data_length)) - .unwrap(); + .ok_or_else(|| WhittakerError::MatrixNotInvertible)?; let weights_vec = self .weights_mat @@ -411,7 +417,7 @@ impl WhittakerSmoother { y_input: &[f64], break_serial_correlation: bool, ) -> Result { - let step = 0.2; + let step = 0.5; let mut start_lambda_log = (1e-2_f64).log10(); // -2 let end_lambda_log = (1e8_f64).log10(); // 8 @@ -420,16 +426,16 @@ impl WhittakerSmoother { let mut min_cve = f64::MAX; let mut possible_new_config = if break_serial_correlation { - let every_n_y_input = every_tenth_element(y_input); + let every_n_y_input = every_fifth_element(y_input); let new_length = every_n_y_input.len(); - let every_n_x_input = self.x_input.as_ref().map(|x| every_tenth_element(&x)); + let every_n_x_input = self.x_input.as_ref().map(|x| every_fifth_element(&x)); let every_n_weight = self .weights_mat .as_ref() - .map(|x| every_tenth_element(x.diag().data())); + .map(|x| every_fifth_element(x.diag().data())); let new_smoother = WhittakerSmoother::new( 1.0, diff --git a/tests/data/original_scripts/demo_wood_cross_val.m b/tests/data/original_scripts/demo_wood_cross_val.m new file mode 100644 index 0000000..4d4fd53 --- /dev/null +++ b/tests/data/original_scripts/demo_wood_cross_val.m @@ -0,0 +1,54 @@ +% Demonstration wood smoothing with Whittaker smoother +% Optimal smoothing by cross-validation +% +% Paul Eilers, 2003 + +% Get the data +y = load('wood.txt'); + + +weights_one = zeros(1, length(y)); +for i = 1:1:length(y) + if mod(i, 10) == 0 + weights_one(i) = 1; + end +end + +x_input = 1:1:length(y); + +disp(y) + +% Smooth for series of lambdas +lambdas = 10 .^ (-2:0.2:8); +cvs = []; +for lambda = lambdas + %[z cv] = whitsmw(y, weights_one', lambda, 2); + [z cv] = whitsmdd(transpose(x_input(1:5:length(y))), y(1:5:length(y)), lambda, 2); + %[z cv] = whitsmddw(transpose(x_input), y,transpose(weights_one), lambda, 2); + cvs = [cvs cv]; +end + + +% Choose optimal lambda +[cm ci] = min(cvs); +lambda = lambdas(ci); +[z cv] = whitsm(y, lambda, 2); + +% Plot data and smooth +subplot(2, 1, 1); +plot([y z] ) % Downward shift for visibility +title('Wood optmial smooth') +xlabel('Channel') +ylabel('Signal strength') + +% Plot CV profile +subplot(2, 1, 2) +semilogx(lambdas, cvs) +title('Cross-validation error') +xlabel('\lambda') +set(gcf, 'PaperPosition', [1 2 6 6]) +ylim([5 7]) +ylabel('CVE') + + + diff --git a/tests/data/original_scripts/readme.md b/tests/data/original_scripts/readme.md index f1be199..de13935 100644 --- a/tests/data/original_scripts/readme.md +++ b/tests/data/original_scripts/readme.md @@ -1,60 +1,65 @@ -Source Scripts --------------- - -This folder contains the original Matlab scripts that were included in the supporting -information of Paul H. C. Eilers' paper: [A Perfect Smoother](https://pubs.acs.org/doi/10.1021/ac034173t). - -A Perfect Smoother -Paul H. C. Eilers -Analytical Chemistry 2003 75 (14), 3631-3636 -DOI: 10.1021/ac034173t - - -I've modified some of the scripts which begin with "demo", to output a csv which can be -used to test and validate the rust version of the Whittaker smoother. I've included the original ReadMe below. - - ---- - -Whittaker Smoother Toolbox --------------------------- - -This archive contains a small toolbox of Matlab functions and -scripts to show the versatility of the Whittaker Smoother (WS). -Theory and implementation are described in my paper in Analytical -Chemistry, 2003 (Vol 75, pp 3631–3636). - -The functions are: - - whitsm: for smoothing of a complete data series, sampled at - equal intervals; - - whitsmw: for smoothing of a data series, sampled at equal - intervals; some data may be missing, as indicated by a - vector of 0/1 weights; - - whitsmdd: for smoothing of a complete data series, sampled at - unequal intervals; the sampling positions are given by a - (monotone) series x; - - whitsmddw: for smoothing of an incomplete data series, sampled - at unequal intervals; the sampling positions are given by a - (monotone) series x; - - whitscat: for smoothing of a scatterplot with arbitrary x and - y data. -All functions return fitted values and, optionally, the RMS -(leave-one-out) cross-validation error. All functions contain -documentation headers to describe input and output. - -There are demonstration scripts to illustrate the use of the -functions. - -You are free to use and modify these functions in any way you -like, as long as you give proper reference to the original source -and cite the paper. - -Paul Eilers - - -Department of Medical Statistics -Leiden University Medical Centre -P.O. Box 9604 -2300 RC Leiden -The Netherlands -e-mail: p.eilers@lumc.nl +Source Scripts +-------------- + +This folder contains the original Matlab scripts that were included in the supporting +information of Paul H. C. Eilers' paper: [A Perfect Smoother](https://pubs.acs.org/doi/10.1021/ac034173t). + +A Perfect Smoother +Paul H. C. Eilers +Analytical Chemistry 2003 75 (14), 3631-3636 +DOI: 10.1021/ac034173t + + +I've modified some of the scripts which begin with "demo", to output a csv which can be +used to test and validate the rust version of the Whittaker smoother. I was able to recreate most things from the paper and +supporting materials about from the cross validation error graph in Figure 10. + + +I've included the original README below. + + + +--- + +Whittaker Smoother Toolbox +-------------------------- + +This archive contains a small toolbox of Matlab functions and +scripts to show the versatility of the Whittaker Smoother (WS). +Theory and implementation are described in my paper in Analytical +Chemistry, 2003 (Vol 75, pp 3631–3636). + +The functions are: + - whitsm: for smoothing of a complete data series, sampled at + equal intervals; + - whitsmw: for smoothing of a data series, sampled at equal + intervals; some data may be missing, as indicated by a + vector of 0/1 weights; + - whitsmdd: for smoothing of a complete data series, sampled at + unequal intervals; the sampling positions are given by a + (monotone) series x; + - whitsmddw: for smoothing of an incomplete data series, sampled + at unequal intervals; the sampling positions are given by a + (monotone) series x; + - whitscat: for smoothing of a scatterplot with arbitrary x and + y data. +All functions return fitted values and, optionally, the RMS +(leave-one-out) cross-validation error. All functions contain +documentation headers to describe input and output. + +There are demonstration scripts to illustrate the use of the +functions. + +You are free to use and modify these functions in any way you +like, as long as you give proper reference to the original source +and cite the paper. + +Paul Eilers + + +Department of Medical Statistics +Leiden University Medical Centre +P.O. Box 9604 +2300 RC Leiden +The Netherlands +e-mail: p.eilers@lumc.nl diff --git a/whittaker-eilers-py/Cargo.toml b/whittaker-eilers-py/Cargo.toml index 882ad14..ad34e6b 100644 --- a/whittaker-eilers-py/Cargo.toml +++ b/whittaker-eilers-py/Cargo.toml @@ -1,19 +1,19 @@ -[package] -name = "whittaker-eilers-py" -version = "0.1.2" -edition = "2021" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[lib] -name = "whittaker_eilers" -# "cdylib" is necssary to produce a shared library for Python to import from. -crate-type = ["cdylib"] - -[dependencies] -whittaker-eilers-rs = { package = "whittaker-eilers", path = "../" } - -[dependencies.pyo3] -version = "0.19.2" -# "abi3-py37" tells pyo3 (and maturin) to build using the stable ABI with minimum Python version 3.7 -features = ["abi3-py37"] +[package] +name = "whittaker-eilers-py" +version = "0.1.3" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[lib] +name = "whittaker_eilers" +# "cdylib" is necssary to produce a shared library for Python to import from. +crate-type = ["cdylib"] + +[dependencies] +whittaker-eilers-rs = { package = "whittaker-eilers", path = "../" } + +[dependencies.pyo3] +version = "0.19.2" +# "abi3-py37" tells pyo3 (and maturin) to build using the stable ABI with minimum Python version 3.7 +features = ["abi3-py37"] diff --git a/whittaker-eilers-py/examples/cross_validation.py b/whittaker-eilers-py/examples/cross_validation.py new file mode 100644 index 0000000..a1b401c --- /dev/null +++ b/whittaker-eilers-py/examples/cross_validation.py @@ -0,0 +1,55 @@ +from whittaker_eilers import WhittakerSmoother +import matplotlib.pyplot as plt +import numpy as np + +data = np.loadtxt("graph.txt", skiprows=5) +axis_color = "#d0d0fa" + +years = data[:, 0] +temp_anom = data[:, 1] +other_smoothed = data[:, 2] + +whittaker_smoother = WhittakerSmoother(lmbda=20, order=2, data_length=len(temp_anom)) + + +res = whittaker_smoother.smooth_and_cross_validate(list(temp_anom)) + + +res = whittaker_smoother.smooth_and_optimise(list(temp_anom)) + +print(res.get_optimal().get_lambda()) + + +(fig, ax1) = plt.subplots(figsize=(8, 4), facecolor="#00002a") + +ax1.spines["bottom"].set_color(axis_color) +ax1.spines["top"].set_color(axis_color) +ax1.spines["right"].set_color(axis_color) +ax1.spines["left"].set_color(axis_color) +ax1.set_xlabel("Year", color=axis_color) +ax1.set_ylabel("Temperature Anomaly / °C", color=axis_color) +ax1.tick_params(axis="y", direction="in", color=axis_color, labelcolor=axis_color) +ax1.tick_params(axis="x", direction="in", color=axis_color, labelcolor=axis_color) +ax1.grid(True, ls="--", alpha=0.6) +ax1.set_facecolor("#00002a") +ax1.set_xlim(1880, 2020) + +ax1.plot( + years, + temp_anom, + color="#fc8d28", + marker=".", + label="Measured", + alpha=0.6, + markersize=8, +) + +ax1.plot( + years, + res.get_optimal().get_smoothed(), + color="#59f176", + lw=2, + label="Whittaker", + solid_capstyle="round", +) +plt.show() diff --git a/whittaker-eilers-py/src/cross_validation.rs b/whittaker-eilers-py/src/cross_validation.rs new file mode 100644 index 0000000..cdd6333 --- /dev/null +++ b/whittaker-eilers-py/src/cross_validation.rs @@ -0,0 +1,33 @@ +use pyo3::prelude::*; + +use whittaker_eilers_rs::CrossValidationResult as CrossValidationResultRs; +use whittaker_eilers_rs::OptimisedSmoothResult as OptimisedSmoothResultRs; + +/// Doc string here +#[pyclass] +#[repr(transparent)] +pub struct OptimisedSmoothResult(pub(crate) OptimisedSmoothResultRs); + +#[pymethods] +impl OptimisedSmoothResult { + pub fn get_optimal(&self) -> CrossValidationResult { + CrossValidationResult(self.0.get_optimal()) + } +} + +#[pyclass] +#[repr(transparent)] +pub struct CrossValidationResult(pub(crate) CrossValidationResultRs); + +#[pymethods] +impl CrossValidationResult { + pub fn get_lambda(&self) -> f64 { + self.0.lambda + } + pub fn get_smoothed(&self) -> Vec { + self.0.smoothed.clone() + } + pub fn get_cross_validation_error(&self) -> f64 { + self.0.cross_validation_error + } +} diff --git a/whittaker-eilers-py/src/errors.rs b/whittaker-eilers-py/src/errors.rs index 196d441..6aa2d99 100644 --- a/whittaker-eilers-py/src/errors.rs +++ b/whittaker-eilers-py/src/errors.rs @@ -1,27 +1,31 @@ -use pyo3::create_exception; -use pyo3::exceptions::PyException; -use pyo3::PyErr; - -use whittaker_eilers_rs::WhittakerError as WhittakerErrorRs; - -pub struct WhittakerError(pub WhittakerErrorRs); - -impl std::convert::From for PyErr { - fn from(err: WhittakerError) -> PyErr { - match &err.0 { - WhittakerErrorRs::LengthMismatch(_, _) => LengthMismatch::new_err(err.0.to_string()), - WhittakerErrorRs::DataTooShort(_, _) => DataTooShort::new_err(err.0.to_string()), - WhittakerErrorRs::SolverError(_) => SolverError::new_err(err.0.to_string()), - WhittakerErrorRs::SampleRateError(_) => SolverError::new_err(err.0.to_string()), - WhittakerErrorRs::NotMonotonicallyIncreasing(_) => { - NotMonotonicallyIncreasing::new_err(err.0.to_string()) - } - } - } -} - -create_exception!(whittaker_eilers, LengthMismatch, PyException); -create_exception!(whittaker_eilers, DataTooShort, PyException); -create_exception!(whittaker_eilers, SolverError, PyException); -create_exception!(whittaker_eilers, SampleRateError, PyException); -create_exception!(whittaker_eilers, NotMonotonicallyIncreasing, PyException); +use pyo3::create_exception; +use pyo3::exceptions::PyException; +use pyo3::PyErr; + +use whittaker_eilers_rs::WhittakerError as WhittakerErrorRs; + +pub struct WhittakerError(pub WhittakerErrorRs); + +impl std::convert::From for PyErr { + fn from(err: WhittakerError) -> PyErr { + match &err.0 { + WhittakerErrorRs::LengthMismatch(_, _) => LengthMismatch::new_err(err.0.to_string()), + WhittakerErrorRs::DataTooShort(_, _) => DataTooShort::new_err(err.0.to_string()), + WhittakerErrorRs::SolverError(_) => SolverError::new_err(err.0.to_string()), + WhittakerErrorRs::SampleRateError(_) => SolverError::new_err(err.0.to_string()), + WhittakerErrorRs::NotMonotonicallyIncreasing(_) => { + NotMonotonicallyIncreasing::new_err(err.0.to_string()) + } + WhittakerErrorRs::MatrixNotInvertible => { + MatrixNotInvertible::new_err(err.0.to_string()) + } + } + } +} + +create_exception!(whittaker_eilers, LengthMismatch, PyException); +create_exception!(whittaker_eilers, DataTooShort, PyException); +create_exception!(whittaker_eilers, SolverError, PyException); +create_exception!(whittaker_eilers, SampleRateError, PyException); +create_exception!(whittaker_eilers, NotMonotonicallyIncreasing, PyException); +create_exception!(whittaker_eilers, MatrixNotInvertible, PyException); diff --git a/whittaker-eilers-py/src/lib.rs b/whittaker-eilers-py/src/lib.rs index 08e05aa..b791862 100644 --- a/whittaker-eilers-py/src/lib.rs +++ b/whittaker-eilers-py/src/lib.rs @@ -1,11 +1,13 @@ -mod errors; -mod whittaker_smoother; -use pyo3::{pymodule, types::PyModule, PyResult, Python}; -use whittaker_smoother::WhittakerSmoother; - -#[pymodule] -fn whittaker_eilers(_py: Python, m: &PyModule) -> PyResult<()> { - #![doc = include_str!("../README.md")] - m.add_class::()?; - Ok(()) -} +mod cross_validation; +mod errors; +mod whittaker_smoother; + +use pyo3::{pymodule, types::PyModule, PyResult, Python}; +use whittaker_smoother::WhittakerSmoother; + +#[pymodule] +fn whittaker_eilers(_py: Python, m: &PyModule) -> PyResult<()> { + #![doc = include_str!("../README.md")] + m.add_class::()?; + Ok(()) +} diff --git a/whittaker-eilers-py/src/whittaker_smoother.rs b/whittaker-eilers-py/src/whittaker_smoother.rs index 96c5c66..c531d2f 100644 --- a/whittaker-eilers-py/src/whittaker_smoother.rs +++ b/whittaker-eilers-py/src/whittaker_smoother.rs @@ -3,6 +3,7 @@ use pyo3::prelude::*; use whittaker_eilers_rs::WhittakerError as WhittakerErrorRs; use whittaker_eilers_rs::WhittakerSmoother as WhittakerSmootherRs; +use crate::cross_validation::{CrossValidationResult, OptimisedSmoothResult}; use crate::errors::WhittakerError; /// A new Whittaker-Eilers smoother and interpolator. @@ -109,6 +110,28 @@ impl WhittakerSmoother { pub fn smooth(&self, y_vals: Vec) -> PyResult> { self.0.smooth(&y_vals).map_err(map_err_to_py) } + + /// TODO: Doc string + pub fn smooth_and_cross_validate(&self, y_input: Vec) -> PyResult { + Ok(CrossValidationResult( + self.0 + .smooth_and_cross_validate(&y_input) + .map_err(map_err_to_py)?, + )) + } + /// TODO: Doc string + #[pyo3(signature = (y_input, break_serial_correlation = true))] + pub fn smooth_and_optimise( + &mut self, + y_input: Vec, + break_serial_correlation: bool, + ) -> PyResult { + Ok(OptimisedSmoothResult( + self.0 + .smooth_and_optimise(&y_input, break_serial_correlation) + .map_err(map_err_to_py)?, + )) + } } fn map_err_to_py(err: WhittakerErrorRs) -> PyErr { diff --git a/whittaker-eilers-py/whittaker_eilers.pyi b/whittaker-eilers-py/whittaker_eilers.pyi index 39aa388..b2311c4 100644 --- a/whittaker-eilers-py/whittaker_eilers.pyi +++ b/whittaker-eilers-py/whittaker_eilers.pyi @@ -1,82 +1,97 @@ -from typing import Optional, Iterable - -# Bit of a pain having to handcraft these! Especially with the docs. -class WhittakerSmoother: - """A new Whittaker-Eilers smoother and interpolator. - - The smoother is configured through it's `lambda` and it's `order`. `Lambda` controls the smoothness of the data (1e2~1e4) and `order` controls - the order of which the penalities are applied (generally 2~4). The smoother can then be configured to weight measurements between 0 and 1 - to interpolate (0 weight) or to complete trust (1 weight) the measurement. The smoother can handle equally spaced measurements by simply not passing - an `x_input` or unequally spaced data by providing the sampling times/positions as `x_input`. - - The smoother parameters can be updated using the provided functions to avoid remaking this costly struct. The only time the WhittakerSmoother should be - remade is when the data length has changed, or a different sampling rate has been provided. - - Parameters - ---------- - lmbda : Controls the smoothing strength, the larger, the smoother. Try 1e2~2e4 to start with and adjust based on the result. `lmbda` must be positive. - order : The order of the filter. Try 2~4 to start with. Order must be positive. - data_length : The length of the data which is to be smoothed. Must be positive. - x_input : The time/position at which the y measurement was taken. Used to smooth unequally spaced data. Must be monotonically increasing. - weights : The weight of each y measurement.""" - - def __init__( - self, - lmbda: float, - order: int, - data_length: int, - x_input: Optional[list] = None, - weights: Optional[list] = None, - ) -> None: ... - def get_order(self) -> int: - """Retrieve the smoother's current order.""" - ... - def get_lambda(self) -> float: - """Retrieve the smoother's current lambda.""" - ... - def get_data_length(self) -> int: - """Retrieve the smoother's current length.""" - ... - def update_weights(self, weights: list) -> None: - """Updates the weights of the data to be smoothed. - The length of weights should be equal to that of the data you are to smooth. The values of the weights should fall between 0 and 1. - - Parameters - ---------- - weights : The weights of the measurements to be smoothed. The smaller the weight the more the measurement will be ignored. Setting a weight to 0 results in interpolation. - """ - ... - def update_order(self, order: int) -> None: - """Updates the order of the Whittaker-Eilers smoother. - - Efficiently updates the order at which the Whittaker will use to smooth the data. - - Parameters - ---------- - order : The order to smooth.""" - ... - def update_lambda(self, lmbda: float) -> None: - """Updates the smoothing constant `lambda` of the Whittaker-Eilers smoother. - - Efficiently update the target smoothness of the Whittaker smoother. The larger the `lambda`, the smoother the data. - - Parameters - ---------- - lmbda : The smoothing constant of the Whittaker-Eilers smoother. - """ - ... - def smooth(self, y_vals: list) -> list: - """Run Whittaker-Eilers smoothing and interpolation. - - This function actually runs the solver which results in the smoothed data. If you just wish to continuously smooth - data of different y values with the sample rate remaining the same, simply call this function with different data. Remaking the `WhittakerSmoother` class - will result in a lot of overhead. - - Parameters - ---------- - vals_y : The values which are to be smoothed and interpolated by the Whittaker-Eilers smoother. - - Returns - ------- - The smoothed and interpolated data.""" - ... +from typing import Optional + +class CrossValidationResult: + def get_lambda(self) -> float: ... + def get_smoothed(self) -> list[float]: ... + def get_cross_validation_error(self) -> float: ... + ... + +class OptimisedSmoothResult: + def get_optimal(self) -> CrossValidationResult: ... + +# Bit of a pain having to handcraft these! Especially with the docs. +class WhittakerSmoother: + """A new Whittaker-Eilers smoother and interpolator. + + The smoother is configured through it's `lambda` and it's `order`. `Lambda` controls the smoothness of the data (1e2~1e4) and `order` controls + the order of which the penalities are applied (generally 2~4). The smoother can then be configured to weight measurements between 0 and 1 + to interpolate (0 weight) or to complete trust (1 weight) the measurement. The smoother can handle equally spaced measurements by simply not passing + an `x_input` or unequally spaced data by providing the sampling times/positions as `x_input`. + + The smoother parameters can be updated using the provided functions to avoid remaking this costly struct. The only time the WhittakerSmoother should be + remade is when the data length has changed, or a different sampling rate has been provided. + + Parameters + ---------- + lmbda : Controls the smoothing strength, the larger, the smoother. Try 1e2~2e4 to start with and adjust based on the result. `lmbda` must be positive. + order : The order of the filter. Try 2~4 to start with. Order must be positive. + data_length : The length of the data which is to be smoothed. Must be positive. + x_input : The time/position at which the y measurement was taken. Used to smooth unequally spaced data. Must be monotonically increasing. + weights : The weight of each y measurement.""" + + def __init__( + self, + lmbda: float, + order: int, + data_length: int, + x_input: Optional[list] = None, + weights: Optional[list] = None, + ) -> None: ... + def get_order(self) -> int: + """Retrieve the smoother's current order.""" + ... + def get_lambda(self) -> float: + """Retrieve the smoother's current lambda.""" + ... + def get_data_length(self) -> int: + """Retrieve the smoother's current length.""" + ... + def update_weights(self, weights: list) -> None: + """Updates the weights of the data to be smoothed. + The length of weights should be equal to that of the data you are to smooth. The values of the weights should fall between 0 and 1. + + Parameters + ---------- + weights : The weights of the measurements to be smoothed. The smaller the weight the more the measurement will be ignored. Setting a weight to 0 results in interpolation. + """ + ... + def update_order(self, order: int) -> None: + """Updates the order of the Whittaker-Eilers smoother. + + Efficiently updates the order at which the Whittaker will use to smooth the data. + + Parameters + ---------- + order : The order to smooth.""" + ... + def update_lambda(self, lmbda: float) -> None: + """Updates the smoothing constant `lambda` of the Whittaker-Eilers smoother. + + Efficiently update the target smoothness of the Whittaker smoother. The larger the `lambda`, the smoother the data. + + Parameters + ---------- + lmbda : The smoothing constant of the Whittaker-Eilers smoother. + """ + ... + def smooth(self, y_vals: list[float]) -> list: + """Run Whittaker-Eilers smoothing and interpolation. + + This function actually runs the solver which results in the smoothed data. If you just wish to continuously smooth + data of different y values with the sample rate remaining the same, simply call this function with different data. Remaking the `WhittakerSmoother` class + will result in a lot of overhead. + + Parameters + ---------- + vals_y : The values which are to be smoothed and interpolated by the Whittaker-Eilers smoother. + + Returns + ------- + The smoothed and interpolated data.""" + ... + def smooth_and_cross_validate( + self, y_input: list[float] + ) -> CrossValidationResult: ... + def smooth_and_optimise( + self, y_input: list[float], break_serial_correlation: bool = True + ) -> OptimisedSmoothResult: ... From c58d2f642d4ae1ccc30fbd0a1e76682195c7517d Mon Sep 17 00:00:00 2001 From: Andrew Date: Thu, 8 Feb 2024 14:40:29 +0000 Subject: [PATCH 06/12] docs: :memo: Docs docs docs --- src/cross_validation.rs | 19 ++++++++++++------- src/whittaker_smoother.rs | 22 +++++++++++++++++++--- 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/src/cross_validation.rs b/src/cross_validation.rs index 2f650b6..7c73816 100644 --- a/src/cross_validation.rs +++ b/src/cross_validation.rs @@ -1,24 +1,29 @@ -/// TODO +/// Contains the results of cross validation for a variety of lambdas +/// +/// This struct contains the results of finding the optimal lambda. A vec +/// contains all of the lambdas, smoothed series, and errors. A function then +/// provides the ability to return the optimal one. +/// pub struct OptimisedSmoothResult { - /// TODO + /// The lambda, smoothed series, and errors for each lambda tested. pub validation_results: Vec, pub(crate) optimal_index: usize, } impl OptimisedSmoothResult { - /// TODO + /// Returns the optimally smoothed data series, lambda, and error. pub fn get_optimal(&self) -> CrossValidationResult { return self.validation_results[self.optimal_index].to_owned(); } } -/// TODO +/// The result of smoothing with cross validation #[derive(Clone, Debug)] pub struct CrossValidationResult { - /// TODO + /// The lambda value that was used to smooth the data. pub lambda: f64, - /// TODO + /// The smoothed data. pub smoothed: Vec, - /// TODO + /// The associated cross validation error for the smoothed data. pub cross_validation_error: f64, } diff --git a/src/whittaker_smoother.rs b/src/whittaker_smoother.rs index 5e004f8..70301e9 100644 --- a/src/whittaker_smoother.rs +++ b/src/whittaker_smoother.rs @@ -254,8 +254,8 @@ impl WhittakerSmoother { /// # Arguments /// * `y_input`: The values which are to be smoothed and interpolated and have their cross validation error calculated. /// - /// # Returns - /// whittaker_eilers::cross_validation::CrossValidationResult: The smoothed data, lambda it was smoothed at, and the cross validation error. + /// # Returns: + /// [CrossValidationResult]: The smoothed data, lambda it was smoothed at, and the cross validation error. pub fn smooth_and_cross_validate( &self, y_input: &[f64], @@ -411,7 +411,23 @@ impl WhittakerSmoother { } } - /// TODO + /// Runs Whittaker-Eilers smoothing for a variety of lambdas and selects the optimally smoothed time series. + /// + /// This function runs the smoother for lambdas varying from 1e-2 to 1e8 in logarithmic steps of 0.5. It computes the + /// hat/smoother matrix and finds the optimal lambda for the data. If the time-series exhibits serial correlation the optimal + /// lambda can be very small and mean the smoothed data doesn't differ from the input data. To avoid this, use `break_serial_correlation = true` + /// + /// It will return the smoothed data, lambda, and cross validation error for each lambda tested! + /// + /// As the smoother matrix requires the inversion of a sparse matrix (which usually becomes a dense matrix), this code is extremely slow compared to smoothing + /// with a known lambda. Use sparingly! + /// + /// # Arguments + /// * `y_input`: The values which are to be smoothed, interpolated, and cross validated for a variety of lambdas. + /// * `break_serial_correlation`: Default here should be `true`. Without it, data that exhibits serial correlation is barely smoothed. + /// + /// # Returns: + /// [OptimisedSmoothResult]: The smoothed data, lambda, and error for each tested lambda. pub fn smooth_and_optimise( &mut self, y_input: &[f64], From 9acc2a517f8a3cb4666631a0bf148466987ba953 Mon Sep 17 00:00:00 2001 From: Andrew Bowell <49959927+AnBowell@users.noreply.github.com> Date: Fri, 9 Feb 2024 08:39:34 +0000 Subject: [PATCH 07/12] docs: :memo: Adding new optimal smoothing and cross validation --- README.md | 48 +++ benches/whittaker.rs | 14 +- examples/smooth_and_interpolate.rs | 4 +- src/cross_validation.rs | 3 +- src/whittaker_smoother.rs | 18 +- tests/validation.rs | 4 +- whittaker-eilers-py/README.md | 274 +++++++++------ .../examples/cross_validation.py | 23 +- .../examples/whittaker_compare_interp.py | 322 +++++++++--------- whittaker-eilers-py/src/cross_validation.rs | 19 +- whittaker-eilers-py/src/whittaker_smoother.rs | 47 ++- whittaker-eilers-py/whittaker_eilers.pyi | 62 +++- 12 files changed, 522 insertions(+), 316 deletions(-) diff --git a/README.md b/README.md index 5d8a4d0..ceec20c 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,54 @@ let smoothed_data = whittaker_smoother.smooth(&data_to_smooth).unwrap(); println!("Smoothed data: {:?}", smoothed_data); ``` + + +### Smoothing with cross validation +With this package, you can also calculate the cross validation error alongside the smoothed series. This shouldn't really be used in production where speed is necessary though! + + +```rust +use whittaker_eilers::WhittakerSmoother; + +let x_input = vec![1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0 ,11.0, 12.0, 13.0]; +let data_to_smooth = vec![1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0, 11.0, 12.0, 13.0]; + +let whittaker_smoother = + WhittakerSmoother::new(2e4, 2, data_to_smooth.len(), Some(&x_input), None) + .unwrap(); + +let smoothed_and_cross_validated = whittaker_smoother.smooth_and_cross_validate(&data_to_smooth).unwrap(); + +println!("Result: {:?}", smoothed_and_cross_validated); +``` + +### Automatic smoothing +Smoothing data requires a choice of Lambda. This can be done using visual inspection or by finding the lambda +which results in the lowest cross validation error. The `smooth_optimal` function runs the smoother for a variety of lambdas and returns the results with the ability to retrieve the optimal one. + + +```rust +use whittaker_eilers::WhittakerSmoother; + +let x_input = vec![1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0 ,11.0, 12.0, 13.0]; +let data_to_smooth = vec![1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0, 11.0, 12.0, 13.0]; + +let mut whittaker_smoother = + WhittakerSmoother::new(2e4, 2, data_to_smooth.len(), Some(&x_input), None) + .unwrap(); + +let results = whittaker_smoother.smooth_optimal(&data_to_smooth, true).unwrap(); + +println!("Result: {:?}", results); + +println!("Optimal result: {:?}", results.get_optimal()); + +``` + + +--- + + You can use these methods in combination with each other for instance, interpolating measurements without providing an x input. For more advanced examples of usage take a look at the examples, tests, and benches in the [Github](https://github.com/AnBowell/whittaker-eilers) repository. Here's an image of some smoothed data from an example: Time-series smoothed by Whittaker-Eilers method diff --git a/benches/whittaker.rs b/benches/whittaker.rs index ee57e33..828c75c 100644 --- a/benches/whittaker.rs +++ b/benches/whittaker.rs @@ -1,5 +1,5 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use whittaker_eilers::{CrossValidationResult, WhittakerSmoother}; +use whittaker_eilers::{CrossValidationResult, OptimisedSmoothResult, WhittakerSmoother}; fn new_y_whittaker(y: &Vec) -> Vec { WhittakerSmoother::new(2e4, 2, y.len(), None, None) @@ -13,7 +13,12 @@ fn new_y_whittaker_cross_validate(y: &Vec) -> CrossValidationResult { .smooth_and_cross_validate(y) .unwrap() } - +fn new_y_whittaker_optimal(y: &Vec) -> OptimisedSmoothResult { + WhittakerSmoother::new(2e4, 2, y.len(), None, None) + .unwrap() + .smooth_optimal(y, true) + .unwrap() +} fn new_x_y_whittaker(x: &Vec, y: &Vec) -> Vec { WhittakerSmoother::new(2e4, 2, y.len(), Some(x), None) .unwrap() @@ -47,6 +52,9 @@ fn criterion_benchmark(c: &mut Criterion) { c.bench_function("Whittaker Wood Cross validate", |b| { b.iter(|| new_y_whittaker_cross_validate(black_box(&wood_data_vec))) }); + c.bench_function("Whittaker Wood Cross validate Optimal", |b| { + b.iter(|| new_y_whittaker_optimal(black_box(&wood_data_vec))) + }); c.bench_function("Whittaker Wood X and Y", |b| { b.iter(|| new_x_y_whittaker(black_box(&wood_x_vec), black_box(&wood_data_vec))) }); @@ -66,8 +74,6 @@ fn criterion_benchmark(c: &mut Criterion) { criterion_group!(benches, criterion_benchmark); criterion_main!(benches); -/// Data from this repo: -/// pub const WOOD_DATASET: &[f64] = &[ 106.0, 111.0, 111.0, 107.0, 105.0, 107.0, 110.0, 108.0, 111.0, 119.0, 117.0, 107.0, 105.0, 107.0, 109.0, 105.0, 104.0, 102.0, 108.0, 113.0, 113.0, 107.0, 103.0, 103.0, 98.0, 102.0, diff --git a/examples/smooth_and_interpolate.rs b/examples/smooth_and_interpolate.rs index 4d7c786..17e6584 100644 --- a/examples/smooth_and_interpolate.rs +++ b/examples/smooth_and_interpolate.rs @@ -121,7 +121,7 @@ fn smooth_cross_validate(x_input: &Vec, y_input: &Vec, weights: &Vec>(); let mut weights = weights_distribution diff --git a/src/cross_validation.rs b/src/cross_validation.rs index 7c73816..de18c6e 100644 --- a/src/cross_validation.rs +++ b/src/cross_validation.rs @@ -4,6 +4,7 @@ /// contains all of the lambdas, smoothed series, and errors. A function then /// provides the ability to return the optimal one. /// +#[derive(Clone, Debug)] pub struct OptimisedSmoothResult { /// The lambda, smoothed series, and errors for each lambda tested. pub validation_results: Vec, @@ -13,7 +14,7 @@ pub struct OptimisedSmoothResult { impl OptimisedSmoothResult { /// Returns the optimally smoothed data series, lambda, and error. pub fn get_optimal(&self) -> CrossValidationResult { - return self.validation_results[self.optimal_index].to_owned(); + self.validation_results[self.optimal_index].to_owned() } } /// The result of smoothing with cross validation diff --git a/src/whittaker_smoother.rs b/src/whittaker_smoother.rs index 70301e9..bb4356a 100644 --- a/src/whittaker_smoother.rs +++ b/src/whittaker_smoother.rs @@ -371,11 +371,11 @@ impl WhittakerSmoother { } .sqrt(); - return Ok(CrossValidationResult { + Ok(CrossValidationResult { lambda: self.get_lambda(), smoothed: smoothed_series, cross_validation_error: cve, - }); + }) } else { let mut hat_matrix = DMatrix::from_iterator( self.to_solve.rows(), @@ -403,11 +403,11 @@ impl WhittakerSmoother { } .sqrt(); - return Ok(CrossValidationResult { + Ok(CrossValidationResult { lambda: self.get_lambda(), smoothed: smoothed_series, cross_validation_error: cve, - }); + }) } } @@ -424,11 +424,11 @@ impl WhittakerSmoother { /// /// # Arguments /// * `y_input`: The values which are to be smoothed, interpolated, and cross validated for a variety of lambdas. - /// * `break_serial_correlation`: Default here should be `true`. Without it, data that exhibits serial correlation is barely smoothed. + /// * `break_serial_correlation`: Default here should be `true`. Without it data that exhibits serial correlation is barely smoothed. /// /// # Returns: - /// [OptimisedSmoothResult]: The smoothed data, lambda, and error for each tested lambda. - pub fn smooth_and_optimise( + /// [OptimisedSmoothResult]: The smoothed data, lambda, and error for each tested lambda. Calling get_optimal, returns the best smoothed series. + pub fn smooth_optimal( &mut self, y_input: &[f64], break_serial_correlation: bool, @@ -497,10 +497,10 @@ impl WhittakerSmoother { } } - return Ok(OptimisedSmoothResult { + Ok(OptimisedSmoothResult { validation_results, optimal_index, - }); + }) } } diff --git a/tests/validation.rs b/tests/validation.rs index 0abd5cf..1c26017 100644 --- a/tests/validation.rs +++ b/tests/validation.rs @@ -330,7 +330,7 @@ fn smooth_and_optimise() { .unwrap(); let cve = whittaker_smoother - .smooth_and_optimise(&input_data.y[..1000], false) + .smooth_optimal(&input_data.y[..1000], false) .unwrap(); let expected = vec![ @@ -341,8 +341,6 @@ fn smooth_and_optimise() { for (actual, expected) in cve.validation_results.iter().zip(expected) { assert_relative_eq!(actual.cross_validation_error, expected, epsilon = 1e-4); } - - // println!("Cve: {:?}", cve); } const INPUT_DATA_LOC: &'static str = "tests/data/input/nmr_with_weights_and_x.csv"; diff --git a/whittaker-eilers-py/README.md b/whittaker-eilers-py/README.md index 1d9cb3f..e5730f1 100644 --- a/whittaker-eilers-py/README.md +++ b/whittaker-eilers-py/README.md @@ -1,112 +1,162 @@ -# Whittaker-Eilers Smoothing and Interpolation -**The Whittaker-Eilers smoother is the perfect smoother.** It offers extremely quick, efficient smoothing with built-in interpolation via weights on each measurement. This package provides a sparse-matrix implementation for additional speed and memory efficiency and can handle both equally and unequally spaced measurements. This package was originally written in Rust so additional examples, tests, and benchmarks are also available in addition to it being super speedy. The API is almost identical. - ---- - -```bash -pip install whittaker-eilers -``` - -## Usage -To start smoothing and interpolating data, create a reusable WhittakerSmoother class. You'll only need to recreate this class if the length or sampling rate of your data changes. - -### Equally spaced data -This is the fastest smoothing option. It smooths equally spaced y measurements using two tunable parameters, `lambda` (2e4) and the smoother `order` (2). The larger the lambda, the smoother the data. -```python -from whittaker_eilers import WhittakerSmoother - -data_to_smooth = [1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0] - -whittaker_smoother = WhittakerSmoother(lmbda=2e4, order=2, data_length = len(data_to_smooth)) - -smoothed_data = whittaker_smoother.smooth(data_to_smooth) - -print("Smoothed data: {}".format(smoothed_data)) -``` - - - -### Non-equally spaced data -If you wish to smooth unequally spaced data, you need to provide an `x_input` with the sample times/positions. -```python -from whittaker_eilers import WhittakerSmoother - -x_input = [1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0] -data_to_smooth = [1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0] - -whittaker_smoother = WhittakerSmoother( - lmbda=2e4, order=2, data_length=len(data_to_smooth), x_input=x_input -) - -smoothed_data = whittaker_smoother.smooth(data_to_smooth) - -print("Smoothed non-equally spaced data: {}".format(smoothed_data)) - - -``` - -### Weighted data & Interpolation -Each measurement can then be weighted to trust some measurements more than others. Setting `weights` to 0 for measurements will lead to interpolation. -```python -from whittaker_eilers import WhittakerSmoother - -x_input = [1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0] -data_to_smooth = [1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0] -weights = [1.0] * len(x_input) -weights[5] = 0.0 - -whittaker_smoother = WhittakerSmoother( - lmbda=2e4, - order=2, - data_length=len(data_to_smooth), - x_input=x_input, - weights=weights, -) - -smoothed_data = whittaker_smoother.smooth(data_to_smooth) - -print("Smoothed and interpolated weighted data: {}".format(smoothed_data)) - -``` -You can use these methods in combination with each other for instance, interpolating measurements without providing an x input. For more advanced examples of usage take a look at the examples, tests, and benches in the [Github](https://github.com/AnBowell/whittaker-eilers) repository. Here's an image of some smoothed data from an example: - -Time-series smoothed by Whittaker-Eilers method - -### Other methods -```python -from whittaker_eilers import WhittakerSmoother -x_input = [1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0] -data_to_smooth = [1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0] -weights = [1.0] * len(x_input) -weights[5] = 0.0 - -whittaker_smoother = WhittakerSmoother( - lmbda=2e4, - order=2, - data_length=len(data_to_smooth), - x_input=x_input, - weights=weights, -) - -whittaker_smoother.get_order() -whittaker_smoother.get_lambda() -whittaker_smoother.get_data_length() -whittaker_smoother.update_weights([0.5] * len(x_input)) -whittaker_smoother.update_order(3) -whittaker_smoother.update_lambda(4321.0) -``` -## Further Reading -If you'd like to see a more detailed run through of the library, check out this [Medium post](https://medium.com/towards-data-science/the-perfect-way-to-smooth-your-noisy-data-4f3fe6b44440). Within it, I run through examples and benchmarks against other smoothing methods. - -## Future Features -- Cross validation options to find optimal lambda. -- Scatter plot smoothing -- Generic typing - -## References -The algorithm implemented here mirrors a 2003 implementation by Paul H. C. Eilers in Matlab. I've included scripts and data from the original paper in the tests for this package. The original paper and code can be found here: - -[A Perfect Smoother](https://pubs.acs.org/doi/10.1021/ac034173t) -Paul H. C. Eilers -Analytical Chemistry 2003 75 (14), 3631-3636 -DOI: 10.1021/ac034173t +# Whittaker-Eilers Smoothing and Interpolation +**The Whittaker-Eilers smoother is the perfect smoother.** It offers extremely quick, efficient smoothing with built-in interpolation via weights on each measurement. This package provides a sparse-matrix implementation for additional speed and memory efficiency and can handle both equally and unequally spaced measurements. This package was originally written in Rust so additional examples, tests, and benchmarks are also available in addition to it being super speedy. The API is almost identical. + +--- + +```bash +pip install whittaker-eilers +``` + +## Usage +To start smoothing and interpolating data, create a reusable WhittakerSmoother class. You'll only need to recreate this class if the length or sampling rate of your data changes. + +### Equally spaced data +This is the fastest smoothing option. It smooths equally spaced y measurements using two tunable parameters, `lambda` (2e4) and the smoother `order` (2). The larger the lambda, the smoother the data. +```python +from whittaker_eilers import WhittakerSmoother + +data_to_smooth = [1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0] + +whittaker_smoother = WhittakerSmoother(lmbda=2e4, order=2, data_length = len(data_to_smooth)) + +smoothed_data = whittaker_smoother.smooth(data_to_smooth) + +print("Smoothed data: {}".format(smoothed_data)) +``` + + + +### Non-equally spaced data +If you wish to smooth unequally spaced data, you need to provide an `x_input` with the sample times/positions. +```python +from whittaker_eilers import WhittakerSmoother + +x_input = [1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0] +data_to_smooth = [1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0] + +whittaker_smoother = WhittakerSmoother( + lmbda=2e4, order=2, data_length=len(data_to_smooth), x_input=x_input +) + +smoothed_data = whittaker_smoother.smooth(data_to_smooth) + +print("Smoothed non-equally spaced data: {}".format(smoothed_data)) + + +``` + +### Weighted data & Interpolation +Each measurement can then be weighted to trust some measurements more than others. Setting `weights` to 0 for measurements will lead to interpolation. +```python +from whittaker_eilers import WhittakerSmoother + +x_input = [1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0] +data_to_smooth = [1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0] +weights = [1.0] * len(x_input) +weights[5] = 0.0 + +whittaker_smoother = WhittakerSmoother( + lmbda=2e4, + order=2, + data_length=len(data_to_smooth), + x_input=x_input, + weights=weights, +) + +smoothed_data = whittaker_smoother.smooth(data_to_smooth) + +print("Smoothed and interpolated weighted data: {}".format(smoothed_data)) + +``` + +### Smoothing with cross validation +With this package, you can also calculate the cross validation error alongside the smoothed series. This shouldn't really be used in production where speed is necessary though! + + +```python +from whittaker_eilers import WhittakerSmoother + +x_input = [1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0] +data_to_smooth = [1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0] + +whittaker_smoother = WhittakerSmoother( + lmbda=2e4, order=2, data_length=len(data_to_smooth), x_input=x_input +) + +smoothed_data_with_cross_validation = whittaker_smoother.smooth_and_cross_validate( + data_to_smooth +) + +print( + "Error :{}".format(smoothed_data_with_cross_validation.get_cross_validation_error()) +) +print("Smoothed :{}".format(smoothed_data_with_cross_validation.get_smoothed())) + +``` + +### Automatic smoothing +Smoothing data requires a choice of Lambda. This can be done using visual inspection or by finding the lambda +which results in the lowest cross validation error. The `smooth_optimal` function runs the smoother for a variety of lambdas and returns the results with the ability to retrieve the optimal one. + + +```python +from whittaker_eilers import WhittakerSmoother + +x_input = [1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0] +data_to_smooth = [1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0] + +whittaker_smoother = WhittakerSmoother( + lmbda=2e4, order=2, data_length=len(data_to_smooth), x_input=x_input +) + +optimal_smooth = whittaker_smoother.smooth_optimal(data_to_smooth) + +print("Optimal lambda: {}".format(optimal_smooth.get_optimal().get_lambda())) + +print("Optimally smoothed data: {}".format(optimal_smooth.get_optimal().get_smoothed())) + +``` + +--- +You can use these methods in combination with each other for instance, interpolating measurements without providing an x input. For more advanced examples of usage take a look at the examples, tests, and benches in the [Github](https://github.com/AnBowell/whittaker-eilers) repository. Here's an image of some smoothed data from an example: + +Time-series smoothed by Whittaker-Eilers method + +### Other methods +```python +from whittaker_eilers import WhittakerSmoother +x_input = [1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0] +data_to_smooth = [1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0] +weights = [1.0] * len(x_input) +weights[5] = 0.0 + +whittaker_smoother = WhittakerSmoother( + lmbda=2e4, + order=2, + data_length=len(data_to_smooth), + x_input=x_input, + weights=weights, +) + +whittaker_smoother.get_order() +whittaker_smoother.get_lambda() +whittaker_smoother.get_data_length() +whittaker_smoother.update_weights([0.5] * len(x_input)) +whittaker_smoother.update_order(3) +whittaker_smoother.update_lambda(4321.0) +``` +## Further Reading +If you'd like to see a more detailed run through of the library, check out this [Medium post](https://medium.com/towards-data-science/the-perfect-way-to-smooth-your-noisy-data-4f3fe6b44440). Within it, I run through examples and benchmarks against other smoothing methods. + +## Future Features +- Cross validation options to find optimal lambda. +- Scatter plot smoothing +- Generic typing + +## References +The algorithm implemented here mirrors a 2003 implementation by Paul H. C. Eilers in Matlab. I've included scripts and data from the original paper in the tests for this package. The original paper and code can be found here: + +[A Perfect Smoother](https://pubs.acs.org/doi/10.1021/ac034173t) +Paul H. C. Eilers +Analytical Chemistry 2003 75 (14), 3631-3636 +DOI: 10.1021/ac034173t diff --git a/whittaker-eilers-py/examples/cross_validation.py b/whittaker-eilers-py/examples/cross_validation.py index a1b401c..a9d7c70 100644 --- a/whittaker-eilers-py/examples/cross_validation.py +++ b/whittaker-eilers-py/examples/cross_validation.py @@ -9,13 +9,30 @@ temp_anom = data[:, 1] other_smoothed = data[:, 2] -whittaker_smoother = WhittakerSmoother(lmbda=20, order=2, data_length=len(temp_anom)) +weights = [1.0] * len(years) +for i in range(1, len(temp_anom)): + if (i % 2 == 0) and i != len(temp_anom) - 1: + temp_anom[i] = np.nan + weights[i] = 0.0 -res = whittaker_smoother.smooth_and_cross_validate(list(temp_anom)) + if i > 5 and i < 5 + 15: + temp_anom[i] = np.nan + weights[i] = 0.0 + if i > 96 and i < 96 + 30: + temp_anom[i] = np.nan + weights[i] = 0.0 -res = whittaker_smoother.smooth_and_optimise(list(temp_anom)) +whittaker_smoother = WhittakerSmoother( + lmbda=20, order=2, data_length=len(temp_anom), x_input=list(years), weights=weights +) + +smoothed_data = whittaker_smoother.smooth_and_cross_validate(list(np.nan_to_num(temp_anom))) + +print(smoothed_data.get_cross_validation_error()) + +res = whittaker_smoother.smooth_optimal(list(np.nan_to_num(temp_anom))) print(res.get_optimal().get_lambda()) diff --git a/whittaker-eilers-py/examples/whittaker_compare_interp.py b/whittaker-eilers-py/examples/whittaker_compare_interp.py index a9bd1f1..82ba068 100644 --- a/whittaker-eilers-py/examples/whittaker_compare_interp.py +++ b/whittaker-eilers-py/examples/whittaker_compare_interp.py @@ -1,161 +1,161 @@ -from whittaker_eilers import WhittakerSmoother -import matplotlib.pyplot as plt -import numpy as np -from scipy.signal import savgol_filter -from scipy.ndimage import gaussian_filter1d -from statsmodels.nonparametric.smoothers_lowess import lowess -from scipy.interpolate import interp1d - - -SAVE_PLOT = False - -axis_color = "#d0d0fa" - -data = np.loadtxt("graph.txt", skiprows=5) - -years = data[:, 0] -temp_anom = data[:, 1] -other_smoothed = data[:, 2] - -whittaker_smoother = WhittakerSmoother(lmbda=20, order=2, data_length=len(temp_anom)) - -whit_temp_anom_no_gaps = whittaker_smoother.smooth(temp_anom) - -gauss_temp_anom_no_gaps = gaussian_filter1d(temp_anom, 2.0, order=0) - -sav_gol_temp_anom_no_gaps = savgol_filter(temp_anom, window_length=35, polyorder=5) - - -lowes_temp_anom_no_gaps = lowess( - temp_anom, - years, - frac=1.0 / 12.0, - it=5, - return_sorted=False, - missing="drop", - xvals=years, -) - - -weights = [1.0] * len(years) - -for i in range(1, len(temp_anom)): - if (i % 2 == 0) and i != len(temp_anom) - 1: - temp_anom[i] = np.nan - weights[i] = 0.0 - - if i > 5 and i < 5 + 15: - temp_anom[i] = np.nan - weights[i] = 0.0 - - if i > 96 and i < 96 + 30: - temp_anom[i] = np.nan - weights[i] = 0.0 - - -whittaker_smoother = WhittakerSmoother( - lmbda=20, order=2, data_length=len(temp_anom), weights=weights -) - -interpolator = interp1d( - years[~np.isnan(temp_anom)], temp_anom[~np.isnan(temp_anom)], kind="linear" -) - -interpolated_data = interpolator(years) - -whit_temp_anom = whittaker_smoother.smooth(np.nan_to_num(temp_anom)) - -gauss_temp_anom = gaussian_filter1d(interpolated_data, 2.0, order=0) - -sav_gol_temp_anom = savgol_filter(interpolated_data, window_length=35, polyorder=5) - -lowes_temp_anom = lowess( - interpolated_data, - years, - frac=1.0 / 12.0, - it=5, - return_sorted=False, - missing="drop", - xvals=years, -) - -whit_mse = ( - np.mean((np.asarray(whit_temp_anom) - np.asarray(whit_temp_anom_no_gaps)) ** 2) - ** 0.5 -) -gauss_mse = np.mean((gauss_temp_anom - gauss_temp_anom_no_gaps) ** 2) ** 0.5 -sav_golay_mse = np.mean((sav_gol_temp_anom - sav_gol_temp_anom_no_gaps) ** 2) ** 0.5 -lowess_mse = np.mean((lowes_temp_anom - lowes_temp_anom_no_gaps) ** 2) ** 0.5 - -print("Whittaker MSE: {} deg".format(whit_mse)) -print("Gauss MSE: {} deg".format(gauss_mse)) -print("Sav Golay MSE: {} deg".format(sav_golay_mse)) -print("Lowess MSE: {} deg".format(lowess_mse)) - - -(fig, ax1) = plt.subplots(figsize=(8, 4), facecolor="#00002a") - -ax1.spines["bottom"].set_color(axis_color) -ax1.spines["top"].set_color(axis_color) -ax1.spines["right"].set_color(axis_color) -ax1.spines["left"].set_color(axis_color) -ax1.set_xlabel("Year", color=axis_color) -ax1.set_ylabel("Temperature Anomaly / °C", color=axis_color) -ax1.tick_params(axis="y", direction="in", color=axis_color, labelcolor=axis_color) -ax1.tick_params(axis="x", direction="in", color=axis_color, labelcolor=axis_color) -ax1.grid(True, ls="--", alpha=0.6) -ax1.set_facecolor("#00002a") -ax1.set_xlim(1880, 2020) - -ax1.plot( - years, - temp_anom, - color="#fc8d28", - marker=".", - label="Measured", - alpha=0.6, - markersize=8, -) - -ax1.plot( - years, - gauss_temp_anom, - color="#f003ef", - lw=2, - label="Gaussian Kernel", - solid_capstyle="round", -) -ax1.plot( - years, - lowes_temp_anom, - color="red", - lw=2, - label="LOWESS", - solid_capstyle="round", -) -ax1.plot( - years, - sav_gol_temp_anom, - color="#66b0ff", - lw=2, - label="Sav-Golay", - solid_capstyle="round", -) - -ax1.plot( - years, - whit_temp_anom, - color="#59f176", - lw=2, - label="Whittaker", - solid_capstyle="round", -) -plt.legend() - -if SAVE_PLOT: - plt.savefig( - "whittaker_compare_interp.png", - dpi=800, - bbox_inches="tight", - ) -plt.show() +from whittaker_eilers import WhittakerSmoother +import matplotlib.pyplot as plt +import numpy as np +from scipy.signal import savgol_filter +from scipy.ndimage import gaussian_filter1d +from statsmodels.nonparametric.smoothers_lowess import lowess +from scipy.interpolate import interp1d + + +SAVE_PLOT = False + +axis_color = "#d0d0fa" + +data = np.loadtxt("graph.txt", skiprows=5) + +years = data[:, 0] +temp_anom = data[:, 1] +other_smoothed = data[:, 2] + +whittaker_smoother = WhittakerSmoother(lmbda=20, order=2, data_length=len(temp_anom)) + +whit_temp_anom_no_gaps = whittaker_smoother.smooth(temp_anom) + +gauss_temp_anom_no_gaps = gaussian_filter1d(temp_anom, 2.0, order=0) + +sav_gol_temp_anom_no_gaps = savgol_filter(temp_anom, window_length=35, polyorder=5) + + +lowes_temp_anom_no_gaps = lowess( + temp_anom, + years, + frac=1.0 / 12.0, + it=5, + return_sorted=False, + missing="drop", + xvals=years, +) + + +weights = [1.0] * len(years) + +for i in range(1, len(temp_anom)): + if (i % 2 == 0) and i != len(temp_anom) - 1: + temp_anom[i] = np.nan + weights[i] = 0.0 + + if i > 5 and i < 5 + 15: + temp_anom[i] = np.nan + weights[i] = 0.0 + + if i > 96 and i < 96 + 30: + temp_anom[i] = np.nan + weights[i] = 0.0 + + +whittaker_smoother = WhittakerSmoother( + lmbda=20, order=2, data_length=len(temp_anom), weights=weights +) + +interpolator = interp1d( + years[~np.isnan(temp_anom)], temp_anom[~np.isnan(temp_anom)], kind="linear" +) + +interpolated_data = interpolator(years) + +whit_temp_anom = whittaker_smoother.smooth(list(np.nan_to_num(temp_anom))) + +gauss_temp_anom = gaussian_filter1d(interpolated_data, 2.0, order=0) + +sav_gol_temp_anom = savgol_filter(interpolated_data, window_length=35, polyorder=5) + +lowes_temp_anom = lowess( + interpolated_data, + years, + frac=1.0 / 12.0, + it=5, + return_sorted=False, + missing="drop", + xvals=years, +) + +whit_mse = ( + np.mean((np.asarray(whit_temp_anom) - np.asarray(whit_temp_anom_no_gaps)) ** 2) + ** 0.5 +) +gauss_mse = np.mean((gauss_temp_anom - gauss_temp_anom_no_gaps) ** 2) ** 0.5 +sav_golay_mse = np.mean((sav_gol_temp_anom - sav_gol_temp_anom_no_gaps) ** 2) ** 0.5 +lowess_mse = np.mean((lowes_temp_anom - lowes_temp_anom_no_gaps) ** 2) ** 0.5 + +print("Whittaker MSE: {} deg".format(whit_mse)) +print("Gauss MSE: {} deg".format(gauss_mse)) +print("Sav Golay MSE: {} deg".format(sav_golay_mse)) +print("Lowess MSE: {} deg".format(lowess_mse)) + + +(fig, ax1) = plt.subplots(figsize=(8, 4), facecolor="#00002a") + +ax1.spines["bottom"].set_color(axis_color) +ax1.spines["top"].set_color(axis_color) +ax1.spines["right"].set_color(axis_color) +ax1.spines["left"].set_color(axis_color) +ax1.set_xlabel("Year", color=axis_color) +ax1.set_ylabel("Temperature Anomaly / °C", color=axis_color) +ax1.tick_params(axis="y", direction="in", color=axis_color, labelcolor=axis_color) +ax1.tick_params(axis="x", direction="in", color=axis_color, labelcolor=axis_color) +ax1.grid(True, ls="--", alpha=0.6) +ax1.set_facecolor("#00002a") +ax1.set_xlim(1880, 2020) + +ax1.plot( + years, + temp_anom, + color="#fc8d28", + marker=".", + label="Measured", + alpha=0.6, + markersize=8, +) + +ax1.plot( + years, + gauss_temp_anom, + color="#f003ef", + lw=2, + label="Gaussian Kernel", + solid_capstyle="round", +) +ax1.plot( + years, + lowes_temp_anom, + color="red", + lw=2, + label="LOWESS", + solid_capstyle="round", +) +ax1.plot( + years, + sav_gol_temp_anom, + color="#66b0ff", + lw=2, + label="Sav-Golay", + solid_capstyle="round", +) + +ax1.plot( + years, + whit_temp_anom, + color="#59f176", + lw=2, + label="Whittaker", + solid_capstyle="round", +) +plt.legend() + +if SAVE_PLOT: + plt.savefig( + "whittaker_compare_interp.png", + dpi=800, + bbox_inches="tight", + ) +plt.show() diff --git a/whittaker-eilers-py/src/cross_validation.rs b/whittaker-eilers-py/src/cross_validation.rs index cdd6333..f018301 100644 --- a/whittaker-eilers-py/src/cross_validation.rs +++ b/whittaker-eilers-py/src/cross_validation.rs @@ -3,30 +3,47 @@ use pyo3::prelude::*; use whittaker_eilers_rs::CrossValidationResult as CrossValidationResultRs; use whittaker_eilers_rs::OptimisedSmoothResult as OptimisedSmoothResultRs; -/// Doc string here +/// Contains the results of cross validation for a variety of lambdas +/// +/// This class contains the results of finding the optimal lambda. A vec +/// contains all of the lambdas, smoothed series, and errors. `get_optimal` then +/// provides the ability to return the optimal one and `get_all` will return the full results. #[pyclass] #[repr(transparent)] pub struct OptimisedSmoothResult(pub(crate) OptimisedSmoothResultRs); #[pymethods] impl OptimisedSmoothResult { + /// Gets the optimally smoothed result. pub fn get_optimal(&self) -> CrossValidationResult { CrossValidationResult(self.0.get_optimal()) } + /// Gets all of the smoothed results. + pub fn get_all(&self) -> Vec { + self.0 + .validation_results + .iter() + .map(|x| CrossValidationResult(x.clone())) + .collect() + } } +/// The result of smoothing with cross validation #[pyclass] #[repr(transparent)] pub struct CrossValidationResult(pub(crate) CrossValidationResultRs); #[pymethods] impl CrossValidationResult { + /// The lambda value that was used to smooth the data. pub fn get_lambda(&self) -> f64 { self.0.lambda } + /// The smoothed data. pub fn get_smoothed(&self) -> Vec { self.0.smoothed.clone() } + /// The associated cross validation error for the smoothed data. pub fn get_cross_validation_error(&self) -> f64 { self.0.cross_validation_error } diff --git a/whittaker-eilers-py/src/whittaker_smoother.rs b/whittaker-eilers-py/src/whittaker_smoother.rs index c531d2f..9fe8f35 100644 --- a/whittaker-eilers-py/src/whittaker_smoother.rs +++ b/whittaker-eilers-py/src/whittaker_smoother.rs @@ -111,24 +111,55 @@ impl WhittakerSmoother { self.0.smooth(&y_vals).map_err(map_err_to_py) } - /// TODO: Doc string - pub fn smooth_and_cross_validate(&self, y_input: Vec) -> PyResult { + /// Run Whittaker-Eilers smoothing, interpolation and cross validation. + /// + /// This function will run the smoother and assess the cross validation error on the result. This is defined in Eiler's + /// 2003 paper: "A Perfect Smoother". It involves computing the "hat matrix" or "smoother matrix" which inverses a sparse matrix. The + /// inverse of a sparse matrix is usually dense, so this function will take much longer to run in comparison to just running `smooth`. + /// + /// Parameters + /// ---------- + /// y_vals: The values which are to be smoothed and interpolated and have their cross validation error calculated. + /// + /// Returns + /// ------- + /// + /// CrossValidationResult: The smoothed data, lambda it was smoothed at, and the cross validation error. + pub fn smooth_and_cross_validate(&self, y_vals: Vec) -> PyResult { Ok(CrossValidationResult( self.0 - .smooth_and_cross_validate(&y_input) + .smooth_and_cross_validate(&y_vals) .map_err(map_err_to_py)?, )) } - /// TODO: Doc string - #[pyo3(signature = (y_input, break_serial_correlation = true))] - pub fn smooth_and_optimise( + /// Runs Whittaker-Eilers smoothing for a variety of lambdas and selects the optimally smoothed time series. + /// + /// This function runs the smoother for lambdas varying from 1e-2 to 1e8 in logarithmic steps of 0.5. It computes the + /// hat/smoother matrix and finds the optimal lambda for the data. If the time-series exhibits serial correlation the optimal + /// lambda can be very small and mean the smoothed data doesn't differ from the input data. To avoid this, use `break_serial_correlation = true` + /// + /// It will return the smoothed data, lambda, and cross validation error for each lambda tested! + /// + /// As the smoother matrix requires the inversion of a sparse matrix (which usually becomes a dense matrix), this code is extremely slow compared to smoothing + /// with a known lambda. Use sparingly! + /// + /// Parameters + /// ---------- + /// y_input: The values which are to be smoothed, interpolated, and cross validated for a variety of lambdas. + /// break_serial_correlation: Defaults to `true`. Without it, data that exhibits serial correlation is barely smoothed. + /// + /// Returns + /// ------- + /// OptimisedSmoothResult: The smoothed data, lambda, and error for each tested lambda. Calling get_optimal, returns the best smoothed series. + #[pyo3(signature = (y_vals, break_serial_correlation = true))] + pub fn smooth_optimal( &mut self, - y_input: Vec, + y_vals: Vec, break_serial_correlation: bool, ) -> PyResult { Ok(OptimisedSmoothResult( self.0 - .smooth_and_optimise(&y_input, break_serial_correlation) + .smooth_optimal(&y_vals, break_serial_correlation) .map_err(map_err_to_py)?, )) } diff --git a/whittaker-eilers-py/whittaker_eilers.pyi b/whittaker-eilers-py/whittaker_eilers.pyi index b2311c4..0c21e07 100644 --- a/whittaker-eilers-py/whittaker_eilers.pyi +++ b/whittaker-eilers-py/whittaker_eilers.pyi @@ -1,13 +1,14 @@ -from typing import Optional +from typing import Optional, List class CrossValidationResult: def get_lambda(self) -> float: ... - def get_smoothed(self) -> list[float]: ... + def get_smoothed(self) -> List[float]: ... def get_cross_validation_error(self) -> float: ... ... class OptimisedSmoothResult: def get_optimal(self) -> CrossValidationResult: ... + def get_all(self) -> List[CrossValidationResult]: ... # Bit of a pain having to handcraft these! Especially with the docs. class WhittakerSmoother: @@ -34,8 +35,8 @@ class WhittakerSmoother: lmbda: float, order: int, data_length: int, - x_input: Optional[list] = None, - weights: Optional[list] = None, + x_input: Optional[List] = None, + weights: Optional[List] = None, ) -> None: ... def get_order(self) -> int: """Retrieve the smoother's current order.""" @@ -46,7 +47,7 @@ class WhittakerSmoother: def get_data_length(self) -> int: """Retrieve the smoother's current length.""" ... - def update_weights(self, weights: list) -> None: + def update_weights(self, weights: List) -> None: """Updates the weights of the data to be smoothed. The length of weights should be equal to that of the data you are to smooth. The values of the weights should fall between 0 and 1. @@ -74,7 +75,7 @@ class WhittakerSmoother: lmbda : The smoothing constant of the Whittaker-Eilers smoother. """ ... - def smooth(self, y_vals: list[float]) -> list: + def smooth(self, y_vals: List[float]) -> List: """Run Whittaker-Eilers smoothing and interpolation. This function actually runs the solver which results in the smoothed data. If you just wish to continuously smooth @@ -89,9 +90,46 @@ class WhittakerSmoother: ------- The smoothed and interpolated data.""" ... - def smooth_and_cross_validate( - self, y_input: list[float] - ) -> CrossValidationResult: ... - def smooth_and_optimise( - self, y_input: list[float], break_serial_correlation: bool = True - ) -> OptimisedSmoothResult: ... + + def smooth_and_cross_validate(self, y_vals: List[float]) -> CrossValidationResult: + """Run Whittaker-Eilers smoothing, interpolation and cross validation. + + This function will run the smoother and assess the cross validation error on the result. This is defined in Eiler's + 2003 paper: "A Perfect Smoother". It involves computing the "hat matrix" or "smoother matrix" which inverses a sparse matrix. The + inverse of a sparse matrix is usually dense, so this function will take much longer to run in comparison to just running `smooth`. + + Parameters + ---------- + y_vals: The values which are to be smoothed and interpolated and have their cross validation error calculated. + + Returns + ------- + CrossValidationResult: The smoothed data, lambda it was smoothed at, and the cross validation error. + + """ + ... + + def smooth_optimal( + self, y_vals: List[float], break_serial_correlation: bool = True + ) -> OptimisedSmoothResult: + """Runs Whittaker-Eilers smoothing for a variety of lambdas and selects the optimally smoothed time series. + + This function runs the smoother for lambdas varying from 1e-2 to 1e8 in logarithmic steps of 0.5. It computes the + hat/smoother matrix and finds the optimal lambda for the data. If the time-series exhibits serial correlation the optimal + lambda can be very small and mean the smoothed data doesn't differ from the input data. To avoid this, use `break_serial_correlation = true` + + It will return the smoothed data, lambda, and cross validation error for each lambda tested! + + As the smoother matrix requires the inversion of a sparse matrix (which usually becomes a dense matrix), this code is extremely slow compared to smoothing + with a known lambda. Use sparingly! + + Parameters + ---------- + y_input: The values which are to be smoothed, interpolated, and cross validated for a variety of lambdas. + break_serial_correlation: Defaults to `true`. Without it, data that exhibits serial correlation is barely smoothed. + + Returns + ------- + OptimisedSmoothResult: The smoothed data, lambda, and error for each tested lambda. Calling get_optimal, returns the best smoothed series. + """ + ... From 3a62969ee30b86b9f6381dc30bcaa789f4427b49 Mon Sep 17 00:00:00 2001 From: Andrew Bowell <49959927+AnBowell@users.noreply.github.com> Date: Sun, 11 Feb 2024 17:53:01 +0000 Subject: [PATCH 08/12] feat: :sparkles: Working on cross validation plots --- .gitignore | 16 +- src/whittaker_smoother.rs | 4 +- .../readme.md => original_scripts/README.md} | 0 tests/{data => }/original_scripts/ddmat.m | 38 +- .../{data => }/original_scripts/demo_nmrcv.m | 0 .../demo_standard_whittaker.m | 58 +-- .../demo_weighted_whittaker.m | 72 ++-- .../original_scripts/demo_wood_cross_val.m | 0 .../demo_x_input_weighted_whittaker.m | 74 ++-- .../original_scripts/demo_x_input_whittaker.m | 72 ++-- tests/{data => }/original_scripts/whitsm.m | 0 tests/{data => }/original_scripts/whitsmdd.m | 0 tests/{data => }/original_scripts/whitsmddw.m | 122 +++--- tests/{data => }/original_scripts/whitsmw.m | 0 .../examples/cross_validation.py | 28 +- .../cross_validation_article/README.md | 15 + .../cross_validation_article/article_plots.py | 371 ++++++++++++++++++ .../cross_validation_article/mse_vs_cve.py | 125 ++++++ 18 files changed, 758 insertions(+), 237 deletions(-) rename tests/{data/original_scripts/readme.md => original_scripts/README.md} (100%) rename tests/{data => }/original_scripts/ddmat.m (95%) rename tests/{data => }/original_scripts/demo_nmrcv.m (100%) rename tests/{data => }/original_scripts/demo_standard_whittaker.m (95%) rename tests/{data => }/original_scripts/demo_weighted_whittaker.m (95%) rename tests/{data => }/original_scripts/demo_wood_cross_val.m (100%) rename tests/{data => }/original_scripts/demo_x_input_weighted_whittaker.m (95%) rename tests/{data => }/original_scripts/demo_x_input_whittaker.m (94%) rename tests/{data => }/original_scripts/whitsm.m (100%) rename tests/{data => }/original_scripts/whitsmdd.m (100%) rename tests/{data => }/original_scripts/whitsmddw.m (96%) rename tests/{data => }/original_scripts/whitsmw.m (100%) create mode 100644 whittaker-eilers-py/examples/cross_validation_article/README.md create mode 100644 whittaker-eilers-py/examples/cross_validation_article/article_plots.py create mode 100644 whittaker-eilers-py/examples/cross_validation_article/mse_vs_cve.py diff --git a/.gitignore b/.gitignore index be2f23e..7fc0788 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,11 @@ -/target -/Cargo.lock -/whittaker-eilers-py/target -**__pycache__** -.vscode \ No newline at end of file +/target +/Cargo.lock +/whittaker-eilers-py/target +**__pycache__** +.vscode + + + +whittaker-eilers-py/examples/cross_validation_article/AirQualityUCI.csv +whittaker-eilers-py/examples/cross_validation_article/bone.data +whittaker-eilers-py/examples/cross_validation_article/spec-1939-53389-0138.fits diff --git a/src/whittaker_smoother.rs b/src/whittaker_smoother.rs index bb4356a..deb1dd4 100644 --- a/src/whittaker_smoother.rs +++ b/src/whittaker_smoother.rs @@ -434,8 +434,8 @@ impl WhittakerSmoother { break_serial_correlation: bool, ) -> Result { let step = 0.5; - let mut start_lambda_log = (1e-2_f64).log10(); // -2 - let end_lambda_log = (1e8_f64).log10(); // 8 + let mut start_lambda_log = (1e-5_f64).log10(); + let end_lambda_log = (1e8_f64).log10(); let mut optimal_index = 0; let mut validation_results = Vec::new(); diff --git a/tests/data/original_scripts/readme.md b/tests/original_scripts/README.md similarity index 100% rename from tests/data/original_scripts/readme.md rename to tests/original_scripts/README.md diff --git a/tests/data/original_scripts/ddmat.m b/tests/original_scripts/ddmat.m similarity index 95% rename from tests/data/original_scripts/ddmat.m rename to tests/original_scripts/ddmat.m index c06f645..521c3c6 100644 --- a/tests/data/original_scripts/ddmat.m +++ b/tests/original_scripts/ddmat.m @@ -1,19 +1,19 @@ -function D = ddmat(x, d) -% Compute divided differencing matrix of order d -% -% Input -% x: vector of sampling positions -% d: order of diffferences -% Output -% D: the matrix; D * Y gives divided differences of order d -% -% Paul Eilers, 2003 - -m = length(x); -if d == 0 - D = speye(m); -else - dx = x((d + 1):m) - x(1:(m - d)); - V = spdiags(1 ./ dx, 0, m - d, m - d); - D = V * diff(ddmat(x, d - 1)); -end +function D = ddmat(x, d) +% Compute divided differencing matrix of order d +% +% Input +% x: vector of sampling positions +% d: order of diffferences +% Output +% D: the matrix; D * Y gives divided differences of order d +% +% Paul Eilers, 2003 + +m = length(x); +if d == 0 + D = speye(m); +else + dx = x((d + 1):m) - x(1:(m - d)); + V = spdiags(1 ./ dx, 0, m - d, m - d); + D = V * diff(ddmat(x, d - 1)); +end diff --git a/tests/data/original_scripts/demo_nmrcv.m b/tests/original_scripts/demo_nmrcv.m similarity index 100% rename from tests/data/original_scripts/demo_nmrcv.m rename to tests/original_scripts/demo_nmrcv.m diff --git a/tests/data/original_scripts/demo_standard_whittaker.m b/tests/original_scripts/demo_standard_whittaker.m similarity index 95% rename from tests/data/original_scripts/demo_standard_whittaker.m rename to tests/original_scripts/demo_standard_whittaker.m index 0b7bc51..3cd2dd1 100644 --- a/tests/data/original_scripts/demo_standard_whittaker.m +++ b/tests/original_scripts/demo_standard_whittaker.m @@ -1,29 +1,29 @@ -% Demonstration NMR spectrum smoothing with Whittaker smoother -% Optimal smoothing by cross-validation -% -% Paul Eilers, 2003 - -% Get the data -data = load('nmr_with_weights_and_x.csv'); -y = data(:,2); - - -[z_2, ~] = whitsm(y, 2e4, 2); - -writematrix(z_2,'output_only_y_2e4_2.csv') - -[z_3, ~] = whitsm(y, 2e4,3); - -writematrix(z_3,'output_only_y_2e4_3.csv') - -% Plot data and smooth -subplot(1, 1, 1); -plot([z_2-10 y] ) % Downward shift for visibility -hold on; -plot([z_3-10 y] ) -title('NMR spectrum and optimal smooth') -xlabel('Channel') -ylabel('Signal strength') - - - +% Demonstration NMR spectrum smoothing with Whittaker smoother +% Optimal smoothing by cross-validation +% +% Paul Eilers, 2003 + +% Get the data +data = load('nmr_with_weights_and_x.csv'); +y = data(:,2); + + +[z_2, ~] = whitsm(y, 2e4, 2); + +writematrix(z_2,'output_only_y_2e4_2.csv') + +[z_3, ~] = whitsm(y, 2e4,3); + +writematrix(z_3,'output_only_y_2e4_3.csv') + +% Plot data and smooth +subplot(1, 1, 1); +plot([z_2-10 y] ) % Downward shift for visibility +hold on; +plot([z_3-10 y] ) +title('NMR spectrum and optimal smooth') +xlabel('Channel') +ylabel('Signal strength') + + + diff --git a/tests/data/original_scripts/demo_weighted_whittaker.m b/tests/original_scripts/demo_weighted_whittaker.m similarity index 95% rename from tests/data/original_scripts/demo_weighted_whittaker.m rename to tests/original_scripts/demo_weighted_whittaker.m index c6f6d33..f9e5745 100644 --- a/tests/data/original_scripts/demo_weighted_whittaker.m +++ b/tests/original_scripts/demo_weighted_whittaker.m @@ -1,36 +1,36 @@ - -% Demonstration NMR spectrum smoothing with Whittaker smoother -% Optimal smoothing by cross-validation -% -% Paul Eilers, 2003 - -% Get the data -data = load('nmr_with_weights_and_x.csv'); -y = data(:,2); -evenly_space_weights = data(:,3); -random_weights = data(:, 4); - - -[z_even, ~] = whitsmw(y, evenly_space_weights, 2e4, 2); - -writematrix(z_even,'output_y_with_weights_2e4_2.csv'); - -[z_random, ~] = whitsmw(y, random_weights, 2e4, 3); - -writematrix(z_random,'output_y_with_random_weights_2e4_3.csv') - - -% Plot data and smooth -subplot(1, 1, 1); -plot([z_even-10 y] ) % Downward shift for visibility -hold on; -plot([z_random-10 y] ) -title('NMR spectrum and optimal smooth') -xlabel('Channel') -ylabel('Signal strength') - - - - - - + +% Demonstration NMR spectrum smoothing with Whittaker smoother +% Optimal smoothing by cross-validation +% +% Paul Eilers, 2003 + +% Get the data +data = load('nmr_with_weights_and_x.csv'); +y = data(:,2); +evenly_space_weights = data(:,3); +random_weights = data(:, 4); + + +[z_even, ~] = whitsmw(y, evenly_space_weights, 2e4, 2); + +writematrix(z_even,'output_y_with_weights_2e4_2.csv'); + +[z_random, ~] = whitsmw(y, random_weights, 2e4, 3); + +writematrix(z_random,'output_y_with_random_weights_2e4_3.csv') + + +% Plot data and smooth +subplot(1, 1, 1); +plot([z_even-10 y] ) % Downward shift for visibility +hold on; +plot([z_random-10 y] ) +title('NMR spectrum and optimal smooth') +xlabel('Channel') +ylabel('Signal strength') + + + + + + diff --git a/tests/data/original_scripts/demo_wood_cross_val.m b/tests/original_scripts/demo_wood_cross_val.m similarity index 100% rename from tests/data/original_scripts/demo_wood_cross_val.m rename to tests/original_scripts/demo_wood_cross_val.m diff --git a/tests/data/original_scripts/demo_x_input_weighted_whittaker.m b/tests/original_scripts/demo_x_input_weighted_whittaker.m similarity index 95% rename from tests/data/original_scripts/demo_x_input_weighted_whittaker.m rename to tests/original_scripts/demo_x_input_weighted_whittaker.m index ed8151d..ae57ad9 100644 --- a/tests/data/original_scripts/demo_x_input_weighted_whittaker.m +++ b/tests/original_scripts/demo_x_input_weighted_whittaker.m @@ -1,37 +1,37 @@ - -% Demonstration NMR spectrum smoothing with Whittaker smoother -% Optimal smoothing by cross-validation -% -% Paul Eilers, 2003 - -% Get the data -data = load('nmr_with_weights_and_x.csv'); -x = data(: ,1); -y = data(:,2); -evenly_space_weights = data(:,3); -random_weights = data(:, 4); - - - -[z_even, ~] = whitsmddw(x, y, evenly_space_weights, 2e4, 2); - -writematrix(z_even,'output_x_y_and_weights_2e4_2.csv') - -[z_random, ~] = whitsmddw(x, y, random_weights, 2e4, 2); - -writematrix(z_random,'output_x_y_and_random_weights_2e4_2.csv') - - -% Plot data and smooth -subplot(1, 1, 1); -plot([z_even-10 y] ) % Downward shift for visibility -hold on; -plot([z_random-10 y] ) -title('NMR spectrum and optimal smooth') -xlabel('Channel') -ylabel('Signal strength') - - - - - + +% Demonstration NMR spectrum smoothing with Whittaker smoother +% Optimal smoothing by cross-validation +% +% Paul Eilers, 2003 + +% Get the data +data = load('nmr_with_weights_and_x.csv'); +x = data(: ,1); +y = data(:,2); +evenly_space_weights = data(:,3); +random_weights = data(:, 4); + + + +[z_even, ~] = whitsmddw(x, y, evenly_space_weights, 2e4, 2); + +writematrix(z_even,'output_x_y_and_weights_2e4_2.csv') + +[z_random, ~] = whitsmddw(x, y, random_weights, 2e4, 2); + +writematrix(z_random,'output_x_y_and_random_weights_2e4_2.csv') + + +% Plot data and smooth +subplot(1, 1, 1); +plot([z_even-10 y] ) % Downward shift for visibility +hold on; +plot([z_random-10 y] ) +title('NMR spectrum and optimal smooth') +xlabel('Channel') +ylabel('Signal strength') + + + + + diff --git a/tests/data/original_scripts/demo_x_input_whittaker.m b/tests/original_scripts/demo_x_input_whittaker.m similarity index 94% rename from tests/data/original_scripts/demo_x_input_whittaker.m rename to tests/original_scripts/demo_x_input_whittaker.m index 133c996..e69e864 100644 --- a/tests/data/original_scripts/demo_x_input_whittaker.m +++ b/tests/original_scripts/demo_x_input_whittaker.m @@ -1,36 +1,36 @@ - -% Demonstration NMR spectrum smoothing with Whittaker smoother -% Optimal smoothing by cross-validation -% -% Paul Eilers, 2003 - -% Get the data -data = load('nmr_with_weights_and_x.csv'); -x = data(: ,1); -y = data(:,2); - - - -[z_order_2, ~] = whitsmdd(x, y, 2e4, 2); - -writematrix(z_order_2,'output_x_and_y_2e4_2.csv') - -[z_order_3, ~] = whitsmdd(x, y, 2e4, 3); - -writematrix(z_order_3,'output_x_and_y_2e4_3.csv') - - -% Plot data and smooth -subplot(1, 1, 1); -plot([z_order_2-10 y] ) % Downward shift for visibility -hold on; -plot([z_order_3-10 y] ) -title('NMR spectrum and optimal smooth') -xlabel('Channel') -ylabel('Signal strength') - - - - - - + +% Demonstration NMR spectrum smoothing with Whittaker smoother +% Optimal smoothing by cross-validation +% +% Paul Eilers, 2003 + +% Get the data +data = load('nmr_with_weights_and_x.csv'); +x = data(: ,1); +y = data(:,2); + + + +[z_order_2, ~] = whitsmdd(x, y, 2e4, 2); + +writematrix(z_order_2,'output_x_and_y_2e4_2.csv') + +[z_order_3, ~] = whitsmdd(x, y, 2e4, 3); + +writematrix(z_order_3,'output_x_and_y_2e4_3.csv') + + +% Plot data and smooth +subplot(1, 1, 1); +plot([z_order_2-10 y] ) % Downward shift for visibility +hold on; +plot([z_order_3-10 y] ) +title('NMR spectrum and optimal smooth') +xlabel('Channel') +ylabel('Signal strength') + + + + + + diff --git a/tests/data/original_scripts/whitsm.m b/tests/original_scripts/whitsm.m similarity index 100% rename from tests/data/original_scripts/whitsm.m rename to tests/original_scripts/whitsm.m diff --git a/tests/data/original_scripts/whitsmdd.m b/tests/original_scripts/whitsmdd.m similarity index 100% rename from tests/data/original_scripts/whitsmdd.m rename to tests/original_scripts/whitsmdd.m diff --git a/tests/data/original_scripts/whitsmddw.m b/tests/original_scripts/whitsmddw.m similarity index 96% rename from tests/data/original_scripts/whitsmddw.m rename to tests/original_scripts/whitsmddw.m index e69ceea..a3e1abb 100644 --- a/tests/data/original_scripts/whitsmddw.m +++ b/tests/original_scripts/whitsmddw.m @@ -1,61 +1,61 @@ -function [z, cve, h] = whitsmddw(x, y, w, lambda, d) -% Whittaker smoother with divided differences and weights -% Input: -% x: data series of sampling positions (must be increasing) -% y: data series, assumed to be sampled at equal intervals -% w: weights -% lambda: smoothing parameter; large lambda gives smoother result -% d: order of differences (default = 2) -% Output: -% z: smoothed series -% cve: RMS leave-one-out prediction error -% h: diagonal of hat matrix -% -% Remark: the computation of the hat diagonal for m > 100 is experimental; -% with many missing observation it may fail. -% -% Paul Eilers, 2003 - -% Default order of differences -if nargin < 3 - d = 2; -end - -% Smoothing -m = length(y); -E = speye(m); -D = ddmat(x, d); -W = spdiags(w, 0, m, m); -C = chol(W + lambda * D' * D); -z = C \ (C' \ (w .*y)); - -% Computation of hat diagonal and cross-validation -if nargout > 1 - if m <= 100 % Exact hat diagonal - W = diag(w); - H = (W + lambda * D' * D) \ W; - h = diag(H); - else % Map to diag(H) for n = 100 - n = 100; - E1 = speye(n); - g = round(((1:n) - 1) * (m - 1) / (n - 1) + 1); - D1 = ddmat(x(g), d); - W1 = diag(w(g)); - lambda1 = lambda * (n / m) ^ (2 * d); - H1 = inv(W1 + lambda1 * D1' * D1); - h1 = diag(H1); - u = zeros(m, 1); - k = floor(m / 2); - k1 = floor(n / 2); - u(k) = 1; - v = C \ (C' \ u); - hk = v(k); - f = round(((1:m)' - 1) * (n - 1)/ (m - 1) + 1); - h = w .* h1(f) * v(k) / h1(k1); - end - r = (y - z) ./ (1 - h); - cve = sqrt(r' * (w .* r) / sum(w)); -end - - - +function [z, cve, h] = whitsmddw(x, y, w, lambda, d) +% Whittaker smoother with divided differences and weights +% Input: +% x: data series of sampling positions (must be increasing) +% y: data series, assumed to be sampled at equal intervals +% w: weights +% lambda: smoothing parameter; large lambda gives smoother result +% d: order of differences (default = 2) +% Output: +% z: smoothed series +% cve: RMS leave-one-out prediction error +% h: diagonal of hat matrix +% +% Remark: the computation of the hat diagonal for m > 100 is experimental; +% with many missing observation it may fail. +% +% Paul Eilers, 2003 + +% Default order of differences +if nargin < 3 + d = 2; +end + +% Smoothing +m = length(y); +E = speye(m); +D = ddmat(x, d); +W = spdiags(w, 0, m, m); +C = chol(W + lambda * D' * D); +z = C \ (C' \ (w .*y)); + +% Computation of hat diagonal and cross-validation +if nargout > 1 + if m <= 100 % Exact hat diagonal + W = diag(w); + H = (W + lambda * D' * D) \ W; + h = diag(H); + else % Map to diag(H) for n = 100 + n = 100; + E1 = speye(n); + g = round(((1:n) - 1) * (m - 1) / (n - 1) + 1); + D1 = ddmat(x(g), d); + W1 = diag(w(g)); + lambda1 = lambda * (n / m) ^ (2 * d); + H1 = inv(W1 + lambda1 * D1' * D1); + h1 = diag(H1); + u = zeros(m, 1); + k = floor(m / 2); + k1 = floor(n / 2); + u(k) = 1; + v = C \ (C' \ u); + hk = v(k); + f = round(((1:m)' - 1) * (n - 1)/ (m - 1) + 1); + h = w .* h1(f) * v(k) / h1(k1); + end + r = (y - z) ./ (1 - h); + cve = sqrt(r' * (w .* r) / sum(w)); +end + + + diff --git a/tests/data/original_scripts/whitsmw.m b/tests/original_scripts/whitsmw.m similarity index 100% rename from tests/data/original_scripts/whitsmw.m rename to tests/original_scripts/whitsmw.m diff --git a/whittaker-eilers-py/examples/cross_validation.py b/whittaker-eilers-py/examples/cross_validation.py index a9d7c70..71d99fb 100644 --- a/whittaker-eilers-py/examples/cross_validation.py +++ b/whittaker-eilers-py/examples/cross_validation.py @@ -11,28 +11,32 @@ weights = [1.0] * len(years) -for i in range(1, len(temp_anom)): - if (i % 2 == 0) and i != len(temp_anom) - 1: - temp_anom[i] = np.nan - weights[i] = 0.0 +# for i in range(1, len(temp_anom)): +# if (i % 2 == 0) and i != len(temp_anom) - 1: +# temp_anom[i] = np.nan +# weights[i] = 0.0 - if i > 5 and i < 5 + 15: - temp_anom[i] = np.nan - weights[i] = 0.0 +# if i > 5 and i < 5 + 15: +# temp_anom[i] = np.nan +# weights[i] = 0.0 - if i > 96 and i < 96 + 30: - temp_anom[i] = np.nan - weights[i] = 0.0 +# if i > 96 and i < 96 + 30: +# temp_anom[i] = np.nan +# weights[i] = 0.0 whittaker_smoother = WhittakerSmoother( lmbda=20, order=2, data_length=len(temp_anom), x_input=list(years), weights=weights ) -smoothed_data = whittaker_smoother.smooth_and_cross_validate(list(np.nan_to_num(temp_anom))) +smoothed_data = whittaker_smoother.smooth_and_cross_validate( + list(np.nan_to_num(temp_anom)) +) print(smoothed_data.get_cross_validation_error()) -res = whittaker_smoother.smooth_optimal(list(np.nan_to_num(temp_anom))) +res = whittaker_smoother.smooth_optimal( + list(np.nan_to_num(temp_anom)), break_serial_correlation=True +) print(res.get_optimal().get_lambda()) diff --git a/whittaker-eilers-py/examples/cross_validation_article/README.md b/whittaker-eilers-py/examples/cross_validation_article/README.md new file mode 100644 index 0000000..692868a --- /dev/null +++ b/whittaker-eilers-py/examples/cross_validation_article/README.md @@ -0,0 +1,15 @@ +# Scripts for article on cross validation + + + +Data source for `spec-1939-53389-0138.fits` +https://dr10.sdss.org/spectrumDetail?plateid=1939&mjd=53389&fiber=138 + + +Data source for `bone.data` +https://hastie.su.domains/ElemStatLearn/ + + +Data source for `AirQualityUCI.csv` + +https://archive.ics.uci.edu/dataset/360/air+quality \ No newline at end of file diff --git a/whittaker-eilers-py/examples/cross_validation_article/article_plots.py b/whittaker-eilers-py/examples/cross_validation_article/article_plots.py new file mode 100644 index 0000000..791fc00 --- /dev/null +++ b/whittaker-eilers-py/examples/cross_validation_article/article_plots.py @@ -0,0 +1,371 @@ +import numpy as np +from whittaker_eilers import WhittakerSmoother +import matplotlib.pyplot as plt +import pandas as pd +from astropy.io import fits + +axis_color = "#d0d0fa" +face_color = "#00002a" + + +def main(): # + plot_bone_mineral_density() + plot_optical_spectra() + plot_humidity() + + +def plot_bone_mineral_density(): + # data = pd.read_csv("cross_validation_article/bone.data", delimiter=" ") + data = pd.read_table("cross_validation_article/bone.data") + + data["age"] += np.random.uniform(0.2, 0.3, len(data)) + data.sort_values(by="age", inplace=True) + + male = data[data["gender"] == "male"] + female = data[data["gender"] == "female"] + + y_raw_male = male["spnbmd"].to_list() + x_input_male = male["age"] + + y_raw_female = female["spnbmd"].to_list() + x_input_female = female["age"] + + male_smoother = WhittakerSmoother( + 1, + 2, + len(x_input_male), + x_input=x_input_male, + ) + male_optimal_smooth = male_smoother.smooth_optimal( + y_raw_male, break_serial_correlation=True + ) + male_smoothed = male_optimal_smooth.get_optimal().get_smoothed() + female_smoother = WhittakerSmoother( + 1, + 2, + len(x_input_female), + x_input=x_input_female, + ) + female_optimal_smooth = female_smoother.smooth_optimal( + y_raw_female, break_serial_correlation=True + ) + female_smoothed = female_optimal_smooth.get_optimal().get_smoothed() + + (male_cross_validation_errors, smoothed_data, male_lambdas) = zip( + *[ + (res.get_cross_validation_error(), res.get_smoothed(), res.get_lambda()) + for res in male_optimal_smooth.get_all() + ] + ) + (female_cross_validation_errors, smoothed_data, female_lambdas) = zip( + *[ + (res.get_cross_validation_error(), res.get_smoothed(), res.get_lambda()) + for res in female_optimal_smooth.get_all() + ] + ) + + print( + "Cross male validation chosen lambda: {}".format( + male_lambdas[np.argmin(male_cross_validation_errors)] + ) + ) + print( + "Cross female validation chosen lambda: {}".format( + female_lambdas[np.argmin(female_cross_validation_errors)] + ) + ) + (fig, ax1) = plt.subplots(figsize=(8, 4), facecolor="#00002a") + ax1.plot( + male_lambdas, + male_cross_validation_errors, + color="#59f176", + label="Cross validation", + marker=".", + markersize=8, + ) + ax1.plot( + female_lambdas, + female_cross_validation_errors, + color="red", + label="Cross validation", + marker=".", + markersize=8, + ) + + ax1.set_xscale("log") + # ax1.set_yscale("log") + ax1.spines["bottom"].set_color(axis_color) + ax1.spines["top"].set_color(axis_color) + ax1.spines["right"].set_color(axis_color) + ax1.spines["left"].set_color(axis_color) + + ax1.tick_params(axis="y", direction="in", color=axis_color, labelcolor=axis_color) + ax1.tick_params(axis="x", direction="in", color=axis_color, labelcolor=axis_color) + ax1.grid(True, ls="--", alpha=0.6) + # ax1.set_xlim(1e1, 1e5) + ax1.set_facecolor("#00002a") + plt.show(block=False) + + (fig, ax1) = plt.subplots(figsize=(8, 4), facecolor="#00002a") + + ax1.spines["bottom"].set_color(axis_color) + ax1.spines["top"].set_color(axis_color) + ax1.spines["right"].set_color(axis_color) + ax1.spines["left"].set_color(axis_color) + ax1.set_xlabel("Ages / Years", color=axis_color) + ax1.set_ylabel("Relative Change in Spinal Bone Mineral Density", color=axis_color) + ax1.tick_params(axis="y", direction="in", color=axis_color, labelcolor=axis_color) + ax1.tick_params(axis="x", direction="in", color=axis_color, labelcolor=axis_color) + ax1.grid(True, ls="--", alpha=0.6) + ax1.set_facecolor("#00002a") + # ax1.set_xlim(0, 50) + # + ax1.scatter( + x_input_male, + y_raw_male, + color="#59f176", + # label="Measured", + alpha=0.6, + s=10, + ) + + ax1.scatter( + x_input_female, + y_raw_female, + color="red", + # label="Measured Female", + alpha=0.6, + s=10, + ) + + ax1.plot( + x_input_male, + male_smoothed, + color="#59f176", + lw=2, + label="Male", + solid_capstyle="round", + ) + ax1.plot( + x_input_female, + female_smoothed, + color="red", + lw=2, + label="Female", + solid_capstyle="round", + ) + ax1.legend() + plt.show() + + +def plot_optical_spectra(): + col = "flux" + with fits.open("cross_validation_article/spec-1939-53389-0138.fits") as data: + data = pd.DataFrame(data[1].data) + + # print(data["and_mask"].to_list()) + # mask = data["and_mask"] > 65552 + data["weights"] = np.full(len(data), 1) + # data["weights"][mask] = 0.0 + + smoother = WhittakerSmoother( + 5e1, + 2, + len(data), + x_input=data.index.to_list(), + ) + + # weights = np.bitwise_and(data["weights"].to_numpy(), data["and_mask"].to_numpy()) + # print(weights) + + smoothed = smoother.smooth(data[col].to_list()) + + optimal_smooth = smoother.smooth_optimal( + data[col].to_list(), break_serial_correlation=False + ) + smoothed = optimal_smooth.get_optimal().get_smoothed() + + (fig, ax1) = plt.subplots(figsize=(8, 4), facecolor="#00002a") + (cross_validation_errors, smoothed_data, lambdas) = zip( + *[ + (res.get_cross_validation_error(), res.get_smoothed(), res.get_lambda()) + for res in optimal_smooth.get_all() + ] + ) + + print( + "Cross validation chosen lambda: {}".format( + lambdas[np.argmin(cross_validation_errors)] + ) + ) + + ax1.plot( + lambdas, + cross_validation_errors, + color="#fc8d28", + label="Cross validation", + marker=".", + markersize=8, + ) + + ax1.set_xscale("log") + ax1.set_yscale("log") + ax1.spines["bottom"].set_color(axis_color) + ax1.spines["top"].set_color(axis_color) + ax1.spines["right"].set_color(axis_color) + ax1.spines["left"].set_color(axis_color) + + ax1.tick_params(axis="y", direction="in", color=axis_color, labelcolor=axis_color) + ax1.tick_params(axis="x", direction="in", color=axis_color, labelcolor=axis_color) + ax1.grid(True, ls="--", alpha=0.6) + # ax1.set_xlim(1e1, 1e5) + ax1.set_facecolor("#00002a") + plt.show(block=False) + + (fig, ax1) = plt.subplots(figsize=(8, 4), facecolor="#00002a") + + ax1.spines["bottom"].set_color(axis_color) + ax1.spines["top"].set_color(axis_color) + ax1.spines["right"].set_color(axis_color) + ax1.spines["left"].set_color(axis_color) + ax1.set_xlabel("Wavelenghts / Ångströms", color=axis_color) + ax1.set_ylabel("Flux / [10-17 erg/cm2/s/Å]", color=axis_color) + ax1.tick_params(axis="y", direction="in", color=axis_color, labelcolor=axis_color) + ax1.tick_params(axis="x", direction="in", color=axis_color, labelcolor=axis_color) + ax1.grid(True, ls="--", alpha=0.6) + ax1.set_facecolor("#00002a") + # ax1.set_xlim(0, 50) + # + ax1.plot( + data.index, + data[col], + color="#fc8d28", + marker=".", + label="Measured", + alpha=0.6, + markersize=8, + ) + + ax1.plot( + data.index, + smoothed, + color="#59f176", + lw=2, + label="Whittaker", + solid_capstyle="round", + ) + plt.show() + + +def plot_humidity(): + col = "AH" + data = pd.read_csv( + "cross_validation_article/AirQualityUCI.csv", + delimiter=";", + na_values=-200, + decimal=",", + ) + + data["Date"] = pd.to_datetime( + data["Date"] + data["Time"], format="%d/%m/%Y%H.%M.%S" + ) + # data["Time"] = pd.to_datetime(data["Date"], format="%d/%m/%Y") + + # data["x"] = pd.Timestamp.combine(data["Date"], data["Time"]) + data["x_input"] = np.arange(0, len(data)) + 1.0 + data["weights"] = np.full(len(data), 1.0) + nan_filter = data[col].isna() + + data["weights"][nan_filter] = 0.0 + data[col][nan_filter] = 0.0 + + # data.fillna(subset=[col], inplace=True) + + smoother = WhittakerSmoother( + 5e1, + 2, + len(data), + x_input=data["x_input"].to_list(), + weights=data["weights"].to_list(), + ) + smoothed = smoother.smooth(data[col].to_list()) + + optimal_smooth = smoother.smooth_optimal( + data[col].to_list(), break_serial_correlation=True + ) + smoothed = optimal_smooth.get_optimal().get_smoothed() + + (fig, ax1) = plt.subplots(figsize=(8, 4), facecolor="#00002a") + (cross_validation_errors, smoothed_data, lambdas) = zip( + *[ + (res.get_cross_validation_error(), res.get_smoothed(), res.get_lambda()) + for res in optimal_smooth.get_all() + ] + ) + + print( + "Cross validation chosen lambda: {}".format( + lambdas[np.argmin(cross_validation_errors)] + ) + ) + + ax1.plot( + lambdas, + cross_validation_errors, + color="#fc8d28", + label="Cross validation", + marker=".", + markersize=8, + ) + + ax1.set_xscale("log") + ax1.set_yscale("log") + ax1.spines["bottom"].set_color(axis_color) + ax1.spines["top"].set_color(axis_color) + ax1.spines["right"].set_color(axis_color) + ax1.spines["left"].set_color(axis_color) + + ax1.tick_params(axis="y", direction="in", color=axis_color, labelcolor=axis_color) + ax1.tick_params(axis="x", direction="in", color=axis_color, labelcolor=axis_color) + ax1.grid(True, ls="--", alpha=0.6) + # ax1.set_xlim(1e1, 1e5) + ax1.set_facecolor("#00002a") + plt.show(block=False) + + (fig, ax1) = plt.subplots(figsize=(8, 4), facecolor="#00002a") + + ax1.spines["bottom"].set_color(axis_color) + ax1.spines["top"].set_color(axis_color) + ax1.spines["right"].set_color(axis_color) + ax1.spines["left"].set_color(axis_color) + ax1.set_xlabel("Year", color=axis_color) + ax1.set_ylabel("Temperature Anomaly / °C", color=axis_color) + ax1.tick_params(axis="y", direction="in", color=axis_color, labelcolor=axis_color) + ax1.tick_params(axis="x", direction="in", color=axis_color, labelcolor=axis_color) + ax1.grid(True, ls="--", alpha=0.6) + ax1.set_facecolor("#00002a") + # ax1.set_xlim(0, 50) + # + ax1.plot( + data["Date"], + np.where(nan_filter, np.nan, data[col]), + color="#fc8d28", + marker=".", + label="Measured", + alpha=0.6, + markersize=8, + ) + + ax1.plot( + data["Date"], + smoothed, + color="#59f176", + lw=2, + label="Whittaker", + solid_capstyle="round", + ) + plt.show() + + +if __name__ == "__main__": + main() diff --git a/whittaker-eilers-py/examples/cross_validation_article/mse_vs_cve.py b/whittaker-eilers-py/examples/cross_validation_article/mse_vs_cve.py new file mode 100644 index 0000000..a47675b --- /dev/null +++ b/whittaker-eilers-py/examples/cross_validation_article/mse_vs_cve.py @@ -0,0 +1,125 @@ +import numpy as np +from whittaker_eilers import WhittakerSmoother +import matplotlib.pyplot as plt + +axis_color = "#d0d0fa" +face_color = "#00002a" +data_length = 1000 + + +def main(): + for scale in [0.0001, 0.001, 0.01, 1.0, 5.0]: + x, original_y, y_with_noise = generate_data(data_length, scale) + + smoother = WhittakerSmoother(1, 2, data_length) # x_input=x) + + optimal_smooth = smoother.smooth_optimal( + y_with_noise, break_serial_correlation=False + ) + + (fig, ax1) = plt.subplots(figsize=(8, 4), facecolor="#00002a") + (cross_validation_errors, smoothed_data, lambdas) = zip( + *[ + (res.get_cross_validation_error(), res.get_smoothed(), res.get_lambda()) + for res in optimal_smooth.get_all() + ] + ) + + actual_errors = [ + np.sqrt(np.mean((original_y - np.array(series)) ** 2)) + for series in smoothed_data + ] + + print( + "Cross validation chosen lambda: {}".format( + lambdas[np.argmin(cross_validation_errors)] + ) + ) + + print("RMSE Chosen lambda: {}".format(lambdas[np.argmin(actual_errors)])) + ax1.plot( + lambdas, + cross_validation_errors, + color="#fc8d28", + label="Cross validation", + marker=".", + markersize=8, + ) + ax2 = ax1.twinx() + ax2.plot( + lambdas, + actual_errors, + color="blue", + label="RMSE", + marker=".", + markersize=8, + ) + ax1.set_xscale("log") + ax1.set_yscale("log") + ax1.spines["bottom"].set_color(axis_color) + ax1.spines["top"].set_color(axis_color) + ax1.spines["right"].set_color(axis_color) + ax1.spines["left"].set_color(axis_color) + + ax1.tick_params( + axis="y", direction="in", color=axis_color, labelcolor=axis_color + ) + ax1.tick_params( + axis="x", direction="in", color=axis_color, labelcolor=axis_color + ) + ax1.grid(True, ls="--", alpha=0.6) + ax1.set_facecolor("#00002a") + plt.show(block=False) + + (fig, ax1) = plt.subplots(figsize=(8, 4), facecolor="#00002a") + + ax1.spines["bottom"].set_color(axis_color) + ax1.spines["top"].set_color(axis_color) + ax1.spines["right"].set_color(axis_color) + ax1.spines["left"].set_color(axis_color) + ax1.set_xlabel("Year", color=axis_color) + ax1.set_ylabel("Temperature Anomaly / °C", color=axis_color) + ax1.tick_params( + axis="y", direction="in", color=axis_color, labelcolor=axis_color + ) + ax1.tick_params( + axis="x", direction="in", color=axis_color, labelcolor=axis_color + ) + ax1.grid(True, ls="--", alpha=0.6) + ax1.set_facecolor("#00002a") + # ax1.set_xlim(0, 2 * np.pi) + + ax1.plot(x, original_y, label="Original", color="white") + ax1.plot( + x, + y_with_noise, + color="#fc8d28", + marker=".", + label="Measured", + alpha=0.6, + markersize=8, + ) + + ax1.plot( + x, + optimal_smooth.get_optimal().get_smoothed(), + color="#59f176", + lw=2, + label="Whittaker", + solid_capstyle="round", + ) + plt.show() + + +def generate_data( + length: int, scale: float +) -> tuple[list[float], list[float], list[float]]: + x = np.linspace(0, 2.0 * np.pi, length) + y = np.cos(x) + y_with_noise = y + np.random.normal(0, scale, x.size) + + return x, y, y_with_noise + + +if __name__ == "__main__": + main() From 7fe124db9b308c4937c8af42a766d93adb72cb31 Mon Sep 17 00:00:00 2001 From: Andrew Bowell <49959927+AnBowell@users.noreply.github.com> Date: Mon, 12 Feb 2024 07:54:24 +0000 Subject: [PATCH 09/12] More plotting --- .gitignore | 1 + .../cross_validation_article/README.md | 2 +- .../cross_validation_article/article_plots.py | 235 ++++++++++-------- .../cross_validation_article/mse_vs_cve.py | 112 ++++++--- 4 files changed, 209 insertions(+), 141 deletions(-) diff --git a/.gitignore b/.gitignore index 7fc0788..cedc68f 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ whittaker-eilers-py/examples/cross_validation_article/AirQualityUCI.csv whittaker-eilers-py/examples/cross_validation_article/bone.data whittaker-eilers-py/examples/cross_validation_article/spec-1939-53389-0138.fits +whittaker-eilers-py/examples/cross_validation_article/*.png \ No newline at end of file diff --git a/whittaker-eilers-py/examples/cross_validation_article/README.md b/whittaker-eilers-py/examples/cross_validation_article/README.md index 692868a..a441998 100644 --- a/whittaker-eilers-py/examples/cross_validation_article/README.md +++ b/whittaker-eilers-py/examples/cross_validation_article/README.md @@ -4,7 +4,7 @@ Data source for `spec-1939-53389-0138.fits` https://dr10.sdss.org/spectrumDetail?plateid=1939&mjd=53389&fiber=138 - +https://www.sdss.org/collaboration/citing-sdss/ Data source for `bone.data` https://hastie.su.domains/ElemStatLearn/ diff --git a/whittaker-eilers-py/examples/cross_validation_article/article_plots.py b/whittaker-eilers-py/examples/cross_validation_article/article_plots.py index 791fc00..e1719c8 100644 --- a/whittaker-eilers-py/examples/cross_validation_article/article_plots.py +++ b/whittaker-eilers-py/examples/cross_validation_article/article_plots.py @@ -1,9 +1,11 @@ import numpy as np from whittaker_eilers import WhittakerSmoother + import matplotlib.pyplot as plt import pandas as pd from astropy.io import fits + axis_color = "#d0d0fa" face_color = "#00002a" @@ -74,8 +76,10 @@ def plot_bone_mineral_density(): female_lambdas[np.argmin(female_cross_validation_errors)] ) ) - (fig, ax1) = plt.subplots(figsize=(8, 4), facecolor="#00002a") - ax1.plot( + + (fig, smooth_ax, cve_ax) = create_figure_and_axes() + + cve_ax.plot( male_lambdas, male_cross_validation_errors, color="#59f176", @@ -83,7 +87,7 @@ def plot_bone_mineral_density(): marker=".", markersize=8, ) - ax1.plot( + cve_ax.plot( female_lambdas, female_cross_validation_errors, color="red", @@ -92,53 +96,30 @@ def plot_bone_mineral_density(): markersize=8, ) - ax1.set_xscale("log") - # ax1.set_yscale("log") - ax1.spines["bottom"].set_color(axis_color) - ax1.spines["top"].set_color(axis_color) - ax1.spines["right"].set_color(axis_color) - ax1.spines["left"].set_color(axis_color) - - ax1.tick_params(axis="y", direction="in", color=axis_color, labelcolor=axis_color) - ax1.tick_params(axis="x", direction="in", color=axis_color, labelcolor=axis_color) - ax1.grid(True, ls="--", alpha=0.6) - # ax1.set_xlim(1e1, 1e5) - ax1.set_facecolor("#00002a") - plt.show(block=False) + cve_ax.set_xlabel("λ", color=axis_color, fontsize=16) + cve_ax.set_ylabel("CVE", color=axis_color, fontsize=14) + cve_ax.set_xscale("log") - (fig, ax1) = plt.subplots(figsize=(8, 4), facecolor="#00002a") - - ax1.spines["bottom"].set_color(axis_color) - ax1.spines["top"].set_color(axis_color) - ax1.spines["right"].set_color(axis_color) - ax1.spines["left"].set_color(axis_color) - ax1.set_xlabel("Ages / Years", color=axis_color) - ax1.set_ylabel("Relative Change in Spinal Bone Mineral Density", color=axis_color) - ax1.tick_params(axis="y", direction="in", color=axis_color, labelcolor=axis_color) - ax1.tick_params(axis="x", direction="in", color=axis_color, labelcolor=axis_color) - ax1.grid(True, ls="--", alpha=0.6) - ax1.set_facecolor("#00002a") - # ax1.set_xlim(0, 50) - # - ax1.scatter( + smooth_ax.set_xlabel("Age / year", color=axis_color, fontsize=14) + smooth_ax.set_ylabel("Δ Spinal Bone Mineral Density", color=axis_color, fontsize=14) + smooth_ax.set_xlim(10, 25) + smooth_ax.scatter( x_input_male, y_raw_male, color="#59f176", - # label="Measured", alpha=0.6, s=10, ) - ax1.scatter( + smooth_ax.scatter( x_input_female, y_raw_female, color="red", - # label="Measured Female", alpha=0.6, s=10, ) - ax1.plot( + smooth_ax.plot( x_input_male, male_smoothed, color="#59f176", @@ -146,7 +127,7 @@ def plot_bone_mineral_density(): label="Male", solid_capstyle="round", ) - ax1.plot( + smooth_ax.plot( x_input_female, female_smoothed, color="red", @@ -154,8 +135,14 @@ def plot_bone_mineral_density(): label="Female", solid_capstyle="round", ) - ax1.legend() - plt.show() + smooth_ax.legend() + fig.tight_layout() + plt.savefig( + "cross_validation_article/bone_mineral_density.png", + dpi=800, + bbox_inches="tight", + ) + plt.show(block=False) def plot_optical_spectra(): @@ -185,7 +172,6 @@ def plot_optical_spectra(): ) smoothed = optimal_smooth.get_optimal().get_smoothed() - (fig, ax1) = plt.subplots(figsize=(8, 4), facecolor="#00002a") (cross_validation_errors, smoothed_data, lambdas) = zip( *[ (res.get_cross_validation_error(), res.get_smoothed(), res.get_lambda()) @@ -199,62 +185,89 @@ def plot_optical_spectra(): ) ) - ax1.plot( + (fig, smooth_ax, cve_ax) = create_figure_and_axes() + + cve_ax.plot( lambdas, cross_validation_errors, - color="#fc8d28", + color="red", label="Cross validation", marker=".", markersize=8, ) - ax1.set_xscale("log") - ax1.set_yscale("log") - ax1.spines["bottom"].set_color(axis_color) - ax1.spines["top"].set_color(axis_color) - ax1.spines["right"].set_color(axis_color) - ax1.spines["left"].set_color(axis_color) + cve_ax.set_xscale("log") + # cve_ax.set_yscale("log") + cve_ax.spines["bottom"].set_color(axis_color) + cve_ax.spines["top"].set_color(axis_color) + cve_ax.spines["right"].set_color(axis_color) + cve_ax.spines["left"].set_color(axis_color) + cve_ax.set_xlabel("λ", color=axis_color, fontsize=16) + cve_ax.set_ylabel("CVE", color=axis_color, fontsize=14) + cve_ax.tick_params( + axis="y", direction="in", color=axis_color, labelcolor=axis_color + ) + cve_ax.tick_params( + axis="x", direction="in", color=axis_color, labelcolor=axis_color + ) + cve_ax.grid(True, ls="--", alpha=0.6) - ax1.tick_params(axis="y", direction="in", color=axis_color, labelcolor=axis_color) - ax1.tick_params(axis="x", direction="in", color=axis_color, labelcolor=axis_color) - ax1.grid(True, ls="--", alpha=0.6) - # ax1.set_xlim(1e1, 1e5) - ax1.set_facecolor("#00002a") + cve_ax.set_facecolor("#00002a") plt.show(block=False) - (fig, ax1) = plt.subplots(figsize=(8, 4), facecolor="#00002a") - - ax1.spines["bottom"].set_color(axis_color) - ax1.spines["top"].set_color(axis_color) - ax1.spines["right"].set_color(axis_color) - ax1.spines["left"].set_color(axis_color) - ax1.set_xlabel("Wavelenghts / Ångströms", color=axis_color) - ax1.set_ylabel("Flux / [10-17 erg/cm2/s/Å]", color=axis_color) - ax1.tick_params(axis="y", direction="in", color=axis_color, labelcolor=axis_color) - ax1.tick_params(axis="x", direction="in", color=axis_color, labelcolor=axis_color) - ax1.grid(True, ls="--", alpha=0.6) - ax1.set_facecolor("#00002a") + smooth_ax.spines["bottom"].set_color(axis_color) + smooth_ax.spines["top"].set_color(axis_color) + smooth_ax.spines["right"].set_color(axis_color) + smooth_ax.spines["left"].set_color(axis_color) + smooth_ax.set_xlabel("Wavelenghts / Ångströms", color=axis_color, fontsize=14) + smooth_ax.set_ylabel("Flux / (10-17 erg/cm2/s/Å)", color=axis_color, fontsize=14) + smooth_ax.tick_params( + axis="y", direction="in", color=axis_color, labelcolor=axis_color + ) + smooth_ax.tick_params( + axis="x", direction="in", color=axis_color, labelcolor=axis_color + ) + + smooth_ax.grid(True, ls="--", alpha=0.6) + smooth_ax.set_facecolor("#00002a") # ax1.set_xlim(0, 50) # - ax1.plot( + smooth_ax.set_xlim(200, 2000) + smooth_ax.set_ylim(-0.5, 15) + smooth_ax.plot( data.index, data[col], - color="#fc8d28", + color="#59f176", marker=".", label="Measured", - alpha=0.6, + alpha=0.5, markersize=8, ) - - ax1.plot( + # smooth_ax.scatter( + # data.index, + # data[col], + # color="yellow", + # # marker=".", + # label="Measured", + # alpha=0.75, + # s=20, + # ) + + smooth_ax.plot( data.index, smoothed, - color="#59f176", + color="red", lw=2, label="Whittaker", solid_capstyle="round", ) - plt.show() + fig.tight_layout() + plt.savefig( + "cross_validation_article/optical_spectra.png", + dpi=800, + bbox_inches="tight", + ) + plt.show(block=False) def plot_humidity(): @@ -295,7 +308,6 @@ def plot_humidity(): ) smoothed = optimal_smooth.get_optimal().get_smoothed() - (fig, ax1) = plt.subplots(figsize=(8, 4), facecolor="#00002a") (cross_validation_errors, smoothed_data, lambdas) = zip( *[ (res.get_cross_validation_error(), res.get_smoothed(), res.get_lambda()) @@ -309,63 +321,78 @@ def plot_humidity(): ) ) - ax1.plot( + (fig, smooth_ax, cve_ax) = create_figure_and_axes() + cve_ax.plot( lambdas, cross_validation_errors, - color="#fc8d28", + color="red", label="Cross validation", marker=".", markersize=8, ) - ax1.set_xscale("log") - ax1.set_yscale("log") - ax1.spines["bottom"].set_color(axis_color) - ax1.spines["top"].set_color(axis_color) - ax1.spines["right"].set_color(axis_color) - ax1.spines["left"].set_color(axis_color) + cve_ax.set_xscale("log") - ax1.tick_params(axis="y", direction="in", color=axis_color, labelcolor=axis_color) - ax1.tick_params(axis="x", direction="in", color=axis_color, labelcolor=axis_color) - ax1.grid(True, ls="--", alpha=0.6) - # ax1.set_xlim(1e1, 1e5) - ax1.set_facecolor("#00002a") - plt.show(block=False) + cve_ax.set_xlabel("λ", color=axis_color, fontsize=16) + cve_ax.set_ylabel("CVE", color=axis_color, fontsize=14) - (fig, ax1) = plt.subplots(figsize=(8, 4), facecolor="#00002a") - - ax1.spines["bottom"].set_color(axis_color) - ax1.spines["top"].set_color(axis_color) - ax1.spines["right"].set_color(axis_color) - ax1.spines["left"].set_color(axis_color) - ax1.set_xlabel("Year", color=axis_color) - ax1.set_ylabel("Temperature Anomaly / °C", color=axis_color) - ax1.tick_params(axis="y", direction="in", color=axis_color, labelcolor=axis_color) - ax1.tick_params(axis="x", direction="in", color=axis_color, labelcolor=axis_color) - ax1.grid(True, ls="--", alpha=0.6) - ax1.set_facecolor("#00002a") - # ax1.set_xlim(0, 50) - # - ax1.plot( + smooth_ax.set_xlabel("Year", color=axis_color, fontsize=14) + smooth_ax.set_ylabel("Absolute Humidity / (g/m3)", color=axis_color, fontsize=14) + + smooth_ax.set_ylim(0.6, 2.27) + + # fig.autofmt_xdate() + smooth_ax.set_xlim(data["Date"].loc[4400], data["Date"].loc[4700]) + smooth_ax.plot( data["Date"], np.where(nan_filter, np.nan, data[col]), - color="#fc8d28", + color="#59f176", marker=".", label="Measured", alpha=0.6, markersize=8, ) - ax1.plot( + smooth_ax.plot( data["Date"], smoothed, - color="#59f176", + color="red", lw=2, label="Whittaker", solid_capstyle="round", ) + fig.tight_layout() + # smooth_ax.xaxis.set_major_locator(mdates.DayLocator()) + plt.savefig( + "cross_validation_article/absolute_humidity.png", + dpi=800, + bbox_inches="tight", + ) + plt.show() +def create_figure_and_axes(): + (fig, axes) = plt.subplots( + 2, 1, figsize=(9, 6), facecolor="#00002a", gridspec_kw={"height_ratios": [3, 1]} + ) + + for ax in axes: + ax.spines["bottom"].set_color(axis_color) + ax.spines["top"].set_color(axis_color) + ax.spines["right"].set_color(axis_color) + ax.spines["left"].set_color(axis_color) + ax.tick_params( + axis="y", direction="in", color=axis_color, labelcolor=axis_color + ) + ax.tick_params( + axis="x", direction="in", color=axis_color, labelcolor=axis_color + ) + ax.grid(True, ls="--", alpha=0.6) + ax.set_facecolor("#00002a") + + return (fig, axes[0], axes[1]) + + if __name__ == "__main__": main() diff --git a/whittaker-eilers-py/examples/cross_validation_article/mse_vs_cve.py b/whittaker-eilers-py/examples/cross_validation_article/mse_vs_cve.py index a47675b..90502e1 100644 --- a/whittaker-eilers-py/examples/cross_validation_article/mse_vs_cve.py +++ b/whittaker-eilers-py/examples/cross_validation_article/mse_vs_cve.py @@ -8,7 +8,29 @@ def main(): - for scale in [0.0001, 0.001, 0.01, 1.0, 5.0]: + cve_fig, cve_axes = plt.subplots( + 2, + 2, + figsize=(16, 8), + sharex=True, + # sharey=True, + facecolor=face_color, + gridspec_kw={"wspace": 0.15, "hspace": 0.15}, + ) + cve_axes_iter = iter(cve_axes.flatten()) + + smoothed_fig, smoothed_axes = plt.subplots( + 2, + 2, + figsize=(16, 8), + sharex=True, + # sharey=True, + facecolor=face_color, + gridspec_kw={"wspace": 0, "hspace": 0}, + ) + smoothed_axes_iter = iter(smoothed_axes.flatten()) + + for counter, scale in enumerate([0.01, 0.1, 1.0, 5.0]): x, original_y, y_with_noise = generate_data(data_length, scale) smoother = WhittakerSmoother(1, 2, data_length) # x_input=x) @@ -17,7 +39,6 @@ def main(): y_with_noise, break_serial_correlation=False ) - (fig, ax1) = plt.subplots(figsize=(8, 4), facecolor="#00002a") (cross_validation_errors, smoothed_data, lambdas) = zip( *[ (res.get_cross_validation_error(), res.get_smoothed(), res.get_lambda()) @@ -37,7 +58,11 @@ def main(): ) print("RMSE Chosen lambda: {}".format(lambdas[np.argmin(actual_errors)])) - ax1.plot( + + cve_ax = next(cve_axes_iter) + [spine.set_color(axis_color) for (_, spine) in cve_ax.spines.items()] + + cve_ax.plot( lambdas, cross_validation_errors, color="#fc8d28", @@ -45,70 +70,85 @@ def main(): marker=".", markersize=8, ) - ax2 = ax1.twinx() + ax2 = cve_ax.twinx() ax2.plot( lambdas, actual_errors, - color="blue", + color="#66b0ff", label="RMSE", marker=".", markersize=8, ) - ax1.set_xscale("log") - ax1.set_yscale("log") - ax1.spines["bottom"].set_color(axis_color) - ax1.spines["top"].set_color(axis_color) - ax1.spines["right"].set_color(axis_color) - ax1.spines["left"].set_color(axis_color) - - ax1.tick_params( + + ax2.tick_params( axis="y", direction="in", color=axis_color, labelcolor=axis_color ) - ax1.tick_params( + ax2.tick_params( axis="x", direction="in", color=axis_color, labelcolor=axis_color ) - ax1.grid(True, ls="--", alpha=0.6) - ax1.set_facecolor("#00002a") - plt.show(block=False) - - (fig, ax1) = plt.subplots(figsize=(8, 4), facecolor="#00002a") - - ax1.spines["bottom"].set_color(axis_color) - ax1.spines["top"].set_color(axis_color) - ax1.spines["right"].set_color(axis_color) - ax1.spines["left"].set_color(axis_color) - ax1.set_xlabel("Year", color=axis_color) - ax1.set_ylabel("Temperature Anomaly / °C", color=axis_color) - ax1.tick_params( + + cve_ax.set_xscale("log") + # cve_ax.set_yscale("log") + + cve_ax.tick_params( axis="y", direction="in", color=axis_color, labelcolor=axis_color ) - ax1.tick_params( + cve_ax.tick_params( axis="x", direction="in", color=axis_color, labelcolor=axis_color ) - ax1.grid(True, ls="--", alpha=0.6) - ax1.set_facecolor("#00002a") + cve_ax.grid(True, ls="--", alpha=0.6) + cve_ax.set_facecolor("#00002a") + + smoothed_ax = next(smoothed_axes_iter) + [spine.set_color(axis_color) for (_, spine) in smoothed_ax.spines.items()] + + # smoothed_ax.set_xlabel("x / Radians", color=axis_color) + # smoothed_ax.set_ylabel("Amplitude", color=axis_color) + smoothed_ax.tick_params( + axis="y", direction="in", color=axis_color, labelcolor=axis_color + ) + smoothed_ax.tick_params( + axis="x", direction="in", color=axis_color, labelcolor=axis_color + ) + smoothed_ax.grid(True, ls="--", alpha=0.6) + smoothed_ax.set_facecolor("#00002a") # ax1.set_xlim(0, 2 * np.pi) - ax1.plot(x, original_y, label="Original", color="white") - ax1.plot( + smoothed_ax.plot(x, original_y, label="Original", color="white") + smoothed_ax.plot( x, y_with_noise, - color="#fc8d28", + color="#59f176", marker=".", label="Measured", alpha=0.6, markersize=8, ) - ax1.plot( + smoothed_ax.plot( x, optimal_smooth.get_optimal().get_smoothed(), - color="#59f176", + color="red", lw=2, label="Whittaker", solid_capstyle="round", ) - plt.show() + + match counter: + case 0: + smoothed_ax.set_ylim(-1.25, 1.25) + case 1: + smoothed_ax.set_ylim(-1.25, 1.25) + case 2: + smoothed_ax.set_ylim(-3.25, 3.25) + case 3: + smoothed_ax.set_ylim(-10, 10) + case _: + pass + smoothed_fig.supxlabel("x / Radians", color=axis_color, fontsize=18) + y_label = smoothed_fig.supylabel("Amplitude", color=axis_color, fontsize=18) + y_label.set_x(0.07) + plt.show() def generate_data( From c85295862278c5e2612c3544385d0ee5d56061a1 Mon Sep 17 00:00:00 2001 From: Andrew Date: Mon, 12 Feb 2024 18:03:20 +0000 Subject: [PATCH 10/12] finish up plots for testing --- .gitignore | 1 + .../cross_validation_article/mse_vs_cve.py | 75 ++++++++++++++----- 2 files changed, 57 insertions(+), 19 deletions(-) diff --git a/.gitignore b/.gitignore index cedc68f..fcd3705 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ /whittaker-eilers-py/target **__pycache__** .vscode +*.venv diff --git a/whittaker-eilers-py/examples/cross_validation_article/mse_vs_cve.py b/whittaker-eilers-py/examples/cross_validation_article/mse_vs_cve.py index 90502e1..bff3ab8 100644 --- a/whittaker-eilers-py/examples/cross_validation_article/mse_vs_cve.py +++ b/whittaker-eilers-py/examples/cross_validation_article/mse_vs_cve.py @@ -1,6 +1,8 @@ import numpy as np from whittaker_eilers import WhittakerSmoother import matplotlib.pyplot as plt +from matplotlib.ticker import MaxNLocator +from matplotlib.ticker import FormatStrFormatter axis_color = "#d0d0fa" face_color = "#00002a" @@ -62,41 +64,40 @@ def main(): cve_ax = next(cve_axes_iter) [spine.set_color(axis_color) for (_, spine) in cve_ax.spines.items()] - cve_ax.plot( + cve_plot = cve_ax.plot( lambdas, cross_validation_errors, - color="#fc8d28", - label="Cross validation", + color="#59f176", + label="CVE", marker=".", markersize=8, ) ax2 = cve_ax.twinx() - ax2.plot( + rmse_plot = ax2.plot( lambdas, actual_errors, - color="#66b0ff", + color="red", label="RMSE", marker=".", markersize=8, ) + [spine.set_color(axis_color) for (_, spine) in ax2.spines.items()] - ax2.tick_params( - axis="y", direction="in", color=axis_color, labelcolor=axis_color - ) - ax2.tick_params( - axis="x", direction="in", color=axis_color, labelcolor=axis_color - ) - + ax2.tick_params(axis="y", direction="in", color="red", labelcolor=axis_color) + ax2.tick_params(axis="x", direction="in", color="red", labelcolor=axis_color) + ax2.yaxis.set_major_formatter(FormatStrFormatter("%.2f")) cve_ax.set_xscale("log") + cve_ax.yaxis.set_major_locator(MaxNLocator(nbins=4, integer=False)) # cve_ax.set_yscale("log") - + cve_ax.yaxis.set_major_formatter(FormatStrFormatter("%.2f")) cve_ax.tick_params( - axis="y", direction="in", color=axis_color, labelcolor=axis_color + axis="y", direction="in", color="#59f176", labelcolor=axis_color ) cve_ax.tick_params( - axis="x", direction="in", color=axis_color, labelcolor=axis_color + axis="x", direction="in", color="#59f176", labelcolor=axis_color ) - cve_ax.grid(True, ls="--", alpha=0.6) + cve_ax.grid(True, ls="--", alpha=0.3, color="#59f176") + ax2.grid(True, ls="--", alpha=0.3, color="red") cve_ax.set_facecolor("#00002a") smoothed_ax = next(smoothed_axes_iter) @@ -114,7 +115,6 @@ def main(): smoothed_ax.set_facecolor("#00002a") # ax1.set_xlim(0, 2 * np.pi) - smoothed_ax.plot(x, original_y, label="Original", color="white") smoothed_ax.plot( x, y_with_noise, @@ -124,7 +124,7 @@ def main(): alpha=0.6, markersize=8, ) - + smoothed_ax.plot(x, original_y, label="Original", color="#f003ef") smoothed_ax.plot( x, optimal_smooth.get_optimal().get_smoothed(), @@ -137,17 +137,54 @@ def main(): match counter: case 0: smoothed_ax.set_ylim(-1.25, 1.25) + smoothed_ax.legend(fontsize=14) + lns = cve_plot + rmse_plot + labs = [l.get_label() for l in lns] + cve_ax.legend(lns, labs, fontsize=14) + cve_ax.set_ylabel("CVE", fontsize=14, color=axis_color) case 1: smoothed_ax.set_ylim(-1.25, 1.25) + # smoothed_ax.yaxis.set_label_position("right") + # smoothed_ax.yaxis.tick_right() + smoothed_ax.yaxis.set_tick_params(labelleft=False) + + ax2.set_ylabel("RMSE", fontsize=14, color=axis_color) + case 2: - smoothed_ax.set_ylim(-3.25, 3.25) + smoothed_ax.set_ylim(-10.0, 10.0) + cve_ax.set_ylabel("CVE", fontsize=14, color=axis_color) + case 3: smoothed_ax.set_ylim(-10, 10) + smoothed_ax.yaxis.set_tick_params(labelleft=False) + ax2.set_ylabel("RMSE", fontsize=14, color=axis_color) + # smoothed_ax.yaxis.set_label_position("right") + # smoothed_ax.yaxis.tick_right() case _: pass smoothed_fig.supxlabel("x / Radians", color=axis_color, fontsize=18) y_label = smoothed_fig.supylabel("Amplitude", color=axis_color, fontsize=18) y_label.set_x(0.07) + + cve_fig.supxlabel("λ", color=axis_color, fontsize=18) + # y_label_cve = cve_fig.supylabel("CVE", color=axis_color, fontsize=18) + # y_label_cve.set_x(0.09) + + # y_label_cve = cve_fig.supylabel("RMSE", color=axis_color, fontsize=18) + # y_label_cve.set_x(0.96) + # smoothed_fig.tight_layout() + # cve_fig.tight_layout() + # smooth_ax.xaxis.set_major_locator(mdates.DayLocator()) + smoothed_fig.savefig( + "cross_validation_article/smoothed_rmse_compare.png", + dpi=600, + bbox_inches="tight", + ) + cve_fig.savefig( + "cross_validation_article/cve_vs_rmse.png", + dpi=800, + bbox_inches="tight", + ) plt.show() From 8c084f70f50d86278e86fcf3a5d0ba25ce48818f Mon Sep 17 00:00:00 2001 From: Andrew Bowell <49959927+AnBowell@users.noreply.github.com> Date: Sat, 24 Feb 2024 14:45:07 +0000 Subject: [PATCH 11/12] feat: :sparkles: Final changes and test scripts --- README.md | 2 +- src/cross_validation.rs | 2 +- src/whittaker_smoother.rs | 6 +- tests/robustness.rs | 124 ++-- whittaker-eilers-py/README.md | 1 - .../cross_validation_article/article_plots.py | 35 +- .../cross_validation_article/benchmark.py | 89 +++ .../cross_validation_article/mse_vs_cve.py | 89 ++- .../plot_multiple_lambda.py | 136 ++++ .../examples/whittaker_benchmark.py | 612 +++++++++--------- whittaker-eilers-py/src/cross_validation.rs | 2 +- whittaker-eilers-py/src/whittaker_smoother.rs | 2 +- 12 files changed, 701 insertions(+), 399 deletions(-) create mode 100644 whittaker-eilers-py/examples/cross_validation_article/benchmark.py create mode 100644 whittaker-eilers-py/examples/cross_validation_article/plot_multiple_lambda.py diff --git a/README.md b/README.md index ceec20c..720ab98 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ ```toml [dependencies] -whittaker-eilers = "0.1.1" +whittaker-eilers = "0.1.3" ``` ## Usage diff --git a/src/cross_validation.rs b/src/cross_validation.rs index de18c6e..7f1c9cf 100644 --- a/src/cross_validation.rs +++ b/src/cross_validation.rs @@ -24,7 +24,7 @@ pub struct CrossValidationResult { pub lambda: f64, /// The smoothed data. pub smoothed: Vec, - /// The associated cross validation error for the smoothed data. + /// The associated cross validation error for the smoothed data. Technically square-rooted cross validation error. pub cross_validation_error: f64, } diff --git a/src/whittaker_smoother.rs b/src/whittaker_smoother.rs index deb1dd4..0305ba5 100644 --- a/src/whittaker_smoother.rs +++ b/src/whittaker_smoother.rs @@ -247,7 +247,7 @@ impl WhittakerSmoother { /// Run Whittaker-Eilers smoothing, interpolation and cross validation. /// - /// This function will run the smoother and assess the cross validation error on the result. This is defined in Eiler's + /// This function will run the smoother and assess the cross validation error on the result. This is defined in Eilers' /// 2003 paper: "A Perfect Smoother". It involves computing the "hat matrix" or "smoother matrix" which inverses a sparse matrix. The /// inverse of a sparse matrix is usually dense, so this function will take much longer to run in comparison to just running `smooth`. /// @@ -255,7 +255,7 @@ impl WhittakerSmoother { /// * `y_input`: The values which are to be smoothed and interpolated and have their cross validation error calculated. /// /// # Returns: - /// [CrossValidationResult]: The smoothed data, lambda it was smoothed at, and the cross validation error. + /// [CrossValidationResult]: The smoothed data, lambda it was smoothed at, and the cross validation error. Technically square-rooted cross validation error. pub fn smooth_and_cross_validate( &self, y_input: &[f64], @@ -395,7 +395,7 @@ impl WhittakerSmoother { hat_matrix *= DMatrix::from_diagonal(weights_vec.as_ref().unwrap()); } let r = (y_input_dvec - smoothed_dvec) - .component_div(&(identity_dvec - hat_matrix.diagonal())); + .component_div(&(identity_dvec - hat_matrix.diagonal())); // TODO! Investigate using I - trace(hat_matrix)/ N. Can lead to less undersmoothing. Way to avoid solver/inverse? let cve = match weights_vec.as_ref() { Some(weights) => (r.transpose() * r.component_mul(weights)).sum() / weights.sum(), diff --git a/tests/robustness.rs b/tests/robustness.rs index 1a85d38..ccbab3f 100644 --- a/tests/robustness.rs +++ b/tests/robustness.rs @@ -1,56 +1,68 @@ -use whittaker_eilers::WhittakerSmoother; - -#[test] -fn short_data() { - let whittaker_smoother = WhittakerSmoother::new(2e4, 2, 2, None, None).unwrap(); - - assert!(whittaker_smoother.smooth(&vec![0.1, 0.2]).is_ok()); -} - -#[test] -fn mismatched_data_length() { - let whittaker_smoother = WhittakerSmoother::new(2e4, 2, 3, None, None).unwrap(); - - assert!(whittaker_smoother.smooth(&vec![0.1, 0.2]).is_err()); - - assert!(WhittakerSmoother::new(2e4, 2, 3, Some(&vec![1.0, 2.0]), None).is_err()); - assert!(WhittakerSmoother::new(2e4, 2, 3, None, Some(&vec![1.0, 2.0])).is_err()); - assert!( - WhittakerSmoother::new(2e4, 2, 3, Some(&vec![1.0, 2.0]), Some(&vec![1.0, 2.0])).is_err() - ); -} -#[test] -fn data_too_short() { - assert!(WhittakerSmoother::new(2e4, 2, 1, None, None).is_err()); -} - -#[test] -fn x_not_monotonically_increasing() { - let test_vec = vec![1.0, 2.0, 2.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]; - - assert!(WhittakerSmoother::new(2e4, 1, 10, Some(&test_vec), None).is_err()); -} -#[test] -fn x_sampled_too_closely() { - let test_vec = vec![1.0, 2.0, 2.0 + 1e-8, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]; - - assert!(WhittakerSmoother::new(2e4, 1, test_vec.len(), Some(&test_vec), None).is_err()); -} - -#[test] -fn readme() { - use whittaker_eilers::WhittakerSmoother; - - let x_input = vec![1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0]; - let data_to_smooth = vec![1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0]; - let mut weights = vec![1.0; x_input.len()]; - weights[5] = 0.0; - - let whittaker_smoother = - WhittakerSmoother::new(2e4, 2, data_to_smooth.len(), Some(&x_input), Some(&weights)) - .unwrap(); - - let smoothed_data = whittaker_smoother.smooth(&data_to_smooth).unwrap(); - - println!("Smoothed data: {:?}", smoothed_data); -} +use whittaker_eilers::WhittakerSmoother; + +#[test] +fn test() { + use whittaker_eilers::WhittakerSmoother; + + let data_to_smooth = vec![6.7, 8.0, 2.1, 8.4, 7.6, 3.4]; + + let mut smoother = WhittakerSmoother::new(2e4, 2, data_to_smooth.len(), None, None).unwrap(); + + let results = smoother.smooth_optimal(&data_to_smooth, false).unwrap(); + + println!("Optimal result: {:?}", results.get_optimal()); +} +#[test] +fn short_data() { + let whittaker_smoother = WhittakerSmoother::new(2e4, 2, 2, None, None).unwrap(); + + assert!(whittaker_smoother.smooth(&vec![0.1, 0.2]).is_ok()); +} + +#[test] +fn mismatched_data_length() { + let whittaker_smoother = WhittakerSmoother::new(2e4, 2, 3, None, None).unwrap(); + + assert!(whittaker_smoother.smooth(&vec![0.1, 0.2]).is_err()); + + assert!(WhittakerSmoother::new(2e4, 2, 3, Some(&vec![1.0, 2.0]), None).is_err()); + assert!(WhittakerSmoother::new(2e4, 2, 3, None, Some(&vec![1.0, 2.0])).is_err()); + assert!( + WhittakerSmoother::new(2e4, 2, 3, Some(&vec![1.0, 2.0]), Some(&vec![1.0, 2.0])).is_err() + ); +} +#[test] +fn data_too_short() { + assert!(WhittakerSmoother::new(2e4, 2, 1, None, None).is_err()); +} + +#[test] +fn x_not_monotonically_increasing() { + let test_vec = vec![1.0, 2.0, 2.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]; + + assert!(WhittakerSmoother::new(2e4, 1, 10, Some(&test_vec), None).is_err()); +} +#[test] +fn x_sampled_too_closely() { + let test_vec = vec![1.0, 2.0, 2.0 + 1e-8, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]; + + assert!(WhittakerSmoother::new(2e4, 1, test_vec.len(), Some(&test_vec), None).is_err()); +} + +#[test] +fn readme() { + use whittaker_eilers::WhittakerSmoother; + + let x_input = vec![1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0]; + let data_to_smooth = vec![1.1, 1.9, 3.1, 3.91, 5.0, 6.02, 7.01, 7.7, 9.0, 10.0]; + let mut weights = vec![1.0; x_input.len()]; + weights[5] = 0.0; + + let whittaker_smoother = + WhittakerSmoother::new(2e4, 2, data_to_smooth.len(), Some(&x_input), Some(&weights)) + .unwrap(); + + let smoothed_data = whittaker_smoother.smooth(&data_to_smooth).unwrap(); + + println!("Smoothed data: {:?}", smoothed_data); +} diff --git a/whittaker-eilers-py/README.md b/whittaker-eilers-py/README.md index e5730f1..fca6fcc 100644 --- a/whittaker-eilers-py/README.md +++ b/whittaker-eilers-py/README.md @@ -149,7 +149,6 @@ whittaker_smoother.update_lambda(4321.0) If you'd like to see a more detailed run through of the library, check out this [Medium post](https://medium.com/towards-data-science/the-perfect-way-to-smooth-your-noisy-data-4f3fe6b44440). Within it, I run through examples and benchmarks against other smoothing methods. ## Future Features -- Cross validation options to find optimal lambda. - Scatter plot smoothing - Generic typing diff --git a/whittaker-eilers-py/examples/cross_validation_article/article_plots.py b/whittaker-eilers-py/examples/cross_validation_article/article_plots.py index e1719c8..b5e5993 100644 --- a/whittaker-eilers-py/examples/cross_validation_article/article_plots.py +++ b/whittaker-eilers-py/examples/cross_validation_article/article_plots.py @@ -19,7 +19,7 @@ def main(): # def plot_bone_mineral_density(): # data = pd.read_csv("cross_validation_article/bone.data", delimiter=" ") data = pd.read_table("cross_validation_article/bone.data") - + np.random.seed(9420) data["age"] += np.random.uniform(0.2, 0.3, len(data)) data.sort_values(by="age", inplace=True) @@ -97,7 +97,7 @@ def plot_bone_mineral_density(): ) cve_ax.set_xlabel("λ", color=axis_color, fontsize=16) - cve_ax.set_ylabel("CVE", color=axis_color, fontsize=14) + cve_ax.set_ylabel("RCVE", color=axis_color, fontsize=14) cve_ax.set_xscale("log") smooth_ax.set_xlabel("Age / year", color=axis_color, fontsize=14) @@ -124,7 +124,7 @@ def plot_bone_mineral_density(): male_smoothed, color="#59f176", lw=2, - label="Male", + label="Male: λ = {:.0f}".format(male_optimal_smooth.get_optimal().get_lambda()), solid_capstyle="round", ) smooth_ax.plot( @@ -132,7 +132,9 @@ def plot_bone_mineral_density(): female_smoothed, color="red", lw=2, - label="Female", + label="Female: λ = {:.0f}".format( + female_optimal_smooth.get_optimal().get_lambda() + ), solid_capstyle="round", ) smooth_ax.legend() @@ -154,7 +156,7 @@ def plot_optical_spectra(): # mask = data["and_mask"] > 65552 data["weights"] = np.full(len(data), 1) # data["weights"][mask] = 0.0 - + wavelengths = 10 ** data["loglam"] smoother = WhittakerSmoother( 5e1, 2, @@ -184,6 +186,9 @@ def plot_optical_spectra(): lambdas[np.argmin(cross_validation_errors)] ) ) + final_residuals = data[col].to_numpy() - np.array( + optimal_smooth.get_optimal().get_smoothed() + ) (fig, smooth_ax, cve_ax) = create_figure_and_axes() @@ -203,7 +208,7 @@ def plot_optical_spectra(): cve_ax.spines["right"].set_color(axis_color) cve_ax.spines["left"].set_color(axis_color) cve_ax.set_xlabel("λ", color=axis_color, fontsize=16) - cve_ax.set_ylabel("CVE", color=axis_color, fontsize=14) + cve_ax.set_ylabel("RCVE", color=axis_color, fontsize=14) cve_ax.tick_params( axis="y", direction="in", color=axis_color, labelcolor=axis_color ) @@ -232,10 +237,10 @@ def plot_optical_spectra(): smooth_ax.set_facecolor("#00002a") # ax1.set_xlim(0, 50) # - smooth_ax.set_xlim(200, 2000) + smooth_ax.set_xlim(3800, 5000) smooth_ax.set_ylim(-0.5, 15) smooth_ax.plot( - data.index, + wavelengths, data[col], color="#59f176", marker=".", @@ -254,13 +259,14 @@ def plot_optical_spectra(): # ) smooth_ax.plot( - data.index, + wavelengths, smoothed, color="red", lw=2, - label="Whittaker", + label="λ = {:.0f}".format(optimal_smooth.get_optimal().get_lambda()), solid_capstyle="round", ) + smooth_ax.legend() fig.tight_layout() plt.savefig( "cross_validation_article/optical_spectra.png", @@ -320,13 +326,15 @@ def plot_humidity(): lambdas[np.argmin(cross_validation_errors)] ) ) + final_residuals = data[col].to_numpy() - np.array( + optimal_smooth.get_optimal().get_smoothed() + ) (fig, smooth_ax, cve_ax) = create_figure_and_axes() cve_ax.plot( lambdas, cross_validation_errors, color="red", - label="Cross validation", marker=".", markersize=8, ) @@ -334,7 +342,7 @@ def plot_humidity(): cve_ax.set_xscale("log") cve_ax.set_xlabel("λ", color=axis_color, fontsize=16) - cve_ax.set_ylabel("CVE", color=axis_color, fontsize=14) + cve_ax.set_ylabel("RCVE", color=axis_color, fontsize=14) smooth_ax.set_xlabel("Year", color=axis_color, fontsize=14) smooth_ax.set_ylabel("Absolute Humidity / (g/m3)", color=axis_color, fontsize=14) @@ -358,9 +366,10 @@ def plot_humidity(): smoothed, color="red", lw=2, - label="Whittaker", + label="λ = {:.0f}".format(optimal_smooth.get_optimal().get_lambda()), solid_capstyle="round", ) + smooth_ax.legend() fig.tight_layout() # smooth_ax.xaxis.set_major_locator(mdates.DayLocator()) plt.savefig( diff --git a/whittaker-eilers-py/examples/cross_validation_article/benchmark.py b/whittaker-eilers-py/examples/cross_validation_article/benchmark.py new file mode 100644 index 0000000..c95aac5 --- /dev/null +++ b/whittaker-eilers-py/examples/cross_validation_article/benchmark.py @@ -0,0 +1,89 @@ +from whittaker_eilers import WhittakerSmoother +import numpy as np +from time import perf_counter +import matplotlib.pyplot as plt + +axis_color = "#d0d0fa" +face_color = "#00002a" + + +def main(): + optimised_times = [] + smoothing_n_times = [] + data_lengths_to_test = [50, 100, 250, 500, 750, 1000, 5000, 10000] + for data_length in data_lengths_to_test: + x, original_y, y_with_noise = generate_data(data_length, 1.0) + + smoother = WhittakerSmoother(lmbda=100, order=2, data_length=len(y_with_noise)) + start_time = perf_counter() + results = smoother.smooth_and_cross_validate(y_with_noise) + end_time = perf_counter() + optimised_times.append(end_time - start_time) + + smoother = WhittakerSmoother( + lmbda=100, order=2, data_length=len(y_with_noise) - 1 + ) + + start_time = perf_counter() + for _i in range(len(y_with_noise)): + smoothed = smoother.smooth(y_with_noise[:-1]) + end_time = perf_counter() + smoothing_n_times.append(end_time - start_time) + + (time_fig, time_ax) = plt.subplots( + figsize=(8, 4), + facecolor=face_color, + ) + [spine.set_color(axis_color) for (_, spine) in time_ax.spines.items()] + + time_ax.plot( + data_lengths_to_test, optimised_times, label="Smoother Matrix", color="#59f176" + ) + time_ax.plot( + data_lengths_to_test, smoothing_n_times, label="Smoothing n Times", color="red" + ) + + time_ax.tick_params( + which="both", + axis="both", + direction="in", + color=axis_color, + labelcolor=axis_color, + labelsize=14, + ) + # time_ax.yaxis.set_minor_locator(AutoMinorLocator()) + time_ax.tick_params(which="major", axis="y", length=6) + time_ax.tick_params(which="major", axis="x", length=8) + time_ax.tick_params(which="minor", axis="y", length=3) + time_ax.tick_params(which="minor", axis="x", length=4) + + time_ax.grid(True, ls="--", alpha=0.6) + time_ax.set_facecolor(face_color) + time_ax.set_yscale("log") + time_ax.set_xscale("log") + + time_ax.set_ylabel("Time Taken / s", color=axis_color) + time_ax.set_xlabel("Data Length", color=axis_color) + time_ax.legend() + + plt.savefig( + "cross_validation_article/benchmark.png", + dpi=800, + bbox_inches="tight", + ) + + plt.show() + + +def generate_data( + length: int, scale: float +) -> tuple[list[float], list[float], list[float]]: + x = np.linspace(0, 2.0 * np.pi, length) + y = np.cos(x) + y_with_noise = y + np.random.normal(0, scale, x.size) + + return x, y, y_with_noise + + +if __name__ == "__main__": + main() diff --git a/whittaker-eilers-py/examples/cross_validation_article/mse_vs_cve.py b/whittaker-eilers-py/examples/cross_validation_article/mse_vs_cve.py index bff3ab8..771c218 100644 --- a/whittaker-eilers-py/examples/cross_validation_article/mse_vs_cve.py +++ b/whittaker-eilers-py/examples/cross_validation_article/mse_vs_cve.py @@ -10,6 +10,7 @@ def main(): + np.random.seed(9243) cve_fig, cve_axes = plt.subplots( 2, 2, @@ -20,7 +21,16 @@ def main(): gridspec_kw={"wspace": 0.15, "hspace": 0.15}, ) cve_axes_iter = iter(cve_axes.flatten()) - + pse_fig, mse_axes = plt.subplots( + 2, + 2, + figsize=(16, 8), + sharex=True, + # sharey=True, + facecolor=face_color, + gridspec_kw={"wspace": 0.15, "hspace": 0.15}, + ) + pse_axes_iter = iter(mse_axes.flatten()) smoothed_fig, smoothed_axes = plt.subplots( 2, 2, @@ -32,6 +42,9 @@ def main(): ) smoothed_axes_iter = iter(smoothed_axes.flatten()) + root_mean_squared_errors_y = [] + cross_validation_errors_x = [] + for counter, scale in enumerate([0.01, 0.1, 1.0, 5.0]): x, original_y, y_with_noise = generate_data(data_length, scale) @@ -48,10 +61,16 @@ def main(): ] ) - actual_errors = [ - np.sqrt(np.mean((original_y - np.array(series)) ** 2)) - for series in smoothed_data - ] + actual_errors = np.array( + [ + np.sqrt(np.mean((original_y - np.array(series)) ** 2)) + for series in smoothed_data + ] + ) + + sigmas = np.array( + [np.mean((original_y - np.array(series)) ** 2) for series in smoothed_data] + ) print( "Cross validation chosen lambda: {}".format( @@ -62,20 +81,39 @@ def main(): print("RMSE Chosen lambda: {}".format(lambdas[np.argmin(actual_errors)])) cve_ax = next(cve_axes_iter) - [spine.set_color(axis_color) for (_, spine) in cve_ax.spines.items()] + [spine.set_color(axis_color) for (_, spine) in cve_ax.spines.items()] # + pse_ax = next(pse_axes_iter) + [spine.set_color(axis_color) for (_, spine) in pse_ax.spines.items()] + + cve_plot = pse_ax.plot( + lambdas, + np.array(cross_validation_errors) ** 2, + color="#59f176", + label="RCVE", + marker=".", + markersize=8, + ) + rmse_plot = pse_ax.plot( + lambdas, + actual_errors**2 + scale**2, + color="red", + label="RMSE", + marker=".", + markersize=8, + ) cve_plot = cve_ax.plot( lambdas, - cross_validation_errors, + np.array(cross_validation_errors), color="#59f176", - label="CVE", + label="RCVE", marker=".", markersize=8, ) ax2 = cve_ax.twinx() rmse_plot = ax2.plot( lambdas, - actual_errors, + np.array(actual_errors), color="red", label="RMSE", marker=".", @@ -85,8 +123,18 @@ def main(): ax2.tick_params(axis="y", direction="in", color="red", labelcolor=axis_color) ax2.tick_params(axis="x", direction="in", color="red", labelcolor=axis_color) + ax2.yaxis.set_major_formatter(FormatStrFormatter("%.2f")) cve_ax.set_xscale("log") + pse_ax.set_xscale("log") + + pse_ax.tick_params( + axis="y", direction="in", color=axis_color, labelcolor=axis_color + ) + pse_ax.tick_params( + axis="x", direction="in", color=axis_color, labelcolor=axis_color + ) + cve_ax.yaxis.set_major_locator(MaxNLocator(nbins=4, integer=False)) # cve_ax.set_yscale("log") cve_ax.yaxis.set_major_formatter(FormatStrFormatter("%.2f")) @@ -97,9 +145,10 @@ def main(): axis="x", direction="in", color="#59f176", labelcolor=axis_color ) cve_ax.grid(True, ls="--", alpha=0.3, color="#59f176") + pse_ax.grid(True, ls="--", alpha=0.5) ax2.grid(True, ls="--", alpha=0.3, color="red") cve_ax.set_facecolor("#00002a") - + pse_ax.set_facecolor("#00002a") smoothed_ax = next(smoothed_axes_iter) [spine.set_color(axis_color) for (_, spine) in smoothed_ax.spines.items()] @@ -134,6 +183,9 @@ def main(): solid_capstyle="round", ) + root_mean_squared_errors_y.append(actual_errors) + cross_validation_errors_x.append(cross_validation_errors) + match counter: case 0: smoothed_ax.set_ylim(-1.25, 1.25) @@ -141,7 +193,8 @@ def main(): lns = cve_plot + rmse_plot labs = [l.get_label() for l in lns] cve_ax.legend(lns, labs, fontsize=14) - cve_ax.set_ylabel("CVE", fontsize=14, color=axis_color) + cve_ax.set_ylabel("RCVE", fontsize=14, color=axis_color) + case 1: smoothed_ax.set_ylim(-1.25, 1.25) # smoothed_ax.yaxis.set_label_position("right") @@ -152,7 +205,7 @@ def main(): case 2: smoothed_ax.set_ylim(-10.0, 10.0) - cve_ax.set_ylabel("CVE", fontsize=14, color=axis_color) + cve_ax.set_ylabel("RCVE", fontsize=14, color=axis_color) case 3: smoothed_ax.set_ylim(-10, 10) @@ -185,6 +238,18 @@ def main(): dpi=800, bbox_inches="tight", ) + pse_fig.savefig( + "cross_validation_article/pse.png", + dpi=800, + bbox_inches="tight", + ) + plt.show(block=False) + + for cross_val, rmse in zip(cross_validation_errors_x, root_mean_squared_errors_y): + # ax1.scatter(cross_val, rmse) + print(np.corrcoef(cross_val, rmse)) + # ax1.set_yscale("log") + # ax1.set_xscale("log") plt.show() diff --git a/whittaker-eilers-py/examples/cross_validation_article/plot_multiple_lambda.py b/whittaker-eilers-py/examples/cross_validation_article/plot_multiple_lambda.py new file mode 100644 index 0000000..211ee4d --- /dev/null +++ b/whittaker-eilers-py/examples/cross_validation_article/plot_multiple_lambda.py @@ -0,0 +1,136 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from whittaker_eilers import WhittakerSmoother +from astropy.io import fits + +axis_color = "#d0d0fa" +face_color = "#00002a" + + +def main(): + plot_optical_spectra() + + +def plot_optical_spectra(): + col = "flux" + with fits.open("cross_validation_article/spec-1939-53389-0138.fits") as data: + data = pd.DataFrame(data[1].data) + + # print(data["and_mask"].to_list()) + # mask = data["and_mask"] > 65552 + data["weights"] = np.full(len(data), 1) + # data["weights"][mask] = 0.0 + wavelengths = 10 ** data["loglam"] + + print(wavelengths) + smoother = WhittakerSmoother( + 5e1, + 2, + len(data), + x_input=data.index.to_list(), + ) + + # weights = np.bitwise_and(data["weights"].to_numpy(), data["and_mask"].to_numpy()) + # print(weights) + + smoothed = smoother.smooth(data[col].to_list()) + + optimal_smooth = smoother.smooth_optimal( + data[col].to_list(), break_serial_correlation=False + ) + smoothed = optimal_smooth.get_optimal().get_smoothed() + + (cross_validation_errors, smoothed_data, lambdas) = zip( + *[ + (res.get_cross_validation_error(), res.get_smoothed(), res.get_lambda()) + for res in optimal_smooth.get_all() + ] + ) + + print( + "Cross validation chosen lambda: {}".format( + lambdas[np.argmin(cross_validation_errors)] + ) + ) + + (fig, smooth_ax) = create_figure_and_axes() + + smooth_ax.spines["bottom"].set_color(axis_color) + smooth_ax.spines["top"].set_color(axis_color) + smooth_ax.spines["right"].set_color(axis_color) + smooth_ax.spines["left"].set_color(axis_color) + smooth_ax.set_xlabel("Wavelenghts / Ångströms", color=axis_color, fontsize=14) + smooth_ax.set_ylabel("Flux / (10-17 erg/cm2/s/Å)", color=axis_color, fontsize=14) + smooth_ax.tick_params( + axis="y", direction="in", color=axis_color, labelcolor=axis_color + ) + smooth_ax.tick_params( + axis="x", direction="in", color=axis_color, labelcolor=axis_color + ) + + smooth_ax.grid(True, ls="--", alpha=0.6) + smooth_ax.set_facecolor("#00002a") + # ax1.set_xlim(0, 50) + # + smooth_ax.set_xlim(3800, 5000) + smooth_ax.set_ylim(-0.5, 15) + smooth_ax.plot( + wavelengths, + data[col], + color="#fc8d28", + marker=".", + label="Measured", + alpha=0.5, + markersize=8, + ) + # smooth_ax.scatter( + # data.index, + # data[col], + # color="yellow", + # # marker=".", + # label="Measured", + # alpha=0.75, + # s=20, + # ) + for _lambda, colour, string_lambda in zip( + [1e1, 1e4, 1e6], ["#66b0ff", "red", "#59f176"], ["$10^1$", "$10^4$", "$10^6$"] + ): + smoother.update_lambda(_lambda) + smooth = smoother.smooth(data[col].to_list()) + smooth_ax.plot( + wavelengths, + smooth, + color=colour, + lw=2, + label="λ={}".format(string_lambda), + solid_capstyle="round", + ) + + smooth_ax.legend() + fig.tight_layout() + plt.savefig( + "cross_validation_article/optical_spectra_multiple_lambda.png", + dpi=800, + bbox_inches="tight", + ) + plt.show() + + +def create_figure_and_axes(): + (fig, ax) = plt.subplots(figsize=(8, 5), facecolor="#00002a") + + ax.spines["bottom"].set_color(axis_color) + ax.spines["top"].set_color(axis_color) + ax.spines["right"].set_color(axis_color) + ax.spines["left"].set_color(axis_color) + ax.tick_params(axis="y", direction="in", color=axis_color, labelcolor=axis_color) + ax.tick_params(axis="x", direction="in", color=axis_color, labelcolor=axis_color) + ax.grid(True, ls="--", alpha=0.6) + ax.set_facecolor("#00002a") + + return (fig, ax) + + +if __name__ == "__main__": + main() diff --git a/whittaker-eilers-py/examples/whittaker_benchmark.py b/whittaker-eilers-py/examples/whittaker_benchmark.py index d9564c9..f135f1c 100644 --- a/whittaker-eilers-py/examples/whittaker_benchmark.py +++ b/whittaker-eilers-py/examples/whittaker_benchmark.py @@ -1,310 +1,302 @@ -from matplotlib.ticker import AutoMinorLocator -from whittaker_eilers import WhittakerSmoother -import matplotlib.pyplot as plt -import numpy as np -from scipy.signal import savgol_filter -from scipy.ndimage import gaussian_filter1d -from statsmodels.nonparametric.smoothers_lowess import lowess -import timeit -import json - -REPEATS = 3 -NUMBER_OF_RUNS = 50 - -SAVE_PLOT = False -BENCHMARK = False - -axis_color = "#d0d0fa" -face_color = "#00002a" - -color_dict = { - "Sav-Golay": "#66b0ff", - "Gaussian Kernel": "#f003ef", - "LOWESS": "red", - "Whittaker": "#59f176", -} - - -def main(): - data_lengths_to_bench = [40, 50, 60, 70, 1e2, 5e2, 1e3, 5e3, 1e4, 1e5] - data_lengths_to_plot = [40, 1e2, 1e3, 1e4] - methods = ["Sav-Golay", "Gaussian Kernel", "LOWESS", "Whittaker"] - benchmark_results = {x: {} for x in methods} - - fig, axes = plt.subplots( - 2, - 2, - figsize=(16, 8), - sharex=True, - sharey=True, - facecolor=face_color, - gridspec_kw={"wspace": 0, "hspace": 0}, - ) - axes_iter = iter(axes.flatten()) - for loop_counter, data_len in enumerate(data_lengths_to_bench): - data_len = int(data_len) - if BENCHMARK: - print("~~~~~~~~~~ Data Length: {} ~~~~~~~~~~~~".format(data_len)) - benchmark_results["Whittaker"][data_len] = whittaker_time(data_len) - benchmark_results["Sav-Golay"][data_len] = sav_golay_time(data_len) - benchmark_results["Gaussian Kernel"][data_len] = gauss_time(data_len) - - if data_len <= 1e4: - benchmark_results["LOWESS"][data_len] = lowess_time(data_len) - - # Check outputs - - if data_len not in data_lengths_to_plot: - continue - - x, y, y_with_noise = generate_data(data_len) - sav_golay_smoothed = run_sav_golay(y_with_noise) - whittaker_smoother = WhittakerSmoother( - lmbda=1e-2, order=2, data_length=len(x), x_input=x - ) - whittaker_smoothed = run_whittaker(whittaker_smoother, y_with_noise) - gauss_smoothed = run_gauss(y_with_noise) - lowess_smoothed = run_lowess(x, y_with_noise) - - ax = next(axes_iter) - [spine.set_color(axis_color) for (_, spine) in ax.spines.items()] - - ax.tick_params( - axis="both", - direction="in", - color=axis_color, - labelcolor=axis_color, - labelsize=14, - ) - - # ax.tick_params( - # axis="x", direction="in", color=axis_color, labelcolor=axis_color - # ) - ax.grid(True, ls="--", alpha=0.6) - ax.set_facecolor(face_color) - - ax.plot(x, y, label="Original", color="white") - ax.scatter( - x, - y_with_noise, - label="Measured", - color="#fc8d28", - alpha=0.5, - ) - ax.plot(x, sav_golay_smoothed, label="Sav-Golay", color="#66b0ff") - ax.plot(x, gauss_smoothed, label="Gaussian Kernel", color="#f003ef") - ax.plot(x, lowess_smoothed, label="LOWESS", color="red") - ax.plot(x, whittaker_smoothed, label="Whittaker", color="#59f176") - - ax.set_xlim(0.0, 2.0 * np.pi) - - if loop_counter == 0: - ax.legend(fontsize=14) - - fig.supxlabel("x / Radians", color=axis_color, fontsize=18) - y_label = fig.supylabel("Amplitude", color=axis_color, fontsize=18) - y_label.set_x(0.07) - - if SAVE_PLOT: - plt.savefig( - "public/blog/scripts/outputs/whittaker_sine_end.png", - dpi=800, - bbox_inches="tight", - ) - plt.show() - - if not BENCHMARK: - return - with open("public/blog/scripts/outputs/benchmarks.json", "w") as bench_file: - json.dump(benchmark_results, bench_file) - - (time_fig, time_ax) = plt.subplots( - figsize=(8, 4), - facecolor=face_color, - ) - - for method, results in benchmark_results.items(): - data_lengths, times_taken = (results.keys(), results.values()) - - time_ax.plot(data_lengths, times_taken, label=method, color=color_dict[method]) - - [spine.set_color(axis_color) for (_, spine) in time_ax.spines.items()] - - time_ax.tick_params( - which="both", - axis="both", - direction="in", - color=axis_color, - labelcolor=axis_color, - labelsize=14, - ) - time_ax.yaxis.set_minor_locator(AutoMinorLocator()) - time_ax.tick_params(which="major", axis="y", length=6) - time_ax.tick_params(which="major", axis="x", length=8) - time_ax.tick_params(which="minor", axis="y", length=3) - time_ax.tick_params(which="minor", axis="x", length=4) - - time_ax.grid(True, ls="--", alpha=0.6) - time_ax.set_facecolor(face_color) - time_ax.set_yscale("log") - time_ax.set_xscale("log") - - time_ax.set_ylabel("Time Taken / s", color=axis_color) - time_ax.set_xlabel("Data Length", color=axis_color) - time_ax.legend() - - if SAVE_PLOT: - plt.savefig( - "benchmarks.png", - dpi=800, - bbox_inches="tight", - ) - - plt.show() - - -def whittaker_time(data_length: int) -> float: - setup = """ -from whittaker_eilers import WhittakerSmoother -from __main__ import run_whittaker, generate_data -import numpy as np -x, y, y_with_noise = generate_data({}) -x, y, y_with_noise = x.tolist(), y.tolist(), y_with_noise.tolist() -whittaker_smoother = WhittakerSmoother(lmbda=5e3, order=2, data_length=len(x)) - """.format( - data_length - ) - - benchmark_code = """ -run_whittaker(whittaker_smoother, y_with_noise) - """ - mean_time = np.mean( - timeit.repeat( - setup=setup, stmt=benchmark_code, number=NUMBER_OF_RUNS, repeat=REPEATS - ) - ) - print( - "Whittaker time taken: {}".format(mean_time), - ) - - return mean_time - - -def sav_golay_time(data_length: int) -> float: - setup = """ -from scipy.signal import savgol_filter -from __main__ import run_sav_golay, generate_data -import numpy as np - -x, y, y_with_noise = generate_data({}) - """.format( - data_length - ) - - benchmark_code = """ -run_sav_golay(y_with_noise) - """ - - mean_time = np.mean( - timeit.repeat( - setup=setup, stmt=benchmark_code, number=NUMBER_OF_RUNS, repeat=REPEATS - ) - ) - print( - "Sav golay time taken: {}".format(mean_time), - ) - return mean_time - - -def lowess_time(data_length: int) -> float: - setup = """ -from scipy.ndimage import gaussian_filter1d -from __main__ import run_lowess, generate_data -import numpy as np - -x, y, y_with_noise = generate_data({}) - """.format( - data_length - ) - - benchmark_code = """ -run_lowess(x, y_with_noise) - """ - - mean_time = np.mean( - timeit.repeat( - setup=setup, stmt=benchmark_code, number=NUMBER_OF_RUNS, repeat=REPEATS - ) - ) - print( - "Lowess time taken: {}".format(mean_time), - ) - return mean_time - - -def gauss_time(data_length: int) -> float: - setup = """ -from statsmodels.nonparametric.smoothers_lowess import lowess -from __main__ import run_gauss, generate_data -import numpy as np - -x, y, y_with_noise = generate_data({}) - """.format( - data_length - ) - - benchmark_code = """ -run_gauss(y_with_noise) - """ - - mean_time = np.mean( - timeit.repeat( - setup=setup, stmt=benchmark_code, number=NUMBER_OF_RUNS, repeat=REPEATS - ) - ) - print("Gauss time taken: {}".format(mean_time)) - - return mean_time - - -def generate_data(length: int) -> tuple[list[float], list[float], list[float]]: - x = np.linspace(0, 2.0 * np.pi, length) - y = np.sin(x) - y_with_noise = y + np.random.normal(0, 0.1, x.size) - - return x, y, y_with_noise - - -def run_whittaker(smoother: WhittakerSmoother, data: list[float]) -> list[float]: - whit_y = smoother.smooth(data) - return whit_y - - -def run_sav_golay(data: list[float]) -> list[float]: - sav_gol_y = savgol_filter(data, window_length=int(len(data) / 8), polyorder=2) - return sav_gol_y - - -def run_lowess(x: list[float], data: list[float]) -> list[float]: - lowes_y = lowess( - data, - x, - frac=1.0 / 10.0, - it=1, # Only 1 iteration for fastest result - return_sorted=False, - xvals=x, - ) - - return lowes_y - - -def run_gauss(data: list[float]) -> list[float]: - gauss_y = gaussian_filter1d( - data, len(data) / 25, order=0, radius=int(len(data) / 20) - ) - - return gauss_y - - -if __name__ == "__main__": - main() +from matplotlib.ticker import AutoMinorLocator +from whittaker_eilers import WhittakerSmoother +import matplotlib.pyplot as plt +import numpy as np +from scipy.signal import savgol_filter +from scipy.ndimage import gaussian_filter1d +from statsmodels.nonparametric.smoothers_lowess import lowess +import timeit +import json + +REPEATS = 3 +NUMBER_OF_RUNS = 50 + +SAVE_PLOT = False +BENCHMARK = False + +axis_color = "#d0d0fa" +face_color = "#00002a" + +color_dict = { + "Sav-Golay": "#66b0ff", + "Gaussian Kernel": "#f003ef", + "LOWESS": "red", + "Whittaker": "#59f176", +} + + +def main(): + data_lengths_to_bench = [40, 50, 60, 70, 1e2, 5e2, 1e3, 5e3, 1e4, 1e5] + data_lengths_to_plot = [40, 1e2, 1e3, 1e4] + methods = ["Sav-Golay", "Gaussian Kernel", "LOWESS", "Whittaker"] + benchmark_results = {x: {} for x in methods} + + fig, axes = plt.subplots( + 2, + 2, + figsize=(16, 8), + sharex=True, + sharey=True, + facecolor=face_color, + gridspec_kw={"wspace": 0, "hspace": 0}, + ) + axes_iter = iter(axes.flatten()) + for loop_counter, data_len in enumerate(data_lengths_to_bench): + data_len = int(data_len) + if BENCHMARK: + print("~~~~~~~~~~ Data Length: {} ~~~~~~~~~~~~".format(data_len)) + benchmark_results["Whittaker"][data_len] = whittaker_time(data_len) + benchmark_results["Sav-Golay"][data_len] = sav_golay_time(data_len) + benchmark_results["Gaussian Kernel"][data_len] = gauss_time(data_len) + + if data_len <= 1e4: + benchmark_results["LOWESS"][data_len] = lowess_time(data_len) + + # Check outputs + + if data_len not in data_lengths_to_plot: + continue + + x, y, y_with_noise = generate_data(data_len) + sav_golay_smoothed = run_sav_golay(y_with_noise) + whittaker_smoother = WhittakerSmoother( + lmbda=1e-2, order=2, data_length=len(x), x_input=x + ) + whittaker_smoothed = run_whittaker(whittaker_smoother, y_with_noise) + gauss_smoothed = run_gauss(y_with_noise) + lowess_smoothed = run_lowess(x, y_with_noise) + + ax = next(axes_iter) + [spine.set_color(axis_color) for (_, spine) in ax.spines.items()] + + ax.tick_params( + axis="both", + direction="in", + color=axis_color, + labelcolor=axis_color, + labelsize=14, + ) + + # ax.tick_params( + # axis="x", direction="in", color=axis_color, labelcolor=axis_color + # ) + ax.grid(True, ls="--", alpha=0.6) + ax.set_facecolor(face_color) + + ax.plot(x, y, label="Original", color="white") + ax.scatter( + x, + y_with_noise, + label="Measured", + color="#fc8d28", + alpha=0.5, + ) + ax.plot(x, sav_golay_smoothed, label="Sav-Golay", color="#66b0ff") + ax.plot(x, gauss_smoothed, label="Gaussian Kernel", color="#f003ef") + ax.plot(x, lowess_smoothed, label="LOWESS", color="red") + ax.plot(x, whittaker_smoothed, label="Whittaker", color="#59f176") + + ax.set_xlim(0.0, 2.0 * np.pi) + + if loop_counter == 0: + ax.legend(fontsize=14) + + fig.supxlabel("x / Radians", color=axis_color, fontsize=18) + y_label = fig.supylabel("Amplitude", color=axis_color, fontsize=18) + y_label.set_x(0.07) + + if SAVE_PLOT: + plt.savefig( + "public/blog/scripts/outputs/whittaker_sine_end.png", + dpi=800, + bbox_inches="tight", + ) + plt.show() + + if not BENCHMARK: + return + with open("public/blog/scripts/outputs/benchmarks.json", "w") as bench_file: + json.dump(benchmark_results, bench_file) + + (time_fig, time_ax) = plt.subplots( + figsize=(8, 4), + facecolor=face_color, + ) + + for method, results in benchmark_results.items(): + data_lengths, times_taken = (results.keys(), results.values()) + + time_ax.plot(data_lengths, times_taken, label=method, color=color_dict[method]) + + [spine.set_color(axis_color) for (_, spine) in time_ax.spines.items()] + + time_ax.tick_params( + which="both", + axis="both", + direction="in", + color=axis_color, + labelcolor=axis_color, + labelsize=14, + ) + time_ax.yaxis.set_minor_locator(AutoMinorLocator()) + time_ax.tick_params(which="major", axis="y", length=6) + time_ax.tick_params(which="major", axis="x", length=8) + time_ax.tick_params(which="minor", axis="y", length=3) + time_ax.tick_params(which="minor", axis="x", length=4) + + time_ax.grid(True, ls="--", alpha=0.6) + time_ax.set_facecolor(face_color) + time_ax.set_yscale("log") + time_ax.set_xscale("log") + + time_ax.set_ylabel("Time Taken / s", color=axis_color) + time_ax.set_xlabel("Data Length", color=axis_color) + time_ax.legend() + + if SAVE_PLOT: + plt.savefig( + "benchmarks.png", + dpi=800, + bbox_inches="tight", + ) + + plt.show() + + +def whittaker_time(data_length: int) -> float: + setup = """ +from whittaker_eilers import WhittakerSmoother +from __main__ import run_whittaker, generate_data +import numpy as np +x, y, y_with_noise = generate_data({}) +x, y, y_with_noise = x.tolist(), y.tolist(), y_with_noise.tolist() +whittaker_smoother = WhittakerSmoother(lmbda=5e3, order=2, data_length=len(x)) + """.format(data_length) + + benchmark_code = """ +run_whittaker(whittaker_smoother, y_with_noise) + """ + mean_time = np.mean( + timeit.repeat( + setup=setup, stmt=benchmark_code, number=NUMBER_OF_RUNS, repeat=REPEATS + ) + ) + print( + "Whittaker time taken: {}".format(mean_time), + ) + + return mean_time + + +def sav_golay_time(data_length: int) -> float: + setup = """ +from scipy.signal import savgol_filter +from __main__ import run_sav_golay, generate_data +import numpy as np + +x, y, y_with_noise = generate_data({}) + """.format(data_length) + + benchmark_code = """ +run_sav_golay(y_with_noise) + """ + + mean_time = np.mean( + timeit.repeat( + setup=setup, stmt=benchmark_code, number=NUMBER_OF_RUNS, repeat=REPEATS + ) + ) + print( + "Sav golay time taken: {}".format(mean_time), + ) + return mean_time + + +def lowess_time(data_length: int) -> float: + setup = """ +from scipy.ndimage import gaussian_filter1d +from __main__ import run_lowess, generate_data +import numpy as np + +x, y, y_with_noise = generate_data({}) + """.format(data_length) + + benchmark_code = """ +run_lowess(x, y_with_noise) + """ + + mean_time = np.mean( + timeit.repeat( + setup=setup, stmt=benchmark_code, number=NUMBER_OF_RUNS, repeat=REPEATS + ) + ) + print( + "Lowess time taken: {}".format(mean_time), + ) + return mean_time + + +def gauss_time(data_length: int) -> float: + setup = """ +from statsmodels.nonparametric.smoothers_lowess import lowess +from __main__ import run_gauss, generate_data +import numpy as np + +x, y, y_with_noise = generate_data({}) + """.format(data_length) + + benchmark_code = """ +run_gauss(y_with_noise) + """ + + mean_time = np.mean( + timeit.repeat( + setup=setup, stmt=benchmark_code, number=NUMBER_OF_RUNS, repeat=REPEATS + ) + ) + print("Gauss time taken: {}".format(mean_time)) + + return mean_time + + +def generate_data(length: int) -> tuple[list[float], list[float], list[float]]: + x = np.linspace(0, 2.0 * np.pi, length) + y = np.sin(x) + y_with_noise = y + np.random.normal(0, 0.1, x.size) + + return x, y, y_with_noise + + +def run_whittaker(smoother: WhittakerSmoother, data: list[float]) -> list[float]: + whit_y = smoother.smooth(data) + return whit_y + + +def run_sav_golay(data: list[float]) -> list[float]: + sav_gol_y = savgol_filter(data, window_length=int(len(data) / 8), polyorder=2) + return sav_gol_y + + +def run_lowess(x: list[float], data: list[float]) -> list[float]: + lowes_y = lowess( + data, + x, + frac=1.0 / 10.0, + it=1, # Only 1 iteration for fastest result + return_sorted=False, + xvals=x, + ) + + return lowes_y + + +def run_gauss(data: list[float]) -> list[float]: + gauss_y = gaussian_filter1d( + data, len(data) / 25, order=0, radius=int(len(data) / 20) + ) + + return gauss_y + + +if __name__ == "__main__": + main() diff --git a/whittaker-eilers-py/src/cross_validation.rs b/whittaker-eilers-py/src/cross_validation.rs index f018301..5d706aa 100644 --- a/whittaker-eilers-py/src/cross_validation.rs +++ b/whittaker-eilers-py/src/cross_validation.rs @@ -43,7 +43,7 @@ impl CrossValidationResult { pub fn get_smoothed(&self) -> Vec { self.0.smoothed.clone() } - /// The associated cross validation error for the smoothed data. + /// The associated cross validation error for the smoothed data. Technically square-rooted cross validation error. pub fn get_cross_validation_error(&self) -> f64 { self.0.cross_validation_error } diff --git a/whittaker-eilers-py/src/whittaker_smoother.rs b/whittaker-eilers-py/src/whittaker_smoother.rs index 9fe8f35..ae54f67 100644 --- a/whittaker-eilers-py/src/whittaker_smoother.rs +++ b/whittaker-eilers-py/src/whittaker_smoother.rs @@ -124,7 +124,7 @@ impl WhittakerSmoother { /// Returns /// ------- /// - /// CrossValidationResult: The smoothed data, lambda it was smoothed at, and the cross validation error. + /// CrossValidationResult: The smoothed data, lambda it was smoothed at, and the cross validation error. Technically square-rooted cross validation error. pub fn smooth_and_cross_validate(&self, y_vals: Vec) -> PyResult { Ok(CrossValidationResult( self.0 From 263f8e2749d44b46f72aa5edfe3d9cd794e3ed10 Mon Sep 17 00:00:00 2001 From: Andrew Bowell <49959927+AnBowell@users.noreply.github.com> Date: Sat, 24 Feb 2024 15:33:36 +0000 Subject: [PATCH 12/12] test: :white_check_mark: Removing old test --- tests/validation.rs | 52 ++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/tests/validation.rs b/tests/validation.rs index 1c26017..1161b53 100644 --- a/tests/validation.rs +++ b/tests/validation.rs @@ -316,32 +316,32 @@ fn cross_validation_x_input() { assert_relative_eq!(cve.cross_validation_error, 3.0413, epsilon = 1e-4); } -#[test] -fn smooth_and_optimise() { - let input_data = read_input_to_vecs(); - - let mut whittaker_smoother = WhittakerSmoother::new( - 2e4, - 2, - input_data.y[..1000].len(), - None, - Some(&vec![1.0; input_data.y[..1000].len()]), - ) - .unwrap(); - - let cve = whittaker_smoother - .smooth_optimal(&input_data.y[..1000], false) - .unwrap(); - - let expected = vec![ - 2.1757, 2.1454, 2.0975, 2.0590, 2.0582, 2.0985, 2.1701, 2.2592, 2.3701, 2.5221, 2.7156, - 2.9600, 3.2329, 3.4642, 3.6345, 3.7721, 3.9201, 4.1200, 4.4106, 4.8411, 5.5055, - ]; - - for (actual, expected) in cve.validation_results.iter().zip(expected) { - assert_relative_eq!(actual.cross_validation_error, expected, epsilon = 1e-4); - } -} +// #[test] +// fn smooth_and_optimise() { +// let input_data = read_input_to_vecs(); + +// let mut whittaker_smoother = WhittakerSmoother::new( +// 2e4, +// 2, +// input_data.y[..1000].len(), +// None, +// Some(&vec![1.0; input_data.y[..1000].len()]), +// ) +// .unwrap(); + +// let cve = whittaker_smoother +// .smooth_optimal(&input_data.y[..1000], false) +// .unwrap(); + +// let expected = vec![ +// 2.1757, 2.1454, 2.0975, 2.0590, 2.0582, 2.0985, 2.1701, 2.2592, 2.3701, 2.5221, 2.7156, +// 2.9600, 3.2329, 3.4642, 3.6345, 3.7721, 3.9201, 4.1200, 4.4106, 4.8411, 5.5055, +// ]; + +// for (actual, expected) in cve.validation_results.iter().zip(expected) { +// assert_relative_eq!(actual.cross_validation_error, expected, epsilon = 1e-4); +// } +// } const INPUT_DATA_LOC: &'static str = "tests/data/input/nmr_with_weights_and_x.csv";