From 9770ceb04f67706a69a45d857677470e364a3609 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Tue, 19 Mar 2024 00:12:52 -0400 Subject: [PATCH 01/56] adding gmm --- .../benchmarks/ReverseMode/gmmrs/Cargo.lock | 16 ++ .../benchmarks/ReverseMode/gmmrs/Cargo.toml | 19 +++ .../benchmarks/ReverseMode/gmmrs/src/lib.rs | 126 +++++++++++++++ .../benchmarks/ReverseMode/gmmrs/src/main.rs | 147 ++++++++++++++++++ 4 files changed, 308 insertions(+) create mode 100644 enzyme/benchmarks/ReverseMode/gmmrs/Cargo.lock create mode 100644 enzyme/benchmarks/ReverseMode/gmmrs/Cargo.toml create mode 100644 enzyme/benchmarks/ReverseMode/gmmrs/src/lib.rs create mode 100644 enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs diff --git a/enzyme/benchmarks/ReverseMode/gmmrs/Cargo.lock b/enzyme/benchmarks/ReverseMode/gmmrs/Cargo.lock new file mode 100644 index 000000000000..cfdab95b3d9c --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/gmmrs/Cargo.lock @@ -0,0 +1,16 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "gmmrs" +version = "0.1.0" +dependencies = [ + "libm", +] + +[[package]] +name = "libm" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" diff --git a/enzyme/benchmarks/ReverseMode/gmmrs/Cargo.toml b/enzyme/benchmarks/ReverseMode/gmmrs/Cargo.toml new file mode 100644 index 000000000000..9ff65cd97178 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/gmmrs/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "gmmrs" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[lib] +crate-type = ["cdylib"] + + +[profile.release] +lto = "fat" + +[profile.dev] +lto = "fat" + +[dependencies] +libm = "0.2.8" diff --git a/enzyme/benchmarks/ReverseMode/gmmrs/src/lib.rs b/enzyme/benchmarks/ReverseMode/gmmrs/src/lib.rs new file mode 100644 index 000000000000..46bd57c99dd1 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/gmmrs/src/lib.rs @@ -0,0 +1,126 @@ +#![feature(autodiff)] +#![crate_type = "dylib"] +use libm::lgamma; + +#[no_mangle] +pub extern "C" fn dgmm_objective_C(d: usize, k: usize, n: usize, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { + dgmm_objective(d, k, n, alphas, dalphas, means, dmeans, icf, dicf, x, wishart, err, derr); +} + +#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] +fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { + let alphas = unsafe { std::slice::from_raw_parts(alphas, k) }; + let means = unsafe { std::slice::from_raw_parts(means, k * d) }; + let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; + let x = unsafe { std::slice::from_raw_parts(x, n * d) }; + let wishart: Wishart = unsafe { *wishart }; + let mut err = unsafe { *err }; + gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut err); +} + +fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], wishart: Wishart, err: &mut f64) { + let constant = -(n as f64) * d as f64 * 0.5 * 2f64.ln(); + let icf_sz = d * (d + 1) / 2; + let mut qdiags = vec![0.; d * k]; + let mut sum_qs = vec![0.; k]; + let mut xcentered = vec![0.; d]; + let mut qxcentered = vec![0.; d]; + let mut main_term = vec![0.; k]; + + preprocess_qs(d, k, icf, &mut sum_qs, &mut qdiags); + + let mut slse = 0.; + for ix in 0..n { + for ik in 0..k { + subtract(d, &x[ix as usize * d as usize..], &means[ik as usize * d as usize..], &mut xcentered); + Qtimesx(d, &qdiags[ik as usize * d as usize..], &icf[ik as usize * icf_sz as usize + d as usize..], &xcentered, &mut qxcentered); + main_term[ik as usize] = alphas[ik as usize] + sum_qs[ik as usize] - 0.5 * sqnorm(d, &qxcentered); + } + + slse = slse + log_sum_exp(k, &main_term); + } + + let lse_alphas = log_sum_exp(k, alphas); + + *err = constant + slse - n as f64 * lse_alphas + log_wishart_prior(d, k, wishart, &sum_qs, &qdiags, icf); +} + +fn arr_max(n: usize, x: &[f64]) -> f64 { + let mut max = f64::NEG_INFINITY; + for i in 0..n { + if max < x[i] { + max = x[i]; + } + } + max +} + +fn preprocess_qs(d: usize, k: usize, icf: &[f64], sum_qs: &mut [f64], qdiags: &mut [f64]) { + let icf_sz = d * (d + 1) / 2; + for ik in 0..k { + sum_qs[ik as usize] = 0.; + for id in 0..d { + let q = icf[ik as usize * icf_sz as usize + id as usize]; + sum_qs[ik as usize] = sum_qs[ik as usize] + q; + qdiags[ik as usize * d as usize + id as usize] = q.exp(); + } + } +} +fn subtract(d: usize, x: &[f64], y: &[f64], out: &mut [f64]) { + assert!(x.len() >= d); + assert!(y.len() >= d); + assert!(out.len() >= d); + for i in 0..d { + out[i] = x[i] - y[i]; + } +} + +fn Qtimesx(d: usize, q_diag: &[f64], ltri: &[f64], x: &[f64], out: &mut [f64]) { + assert!(out.len() >= d); + assert!(q_diag.len() >= d); + assert!(x.len() >= d); + for i in 0..d { + out[i] = q_diag[i] * x[i]; + } + + for i in 0..d { + let mut lparamsidx = i*(2*d-i-1)/2; + for j in i + 1..d { + out[j] = out[j] + ltri[lparamsidx] * x[i]; + lparamsidx += 1; + } + } +} + +fn log_sum_exp(n: usize, x: &[f64]) -> f64 { + let mx = arr_max(n, x); + let semx: f64 = x.iter().map(|x| (x - mx).exp()).sum(); + semx.ln() + mx +} +fn log_gamma_distrib(a: f64, p: f64) -> f64 { + 0.25 * p * (p - 1.) * std::f64::consts::PI.ln() + (1..=p as usize).map(|j| lgamma(a + 0.5 * (1. - j as f64))).sum::() +} + +#[derive(Clone, Copy)] +#[repr(C)] +pub struct Wishart { + gamma: f64, + m: usize, +} +fn log_wishart_prior(p: usize, k: usize, wishart: Wishart, sum_qs: &[f64], qdiags: &[f64], icf: &[f64]) -> f64 { + let n = p + wishart.m + 1; + let icf_sz = p * (p + 1) / 2; + + let c = n as f64 * p as f64 * (wishart.gamma.ln() - 0.5 * 2f64.ln()) - log_gamma_distrib(0.5 * n as f64, p as f64); + + let out = (0..k).map(|ik| { + let frobenius = sqnorm(p, &qdiags[ik * p as usize..]) + sqnorm(icf_sz - p, &icf[ik * icf_sz as usize + p as usize..]); + 0.5 * wishart.gamma * wishart.gamma * (frobenius) - (wishart.m as f64) * sum_qs[ik as usize] + }).sum::(); + + out - k as f64 * c +} + +fn sqnorm(n: usize, x: &[f64]) -> f64 { + x.iter().map(|x| x * x).sum() +} diff --git a/enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs b/enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs new file mode 100644 index 000000000000..784dce85143a --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs @@ -0,0 +1,147 @@ +#![feature(autodiff)] +#![crate_type = "dylib"] +use libm::lgamma; + +fn main() { + let d = 2; + let k = 2; + let n = 2; + let alphas = vec![0.5, 0.5]; + let means = vec![0., 0., 1., 1.]; + let icf = vec![1., 0., 1.]; + let x = vec![0., 0., 1., 1.]; + let wishart = Wishart { gamma: 1., m: 1 }; + let mut err = 0.; + let mut d_alphas = vec![0.; alphas.len()]; + let mut d_means = vec![0.; means.len()]; + let mut d_icf = vec![0.; icf.len()]; + let mut d_x = vec![0.; x.len()]; + let mut d_err = 0.; + let mut err2 = &mut err; + let mut d_err2 = &mut d_err; + let wishart2 = &wishart; + // pass as raw ptr: + dgmm_objective(d, k, n, alphas.as_ptr(), d_alphas.as_mut_ptr(), means.as_ptr(), d_means.as_mut_ptr(), icf.as_ptr(), d_icf.as_mut_ptr(), x.as_ptr(), wishart2 as *const Wishart, err2 as *mut f64, d_err2 as *mut f64); +} +fn arr_max(n: usize, x: &[f64]) -> f64 { + let mut max = f64::NEG_INFINITY; + for i in 0..n { + if max < x[i] { + max = x[i]; + } + } + max +} + +#[no_mangle] +pub extern "C" fn dgmm_objective_C(d: usize, k: usize, n: usize, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { + dgmm_objective(d, k, n, alphas, dalphas, means, dmeans, icf, dicf, x, wishart, err, derr); +} + +#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] +fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { + let alphas = unsafe { std::slice::from_raw_parts(alphas, k) }; + let means = unsafe { std::slice::from_raw_parts(means, k * d) }; + let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; + let x = unsafe { std::slice::from_raw_parts(x, n * d) }; + let wishart: Wishart = unsafe { *wishart }; + let mut err = unsafe { *err }; + gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut err); +} + +fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], wishart: Wishart, err: &mut f64) { + let constant = -(n as f64) * d as f64 * 0.5 * 2f64.ln(); + let icf_sz = d * (d + 1) / 2; + let mut qdiags = vec![0.; d * k]; + let mut sum_qs = vec![0.; k]; + let mut xcentered = vec![0.; d]; + let mut qxcentered = vec![0.; d]; + let mut main_term = vec![0.; k]; + + preprocess_qs(d, k, icf, &mut sum_qs, &mut qdiags); + + let mut slse = 0.; + for ix in 0..n { + for ik in 0..k { + subtract(d, &x[ix as usize * d as usize..], &means[ik as usize * d as usize..], &mut xcentered); + Qtimesx(d, &qdiags[ik as usize * d as usize..], &icf[ik as usize * icf_sz as usize + d as usize..], &xcentered, &mut qxcentered); + main_term[ik as usize] = alphas[ik as usize] + sum_qs[ik as usize] - 0.5 * sqnorm(d, &qxcentered); + } + + slse = slse + log_sum_exp(k, &main_term); + } + + let lse_alphas = log_sum_exp(k, alphas); + + *err = constant + slse - n as f64 * lse_alphas + log_wishart_prior(d, k, wishart, &sum_qs, &qdiags, icf); +} + +fn preprocess_qs(d: usize, k: usize, icf: &[f64], sum_qs: &mut [f64], qdiags: &mut [f64]) { + let icf_sz = d * (d + 1) / 2; + for ik in 0..k { + sum_qs[ik as usize] = 0.; + for id in 0..d { + let q = icf[ik as usize * icf_sz as usize + id as usize]; + sum_qs[ik as usize] = sum_qs[ik as usize] + q; + qdiags[ik as usize * d as usize + id as usize] = q.exp(); + } + } +} +fn subtract(d: usize, x: &[f64], y: &[f64], out: &mut [f64]) { + assert!(x.len() >= d); + assert!(y.len() >= d); + assert!(out.len() >= d); + for i in 0..d { + out[i] = x[i] - y[i]; + } +} + +fn Qtimesx(d: usize, q_diag: &[f64], ltri: &[f64], x: &[f64], out: &mut [f64]) { + assert!(out.len() >= d); + assert!(q_diag.len() >= d); + assert!(x.len() >= d); + for i in 0..d { + out[i] = q_diag[i] * x[i]; + } + + for i in 0..d { + let mut lparamsidx = i*(2*d-i-1)/2; + for j in i + 1..d { + out[j] = out[j] + ltri[lparamsidx] * x[i]; + lparamsidx += 1; + } + } +} + +fn log_sum_exp(n: usize, x: &[f64]) -> f64 { + let mx = arr_max(n, x); + let semx: f64 = x.iter().map(|x| (x - mx).exp()).sum(); + semx.ln() + mx +} +fn log_gamma_distrib(a: f64, p: f64) -> f64 { + 0.25 * p * (p - 1.) * std::f64::consts::PI.ln() + (1..=p as usize).map(|j| lgamma(a + 0.5 * (1. - j as f64))).sum::() +} + +#[derive(Clone, Copy)] +#[repr(C)] +pub struct Wishart { + gamma: f64, + m: usize, +} +fn log_wishart_prior(p: usize, k: usize, wishart: Wishart, sum_qs: &[f64], qdiags: &[f64], icf: &[f64]) -> f64 { + let n = p + wishart.m + 1; + let icf_sz = p * (p + 1) / 2; + + let c = n as f64 * p as f64 * (wishart.gamma.ln() - 0.5 * 2f64.ln()) - log_gamma_distrib(0.5 * n as f64, p as f64); + + let out = (0..k).map(|ik| { + let frobenius = sqnorm(p, &qdiags[ik * p as usize..]) + sqnorm(icf_sz - p, &icf[ik * icf_sz as usize + p as usize..]); + 0.5 * wishart.gamma * wishart.gamma * (frobenius) - (wishart.m as f64) * sum_qs[ik as usize] + }).sum::(); + + out - k as f64 * c +} + +fn sqnorm(n: usize, x: &[f64]) -> f64 { + x.iter().map(|x| x * x).sum() +} From 9f05ff836db1fa5a483e27e52d735d76dbab7e5c Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Mon, 25 Mar 2024 18:11:00 -0400 Subject: [PATCH 02/56] working C too --- .../benchmarks/ReverseMode/gmmrs/Cargo.toml | 3 +- .../benchmarks/ReverseMode/gmmrs/src/lib.rs | 11 +- .../benchmarks/ReverseMode/gmmrs/src/main.rs | 125 +-------------- .../benchmarks/ReverseMode/gmmrs/src/main.rs2 | 147 ++++++++++++++++++ 4 files changed, 154 insertions(+), 132 deletions(-) create mode 100644 enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs2 diff --git a/enzyme/benchmarks/ReverseMode/gmmrs/Cargo.toml b/enzyme/benchmarks/ReverseMode/gmmrs/Cargo.toml index 9ff65cd97178..6271be06da5d 100644 --- a/enzyme/benchmarks/ReverseMode/gmmrs/Cargo.toml +++ b/enzyme/benchmarks/ReverseMode/gmmrs/Cargo.toml @@ -6,8 +6,7 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [lib] -crate-type = ["cdylib"] - +crate-type = ["lib"] [profile.release] lto = "fat" diff --git a/enzyme/benchmarks/ReverseMode/gmmrs/src/lib.rs b/enzyme/benchmarks/ReverseMode/gmmrs/src/lib.rs index 46bd57c99dd1..2b565072a505 100644 --- a/enzyme/benchmarks/ReverseMode/gmmrs/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/gmmrs/src/lib.rs @@ -1,14 +1,13 @@ #![feature(autodiff)] -#![crate_type = "dylib"] use libm::lgamma; #[no_mangle] -pub extern "C" fn dgmm_objective_C(d: usize, k: usize, n: usize, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { +pub extern "C" fn dgmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { dgmm_objective(d, k, n, alphas, dalphas, means, dmeans, icf, dicf, x, wishart, err, derr); } #[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] -fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { +pub fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { let alphas = unsafe { std::slice::from_raw_parts(alphas, k) }; let means = unsafe { std::slice::from_raw_parts(means, k * d) }; let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; @@ -18,7 +17,7 @@ fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: *con gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut err); } -fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], wishart: Wishart, err: &mut f64) { +pub fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], wishart: Wishart, err: &mut f64) { let constant = -(n as f64) * d as f64 * 0.5 * 2f64.ln(); let icf_sz = d * (d + 1) / 2; let mut qdiags = vec![0.; d * k]; @@ -104,8 +103,8 @@ fn log_gamma_distrib(a: f64, p: f64) -> f64 { #[derive(Clone, Copy)] #[repr(C)] pub struct Wishart { - gamma: f64, - m: usize, + pub gamma: f64, + pub m: usize, } fn log_wishart_prior(p: usize, k: usize, wishart: Wishart, sum_qs: &[f64], qdiags: &[f64], icf: &[f64]) -> f64 { let n = p + wishart.m + 1; diff --git a/enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs b/enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs index 784dce85143a..8f4357588ab8 100644 --- a/enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs +++ b/enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs @@ -1,6 +1,5 @@ #![feature(autodiff)] -#![crate_type = "dylib"] -use libm::lgamma; +use gmmrs::{Wishart, dgmm_objective}; fn main() { let d = 2; @@ -23,125 +22,3 @@ fn main() { // pass as raw ptr: dgmm_objective(d, k, n, alphas.as_ptr(), d_alphas.as_mut_ptr(), means.as_ptr(), d_means.as_mut_ptr(), icf.as_ptr(), d_icf.as_mut_ptr(), x.as_ptr(), wishart2 as *const Wishart, err2 as *mut f64, d_err2 as *mut f64); } -fn arr_max(n: usize, x: &[f64]) -> f64 { - let mut max = f64::NEG_INFINITY; - for i in 0..n { - if max < x[i] { - max = x[i]; - } - } - max -} - -#[no_mangle] -pub extern "C" fn dgmm_objective_C(d: usize, k: usize, n: usize, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { - dgmm_objective(d, k, n, alphas, dalphas, means, dmeans, icf, dicf, x, wishart, err, derr); -} - -#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] -fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { - let alphas = unsafe { std::slice::from_raw_parts(alphas, k) }; - let means = unsafe { std::slice::from_raw_parts(means, k * d) }; - let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; - let x = unsafe { std::slice::from_raw_parts(x, n * d) }; - let wishart: Wishart = unsafe { *wishart }; - let mut err = unsafe { *err }; - gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut err); -} - -fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], wishart: Wishart, err: &mut f64) { - let constant = -(n as f64) * d as f64 * 0.5 * 2f64.ln(); - let icf_sz = d * (d + 1) / 2; - let mut qdiags = vec![0.; d * k]; - let mut sum_qs = vec![0.; k]; - let mut xcentered = vec![0.; d]; - let mut qxcentered = vec![0.; d]; - let mut main_term = vec![0.; k]; - - preprocess_qs(d, k, icf, &mut sum_qs, &mut qdiags); - - let mut slse = 0.; - for ix in 0..n { - for ik in 0..k { - subtract(d, &x[ix as usize * d as usize..], &means[ik as usize * d as usize..], &mut xcentered); - Qtimesx(d, &qdiags[ik as usize * d as usize..], &icf[ik as usize * icf_sz as usize + d as usize..], &xcentered, &mut qxcentered); - main_term[ik as usize] = alphas[ik as usize] + sum_qs[ik as usize] - 0.5 * sqnorm(d, &qxcentered); - } - - slse = slse + log_sum_exp(k, &main_term); - } - - let lse_alphas = log_sum_exp(k, alphas); - - *err = constant + slse - n as f64 * lse_alphas + log_wishart_prior(d, k, wishart, &sum_qs, &qdiags, icf); -} - -fn preprocess_qs(d: usize, k: usize, icf: &[f64], sum_qs: &mut [f64], qdiags: &mut [f64]) { - let icf_sz = d * (d + 1) / 2; - for ik in 0..k { - sum_qs[ik as usize] = 0.; - for id in 0..d { - let q = icf[ik as usize * icf_sz as usize + id as usize]; - sum_qs[ik as usize] = sum_qs[ik as usize] + q; - qdiags[ik as usize * d as usize + id as usize] = q.exp(); - } - } -} -fn subtract(d: usize, x: &[f64], y: &[f64], out: &mut [f64]) { - assert!(x.len() >= d); - assert!(y.len() >= d); - assert!(out.len() >= d); - for i in 0..d { - out[i] = x[i] - y[i]; - } -} - -fn Qtimesx(d: usize, q_diag: &[f64], ltri: &[f64], x: &[f64], out: &mut [f64]) { - assert!(out.len() >= d); - assert!(q_diag.len() >= d); - assert!(x.len() >= d); - for i in 0..d { - out[i] = q_diag[i] * x[i]; - } - - for i in 0..d { - let mut lparamsidx = i*(2*d-i-1)/2; - for j in i + 1..d { - out[j] = out[j] + ltri[lparamsidx] * x[i]; - lparamsidx += 1; - } - } -} - -fn log_sum_exp(n: usize, x: &[f64]) -> f64 { - let mx = arr_max(n, x); - let semx: f64 = x.iter().map(|x| (x - mx).exp()).sum(); - semx.ln() + mx -} -fn log_gamma_distrib(a: f64, p: f64) -> f64 { - 0.25 * p * (p - 1.) * std::f64::consts::PI.ln() + (1..=p as usize).map(|j| lgamma(a + 0.5 * (1. - j as f64))).sum::() -} - -#[derive(Clone, Copy)] -#[repr(C)] -pub struct Wishart { - gamma: f64, - m: usize, -} -fn log_wishart_prior(p: usize, k: usize, wishart: Wishart, sum_qs: &[f64], qdiags: &[f64], icf: &[f64]) -> f64 { - let n = p + wishart.m + 1; - let icf_sz = p * (p + 1) / 2; - - let c = n as f64 * p as f64 * (wishart.gamma.ln() - 0.5 * 2f64.ln()) - log_gamma_distrib(0.5 * n as f64, p as f64); - - let out = (0..k).map(|ik| { - let frobenius = sqnorm(p, &qdiags[ik * p as usize..]) + sqnorm(icf_sz - p, &icf[ik * icf_sz as usize + p as usize..]); - 0.5 * wishart.gamma * wishart.gamma * (frobenius) - (wishart.m as f64) * sum_qs[ik as usize] - }).sum::(); - - out - k as f64 * c -} - -fn sqnorm(n: usize, x: &[f64]) -> f64 { - x.iter().map(|x| x * x).sum() -} diff --git a/enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs2 b/enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs2 new file mode 100644 index 000000000000..784dce85143a --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs2 @@ -0,0 +1,147 @@ +#![feature(autodiff)] +#![crate_type = "dylib"] +use libm::lgamma; + +fn main() { + let d = 2; + let k = 2; + let n = 2; + let alphas = vec![0.5, 0.5]; + let means = vec![0., 0., 1., 1.]; + let icf = vec![1., 0., 1.]; + let x = vec![0., 0., 1., 1.]; + let wishart = Wishart { gamma: 1., m: 1 }; + let mut err = 0.; + let mut d_alphas = vec![0.; alphas.len()]; + let mut d_means = vec![0.; means.len()]; + let mut d_icf = vec![0.; icf.len()]; + let mut d_x = vec![0.; x.len()]; + let mut d_err = 0.; + let mut err2 = &mut err; + let mut d_err2 = &mut d_err; + let wishart2 = &wishart; + // pass as raw ptr: + dgmm_objective(d, k, n, alphas.as_ptr(), d_alphas.as_mut_ptr(), means.as_ptr(), d_means.as_mut_ptr(), icf.as_ptr(), d_icf.as_mut_ptr(), x.as_ptr(), wishart2 as *const Wishart, err2 as *mut f64, d_err2 as *mut f64); +} +fn arr_max(n: usize, x: &[f64]) -> f64 { + let mut max = f64::NEG_INFINITY; + for i in 0..n { + if max < x[i] { + max = x[i]; + } + } + max +} + +#[no_mangle] +pub extern "C" fn dgmm_objective_C(d: usize, k: usize, n: usize, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { + dgmm_objective(d, k, n, alphas, dalphas, means, dmeans, icf, dicf, x, wishart, err, derr); +} + +#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] +fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { + let alphas = unsafe { std::slice::from_raw_parts(alphas, k) }; + let means = unsafe { std::slice::from_raw_parts(means, k * d) }; + let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; + let x = unsafe { std::slice::from_raw_parts(x, n * d) }; + let wishart: Wishart = unsafe { *wishart }; + let mut err = unsafe { *err }; + gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut err); +} + +fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], wishart: Wishart, err: &mut f64) { + let constant = -(n as f64) * d as f64 * 0.5 * 2f64.ln(); + let icf_sz = d * (d + 1) / 2; + let mut qdiags = vec![0.; d * k]; + let mut sum_qs = vec![0.; k]; + let mut xcentered = vec![0.; d]; + let mut qxcentered = vec![0.; d]; + let mut main_term = vec![0.; k]; + + preprocess_qs(d, k, icf, &mut sum_qs, &mut qdiags); + + let mut slse = 0.; + for ix in 0..n { + for ik in 0..k { + subtract(d, &x[ix as usize * d as usize..], &means[ik as usize * d as usize..], &mut xcentered); + Qtimesx(d, &qdiags[ik as usize * d as usize..], &icf[ik as usize * icf_sz as usize + d as usize..], &xcentered, &mut qxcentered); + main_term[ik as usize] = alphas[ik as usize] + sum_qs[ik as usize] - 0.5 * sqnorm(d, &qxcentered); + } + + slse = slse + log_sum_exp(k, &main_term); + } + + let lse_alphas = log_sum_exp(k, alphas); + + *err = constant + slse - n as f64 * lse_alphas + log_wishart_prior(d, k, wishart, &sum_qs, &qdiags, icf); +} + +fn preprocess_qs(d: usize, k: usize, icf: &[f64], sum_qs: &mut [f64], qdiags: &mut [f64]) { + let icf_sz = d * (d + 1) / 2; + for ik in 0..k { + sum_qs[ik as usize] = 0.; + for id in 0..d { + let q = icf[ik as usize * icf_sz as usize + id as usize]; + sum_qs[ik as usize] = sum_qs[ik as usize] + q; + qdiags[ik as usize * d as usize + id as usize] = q.exp(); + } + } +} +fn subtract(d: usize, x: &[f64], y: &[f64], out: &mut [f64]) { + assert!(x.len() >= d); + assert!(y.len() >= d); + assert!(out.len() >= d); + for i in 0..d { + out[i] = x[i] - y[i]; + } +} + +fn Qtimesx(d: usize, q_diag: &[f64], ltri: &[f64], x: &[f64], out: &mut [f64]) { + assert!(out.len() >= d); + assert!(q_diag.len() >= d); + assert!(x.len() >= d); + for i in 0..d { + out[i] = q_diag[i] * x[i]; + } + + for i in 0..d { + let mut lparamsidx = i*(2*d-i-1)/2; + for j in i + 1..d { + out[j] = out[j] + ltri[lparamsidx] * x[i]; + lparamsidx += 1; + } + } +} + +fn log_sum_exp(n: usize, x: &[f64]) -> f64 { + let mx = arr_max(n, x); + let semx: f64 = x.iter().map(|x| (x - mx).exp()).sum(); + semx.ln() + mx +} +fn log_gamma_distrib(a: f64, p: f64) -> f64 { + 0.25 * p * (p - 1.) * std::f64::consts::PI.ln() + (1..=p as usize).map(|j| lgamma(a + 0.5 * (1. - j as f64))).sum::() +} + +#[derive(Clone, Copy)] +#[repr(C)] +pub struct Wishart { + gamma: f64, + m: usize, +} +fn log_wishart_prior(p: usize, k: usize, wishart: Wishart, sum_qs: &[f64], qdiags: &[f64], icf: &[f64]) -> f64 { + let n = p + wishart.m + 1; + let icf_sz = p * (p + 1) / 2; + + let c = n as f64 * p as f64 * (wishart.gamma.ln() - 0.5 * 2f64.ln()) - log_gamma_distrib(0.5 * n as f64, p as f64); + + let out = (0..k).map(|ik| { + let frobenius = sqnorm(p, &qdiags[ik * p as usize..]) + sqnorm(icf_sz - p, &icf[ik * icf_sz as usize + p as usize..]); + 0.5 * wishart.gamma * wishart.gamma * (frobenius) - (wishart.m as f64) * sum_qs[ik as usize] + }).sum::(); + + out - k as f64 * c +} + +fn sqnorm(n: usize, x: &[f64]) -> f64 { + x.iter().map(|x| x * x).sum() +} From 4da9910986f12470743130ebaadeb4fa99653f94 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Mon, 25 Mar 2024 20:17:27 -0400 Subject: [PATCH 03/56] Delete enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs2 --- .../benchmarks/ReverseMode/gmmrs/src/main.rs2 | 147 ------------------ 1 file changed, 147 deletions(-) delete mode 100644 enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs2 diff --git a/enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs2 b/enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs2 deleted file mode 100644 index 784dce85143a..000000000000 --- a/enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs2 +++ /dev/null @@ -1,147 +0,0 @@ -#![feature(autodiff)] -#![crate_type = "dylib"] -use libm::lgamma; - -fn main() { - let d = 2; - let k = 2; - let n = 2; - let alphas = vec![0.5, 0.5]; - let means = vec![0., 0., 1., 1.]; - let icf = vec![1., 0., 1.]; - let x = vec![0., 0., 1., 1.]; - let wishart = Wishart { gamma: 1., m: 1 }; - let mut err = 0.; - let mut d_alphas = vec![0.; alphas.len()]; - let mut d_means = vec![0.; means.len()]; - let mut d_icf = vec![0.; icf.len()]; - let mut d_x = vec![0.; x.len()]; - let mut d_err = 0.; - let mut err2 = &mut err; - let mut d_err2 = &mut d_err; - let wishart2 = &wishart; - // pass as raw ptr: - dgmm_objective(d, k, n, alphas.as_ptr(), d_alphas.as_mut_ptr(), means.as_ptr(), d_means.as_mut_ptr(), icf.as_ptr(), d_icf.as_mut_ptr(), x.as_ptr(), wishart2 as *const Wishart, err2 as *mut f64, d_err2 as *mut f64); -} -fn arr_max(n: usize, x: &[f64]) -> f64 { - let mut max = f64::NEG_INFINITY; - for i in 0..n { - if max < x[i] { - max = x[i]; - } - } - max -} - -#[no_mangle] -pub extern "C" fn dgmm_objective_C(d: usize, k: usize, n: usize, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { - dgmm_objective(d, k, n, alphas, dalphas, means, dmeans, icf, dicf, x, wishart, err, derr); -} - -#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] -fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { - let alphas = unsafe { std::slice::from_raw_parts(alphas, k) }; - let means = unsafe { std::slice::from_raw_parts(means, k * d) }; - let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; - let x = unsafe { std::slice::from_raw_parts(x, n * d) }; - let wishart: Wishart = unsafe { *wishart }; - let mut err = unsafe { *err }; - gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut err); -} - -fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], wishart: Wishart, err: &mut f64) { - let constant = -(n as f64) * d as f64 * 0.5 * 2f64.ln(); - let icf_sz = d * (d + 1) / 2; - let mut qdiags = vec![0.; d * k]; - let mut sum_qs = vec![0.; k]; - let mut xcentered = vec![0.; d]; - let mut qxcentered = vec![0.; d]; - let mut main_term = vec![0.; k]; - - preprocess_qs(d, k, icf, &mut sum_qs, &mut qdiags); - - let mut slse = 0.; - for ix in 0..n { - for ik in 0..k { - subtract(d, &x[ix as usize * d as usize..], &means[ik as usize * d as usize..], &mut xcentered); - Qtimesx(d, &qdiags[ik as usize * d as usize..], &icf[ik as usize * icf_sz as usize + d as usize..], &xcentered, &mut qxcentered); - main_term[ik as usize] = alphas[ik as usize] + sum_qs[ik as usize] - 0.5 * sqnorm(d, &qxcentered); - } - - slse = slse + log_sum_exp(k, &main_term); - } - - let lse_alphas = log_sum_exp(k, alphas); - - *err = constant + slse - n as f64 * lse_alphas + log_wishart_prior(d, k, wishart, &sum_qs, &qdiags, icf); -} - -fn preprocess_qs(d: usize, k: usize, icf: &[f64], sum_qs: &mut [f64], qdiags: &mut [f64]) { - let icf_sz = d * (d + 1) / 2; - for ik in 0..k { - sum_qs[ik as usize] = 0.; - for id in 0..d { - let q = icf[ik as usize * icf_sz as usize + id as usize]; - sum_qs[ik as usize] = sum_qs[ik as usize] + q; - qdiags[ik as usize * d as usize + id as usize] = q.exp(); - } - } -} -fn subtract(d: usize, x: &[f64], y: &[f64], out: &mut [f64]) { - assert!(x.len() >= d); - assert!(y.len() >= d); - assert!(out.len() >= d); - for i in 0..d { - out[i] = x[i] - y[i]; - } -} - -fn Qtimesx(d: usize, q_diag: &[f64], ltri: &[f64], x: &[f64], out: &mut [f64]) { - assert!(out.len() >= d); - assert!(q_diag.len() >= d); - assert!(x.len() >= d); - for i in 0..d { - out[i] = q_diag[i] * x[i]; - } - - for i in 0..d { - let mut lparamsidx = i*(2*d-i-1)/2; - for j in i + 1..d { - out[j] = out[j] + ltri[lparamsidx] * x[i]; - lparamsidx += 1; - } - } -} - -fn log_sum_exp(n: usize, x: &[f64]) -> f64 { - let mx = arr_max(n, x); - let semx: f64 = x.iter().map(|x| (x - mx).exp()).sum(); - semx.ln() + mx -} -fn log_gamma_distrib(a: f64, p: f64) -> f64 { - 0.25 * p * (p - 1.) * std::f64::consts::PI.ln() + (1..=p as usize).map(|j| lgamma(a + 0.5 * (1. - j as f64))).sum::() -} - -#[derive(Clone, Copy)] -#[repr(C)] -pub struct Wishart { - gamma: f64, - m: usize, -} -fn log_wishart_prior(p: usize, k: usize, wishart: Wishart, sum_qs: &[f64], qdiags: &[f64], icf: &[f64]) -> f64 { - let n = p + wishart.m + 1; - let icf_sz = p * (p + 1) / 2; - - let c = n as f64 * p as f64 * (wishart.gamma.ln() - 0.5 * 2f64.ln()) - log_gamma_distrib(0.5 * n as f64, p as f64); - - let out = (0..k).map(|ik| { - let frobenius = sqnorm(p, &qdiags[ik * p as usize..]) + sqnorm(icf_sz - p, &icf[ik * icf_sz as usize + p as usize..]); - 0.5 * wishart.gamma * wishart.gamma * (frobenius) - (wishart.m as f64) * sum_qs[ik as usize] - }).sum::(); - - out - k as f64 * c -} - -fn sqnorm(n: usize, x: &[f64]) -> f64 { - x.iter().map(|x| x * x).sum() -} From 0f68ce8848374e78502d9213baf065d05d812817 Mon Sep 17 00:00:00 2001 From: "William S. Moses" Date: Thu, 28 Mar 2024 14:00:34 -0400 Subject: [PATCH 04/56] rust setup --- enzyme/benchmarks/ReverseMode/adbench/gmm.h | 34 +++++ .../benchmarks/ReverseMode/gmm/Makefile.make | 15 ++- .../benchmarks/ReverseMode/gmmrs/Cargo.lock | 16 --- .../benchmarks/ReverseMode/gmmrs/Cargo.toml | 18 --- .../benchmarks/ReverseMode/gmmrs/src/lib.rs | 125 ------------------ .../benchmarks/ReverseMode/gmmrs/src/main.rs | 24 ---- enzyme/benchmarks/lit.site.cfg.py.in | 67 ++++++++-- 7 files changed, 101 insertions(+), 198 deletions(-) delete mode 100644 enzyme/benchmarks/ReverseMode/gmmrs/Cargo.lock delete mode 100644 enzyme/benchmarks/ReverseMode/gmmrs/Cargo.toml delete mode 100644 enzyme/benchmarks/ReverseMode/gmmrs/src/lib.rs delete mode 100644 enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs diff --git a/enzyme/benchmarks/ReverseMode/adbench/gmm.h b/enzyme/benchmarks/ReverseMode/adbench/gmm.h index 00f4302b9f99..e47dfaa62a23 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/gmm.h +++ b/enzyme/benchmarks/ReverseMode/adbench/gmm.h @@ -47,6 +47,11 @@ extern "C" { alphasb, const double *means, double *meansb, const double *icf, double *icfb, const double *x, Wishart wishart, double *err, double * errb); + + void rust_dgmm_objective(int d, int k, int n, const double *alphas, double * + alphasb, const double *means, double *meansb, const double *icf, + double *icfb, const double *x, Wishart wishart, double *err, double * + errb); } void read_gmm_instance(const string& fn, @@ -269,6 +274,35 @@ int main(const int argc, const char* argv[]) { test_suite["tools"].push_back(enzyme); } + } + + { + + struct GMMInput input; + read_gmm_instance("data/" + path, &input.d, &input.k, &input.n, + input.alphas, input.means, input.icf, input.x, input.wishart, params.replicate_point); + + int Jcols = (input.k * (input.d + 1) * (input.d + 2)) / 2; + + struct GMMOutput result = { 0, std::vector(Jcols) }; + + { + struct timeval start, end; + gettimeofday(&start, NULL); + calculate_jacobian(input, result); + gettimeofday(&end, NULL); + json enzyme; + enzyme["name"] = "Rust Enzyme combined"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = result.gradient.size() - 5; + i < result.gradient.size(); i++) { + printf("%f ", result.gradient[i]); + enzyme["result"].push_back(result.gradient[i]); + } + printf("\n"); + test_suite["tools"].push_back(enzyme); + } + } test_suite["llvm-version"] = __clang_version__; test_suite["mode"] = "ReverseMode"; diff --git a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make index 5072679eeb0e..1e1a36f72c2f 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make @@ -1,23 +1,28 @@ -# RUN: if [ %llvmver -ge 12 ] || [ %llvmver -le 9 ]; then cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%loadEnzyme" make -B gmm-unopt.ll gmm-raw.ll results.json -f %s; fi +# RUN: if [ %llvmver -ge 12 ] || [ %llvmver -le 9 ]; then cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%newLoadEnzyme" make -B gmm-unopt.ll gmm-raw.ll results.json -f %s; fi .PHONY: clean clean: rm -f *.ll *.o results.txt results.json -%-unopt.ll: %.cpp - clang++ $(BENCH) $^ -O2 -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm +%-unopt.ll: %.cpp src/lib.rs + ENZYME_LOOSE_TYPES=1 cargo +enzyme rustc --release --lib --crate-type=staticlib + clang++ $(BENCH) gmm.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -fno-vectorize -fno-slp-vectorize -fno-unroll-loops -o gmm-unopt.ll -S -emit-llvm #clang++ $(BENCH) $^ -O1 -Xclang -disable-llvm-passes -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm %-raw.ll: %-unopt.ll - opt $^ $(LOAD) -enzyme -o $@ -S + echo opt $^ $(LOAD) -passes="enzyme" -o $@ -S + opt $^ $(LOAD) -passes="enzyme" -o $@ -S %-opt.ll: %-raw.ll + echo opt $^ -o $@ -S opt $^ -o $@ -S #opt $^ -O2 -o $@ -S gmm.o: gmm-opt.ll - clang++ -O2 $^ -o $@ $(BENCHLINK) -lm + pwd + echo clang++ -O2 $^ -o $@ $(BENCHLINK) -lm /home/wmoses/git/Enzyme/enzyme/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 + clang++ -v -O2 $^ -o $@ $(BENCHLINK) -lm /home/wmoses/git/Enzyme/enzyme/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 results.json: gmm.o ./$^ diff --git a/enzyme/benchmarks/ReverseMode/gmmrs/Cargo.lock b/enzyme/benchmarks/ReverseMode/gmmrs/Cargo.lock deleted file mode 100644 index cfdab95b3d9c..000000000000 --- a/enzyme/benchmarks/ReverseMode/gmmrs/Cargo.lock +++ /dev/null @@ -1,16 +0,0 @@ -# This file is automatically @generated by Cargo. -# It is not intended for manual editing. -version = 3 - -[[package]] -name = "gmmrs" -version = "0.1.0" -dependencies = [ - "libm", -] - -[[package]] -name = "libm" -version = "0.2.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" diff --git a/enzyme/benchmarks/ReverseMode/gmmrs/Cargo.toml b/enzyme/benchmarks/ReverseMode/gmmrs/Cargo.toml deleted file mode 100644 index 6271be06da5d..000000000000 --- a/enzyme/benchmarks/ReverseMode/gmmrs/Cargo.toml +++ /dev/null @@ -1,18 +0,0 @@ -[package] -name = "gmmrs" -version = "0.1.0" -edition = "2021" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[lib] -crate-type = ["lib"] - -[profile.release] -lto = "fat" - -[profile.dev] -lto = "fat" - -[dependencies] -libm = "0.2.8" diff --git a/enzyme/benchmarks/ReverseMode/gmmrs/src/lib.rs b/enzyme/benchmarks/ReverseMode/gmmrs/src/lib.rs deleted file mode 100644 index 2b565072a505..000000000000 --- a/enzyme/benchmarks/ReverseMode/gmmrs/src/lib.rs +++ /dev/null @@ -1,125 +0,0 @@ -#![feature(autodiff)] -use libm::lgamma; - -#[no_mangle] -pub extern "C" fn dgmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { - dgmm_objective(d, k, n, alphas, dalphas, means, dmeans, icf, dicf, x, wishart, err, derr); -} - -#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] -pub fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { - let alphas = unsafe { std::slice::from_raw_parts(alphas, k) }; - let means = unsafe { std::slice::from_raw_parts(means, k * d) }; - let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; - let x = unsafe { std::slice::from_raw_parts(x, n * d) }; - let wishart: Wishart = unsafe { *wishart }; - let mut err = unsafe { *err }; - gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut err); -} - -pub fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], wishart: Wishart, err: &mut f64) { - let constant = -(n as f64) * d as f64 * 0.5 * 2f64.ln(); - let icf_sz = d * (d + 1) / 2; - let mut qdiags = vec![0.; d * k]; - let mut sum_qs = vec![0.; k]; - let mut xcentered = vec![0.; d]; - let mut qxcentered = vec![0.; d]; - let mut main_term = vec![0.; k]; - - preprocess_qs(d, k, icf, &mut sum_qs, &mut qdiags); - - let mut slse = 0.; - for ix in 0..n { - for ik in 0..k { - subtract(d, &x[ix as usize * d as usize..], &means[ik as usize * d as usize..], &mut xcentered); - Qtimesx(d, &qdiags[ik as usize * d as usize..], &icf[ik as usize * icf_sz as usize + d as usize..], &xcentered, &mut qxcentered); - main_term[ik as usize] = alphas[ik as usize] + sum_qs[ik as usize] - 0.5 * sqnorm(d, &qxcentered); - } - - slse = slse + log_sum_exp(k, &main_term); - } - - let lse_alphas = log_sum_exp(k, alphas); - - *err = constant + slse - n as f64 * lse_alphas + log_wishart_prior(d, k, wishart, &sum_qs, &qdiags, icf); -} - -fn arr_max(n: usize, x: &[f64]) -> f64 { - let mut max = f64::NEG_INFINITY; - for i in 0..n { - if max < x[i] { - max = x[i]; - } - } - max -} - -fn preprocess_qs(d: usize, k: usize, icf: &[f64], sum_qs: &mut [f64], qdiags: &mut [f64]) { - let icf_sz = d * (d + 1) / 2; - for ik in 0..k { - sum_qs[ik as usize] = 0.; - for id in 0..d { - let q = icf[ik as usize * icf_sz as usize + id as usize]; - sum_qs[ik as usize] = sum_qs[ik as usize] + q; - qdiags[ik as usize * d as usize + id as usize] = q.exp(); - } - } -} -fn subtract(d: usize, x: &[f64], y: &[f64], out: &mut [f64]) { - assert!(x.len() >= d); - assert!(y.len() >= d); - assert!(out.len() >= d); - for i in 0..d { - out[i] = x[i] - y[i]; - } -} - -fn Qtimesx(d: usize, q_diag: &[f64], ltri: &[f64], x: &[f64], out: &mut [f64]) { - assert!(out.len() >= d); - assert!(q_diag.len() >= d); - assert!(x.len() >= d); - for i in 0..d { - out[i] = q_diag[i] * x[i]; - } - - for i in 0..d { - let mut lparamsidx = i*(2*d-i-1)/2; - for j in i + 1..d { - out[j] = out[j] + ltri[lparamsidx] * x[i]; - lparamsidx += 1; - } - } -} - -fn log_sum_exp(n: usize, x: &[f64]) -> f64 { - let mx = arr_max(n, x); - let semx: f64 = x.iter().map(|x| (x - mx).exp()).sum(); - semx.ln() + mx -} -fn log_gamma_distrib(a: f64, p: f64) -> f64 { - 0.25 * p * (p - 1.) * std::f64::consts::PI.ln() + (1..=p as usize).map(|j| lgamma(a + 0.5 * (1. - j as f64))).sum::() -} - -#[derive(Clone, Copy)] -#[repr(C)] -pub struct Wishart { - pub gamma: f64, - pub m: usize, -} -fn log_wishart_prior(p: usize, k: usize, wishart: Wishart, sum_qs: &[f64], qdiags: &[f64], icf: &[f64]) -> f64 { - let n = p + wishart.m + 1; - let icf_sz = p * (p + 1) / 2; - - let c = n as f64 * p as f64 * (wishart.gamma.ln() - 0.5 * 2f64.ln()) - log_gamma_distrib(0.5 * n as f64, p as f64); - - let out = (0..k).map(|ik| { - let frobenius = sqnorm(p, &qdiags[ik * p as usize..]) + sqnorm(icf_sz - p, &icf[ik * icf_sz as usize + p as usize..]); - 0.5 * wishart.gamma * wishart.gamma * (frobenius) - (wishart.m as f64) * sum_qs[ik as usize] - }).sum::(); - - out - k as f64 * c -} - -fn sqnorm(n: usize, x: &[f64]) -> f64 { - x.iter().map(|x| x * x).sum() -} diff --git a/enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs b/enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs deleted file mode 100644 index 8f4357588ab8..000000000000 --- a/enzyme/benchmarks/ReverseMode/gmmrs/src/main.rs +++ /dev/null @@ -1,24 +0,0 @@ -#![feature(autodiff)] -use gmmrs::{Wishart, dgmm_objective}; - -fn main() { - let d = 2; - let k = 2; - let n = 2; - let alphas = vec![0.5, 0.5]; - let means = vec![0., 0., 1., 1.]; - let icf = vec![1., 0., 1.]; - let x = vec![0., 0., 1., 1.]; - let wishart = Wishart { gamma: 1., m: 1 }; - let mut err = 0.; - let mut d_alphas = vec![0.; alphas.len()]; - let mut d_means = vec![0.; means.len()]; - let mut d_icf = vec![0.; icf.len()]; - let mut d_x = vec![0.; x.len()]; - let mut d_err = 0.; - let mut err2 = &mut err; - let mut d_err2 = &mut d_err; - let wishart2 = &wishart; - // pass as raw ptr: - dgmm_objective(d, k, n, alphas.as_ptr(), d_alphas.as_mut_ptr(), means.as_ptr(), d_means.as_mut_ptr(), icf.as_ptr(), d_icf.as_mut_ptr(), x.as_ptr(), wishart2 as *const Wishart, err2 as *mut f64, d_err2 as *mut f64); -} diff --git a/enzyme/benchmarks/lit.site.cfg.py.in b/enzyme/benchmarks/lit.site.cfg.py.in index 93937f9c62d3..2ef3c28b0ca9 100644 --- a/enzyme/benchmarks/lit.site.cfg.py.in +++ b/enzyme/benchmarks/lit.site.cfg.py.in @@ -49,21 +49,68 @@ config.substitutions.append(('%lli', config.llvm_tools_dir + "/lli" + (" --jit-k config.substitutions.append(('%opt', config.llvm_tools_dir + "/opt")) config.substitutions.append(('%llvmver', config.llvm_ver)) config.substitutions.append(('%FileCheck', config.llvm_tools_dir + "/FileCheck")) -config.substitutions.append(('%clang', config.llvm_tools_dir + "/clang")) -config.substitutions.append(('%loadEnzyme', '' - + (" --enable-new-pm=0" if int(config.llvm_ver) >= 13 else "") + +emopt = config.enzyme_obj_root + "/Enzyme/MLIR/enzymemlir-opt" +if len("@ENZYME_BINARY_DIR@") == 0: + emopt = os.path.dirname(os.path.abspath(__file__)) + "/../enzymemlir-opt" + +eclang = config.llvm_tools_dir + "/clang" +if len("@ENZYME_BINARY_DIR@") == 0: + eclang = os.path.dirname(os.path.abspath(__file__)) + "/../enzyme-clang" + resource = config.llvm_tools_dir + "/../clang/staging" + eclang += " -resource-dir " + resource + " " + eclang += "-I " + os.path.dirname(os.path.abspath(__file__)) + "/Integration" + +config.substitutions.append(('%eopt', emopt)) +config.substitutions.append(('%llvmver', config.llvm_ver)) +config.substitutions.append(('%FileCheck', config.llvm_tools_dir + "/FileCheck")) +config.substitutions.append(('%clang', eclang)) +config.substitutions.append(('%O0TBAA', "-O1 -Xclang -disable-llvm-passes")) + +oldPM = ((" --enable-new-pm=0" if int(config.llvm_ver) >= 13 else "") + ' -load=@ENZYME_BINARY_DIR@/Enzyme/LLVMEnzyme-' + config.llvm_ver + config.llvm_shlib_ext - + (" --enzyme-attributor=0" if int(config.llvm_ver) >= 13 else "") - + ' -enzyme-preopt=0' - )) + + (" --enzyme-attributor=0" if int(config.llvm_ver) >= 13 else "")) +newPM = ((" --enable-new-pm=1" if int(config.llvm_ver) in (12,13) else "") + + ' -load-pass-plugin=@ENZYME_BINARY_DIR@/Enzyme/LLVMEnzyme-' + config.llvm_ver + config.llvm_shlib_ext + + ' -load=@ENZYME_BINARY_DIR@/Enzyme/LLVMEnzyme-' + config.llvm_ver + config.llvm_shlib_ext + + (" --enzyme-attributor=0" if int(config.llvm_ver) >= 13 else "")) +if len("@ENZYME_BINARY_DIR@") == 0: + oldPM = ((" --enable-new-pm=0" if int(config.llvm_ver) >= 13 else "") + + (" --enzyme-attributor=0" if int(config.llvm_ver) >= 13 else "")) + newPM = ((" --enable-new-pm=1" if int(config.llvm_ver) in (12,13) else "") + + (" --enzyme-attributor=0" if int(config.llvm_ver) >= 13 else "")) + +oldPMOP = oldPM +newPMOP = newPM +if int(config.llvm_ver) == 16: + newPM += " -opaque-pointers=0" + oldPM += " -opaque-pointers=0" + +config.substitutions.append(('%loadEnzyme', oldPM if int(config.llvm_ver) < 16 else newPM)) +config.substitutions.append(('%newLoadEnzyme', newPM)) +config.substitutions.append(('%OPloadEnzyme', oldPMOP if int(config.llvm_ver) < 16 else newPMOP)) +config.substitutions.append(('%OPnewLoadEnzyme', newPMOP)) +config.substitutions.append(('%enzyme', ('-enzyme' if int(config.llvm_ver) < 16 else '-passes="enzyme"'))) +config.substitutions.append(('%simplifycfg', ("simplify-cfg" if int(config.llvm_ver) < 11 else "simplifycfg"))) +config.substitutions.append(('%loopmssa', ("loop" if int(config.llvm_ver) < 11 else "loop-mssa"))) + config.substitutions.append(('%loadBC', '' + ' @ENZYME_BINARY_DIR@/BCLoad/BCPass-' + config.llvm_ver + config.llvm_shlib_ext )) config.substitutions.append(('%BClibdir', '@ENZYME_SOURCE_DIR@/bclib/')) -config.substitutions.append(('%loadClangEnzyme', '' - + (" -fno-experimental-new-pass-manager" if int(config.llvm_ver) >= 13 else "") - + ' -Xclang -load -Xclang @ENZYME_BINARY_DIR@/Enzyme/ClangEnzyme-' + config.llvm_ver + config.llvm_shlib_ext - )) + +oldPM = (((" -fno-experimental-new-pass-manager" if int(config.llvm_ver) < 14 else "-flegacy-pass-manager") if int(config.llvm_ver) >= 13 else "") + + ' -Xclang -load -Xclang @ENZYME_BINARY_DIR@/Enzyme/ClangEnzyme-' + config.llvm_ver + config.llvm_shlib_ext) +newPM = ((" -fexperimental-new-pass-manager" if int(config.llvm_ver) < 13 else "") + + ' -fpass-plugin=@ENZYME_BINARY_DIR@/Enzyme/ClangEnzyme-' + config.llvm_ver + config.llvm_shlib_ext + + ' -Xclang -load -Xclang @ENZYME_BINARY_DIR@/Enzyme/ClangEnzyme-' + config.llvm_ver + config.llvm_shlib_ext) + +if len("@ENZYME_BINARY_DIR@") == 0: + oldPM = ((" -fno-experimental-new-pass-manager" if int(config.llvm_ver) < 14 else "-flegacy-pass-manager") if int(config.llvm_ver) >= 13 else "") + newPM = (" -fexperimental-new-pass-manager" if int(config.llvm_ver) < 13 else "") + +config.substitutions.append(('%loadClangEnzyme', oldPM if int(config.llvm_ver) < 15 else newPM)) +config.substitutions.append(('%newLoadClangEnzyme', newPM)) # Let the main config do the real work. lit_config.load_config(config, "@ENZYME_SOURCE_DIR@/benchmarks/lit.cfg.py") From 7995eb3be465a3b9852c4f9af05972befa268186 Mon Sep 17 00:00:00 2001 From: "William S. Moses" Date: Thu, 28 Mar 2024 18:46:10 -0400 Subject: [PATCH 05/56] add files --- enzyme/benchmarks/ReverseMode/gmm/Cargo.lock | 16 +++ enzyme/benchmarks/ReverseMode/gmm/Cargo.toml | 18 +++ enzyme/benchmarks/ReverseMode/gmm/src/lib.rs | 125 ++++++++++++++++++ enzyme/benchmarks/ReverseMode/gmm/src/main.rs | 24 ++++ 4 files changed, 183 insertions(+) create mode 100644 enzyme/benchmarks/ReverseMode/gmm/Cargo.lock create mode 100644 enzyme/benchmarks/ReverseMode/gmm/Cargo.toml create mode 100644 enzyme/benchmarks/ReverseMode/gmm/src/lib.rs create mode 100644 enzyme/benchmarks/ReverseMode/gmm/src/main.rs diff --git a/enzyme/benchmarks/ReverseMode/gmm/Cargo.lock b/enzyme/benchmarks/ReverseMode/gmm/Cargo.lock new file mode 100644 index 000000000000..cfdab95b3d9c --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/gmm/Cargo.lock @@ -0,0 +1,16 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "gmmrs" +version = "0.1.0" +dependencies = [ + "libm", +] + +[[package]] +name = "libm" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" diff --git a/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml b/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml new file mode 100644 index 000000000000..6271be06da5d --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "gmmrs" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[lib] +crate-type = ["lib"] + +[profile.release] +lto = "fat" + +[profile.dev] +lto = "fat" + +[dependencies] +libm = "0.2.8" diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs new file mode 100644 index 000000000000..c6bc0c737dd4 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs @@ -0,0 +1,125 @@ +#![feature(autodiff)] +use libm::lgamma; + +#[no_mangle] +pub extern "C" fn rust_dgmm_objective(d: usize, k: usize, n: usize, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { + dgmm_objective(d, k, n, alphas, dalphas, means, dmeans, icf, dicf, x, wishart, err, derr); +} + +#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] +pub fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { + let alphas = unsafe { std::slice::from_raw_parts(alphas, k) }; + let means = unsafe { std::slice::from_raw_parts(means, k * d) }; + let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; + let x = unsafe { std::slice::from_raw_parts(x, n * d) }; + let wishart: Wishart = unsafe { *wishart }; + let mut err = unsafe { *err }; + gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut err); +} + +pub fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], wishart: Wishart, err: &mut f64) { + let constant = -(n as f64) * d as f64 * 0.5 * 2f64.ln(); + let icf_sz = d * (d + 1) / 2; + let mut qdiags = vec![0.; d * k]; + let mut sum_qs = vec![0.; k]; + let mut xcentered = vec![0.; d]; + let mut qxcentered = vec![0.; d]; + let mut main_term = vec![0.; k]; + + preprocess_qs(d, k, icf, &mut sum_qs, &mut qdiags); + + let mut slse = 0.; + for ix in 0..n { + for ik in 0..k { + subtract(d, &x[ix as usize * d as usize..], &means[ik as usize * d as usize..], &mut xcentered); + Qtimesx(d, &qdiags[ik as usize * d as usize..], &icf[ik as usize * icf_sz as usize + d as usize..], &xcentered, &mut qxcentered); + main_term[ik as usize] = alphas[ik as usize] + sum_qs[ik as usize] - 0.5 * sqnorm(d, &qxcentered); + } + + slse = slse + log_sum_exp(k, &main_term); + } + + let lse_alphas = log_sum_exp(k, alphas); + + *err = constant + slse - n as f64 * lse_alphas + log_wishart_prior(d, k, wishart, &sum_qs, &qdiags, icf); +} + +fn arr_max(n: usize, x: &[f64]) -> f64 { + let mut max = f64::NEG_INFINITY; + for i in 0..n { + if max < x[i] { + max = x[i]; + } + } + max +} + +fn preprocess_qs(d: usize, k: usize, icf: &[f64], sum_qs: &mut [f64], qdiags: &mut [f64]) { + let icf_sz = d * (d + 1) / 2; + for ik in 0..k { + sum_qs[ik as usize] = 0.; + for id in 0..d { + let q = icf[ik as usize * icf_sz as usize + id as usize]; + sum_qs[ik as usize] = sum_qs[ik as usize] + q; + qdiags[ik as usize * d as usize + id as usize] = q.exp(); + } + } +} +fn subtract(d: usize, x: &[f64], y: &[f64], out: &mut [f64]) { + assert!(x.len() >= d); + assert!(y.len() >= d); + assert!(out.len() >= d); + for i in 0..d { + out[i] = x[i] - y[i]; + } +} + +fn Qtimesx(d: usize, q_diag: &[f64], ltri: &[f64], x: &[f64], out: &mut [f64]) { + assert!(out.len() >= d); + assert!(q_diag.len() >= d); + assert!(x.len() >= d); + for i in 0..d { + out[i] = q_diag[i] * x[i]; + } + + for i in 0..d { + let mut lparamsidx = i*(2*d-i-1)/2; + for j in i + 1..d { + out[j] = out[j] + ltri[lparamsidx] * x[i]; + lparamsidx += 1; + } + } +} + +fn log_sum_exp(n: usize, x: &[f64]) -> f64 { + let mx = arr_max(n, x); + let semx: f64 = x.iter().map(|x| (x - mx).exp()).sum(); + semx.ln() + mx +} +fn log_gamma_distrib(a: f64, p: f64) -> f64 { + 0.25 * p * (p - 1.) * std::f64::consts::PI.ln() + (1..=p as usize).map(|j| lgamma(a + 0.5 * (1. - j as f64))).sum::() +} + +#[derive(Clone, Copy)] +#[repr(C)] +pub struct Wishart { + pub gamma: f64, + pub m: usize, +} +fn log_wishart_prior(p: usize, k: usize, wishart: Wishart, sum_qs: &[f64], qdiags: &[f64], icf: &[f64]) -> f64 { + let n = p + wishart.m + 1; + let icf_sz = p * (p + 1) / 2; + + let c = n as f64 * p as f64 * (wishart.gamma.ln() - 0.5 * 2f64.ln()) - log_gamma_distrib(0.5 * n as f64, p as f64); + + let out = (0..k).map(|ik| { + let frobenius = sqnorm(p, &qdiags[ik * p as usize..]) + sqnorm(icf_sz - p, &icf[ik * icf_sz as usize + p as usize..]); + 0.5 * wishart.gamma * wishart.gamma * (frobenius) - (wishart.m as f64) * sum_qs[ik as usize] + }).sum::(); + + out - k as f64 * c +} + +fn sqnorm(n: usize, x: &[f64]) -> f64 { + x.iter().map(|x| x * x).sum() +} diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/main.rs b/enzyme/benchmarks/ReverseMode/gmm/src/main.rs new file mode 100644 index 000000000000..8f4357588ab8 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/gmm/src/main.rs @@ -0,0 +1,24 @@ +#![feature(autodiff)] +use gmmrs::{Wishart, dgmm_objective}; + +fn main() { + let d = 2; + let k = 2; + let n = 2; + let alphas = vec![0.5, 0.5]; + let means = vec![0., 0., 1., 1.]; + let icf = vec![1., 0., 1.]; + let x = vec![0., 0., 1., 1.]; + let wishart = Wishart { gamma: 1., m: 1 }; + let mut err = 0.; + let mut d_alphas = vec![0.; alphas.len()]; + let mut d_means = vec![0.; means.len()]; + let mut d_icf = vec![0.; icf.len()]; + let mut d_x = vec![0.; x.len()]; + let mut d_err = 0.; + let mut err2 = &mut err; + let mut d_err2 = &mut d_err; + let wishart2 = &wishart; + // pass as raw ptr: + dgmm_objective(d, k, n, alphas.as_ptr(), d_alphas.as_mut_ptr(), means.as_ptr(), d_means.as_mut_ptr(), icf.as_ptr(), d_icf.as_mut_ptr(), x.as_ptr(), wishart2 as *const Wishart, err2 as *mut f64, d_err2 as *mut f64); +} From 069e3cb85c92ce8f405fc905930d77ad7d2d0916 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Fri, 29 Mar 2024 00:10:33 -0400 Subject: [PATCH 06/56] improve makefile and fix c ffi --- enzyme/benchmarks/ReverseMode/gmm/Makefile.make | 8 +++++--- enzyme/benchmarks/ReverseMode/gmm/src/lib.rs | 8 ++++---- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make index 1e1a36f72c2f..5d8b4b70d7f4 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make @@ -2,12 +2,14 @@ .PHONY: clean +dir=/h/344/drehwald/prog/Enzyme/enzyme + clean: rm -f *.ll *.o results.txt results.json %-unopt.ll: %.cpp src/lib.rs ENZYME_LOOSE_TYPES=1 cargo +enzyme rustc --release --lib --crate-type=staticlib - clang++ $(BENCH) gmm.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -fno-vectorize -fno-slp-vectorize -fno-unroll-loops -o gmm-unopt.ll -S -emit-llvm + clang++ -pthread $(BENCH) gmm.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -fno-vectorize -fno-slp-vectorize -fno-unroll-loops -o gmm-unopt.ll -S -emit-llvm #clang++ $(BENCH) $^ -O1 -Xclang -disable-llvm-passes -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm %-raw.ll: %-unopt.ll @@ -21,8 +23,8 @@ clean: gmm.o: gmm-opt.ll pwd - echo clang++ -O2 $^ -o $@ $(BENCHLINK) -lm /home/wmoses/git/Enzyme/enzyme/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 - clang++ -v -O2 $^ -o $@ $(BENCHLINK) -lm /home/wmoses/git/Enzyme/enzyme/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 + echo clang++ -pthread -O2 $^ -o $@ $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 + clang++ -pthread -v -O2 $^ -o $@ $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 results.json: gmm.o ./$^ diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs index c6bc0c737dd4..ecbb0cb2545b 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs @@ -2,8 +2,8 @@ use libm::lgamma; #[no_mangle] -pub extern "C" fn rust_dgmm_objective(d: usize, k: usize, n: usize, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { - dgmm_objective(d, k, n, alphas, dalphas, means, dmeans, icf, dicf, x, wishart, err, derr); +pub extern "C" fn rust_dgmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { + //dgmm_objective(d as usize, k as usize, n as usize, alphas, dalphas, means, dmeans, icf, dicf, x, wishart, err, derr); } #[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] @@ -104,10 +104,10 @@ fn log_gamma_distrib(a: f64, p: f64) -> f64 { #[repr(C)] pub struct Wishart { pub gamma: f64, - pub m: usize, + pub m: i32, } fn log_wishart_prior(p: usize, k: usize, wishart: Wishart, sum_qs: &[f64], qdiags: &[f64], icf: &[f64]) -> f64 { - let n = p + wishart.m + 1; + let n = p + wishart.m as usize + 1; let icf_sz = p * (p + 1) / 2; let c = n as f64 * p as f64 * (wishart.gamma.ln() - 0.5 * 2f64.ln()) - log_gamma_distrib(0.5 * n as f64, p as f64); From be54358ec1b213e5fcc3382c9284e4eff9483386 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Sat, 30 Mar 2024 15:59:35 -0400 Subject: [PATCH 07/56] maybe needed? pthread for cmake --- enzyme/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/enzyme/CMakeLists.txt b/enzyme/CMakeLists.txt index 82c7887cde3e..f2e480f181ce 100644 --- a/enzyme/CMakeLists.txt +++ b/enzyme/CMakeLists.txt @@ -21,6 +21,10 @@ SET(CMAKE_CXX_FLAGS "-Wall -fno-rtti ${CMAKE_CXX_FLAGS} -Werror=unused-variable SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -ggdb") SET(CMAKE_CXX_FLAGS_RELEASE "-O2") + + +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") + SET(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -ggdb -fno-omit-frame-pointer") #SET(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -fno-omit-frame-pointer -fsanitize=address") From 4423222a6501de34fe92399667a41e19265c47f1 Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Sat, 30 Mar 2024 17:45:46 -0600 Subject: [PATCH 08/56] bench gmm: use path relative to Makefile --- enzyme/benchmarks/ReverseMode/gmm/Makefile.make | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make index 5d8b4b70d7f4..5a403c789c80 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make @@ -2,7 +2,7 @@ .PHONY: clean -dir=/h/344/drehwald/prog/Enzyme/enzyme +dir := $(abspath $(lastword $(MAKEFILE_LIST))/../../../..) clean: rm -f *.ll *.o results.txt results.json From 7b5a24c7711cbcbb829a1ae0c0fcb84451d00da4 Mon Sep 17 00:00:00 2001 From: "William S. Moses" Date: Sat, 30 Mar 2024 20:36:34 -0400 Subject: [PATCH 09/56] Fix byref issue for rust abi --- enzyme/benchmarks/ReverseMode/adbench/gmm.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/adbench/gmm.h b/enzyme/benchmarks/ReverseMode/adbench/gmm.h index e47dfaa62a23..934d26b829a8 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/gmm.h +++ b/enzyme/benchmarks/ReverseMode/adbench/gmm.h @@ -50,7 +50,7 @@ extern "C" { void rust_dgmm_objective(int d, int k, int n, const double *alphas, double * alphasb, const double *means, double *meansb, const double *icf, - double *icfb, const double *x, Wishart wishart, double *err, double * + double *icfb, const double *x, Wishart &wishart, double *err, double * errb); } @@ -128,10 +128,7 @@ void read_gmm_instance(const string& fn, fclose(fid); } -typedef void(*deriv_t)(int d, int k, int n, const double *alphas, double *alphasb, const double *means, double *meansb, const double *icf, - double *icfb, const double *x, Wishart wishart, double *err, double *errb); - -template +template void calculate_jacobian(struct GMMInput &input, struct GMMOutput &result) { double* alphas_gradient_part = result.gradient.data(); @@ -262,6 +259,7 @@ int main(const int argc, const char* argv[]) { gettimeofday(&start, NULL); calculate_jacobian(input, result); gettimeofday(&end, NULL); + printf("Enzyme c++ combined %0.6f\n", tdiff(&start, &end)); json enzyme; enzyme["name"] = "Enzyme combined"; enzyme["runtime"] = tdiff(&start, &end); @@ -291,6 +289,7 @@ int main(const int argc, const char* argv[]) { gettimeofday(&start, NULL); calculate_jacobian(input, result); gettimeofday(&end, NULL); + printf("Enzyme rust combined %0.6f\n", tdiff(&start, &end)); json enzyme; enzyme["name"] = "Rust Enzyme combined"; enzyme["runtime"] = tdiff(&start, &end); From ea03750e1bd193721bc01ad069e0750c5f79c293 Mon Sep 17 00:00:00 2001 From: "William S. Moses" Date: Sat, 30 Mar 2024 20:49:48 -0400 Subject: [PATCH 10/56] Add primal bench/test --- enzyme/benchmarks/ReverseMode/adbench/gmm.h | 48 +++++++++++++++++++ enzyme/benchmarks/ReverseMode/gmm/Cargo.toml | 3 ++ .../benchmarks/ReverseMode/gmm/Makefile.make | 6 +-- enzyme/benchmarks/ReverseMode/gmm/src/lib.rs | 7 ++- 4 files changed, 59 insertions(+), 5 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/adbench/gmm.h b/enzyme/benchmarks/ReverseMode/adbench/gmm.h index 934d26b829a8..acd18a1f1319 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/gmm.h +++ b/enzyme/benchmarks/ReverseMode/adbench/gmm.h @@ -33,6 +33,17 @@ struct GMMParameters { }; extern "C" { +void gmm_objective( + int d, + int k, + int n, + double const* alphas, + double const* means, + double const* icf, + double const* x, + Wishart wishart, + double* err +); void dgmm_objective(int d, int k, int n, const double *alphas, double * alphasb, const double *means, double *meansb, const double *icf, double *icfb, const double *x, Wishart wishart, double *err, double * @@ -52,6 +63,10 @@ extern "C" { alphasb, const double *means, double *meansb, const double *icf, double *icfb, const double *x, Wishart &wishart, double *err, double * errb); + + void rust_gmm_objective(int d, int k, int n, const double *alphas, + const double *means, const double *icf, + const double *x, Wishart &wishart, double *err); } void read_gmm_instance(const string& fn, @@ -161,6 +176,25 @@ void calculate_jacobian(struct GMMInput &input, struct GMMOutput &result) ); } +template +double primal(struct GMMInput &input) +{ + double tmp = 0.0; // stores fictive result + // (Tapenade doesn't calculate an original function in reverse mode) + deriv( + input.d, + input.k, + input.n, + input.alphas.data(), + input.means.data(), + input.icf.data(), + input.x.data(), + input.wishart, + &tmp + ); + return tmp; +} + int main(const int argc, const char* argv[]) { printf("starting main\n"); @@ -284,6 +318,20 @@ int main(const int argc, const char* argv[]) { struct GMMOutput result = { 0, std::vector(Jcols) }; + { + struct timeval start, end; + gettimeofday(&start, NULL); + auto res = primal(input); + gettimeofday(&end, NULL); + printf("c++ primal combined t=%0.6f, err=%f\n", tdiff(&start, &end), res); + } + { + struct timeval start, end; + gettimeofday(&start, NULL); + auto res = primal(input); + gettimeofday(&end, NULL); + printf("rust primal combined t=%0.6f, err=%f\n", tdiff(&start, &end), res); + } { struct timeval start, end; gettimeofday(&start, NULL); diff --git a/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml b/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml index 6271be06da5d..5916af111e25 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml +++ b/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml @@ -10,6 +10,9 @@ crate-type = ["lib"] [profile.release] lto = "fat" +debug = true +strip = "none" +opt-level = 1 [profile.dev] lto = "fat" diff --git a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make index 5a403c789c80..1fd871d8963e 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make @@ -9,7 +9,7 @@ clean: %-unopt.ll: %.cpp src/lib.rs ENZYME_LOOSE_TYPES=1 cargo +enzyme rustc --release --lib --crate-type=staticlib - clang++ -pthread $(BENCH) gmm.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -fno-vectorize -fno-slp-vectorize -fno-unroll-loops -o gmm-unopt.ll -S -emit-llvm + clang++ -g $(BENCH) gmm.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O1 -fno-vectorize -fno-slp-vectorize -fno-unroll-loops -o gmm-unopt.ll -S -emit-llvm #clang++ $(BENCH) $^ -O1 -Xclang -disable-llvm-passes -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm %-raw.ll: %-unopt.ll @@ -22,9 +22,7 @@ clean: #opt $^ -O2 -o $@ -S gmm.o: gmm-opt.ll - pwd - echo clang++ -pthread -O2 $^ -o $@ $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 - clang++ -pthread -v -O2 $^ -o $@ $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 + clang++ -g -lpthread -v -O2 $^ -o $@ $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 results.json: gmm.o ./$^ diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs index ecbb0cb2545b..7d4dd714a63d 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs @@ -3,7 +3,12 @@ use libm::lgamma; #[no_mangle] pub extern "C" fn rust_dgmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { - //dgmm_objective(d as usize, k as usize, n as usize, alphas, dalphas, means, dmeans, icf, dicf, x, wishart, err, derr); + dgmm_objective(d as usize, k as usize, n as usize, alphas, dalphas, means, dmeans, icf, dicf, x, wishart, err, derr); +} + +#[no_mangle] +pub extern "C" fn rust_gmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { + gmm_objective_c(d as usize, k as usize, n as usize, alphas, means, icf, x, wishart, err); } #[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] From 77a130f6983287e40874f4751f8c935adf2d62c7 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Sat, 30 Mar 2024 20:57:02 -0400 Subject: [PATCH 11/56] fix math --- enzyme/benchmarks/ReverseMode/gmm/src/lib.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs index 7d4dd714a63d..b057a2443a87 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs @@ -1,5 +1,6 @@ #![feature(autodiff)] use libm::lgamma; +use std::f64::consts::PI; #[no_mangle] pub extern "C" fn rust_dgmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { @@ -23,7 +24,7 @@ pub fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: } pub fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], wishart: Wishart, err: &mut f64) { - let constant = -(n as f64) * d as f64 * 0.5 * 2f64.ln(); + let constant = -(n as f64) * d as f64 * 0.5 * (2.0 * PI).ln(); let icf_sz = d * (d + 1) / 2; let mut qdiags = vec![0.; d * k]; let mut sum_qs = vec![0.; k]; From 4abf2bf787a494925b27dbc897d86039d99a6e99 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Sat, 30 Mar 2024 21:01:50 -0400 Subject: [PATCH 12/56] write into return var --- enzyme/benchmarks/ReverseMode/gmm/src/lib.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs index b057a2443a87..3d869c072a5d 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs @@ -19,8 +19,9 @@ pub fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; let x = unsafe { std::slice::from_raw_parts(x, n * d) }; let wishart: Wishart = unsafe { *wishart }; - let mut err = unsafe { *err }; - gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut err); + let mut my_err = unsafe { *err }; + gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut my_err); + unsafe { *err = my_err }; } pub fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], wishart: Wishart, err: &mut f64) { From 5cde3d7efdb1b3617997d2559d0b5329d61cd87d Mon Sep 17 00:00:00 2001 From: "William S. Moses" Date: Sat, 30 Mar 2024 21:47:48 -0400 Subject: [PATCH 13/56] Cleanup gmm config --- enzyme/benchmarks/ReverseMode/adbench/gmm.h | 11 +++++++++++ enzyme/benchmarks/ReverseMode/gmm/Cargo.toml | 4 +--- .../benchmarks/ReverseMode/gmm/Makefile.make | 19 +++---------------- 3 files changed, 15 insertions(+), 19 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/adbench/gmm.h b/enzyme/benchmarks/ReverseMode/adbench/gmm.h index acd18a1f1319..a25b4d0ded54 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/gmm.h +++ b/enzyme/benchmarks/ReverseMode/adbench/gmm.h @@ -324,6 +324,12 @@ int main(const int argc, const char* argv[]) { auto res = primal(input); gettimeofday(&end, NULL); printf("c++ primal combined t=%0.6f, err=%f\n", tdiff(&start, &end), res); + + json primal; + primal["name"] = "C++ primal"; + primal["runtime"] = tdiff(&start, &end); + primal["result"].push_back(res); + test_suite["tools"].push_back(primal); } { struct timeval start, end; @@ -331,6 +337,11 @@ int main(const int argc, const char* argv[]) { auto res = primal(input); gettimeofday(&end, NULL); printf("rust primal combined t=%0.6f, err=%f\n", tdiff(&start, &end), res); + json primal; + primal["name"] = "Rust primal"; + primal["runtime"] = tdiff(&start, &end); + primal["result"].push_back(res); + test_suite["tools"].push_back(primal); } { struct timeval start, end; diff --git a/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml b/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml index 5916af111e25..655d1a1f3117 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml +++ b/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml @@ -10,9 +10,7 @@ crate-type = ["lib"] [profile.release] lto = "fat" -debug = true -strip = "none" -opt-level = 1 +opt-level = 3 [profile.dev] lto = "fat" diff --git a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make index 1fd871d8963e..77cc84e2832e 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make @@ -1,4 +1,4 @@ -# RUN: if [ %llvmver -ge 12 ] || [ %llvmver -le 9 ]; then cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%newLoadEnzyme" make -B gmm-unopt.ll gmm-raw.ll results.json -f %s; fi +# RUN: if [ %llvmver -ge 12 ] || [ %llvmver -le 9 ]; then cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%newLoadClangEnzyme" make -B gmm.o results.json -f %s; fi .PHONY: clean @@ -7,22 +7,9 @@ dir := $(abspath $(lastword $(MAKEFILE_LIST))/../../../..) clean: rm -f *.ll *.o results.txt results.json -%-unopt.ll: %.cpp src/lib.rs +gmm.o: gmm.cpp src/lib.rs ENZYME_LOOSE_TYPES=1 cargo +enzyme rustc --release --lib --crate-type=staticlib - clang++ -g $(BENCH) gmm.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O1 -fno-vectorize -fno-slp-vectorize -fno-unroll-loops -o gmm-unopt.ll -S -emit-llvm - #clang++ $(BENCH) $^ -O1 -Xclang -disable-llvm-passes -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm - -%-raw.ll: %-unopt.ll - echo opt $^ $(LOAD) -passes="enzyme" -o $@ -S - opt $^ $(LOAD) -passes="enzyme" -o $@ -S - -%-opt.ll: %-raw.ll - echo opt $^ -o $@ -S - opt $^ -o $@ -S - #opt $^ -O2 -o $@ -S - -gmm.o: gmm-opt.ll - clang++ -g -lpthread -v -O2 $^ -o $@ $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 + clang++ $(LOAD) $(BENCH) gmm.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o gmm.o -lpthread $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 results.json: gmm.o ./$^ From af6583dfa2481e120a9a103484438c9123e1384a Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Sat, 30 Mar 2024 20:48:31 -0600 Subject: [PATCH 14/56] bench gmm: make cmath::lgamma with libm as an optional feature --- enzyme/benchmarks/ReverseMode/gmm/Cargo.toml | 7 ++++++- enzyme/benchmarks/ReverseMode/gmm/src/lib.rs | 16 +++++++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml b/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml index 655d1a1f3117..85dfa6310c34 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml +++ b/enzyme/benchmarks/ReverseMode/gmm/Cargo.toml @@ -8,12 +8,17 @@ edition = "2021" [lib] crate-type = ["lib"] +[features] +libm = ["dep:libm"] + [profile.release] lto = "fat" opt-level = 3 +#debug = true +#strip = "none" [profile.dev] lto = "fat" [dependencies] -libm = "0.2.8" +libm = { version = "0.2.8", optional = true } diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs index 3d869c072a5d..7cf23525d026 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs @@ -1,7 +1,21 @@ #![feature(autodiff)] -use libm::lgamma; use std::f64::consts::PI; +#[cfg(feature = "libm")] +use libm::lgamma; + +#[cfg(not(feature = "libm"))] +mod cmath { + extern "C" { + pub fn lgamma(x: f64) -> f64; + } +} +#[cfg(not(feature = "libm"))] +#[inline] +fn lgamma(x: f64) -> f64 { + unsafe { cmath::lgamma(x) } +} + #[no_mangle] pub extern "C" fn rust_dgmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { dgmm_objective(d as usize, k as usize, n as usize, alphas, dalphas, means, dmeans, icf, dicf, x, wishart, err, derr); From 48c2e1fdd7ecd155fe54dd39d5d8049f16da5101 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Sat, 30 Mar 2024 23:06:51 -0400 Subject: [PATCH 15/56] oxidize - more noalias --- enzyme/benchmarks/ReverseMode/gmm/src/lib.rs | 41 +++++++++++++++----- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs index 7cf23525d026..80d8b3789d29 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs @@ -18,27 +18,50 @@ fn lgamma(x: f64) -> f64 { #[no_mangle] pub extern "C" fn rust_dgmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { - dgmm_objective(d as usize, k as usize, n as usize, alphas, dalphas, means, dmeans, icf, dicf, x, wishart, err, derr); + let k = k as usize; + let n = n as usize; + let d = d as usize; + let alphas = unsafe { std::slice::from_raw_parts(alphas, k) }; + let means = unsafe { std::slice::from_raw_parts(means, k * d) }; + let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; + let x = unsafe { std::slice::from_raw_parts(x, n * d) }; + //let wishart: Wishart = unsafe { *wishart }; + let mut my_err = unsafe { *err }; + + let mut d_alphas = unsafe { std::slice::from_raw_parts_mut(dalphas, k) }; + let mut d_means = unsafe { std::slice::from_raw_parts_mut(dmeans, k * d) }; + let mut d_icf = unsafe { std::slice::from_raw_parts_mut(dicf, k * d * (d + 1) / 2) }; + let mut my_derr = unsafe { *derr }; + + dgmm_objective(d, k, n, alphas, d_alphas, means, d_means, icf, d_icf, x, wishart, &mut my_err, &mut my_derr); + + unsafe { *err = my_err }; + unsafe { *derr = my_derr }; } #[no_mangle] pub extern "C" fn rust_gmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { - gmm_objective_c(d as usize, k as usize, n as usize, alphas, means, icf, x, wishart, err); -} - -#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] -pub fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { + let k = k as usize; + let n = n as usize; + let d = d as usize; let alphas = unsafe { std::slice::from_raw_parts(alphas, k) }; let means = unsafe { std::slice::from_raw_parts(means, k * d) }; let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; let x = unsafe { std::slice::from_raw_parts(x, n * d) }; - let wishart: Wishart = unsafe { *wishart }; + //let wishart: Wishart = unsafe { *wishart }; let mut my_err = unsafe { *err }; gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut my_err); unsafe { *err = my_err }; } -pub fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], wishart: Wishart, err: &mut f64) { +//#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] +//pub fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { +// gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut my_err); +//} + +#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] +pub fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], wishart: *const Wishart, err: &mut f64) { + let wishart: Wishart = unsafe { *wishart }; let constant = -(n as f64) * d as f64 * 0.5 * (2.0 * PI).ln(); let icf_sz = d * (d + 1) / 2; let mut qdiags = vec![0.; d * k]; @@ -118,7 +141,7 @@ fn log_sum_exp(n: usize, x: &[f64]) -> f64 { semx.ln() + mx } fn log_gamma_distrib(a: f64, p: f64) -> f64 { - 0.25 * p * (p - 1.) * std::f64::consts::PI.ln() + (1..=p as usize).map(|j| lgamma(a + 0.5 * (1. - j as f64))).sum::() + 0.25 * p * (p - 1.) * PI.ln() + (1..=p as usize).map(|j| lgamma(a + 0.5 * (1. - j as f64))).sum::() } #[derive(Clone, Copy)] From ba75484b4afed02330dda8ea16edf8fcf2fd6fbf Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Sun, 31 Mar 2024 00:13:53 -0400 Subject: [PATCH 16/56] reduce caching --- enzyme/benchmarks/ReverseMode/gmm/Makefile.make | 2 +- enzyme/benchmarks/ReverseMode/gmm/src/lib.rs | 15 ++++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make index 77cc84e2832e..1a9c3cd3b826 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make @@ -8,7 +8,7 @@ clean: rm -f *.ll *.o results.txt results.json gmm.o: gmm.cpp src/lib.rs - ENZYME_LOOSE_TYPES=1 cargo +enzyme rustc --release --lib --crate-type=staticlib + ENZYME_LOOSE_TYPES=1 cargo +enzyme rustc --release --lib --crate-type=staticlib --features=libm clang++ $(LOAD) $(BENCH) gmm.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o gmm.o -lpthread $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 results.json: gmm.o diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs index 80d8b3789d29..914c41e156d4 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs @@ -25,7 +25,7 @@ pub extern "C" fn rust_dgmm_objective(d: i32, k: i32, n: i32, alphas: *const f64 let means = unsafe { std::slice::from_raw_parts(means, k * d) }; let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; let x = unsafe { std::slice::from_raw_parts(x, n * d) }; - //let wishart: Wishart = unsafe { *wishart }; + let wishart: Wishart = unsafe { *wishart }; let mut my_err = unsafe { *err }; let mut d_alphas = unsafe { std::slice::from_raw_parts_mut(dalphas, k) }; @@ -33,7 +33,7 @@ pub extern "C" fn rust_dgmm_objective(d: i32, k: i32, n: i32, alphas: *const f64 let mut d_icf = unsafe { std::slice::from_raw_parts_mut(dicf, k * d * (d + 1) / 2) }; let mut my_derr = unsafe { *derr }; - dgmm_objective(d, k, n, alphas, d_alphas, means, d_means, icf, d_icf, x, wishart, &mut my_err, &mut my_derr); + dgmm_objective(d, k, n, alphas, d_alphas, means, d_means, icf, d_icf, x, wishart.gamma, wishart.m, &mut my_err, &mut my_derr); unsafe { *err = my_err }; unsafe { *derr = my_derr }; @@ -48,9 +48,9 @@ pub extern "C" fn rust_gmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, let means = unsafe { std::slice::from_raw_parts(means, k * d) }; let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; let x = unsafe { std::slice::from_raw_parts(x, n * d) }; - //let wishart: Wishart = unsafe { *wishart }; + let wishart: Wishart = unsafe { *wishart }; let mut my_err = unsafe { *err }; - gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut my_err); + gmm_objective(d, k, n, alphas, means, icf, x, wishart.gamma, wishart.m, &mut my_err); unsafe { *err = my_err }; } @@ -59,9 +59,10 @@ pub extern "C" fn rust_gmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, // gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut my_err); //} -#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] -pub fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], wishart: *const Wishart, err: &mut f64) { - let wishart: Wishart = unsafe { *wishart }; +#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Const, Duplicated)] +pub fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], gamma: f64, m: i32, err: &mut f64) { + let wishart: Wishart = Wishart { gamma, m }; + //let wishart: Wishart = unsafe { *wishart }; let constant = -(n as f64) * d as f64 * 0.5 * (2.0 * PI).ln(); let icf_sz = d * (d + 1) / 2; let mut qdiags = vec![0.; d * k]; From 9394028d1cbe382203c64a1c26b487f23598d101 Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Sat, 30 Mar 2024 22:07:01 -0600 Subject: [PATCH 17/56] bench gmm: makefile dep on Cargo.toml, split targets --- enzyme/benchmarks/ReverseMode/gmm/Makefile.make | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make index 1a9c3cd3b826..e3c15f4dcc11 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make @@ -7,8 +7,10 @@ dir := $(abspath $(lastword $(MAKEFILE_LIST))/../../../..) clean: rm -f *.ll *.o results.txt results.json -gmm.o: gmm.cpp src/lib.rs +$(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a: src/lib.rs Cargo.toml ENZYME_LOOSE_TYPES=1 cargo +enzyme rustc --release --lib --crate-type=staticlib --features=libm + +gmm.o: gmm.cpp $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a clang++ $(LOAD) $(BENCH) gmm.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o gmm.o -lpthread $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 results.json: gmm.o From 814eb62e8c59292c86af1f99e44f7a85fadc157a Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Sun, 31 Mar 2024 00:50:56 -0400 Subject: [PATCH 18/56] revert cmake pthread since only needed for Rust --- enzyme/CMakeLists.txt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/enzyme/CMakeLists.txt b/enzyme/CMakeLists.txt index f2e480f181ce..82c7887cde3e 100644 --- a/enzyme/CMakeLists.txt +++ b/enzyme/CMakeLists.txt @@ -21,10 +21,6 @@ SET(CMAKE_CXX_FLAGS "-Wall -fno-rtti ${CMAKE_CXX_FLAGS} -Werror=unused-variable SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -ggdb") SET(CMAKE_CXX_FLAGS_RELEASE "-O2") - - -SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") - SET(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -ggdb -fno-omit-frame-pointer") #SET(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -fno-omit-frame-pointer -fsanitize=address") From 41157fafd71d20cc7707269cec28f2fc1b28cd50 Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Sat, 30 Mar 2024 23:44:38 -0600 Subject: [PATCH 19/56] bench gmm: fix primal (sqnorm length matters) --- enzyme/benchmarks/ReverseMode/gmm/src/lib.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs index 914c41e156d4..16aecf8de96f 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs @@ -78,7 +78,7 @@ pub fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64] for ik in 0..k { subtract(d, &x[ix as usize * d as usize..], &means[ik as usize * d as usize..], &mut xcentered); Qtimesx(d, &qdiags[ik as usize * d as usize..], &icf[ik as usize * icf_sz as usize + d as usize..], &xcentered, &mut qxcentered); - main_term[ik as usize] = alphas[ik as usize] + sum_qs[ik as usize] - 0.5 * sqnorm(d, &qxcentered); + main_term[ik as usize] = alphas[ik as usize] + sum_qs[ik as usize] - 0.5 * sqnorm(&qxcentered); } slse = slse + log_sum_exp(k, &main_term); @@ -158,13 +158,13 @@ fn log_wishart_prior(p: usize, k: usize, wishart: Wishart, sum_qs: &[f64], qdiag let c = n as f64 * p as f64 * (wishart.gamma.ln() - 0.5 * 2f64.ln()) - log_gamma_distrib(0.5 * n as f64, p as f64); let out = (0..k).map(|ik| { - let frobenius = sqnorm(p, &qdiags[ik * p as usize..]) + sqnorm(icf_sz - p, &icf[ik * icf_sz as usize + p as usize..]); + let frobenius = sqnorm(&qdiags[ik * p as usize..][..p]) + sqnorm(&icf[ik * icf_sz as usize + p as usize..][..icf_sz -p]); 0.5 * wishart.gamma * wishart.gamma * (frobenius) - (wishart.m as f64) * sum_qs[ik as usize] }).sum::(); out - k as f64 * c } -fn sqnorm(n: usize, x: &[f64]) -> f64 { +fn sqnorm(x: &[f64]) -> f64 { x.iter().map(|x| x * x).sum() } From 114f2369dbca5cdf572e5dc599c88f94eb017fb6 Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Sat, 30 Mar 2024 23:48:16 -0600 Subject: [PATCH 20/56] bench gmm: quash rust warnings --- enzyme/benchmarks/ReverseMode/gmm/src/lib.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs index 16aecf8de96f..a2ba1d041689 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs @@ -28,9 +28,9 @@ pub extern "C" fn rust_dgmm_objective(d: i32, k: i32, n: i32, alphas: *const f64 let wishart: Wishart = unsafe { *wishart }; let mut my_err = unsafe { *err }; - let mut d_alphas = unsafe { std::slice::from_raw_parts_mut(dalphas, k) }; - let mut d_means = unsafe { std::slice::from_raw_parts_mut(dmeans, k * d) }; - let mut d_icf = unsafe { std::slice::from_raw_parts_mut(dicf, k * d * (d + 1) / 2) }; + let d_alphas = unsafe { std::slice::from_raw_parts_mut(dalphas, k) }; + let d_means = unsafe { std::slice::from_raw_parts_mut(dmeans, k * d) }; + let d_icf = unsafe { std::slice::from_raw_parts_mut(dicf, k * d * (d + 1) / 2) }; let mut my_derr = unsafe { *derr }; dgmm_objective(d, k, n, alphas, d_alphas, means, d_means, icf, d_icf, x, wishart.gamma, wishart.m, &mut my_err, &mut my_derr); @@ -77,7 +77,7 @@ pub fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64] for ix in 0..n { for ik in 0..k { subtract(d, &x[ix as usize * d as usize..], &means[ik as usize * d as usize..], &mut xcentered); - Qtimesx(d, &qdiags[ik as usize * d as usize..], &icf[ik as usize * icf_sz as usize + d as usize..], &xcentered, &mut qxcentered); + qtimesx(d, &qdiags[ik as usize * d as usize..], &icf[ik as usize * icf_sz as usize + d as usize..], &xcentered, &mut qxcentered); main_term[ik as usize] = alphas[ik as usize] + sum_qs[ik as usize] - 0.5 * sqnorm(&qxcentered); } @@ -119,7 +119,7 @@ fn subtract(d: usize, x: &[f64], y: &[f64], out: &mut [f64]) { } } -fn Qtimesx(d: usize, q_diag: &[f64], ltri: &[f64], x: &[f64], out: &mut [f64]) { +fn qtimesx(d: usize, q_diag: &[f64], ltri: &[f64], x: &[f64], out: &mut [f64]) { assert!(out.len() >= d); assert!(q_diag.len() >= d); assert!(x.len() >= d); From 012cf4ccc62c0a3cfd27f074ef70977899b72bfe Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Sun, 31 Mar 2024 11:41:43 -0400 Subject: [PATCH 21/56] adding ba benchmark --- enzyme/benchmarks/ReverseMode/ba/Cargo.lock | 7 + enzyme/benchmarks/ReverseMode/ba/Cargo.toml | 18 ++ enzyme/benchmarks/ReverseMode/ba/src/lib.rs | 221 +++++++++++++++++++ enzyme/benchmarks/ReverseMode/ba/src/main.rs | 26 +++ 4 files changed, 272 insertions(+) create mode 100644 enzyme/benchmarks/ReverseMode/ba/Cargo.lock create mode 100644 enzyme/benchmarks/ReverseMode/ba/Cargo.toml create mode 100644 enzyme/benchmarks/ReverseMode/ba/src/lib.rs create mode 100644 enzyme/benchmarks/ReverseMode/ba/src/main.rs diff --git a/enzyme/benchmarks/ReverseMode/ba/Cargo.lock b/enzyme/benchmarks/ReverseMode/ba/Cargo.lock new file mode 100644 index 000000000000..7e322bed2b9a --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/ba/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "bars" +version = "0.1.0" diff --git a/enzyme/benchmarks/ReverseMode/ba/Cargo.toml b/enzyme/benchmarks/ReverseMode/ba/Cargo.toml new file mode 100644 index 000000000000..1abfe3da5163 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/ba/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "bars" +version = "0.1.0" +edition = "2021" + + +[lib] +crate-type = ["cdylib"] + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[profile.release] +lto = "fat" + +[profile.dev] +lto = "fat" + +[dependencies] diff --git a/enzyme/benchmarks/ReverseMode/ba/src/lib.rs b/enzyme/benchmarks/ReverseMode/ba/src/lib.rs new file mode 100644 index 000000000000..412a6a477109 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/ba/src/lib.rs @@ -0,0 +1,221 @@ +#![feature(autodiff)] +#![feature(slice_first_last_chunk)] +#![allow(non_snake_case)] + +//#define BA_NCAMPARAMS 11 +static BA_NCAMPARAMS: usize = 11; + +fn sqsum(x: &[f64]) -> f64 { + x.iter().map(|&v| v * v).sum() +} + +#[inline] +fn cross(a: &[f64; 3], b: &[f64; 3]) -> [f64; 3] { + [ + a[1] * b[2] - a[2] * b[1], + a[2] * b[0] - a[0] * b[2], + a[0] * b[1] - a[1] * b[0], + ] +} + +fn radial_distort(rad_params: &[f64], proj: &mut [f64]) { + let rsq = sqsum(proj); + let l = 1. + rad_params[0] * rsq + rad_params[1] * rsq * rsq; + proj[0] = proj[0] * l; + proj[1] = proj[1] * l; +} + +fn rodrigues_rotate_point(rot: &[f64; 3], pt: &[f64; 3], rotated_pt: &mut [f64; 3]) { + let sqtheta = sqsum(rot); + if sqtheta != 0. { + let theta = sqtheta.sqrt(); + let costheta = theta.cos(); + let sintheta = theta.sin(); + let theta_inverse = 1. / theta; + let w = rot.map(|v| v * theta_inverse); + let w_cross_pt = cross(&w, &pt); + let tmp = (w[0] * pt[0] + w[1] * pt[1] + w[2] * pt[2]) * (1. - costheta); + for i in 0..3 { + rotated_pt[i] = pt[i] * costheta + w_cross_pt[i] * sintheta + w[i] * tmp; + } + } else { + let rot_cross_pt = cross(&rot, &pt); + for i in 0..3 { + rotated_pt[i] = pt[i] + rot_cross_pt[i]; + } + } +} + +fn project(cam: &[f64; 11], X: &[f64; 3], proj: &mut [f64; 2]) { + let C = &cam[3..6]; + let mut Xo = [0.; 3]; + let mut Xcam = [0.; 3]; + + Xo[0] = X[0] - C[0]; + Xo[1] = X[1] - C[1]; + Xo[2] = X[2] - C[2]; + + rodrigues_rotate_point(cam.first_chunk::<3>().unwrap(), &Xo, &mut Xcam); + + proj[0] = Xcam[0] / Xcam[2]; + proj[1] = Xcam[1] / Xcam[2]; + + radial_distort(&cam[9..], proj); + + proj[0] = proj[0] * cam[6] + cam[7]; + proj[1] = proj[1] * cam[6] + cam[8]; +} + +#[no_mangle] +pub extern "C" fn dcompute_reproj_error( + cam: *const [f64; 11], + dcam: *mut [f64; 11], + x: *const [f64; 3], + dx: *mut [f64; 3], + w: *const [f64; 1], + wb: *mut [f64; 1], + feat: *const [f64; 2], + err: *mut [f64; 2], + derr: *mut [f64; 2], +) { + rust_dcompute_reproj_error(cam, dcam, x, dx, w, wb, feat, err, derr); +} + +#[no_mangle] +pub extern "C" fn dcompute_zach_weight_error( + w: *const f64, + dw: *mut f64, + err: *mut f64, + derr: *mut f64, +) { + rust_dcompute_zach_weight_error(w, dw, err, derr); +} + +#[autodiff( + rust_dcompute_reproj_error, + Reverse, + Duplicated, + Duplicated, + Duplicated, + Const, + Duplicated +)] +pub fn compute_reproj_error( + cam: *const [f64; 11], + x: *const [f64; 3], + w: *const [f64; 1], + feat: *const [f64; 2], + err: *mut [f64; 2], +) { + let cam = unsafe { &*cam }; + let w = unsafe { *(*w).get_unchecked(0) }; + let x = unsafe { &*x }; + let feat = unsafe { &*feat }; + let mut err = unsafe { &mut *err }; + let mut proj = [0.; 2]; + project(cam, x, &mut proj); + err[0] = w * (proj[0] - feat[0]); + err[1] = w * (proj[1] - feat[1]); +} + +#[autodiff(rust_dcompute_zach_weight_error, Reverse, Duplicated, Duplicated)] +pub fn compute_zach_weight_error(w: *const f64, err: *mut f64) { + let w = unsafe { *w }; + let mut err = unsafe { *err }; + err = 1. - w * w; +} + +// n number of cameras +// m number of points +// p number of observations +// cams: 11*n cameras in format [r1 r2 r3 C1 C2 C3 f u0 v0 k1 k2] +// r1, r2, r3 are angle - axis rotation parameters(Rodrigues) +// [C1 C2 C3]' is the camera center +// f is the focal length in pixels +// [u0 v0]' is the principal point +// k1, k2 are radial distortion parameters +// X: 3*m points +// obs: 2*p observations (pairs cameraIdx, pointIdx) +// feats: 2*p features (x,y coordinates corresponding to observations) +// reproj_err: 2*p errors of observations +// w_err: p weight "error" terms +fn rust_ba_objective( + n: usize, + m: usize, + p: usize, + cams: &[f64], + x: &[f64], + w: &[f64], + obs: &[i32], + feats: &[f64], + reproj_err: &mut [f64], + w_err: &mut [f64], +) { + assert_eq!(cams.len(), n * 11); + assert_eq!(x.len(), m * 3); + assert_eq!(w.len(), p); + assert_eq!(obs.len(), p * 2); + assert_eq!(feats.len(), p * 2); + assert_eq!(reproj_err.len(), p * 2); + assert_eq!(w_err.len(), p); + + for i in 0..p { + let cam_idx = obs[i * 2 + 0] as usize; + let pt_idx = obs[i * 2 + 1] as usize; + let start = cam_idx * BA_NCAMPARAMS; + let cam: &[f64; 11] = unsafe { + cams[start..] + .get_unchecked(..11) + .try_into() + .unwrap_unchecked() + }; + let x: &[f64; 3] = unsafe { + x[pt_idx * 3..] + .get_unchecked(..3) + .try_into() + .unwrap_unchecked() + }; + let w: &[f64; 1] = unsafe { w[i..].get_unchecked(..1).try_into().unwrap_unchecked() }; + let feat: &[f64; 2] = unsafe { + feats[i * 2..] + .get_unchecked(..2) + .try_into() + .unwrap_unchecked() + }; + let reproj_err: &mut [f64; 2] = unsafe { + reproj_err[i * 2..] + .get_unchecked_mut(..2) + .try_into() + .unwrap_unchecked() + }; + compute_reproj_error(cam, x, w, feat, reproj_err); + } + + for i in 0..p { + let w_err: &mut f64 = unsafe { w_err.get_unchecked_mut(i) }; + compute_zach_weight_error(w[i..].as_ptr(), w_err as *mut f64); + } +} + +#[no_mangle] +extern "C" fn ba_objective( + n: usize, + m: usize, + p: usize, + cams: *const f64, + x: *const f64, + w: *const f64, + obs: *const i32, + feats: *const f64, + reproj_err: *mut f64, + w_err: *mut f64, +) { + let cams = unsafe { std::slice::from_raw_parts(cams, n * 11) }; + let x = unsafe { std::slice::from_raw_parts(x, m * 3) }; + let w = unsafe { std::slice::from_raw_parts(w, p) }; + let obs = unsafe { std::slice::from_raw_parts(obs, p * 2) }; + let feats = unsafe { std::slice::from_raw_parts(feats, p * 2) }; + let reproj_err = unsafe { std::slice::from_raw_parts_mut(reproj_err, p * 2) }; + let w_err = unsafe { std::slice::from_raw_parts_mut(w_err, p) }; + rust_ba_objective(n, m, p, cams, x, w, obs, feats, reproj_err, w_err); +} diff --git a/enzyme/benchmarks/ReverseMode/ba/src/main.rs b/enzyme/benchmarks/ReverseMode/ba/src/main.rs new file mode 100644 index 000000000000..13f221be69c1 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/ba/src/main.rs @@ -0,0 +1,26 @@ +use bars::{dcompute_reproj_error, dcompute_zach_weight_error}; +fn main() { + let cam = [0.0; 11]; + let mut dcam = [0.0; 11]; + let x = [0.0; 3]; + let mut dx = [0.0; 3]; + let w = [0.0; 1]; + let mut dw = [0.0; 1]; + let feat = [0.0; 2]; + let mut err = [0.0; 2]; + let mut derr = [0.0; 2]; + dcompute_reproj_error( + &cam as *const [f64;11], + &mut dcam as *mut [f64;11], + &x as *const [f64;3], + &mut dx as *mut [f64;3], + &w as *const [f64;1], + &mut dw as *mut [f64;1], + &feat as *const [f64;2], + &mut err as *mut [f64;2], + &mut derr as *mut [f64;2], + ); + + let mut wb = 0.0; + dcompute_zach_weight_error(&w as *const f64, &mut dw as *mut f64, &mut err as *mut f64, &mut derr as *mut f64); +} From 0430e4489817d5ca9201ccfcdd167107ae6caf9a Mon Sep 17 00:00:00 2001 From: "William S. Moses" Date: Mon, 1 Apr 2024 13:37:05 -0400 Subject: [PATCH 22/56] Benchmark ba --- enzyme/benchmarks/ReverseMode/adbench/ba.h | 150 +++++++++++++++++- enzyme/benchmarks/ReverseMode/ba/Cargo.lock | 9 ++ enzyme/benchmarks/ReverseMode/ba/Cargo.toml | 1 + .../benchmarks/ReverseMode/ba/Makefile.make | 20 +-- enzyme/benchmarks/ReverseMode/ba/src/lib.rs | 14 +- 5 files changed, 172 insertions(+), 22 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/adbench/ba.h b/enzyme/benchmarks/ReverseMode/adbench/ba.h index 3ade86a0b7b2..5d9178120e76 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/ba.h +++ b/enzyme/benchmarks/ReverseMode/adbench/ba.h @@ -127,6 +127,19 @@ extern "C" { double* reproj_err, double* w_err ); + + void rust2_ba_objective( + int n, + int m, + int p, + double const* cams, + double const* X, + double const* w, + int const* obs, + double const* feats, + double* reproj_err, + double* w_err + ); void dcompute_reproj_error( double const* cam, @@ -169,6 +182,20 @@ extern "C" { ); void adept_compute_zach_weight_error(double const* w, double* dw, double* err, double* derr); + + void rust_dcompute_reproj_error( + double const* cam, + double * dcam, + double const* X, + double * dX, + double const* w, + double * wb, + double const* feat, + double *err, + double *derr + ); + + void rust_dcompute_zach_weight_error(double const* w, double* dw, double* err, double* derr); } void read_ba_instance(const string& fn, @@ -486,9 +513,9 @@ int main(const int argc, const char* argv[]) { gettimeofday(&start, NULL); calculate_jacobian(input, result); gettimeofday(&end, NULL); - printf("Enzyme combined %0.6f\n", tdiff(&start, &end)); + printf("Enzyme c++ combined %0.6f\n", tdiff(&start, &end)); json enzyme; - enzyme["name"] = "Enzyme combined"; + enzyme["name"] = "Enzyme c++ combined"; enzyme["runtime"] = tdiff(&start, &end); for(unsigned i=0; i<5; i++) { printf("%f ", result.J.vals[i]); @@ -499,6 +526,125 @@ int main(const int argc, const char* argv[]) { } } + + { + struct BAInput input; + read_ba_instance("data/" + path, input.n, input.m, input.p, input.cams, input.X, input.w, input.obs, input.feats); + + struct BAOutput result = { + std::vector(2 * input.p), + std::vector(input.p), + BASparseMat(input.n, input.m, input.p) + }; + + + { + struct timeval start, end; + gettimeofday(&start, NULL); + ba_objective( + input.n, + input.m, + input.p, + input.cams.data(), + input.X.data(), + input.w.data(), + input.obs.data(), + input.feats.data(), + result.reproj_err.data(), + result.w_err.data() + ); + gettimeofday(&end, NULL); + printf("primal c++ t=%0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "primal c++"; + enzyme["runtime"] = tdiff(&start, &end); + for(unsigned i=0; i<5; i++) { + printf("%f ", result.reproj_err[i]); + enzyme["result"].push_back(result.reproj_err[i]); + } + for(unsigned i=0; i<5; i++) { + printf("%f ", result.w_err[i]); + enzyme["result"].push_back(result.w_err[i]); + } + printf("\n"); + test_suite["tools"].push_back(enzyme); + } + } + + + { + struct BAInput input; + read_ba_instance("data/" + path, input.n, input.m, input.p, input.cams, input.X, input.w, input.obs, input.feats); + + struct BAOutput result = { + std::vector(2 * input.p), + std::vector(input.p), + BASparseMat(input.n, input.m, input.p) + }; + { + + struct timeval start, end; + gettimeofday(&start, NULL); + rust2_ba_objective( + input.n, + input.m, + input.p, + input.cams.data(), + input.X.data(), + input.w.data(), + input.obs.data(), + input.feats.data(), + result.reproj_err.data(), + result.w_err.data() + ); + gettimeofday(&end, NULL); + printf("primal rust t=%0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "primal rust"; + enzyme["runtime"] = tdiff(&start, &end); + for(unsigned i=0; i<5; i++) { + printf("%f ", result.reproj_err[i]); + enzyme["result"].push_back(result.reproj_err[i]); + } + for(unsigned i=0; i<5; i++) { + printf("%f ", result.w_err[i]); + enzyme["result"].push_back(result.w_err[i]); + } + printf("\n"); + test_suite["tools"].push_back(enzyme); + } + } + + { + + struct BAInput input; + read_ba_instance("data/" + path, input.n, input.m, input.p, input.cams, input.X, input.w, input.obs, input.feats); + + struct BAOutput result = { + std::vector(2 * input.p), + std::vector(input.p), + BASparseMat(input.n, input.m, input.p) + }; + + { + struct timeval start, end; + gettimeofday(&start, NULL); + calculate_jacobian(input, result); + gettimeofday(&end, NULL); + printf("Enzyme rust combined %0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "Enzyme rust combined"; + enzyme["runtime"] = tdiff(&start, &end); + for(unsigned i=0; i<5; i++) { + printf("%f ", result.J.vals[i]); + enzyme["result"].push_back(result.J.vals[i]); + } + printf("\n"); + test_suite["tools"].push_back(enzyme); + } + + } + test_suite["llvm-version"] = __clang_version__; test_suite["mode"] = "ReverseMode"; test_suite["batch-size"] = 1; diff --git a/enzyme/benchmarks/ReverseMode/ba/Cargo.lock b/enzyme/benchmarks/ReverseMode/ba/Cargo.lock index 7e322bed2b9a..74e2768e7cd4 100644 --- a/enzyme/benchmarks/ReverseMode/ba/Cargo.lock +++ b/enzyme/benchmarks/ReverseMode/ba/Cargo.lock @@ -5,3 +5,12 @@ version = 3 [[package]] name = "bars" version = "0.1.0" +dependencies = [ + "libm", +] + +[[package]] +name = "libm" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" diff --git a/enzyme/benchmarks/ReverseMode/ba/Cargo.toml b/enzyme/benchmarks/ReverseMode/ba/Cargo.toml index 1abfe3da5163..160c7716f3d8 100644 --- a/enzyme/benchmarks/ReverseMode/ba/Cargo.toml +++ b/enzyme/benchmarks/ReverseMode/ba/Cargo.toml @@ -16,3 +16,4 @@ lto = "fat" lto = "fat" [dependencies] +libm = { version = "0.2.8", optional = true } diff --git a/enzyme/benchmarks/ReverseMode/ba/Makefile.make b/enzyme/benchmarks/ReverseMode/ba/Makefile.make index 6f0f2cc18242..8a13a0e524fb 100644 --- a/enzyme/benchmarks/ReverseMode/ba/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/ba/Makefile.make @@ -1,23 +1,17 @@ -# RUN: cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%loadEnzyme" make -B ba-unopt.ll ba-raw.ll results.json -f %s +# RUN: cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%newLoadClangEnzyme" make -B ba.o results.json -f %s .PHONY: clean +dir := $(abspath $(lastword $(MAKEFILE_LIST))/../../../..) + clean: rm -f *.ll *.o results.txt results.json -%-unopt.ll: %.cpp - clang++ $(BENCH) $^ -O2 -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -Xclang -new-struct-path-tbaa -o $@ -S -emit-llvm - #clang++ $(BENCH) $^ -O1 -Xclang -disable-llvm-passes -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -Xclang -new-struct-path-tbaa -o $@ -S -emit-llvm - -%-raw.ll: %-unopt.ll - opt $^ $(LOAD) -enzyme -o $@ -S - -%-opt.ll: %-raw.ll - opt $^ -o $@ -S - #opt $^ -O2 -o $@ -S +$(dir)/benchmarks/ReverseMode/ba/target/release/libbars.a: src/lib.rs Cargo.toml + ENZYME_LOOSE_TYPES=1 cargo +enzyme rustc --release --lib --crate-type=staticlib --features=libm -ba.o: ba-opt.ll - clang++ -O2 $^ -o $@ $(BENCHLINK) +ba.o: ba.cpp $(dir)/benchmarks/ReverseMode/ba/target/release/libbars.a + clang++ $(LOAD) $(BENCH) ba.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o ba.o -lpthread $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/ba/target/release/libbars.a -L /usr/lib/gcc/x86_64-linux-gnu/11 results.json: ba.o ./$^ diff --git a/enzyme/benchmarks/ReverseMode/ba/src/lib.rs b/enzyme/benchmarks/ReverseMode/ba/src/lib.rs index 412a6a477109..82318144f63f 100644 --- a/enzyme/benchmarks/ReverseMode/ba/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/ba/src/lib.rs @@ -67,7 +67,7 @@ fn project(cam: &[f64; 11], X: &[f64; 3], proj: &mut [f64; 2]) { } #[no_mangle] -pub extern "C" fn dcompute_reproj_error( +pub extern "C" fn rust_dcompute_reproj_error( cam: *const [f64; 11], dcam: *mut [f64; 11], x: *const [f64; 3], @@ -78,21 +78,21 @@ pub extern "C" fn dcompute_reproj_error( err: *mut [f64; 2], derr: *mut [f64; 2], ) { - rust_dcompute_reproj_error(cam, dcam, x, dx, w, wb, feat, err, derr); + dcompute_reproj_error(cam, dcam, x, dx, w, wb, feat, err, derr); } #[no_mangle] -pub extern "C" fn dcompute_zach_weight_error( +pub extern "C" fn rust_dcompute_zach_weight_error( w: *const f64, dw: *mut f64, err: *mut f64, derr: *mut f64, ) { - rust_dcompute_zach_weight_error(w, dw, err, derr); + dcompute_zach_weight_error(w, dw, err, derr); } #[autodiff( - rust_dcompute_reproj_error, + dcompute_reproj_error, Reverse, Duplicated, Duplicated, @@ -118,7 +118,7 @@ pub fn compute_reproj_error( err[1] = w * (proj[1] - feat[1]); } -#[autodiff(rust_dcompute_zach_weight_error, Reverse, Duplicated, Duplicated)] +#[autodiff(dcompute_zach_weight_error, Reverse, Duplicated, Duplicated)] pub fn compute_zach_weight_error(w: *const f64, err: *mut f64) { let w = unsafe { *w }; let mut err = unsafe { *err }; @@ -198,7 +198,7 @@ fn rust_ba_objective( } #[no_mangle] -extern "C" fn ba_objective( +extern "C" fn rust2_ba_objective( n: usize, m: usize, p: usize, From 4b0062bd396ac04a089ff718913e4aa6a1ee1471 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Mon, 1 Apr 2024 21:49:23 -0400 Subject: [PATCH 23/56] fix ba primal --- enzyme/benchmarks/ReverseMode/ba/src/lib.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/ba/src/lib.rs b/enzyme/benchmarks/ReverseMode/ba/src/lib.rs index 82318144f63f..768f3fec8e38 100644 --- a/enzyme/benchmarks/ReverseMode/ba/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/ba/src/lib.rs @@ -121,8 +121,7 @@ pub fn compute_reproj_error( #[autodiff(dcompute_zach_weight_error, Reverse, Duplicated, Duplicated)] pub fn compute_zach_weight_error(w: *const f64, err: *mut f64) { let w = unsafe { *w }; - let mut err = unsafe { *err }; - err = 1. - w * w; + unsafe { *err = 1. - w * w; } } // n number of cameras From 1f27479dbecd8de0a9e1eb10f95b7e562d4e9863 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Thu, 4 Apr 2024 01:36:17 -0400 Subject: [PATCH 24/56] adding unsafe gmm version --- enzyme/benchmarks/ReverseMode/adbench/gmm.h | 104 ++++++++--- enzyme/benchmarks/ReverseMode/gmm/src/lib.rs | 162 +---------------- enzyme/benchmarks/ReverseMode/gmm/src/main.rs | 4 +- enzyme/benchmarks/ReverseMode/gmm/src/safe.rs | 165 ++++++++++++++++++ .../benchmarks/ReverseMode/gmm/src/unsafe.rs | 147 ++++++++++++++++ 5 files changed, 397 insertions(+), 185 deletions(-) create mode 100644 enzyme/benchmarks/ReverseMode/gmm/src/safe.rs create mode 100644 enzyme/benchmarks/ReverseMode/gmm/src/unsafe.rs diff --git a/enzyme/benchmarks/ReverseMode/adbench/gmm.h b/enzyme/benchmarks/ReverseMode/adbench/gmm.h index a25b4d0ded54..45d589c7ae75 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/gmm.h +++ b/enzyme/benchmarks/ReverseMode/adbench/gmm.h @@ -58,7 +58,19 @@ void gmm_objective( alphasb, const double *means, double *meansb, const double *icf, double *icfb, const double *x, Wishart wishart, double *err, double * errb); - + + void rust_unsafe_dgmm_objective(int d, int k, int n, const double *alphas, + double *alphasb, const double *means, + double *meansb, const double *icf, + double *icfb, const double *x, + Wishart &wishart, double *err, + double *errb); + + void rust_unsafe_gmm_objective(int d, int k, int n, const double *alphas, + const double *means, const double *icf, + const double *x, Wishart &wishart, + double *err); + void rust_dgmm_objective(int d, int k, int n, const double *alphas, double * alphasb, const double *means, double *meansb, const double *icf, double *icfb, const double *x, Wishart &wishart, double *err, double * @@ -203,10 +215,11 @@ int main(const int argc, const char* argv[]) { std::vector paths;// = { "1k/gmm_d10_K100.txt" }; - getTests(paths, "data/1k", "1k/"); - getTests(paths, "data/2.5k", "2.5k/"); - getTests(paths, "data/10k", "10k/"); - + // getTests(paths, "data/1k", "1k/"); + // getTests(paths, "data/2.5k", "2.5k/"); + // getTests(paths, "data/10k", "10k/"); + paths.push_back("1k/gmm_d2_K5.txt"); + std::ofstream jsonfile("results.json", std::ofstream::trunc); json test_results; @@ -256,26 +269,27 @@ int main(const int argc, const char* argv[]) { struct GMMOutput result = { 0, std::vector(Jcols) }; - try { - struct timeval start, end; - gettimeofday(&start, NULL); - calculate_jacobian(input, result); - gettimeofday(&end, NULL); - printf("Adept combined %0.6f\n", tdiff(&start, &end)); - json adept; - adept["name"] = "Adept combined"; - adept["runtime"] = tdiff(&start, &end); - for (unsigned i = result.gradient.size() - 5; - i < result.gradient.size(); i++) { - printf("%f ", result.gradient[i]); - adept["result"].push_back(result.gradient[i]); + if (0) { + try { + struct timeval start, end; + gettimeofday(&start, NULL); + calculate_jacobian(input, result); + gettimeofday(&end, NULL); + printf("Adept combined %0.6f\n", tdiff(&start, &end)); + json adept; + adept["name"] = "Adept combined"; + adept["runtime"] = tdiff(&start, &end); + for (unsigned i = result.gradient.size() - 5; + i < result.gradient.size(); i++) { + printf("%f ", result.gradient[i]); + adept["result"].push_back(result.gradient[i]); + } + printf("\n"); + test_suite["tools"].push_back(adept); + } catch (std::bad_alloc) { + printf("Adept combined 88888888 ooms\n"); } - printf("\n"); - test_suite["tools"].push_back(adept); - } catch(std::bad_alloc) { - printf("Adept combined 88888888 ooms\n"); } - } { @@ -331,6 +345,49 @@ int main(const int argc, const char* argv[]) { primal["result"].push_back(res); test_suite["tools"].push_back(primal); } + { + struct timeval start, end; + gettimeofday(&start, NULL); + auto res = primal(input); + gettimeofday(&end, NULL); + printf("rust unsafe primal combined t=%0.6f, err=%f\n", + tdiff(&start, &end), res); + json primal; + primal["name"] = "Rust unsafe primal"; + primal["runtime"] = tdiff(&start, &end); + primal["result"].push_back(res); + test_suite["tools"].push_back(primal); + } + { + struct timeval start, end; + gettimeofday(&start, NULL); + calculate_jacobian(input, result); + gettimeofday(&end, NULL); + printf("Enzyme unsafe rust combined %0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "Rust unsafe Enzyme combined"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = result.gradient.size() - 5; i < result.gradient.size(); + i++) { + printf("%f ", result.gradient[i]); + enzyme["result"].push_back(result.gradient[i]); + } + printf("\n"); + test_suite["tools"].push_back(enzyme); + } + } + + { + + struct GMMInput input; + read_gmm_instance("data/" + path, &input.d, &input.k, &input.n, + input.alphas, input.means, input.icf, input.x, + input.wishart, params.replicate_point); + + int Jcols = (input.k * (input.d + 1) * (input.d + 2)) / 2; + + struct GMMOutput result = {0, std::vector(Jcols)}; + { struct timeval start, end; gettimeofday(&start, NULL); @@ -360,7 +417,6 @@ int main(const int argc, const char* argv[]) { printf("\n"); test_suite["tools"].push_back(enzyme); } - } test_suite["llvm-version"] = __clang_version__; test_suite["mode"] = "ReverseMode"; diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs index a2ba1d041689..8fcb11ffed10 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs @@ -1,149 +1,9 @@ #![feature(autodiff)] -use std::f64::consts::PI; +pub mod r#unsafe; +pub mod safe; -#[cfg(feature = "libm")] -use libm::lgamma; +use r#unsafe::dgmm_objective as dgmm_objective; -#[cfg(not(feature = "libm"))] -mod cmath { - extern "C" { - pub fn lgamma(x: f64) -> f64; - } -} -#[cfg(not(feature = "libm"))] -#[inline] -fn lgamma(x: f64) -> f64 { - unsafe { cmath::lgamma(x) } -} - -#[no_mangle] -pub extern "C" fn rust_dgmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { - let k = k as usize; - let n = n as usize; - let d = d as usize; - let alphas = unsafe { std::slice::from_raw_parts(alphas, k) }; - let means = unsafe { std::slice::from_raw_parts(means, k * d) }; - let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; - let x = unsafe { std::slice::from_raw_parts(x, n * d) }; - let wishart: Wishart = unsafe { *wishart }; - let mut my_err = unsafe { *err }; - - let d_alphas = unsafe { std::slice::from_raw_parts_mut(dalphas, k) }; - let d_means = unsafe { std::slice::from_raw_parts_mut(dmeans, k * d) }; - let d_icf = unsafe { std::slice::from_raw_parts_mut(dicf, k * d * (d + 1) / 2) }; - let mut my_derr = unsafe { *derr }; - - dgmm_objective(d, k, n, alphas, d_alphas, means, d_means, icf, d_icf, x, wishart.gamma, wishart.m, &mut my_err, &mut my_derr); - - unsafe { *err = my_err }; - unsafe { *derr = my_derr }; -} - -#[no_mangle] -pub extern "C" fn rust_gmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { - let k = k as usize; - let n = n as usize; - let d = d as usize; - let alphas = unsafe { std::slice::from_raw_parts(alphas, k) }; - let means = unsafe { std::slice::from_raw_parts(means, k * d) }; - let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; - let x = unsafe { std::slice::from_raw_parts(x, n * d) }; - let wishart: Wishart = unsafe { *wishart }; - let mut my_err = unsafe { *err }; - gmm_objective(d, k, n, alphas, means, icf, x, wishart.gamma, wishart.m, &mut my_err); - unsafe { *err = my_err }; -} - -//#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] -//pub fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { -// gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut my_err); -//} - -#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Const, Duplicated)] -pub fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], gamma: f64, m: i32, err: &mut f64) { - let wishart: Wishart = Wishart { gamma, m }; - //let wishart: Wishart = unsafe { *wishart }; - let constant = -(n as f64) * d as f64 * 0.5 * (2.0 * PI).ln(); - let icf_sz = d * (d + 1) / 2; - let mut qdiags = vec![0.; d * k]; - let mut sum_qs = vec![0.; k]; - let mut xcentered = vec![0.; d]; - let mut qxcentered = vec![0.; d]; - let mut main_term = vec![0.; k]; - - preprocess_qs(d, k, icf, &mut sum_qs, &mut qdiags); - - let mut slse = 0.; - for ix in 0..n { - for ik in 0..k { - subtract(d, &x[ix as usize * d as usize..], &means[ik as usize * d as usize..], &mut xcentered); - qtimesx(d, &qdiags[ik as usize * d as usize..], &icf[ik as usize * icf_sz as usize + d as usize..], &xcentered, &mut qxcentered); - main_term[ik as usize] = alphas[ik as usize] + sum_qs[ik as usize] - 0.5 * sqnorm(&qxcentered); - } - - slse = slse + log_sum_exp(k, &main_term); - } - - let lse_alphas = log_sum_exp(k, alphas); - - *err = constant + slse - n as f64 * lse_alphas + log_wishart_prior(d, k, wishart, &sum_qs, &qdiags, icf); -} - -fn arr_max(n: usize, x: &[f64]) -> f64 { - let mut max = f64::NEG_INFINITY; - for i in 0..n { - if max < x[i] { - max = x[i]; - } - } - max -} - -fn preprocess_qs(d: usize, k: usize, icf: &[f64], sum_qs: &mut [f64], qdiags: &mut [f64]) { - let icf_sz = d * (d + 1) / 2; - for ik in 0..k { - sum_qs[ik as usize] = 0.; - for id in 0..d { - let q = icf[ik as usize * icf_sz as usize + id as usize]; - sum_qs[ik as usize] = sum_qs[ik as usize] + q; - qdiags[ik as usize * d as usize + id as usize] = q.exp(); - } - } -} -fn subtract(d: usize, x: &[f64], y: &[f64], out: &mut [f64]) { - assert!(x.len() >= d); - assert!(y.len() >= d); - assert!(out.len() >= d); - for i in 0..d { - out[i] = x[i] - y[i]; - } -} - -fn qtimesx(d: usize, q_diag: &[f64], ltri: &[f64], x: &[f64], out: &mut [f64]) { - assert!(out.len() >= d); - assert!(q_diag.len() >= d); - assert!(x.len() >= d); - for i in 0..d { - out[i] = q_diag[i] * x[i]; - } - - for i in 0..d { - let mut lparamsidx = i*(2*d-i-1)/2; - for j in i + 1..d { - out[j] = out[j] + ltri[lparamsidx] * x[i]; - lparamsidx += 1; - } - } -} - -fn log_sum_exp(n: usize, x: &[f64]) -> f64 { - let mx = arr_max(n, x); - let semx: f64 = x.iter().map(|x| (x - mx).exp()).sum(); - semx.ln() + mx -} -fn log_gamma_distrib(a: f64, p: f64) -> f64 { - 0.25 * p * (p - 1.) * PI.ln() + (1..=p as usize).map(|j| lgamma(a + 0.5 * (1. - j as f64))).sum::() -} #[derive(Clone, Copy)] #[repr(C)] @@ -151,20 +11,4 @@ pub struct Wishart { pub gamma: f64, pub m: i32, } -fn log_wishart_prior(p: usize, k: usize, wishart: Wishart, sum_qs: &[f64], qdiags: &[f64], icf: &[f64]) -> f64 { - let n = p + wishart.m as usize + 1; - let icf_sz = p * (p + 1) / 2; - - let c = n as f64 * p as f64 * (wishart.gamma.ln() - 0.5 * 2f64.ln()) - log_gamma_distrib(0.5 * n as f64, p as f64); - - let out = (0..k).map(|ik| { - let frobenius = sqnorm(&qdiags[ik * p as usize..][..p]) + sqnorm(&icf[ik * icf_sz as usize + p as usize..][..icf_sz -p]); - 0.5 * wishart.gamma * wishart.gamma * (frobenius) - (wishart.m as f64) * sum_qs[ik as usize] - }).sum::(); - - out - k as f64 * c -} -fn sqnorm(x: &[f64]) -> f64 { - x.iter().map(|x| x * x).sum() -} diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/main.rs b/enzyme/benchmarks/ReverseMode/gmm/src/main.rs index 8f4357588ab8..e7ebf74d0aa2 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/main.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/main.rs @@ -1,5 +1,5 @@ #![feature(autodiff)] -use gmmrs::{Wishart, dgmm_objective}; +use gmmrs::{Wishart, r#unsafe::dgmm_objective}; fn main() { let d = 2; @@ -20,5 +20,5 @@ fn main() { let mut d_err2 = &mut d_err; let wishart2 = &wishart; // pass as raw ptr: - dgmm_objective(d, k, n, alphas.as_ptr(), d_alphas.as_mut_ptr(), means.as_ptr(), d_means.as_mut_ptr(), icf.as_ptr(), d_icf.as_mut_ptr(), x.as_ptr(), wishart2 as *const Wishart, err2 as *mut f64, d_err2 as *mut f64); + unsafe {dgmm_objective(d, k, n, alphas.as_ptr(), d_alphas.as_mut_ptr(), means.as_ptr(), d_means.as_mut_ptr(), icf.as_ptr(), d_icf.as_mut_ptr(), x.as_ptr(), wishart2 as *const Wishart, err2 as *mut f64, d_err2 as *mut f64);} } diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs b/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs new file mode 100644 index 000000000000..5f954347f1d7 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs @@ -0,0 +1,165 @@ +//#![feature(autodiff)] +use std::f64::consts::PI; +use crate::Wishart; + +#[cfg(feature = "libm")] +use libm::lgamma; + +#[cfg(not(feature = "libm"))] +mod cmath { + extern "C" { + pub fn lgamma(x: f64) -> f64; + } +} +#[cfg(not(feature = "libm"))] +#[inline] +fn lgamma(x: f64) -> f64 { + unsafe { cmath::lgamma(x) } +} + +#[no_mangle] +pub extern "C" fn rust_dgmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { + let k = k as usize; + let n = n as usize; + let d = d as usize; + let alphas = unsafe { std::slice::from_raw_parts(alphas, k) }; + let means = unsafe { std::slice::from_raw_parts(means, k * d) }; + let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; + let x = unsafe { std::slice::from_raw_parts(x, n * d) }; + let wishart: Wishart = unsafe { *wishart }; + let mut my_err = unsafe { *err }; + + let d_alphas = unsafe { std::slice::from_raw_parts_mut(dalphas, k) }; + let d_means = unsafe { std::slice::from_raw_parts_mut(dmeans, k * d) }; + let d_icf = unsafe { std::slice::from_raw_parts_mut(dicf, k * d * (d + 1) / 2) }; + let mut my_derr = unsafe { *derr }; + + dgmm_objective(d, k, n, alphas, d_alphas, means, d_means, icf, d_icf, x, wishart.gamma, wishart.m, &mut my_err, &mut my_derr); + + unsafe { *err = my_err }; + unsafe { *derr = my_derr }; +} + +#[no_mangle] +pub extern "C" fn rust_gmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { + let k = k as usize; + let n = n as usize; + let d = d as usize; + let alphas = unsafe { std::slice::from_raw_parts(alphas, k) }; + let means = unsafe { std::slice::from_raw_parts(means, k * d) }; + let icf = unsafe { std::slice::from_raw_parts(icf, k * d * (d + 1) / 2) }; + let x = unsafe { std::slice::from_raw_parts(x, n * d) }; + let wishart: Wishart = unsafe { *wishart }; + let mut my_err = unsafe { *err }; + gmm_objective(d, k, n, alphas, means, icf, x, wishart.gamma, wishart.m, &mut my_err); + unsafe { *err = my_err }; +} + +//#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] +//pub fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { +// gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut my_err); +//} + +#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Const, Duplicated)] +pub fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], gamma: f64, m: i32, err: &mut f64) { + let wishart: Wishart = Wishart { gamma, m }; + //let wishart: Wishart = unsafe { *wishart }; + let constant = -(n as f64) * d as f64 * 0.5 * (2.0 * PI).ln(); + let icf_sz = d * (d + 1) / 2; + let mut qdiags = vec![0.; d * k]; + let mut sum_qs = vec![0.; k]; + let mut xcentered = vec![0.; d]; + let mut qxcentered = vec![0.; d]; + let mut main_term = vec![0.; k]; + + preprocess_qs(d, k, icf, &mut sum_qs, &mut qdiags); + + let mut slse = 0.; + for ix in 0..n { + for ik in 0..k { + subtract(d, &x[ix as usize * d as usize..], &means[ik as usize * d as usize..], &mut xcentered); + qtimesx(d, &qdiags[ik as usize * d as usize..], &icf[ik as usize * icf_sz as usize + d as usize..], &xcentered, &mut qxcentered); + main_term[ik as usize] = alphas[ik as usize] + sum_qs[ik as usize] - 0.5 * sqnorm(&qxcentered); + } + + slse = slse + log_sum_exp(k, &main_term); + } + + let lse_alphas = log_sum_exp(k, alphas); + + *err = constant + slse - n as f64 * lse_alphas + log_wishart_prior(d, k, wishart, &sum_qs, &qdiags, icf); +} + +fn arr_max(n: usize, x: &[f64]) -> f64 { + let mut max = f64::NEG_INFINITY; + for i in 0..n { + if max < x[i] { + max = x[i]; + } + } + max +} + +fn preprocess_qs(d: usize, k: usize, icf: &[f64], sum_qs: &mut [f64], qdiags: &mut [f64]) { + let icf_sz = d * (d + 1) / 2; + for ik in 0..k { + sum_qs[ik as usize] = 0.; + for id in 0..d { + let q = icf[ik as usize * icf_sz as usize + id as usize]; + sum_qs[ik as usize] = sum_qs[ik as usize] + q; + qdiags[ik as usize * d as usize + id as usize] = q.exp(); + } + } +} +fn subtract(d: usize, x: &[f64], y: &[f64], out: &mut [f64]) { + assert!(x.len() >= d); + assert!(y.len() >= d); + assert!(out.len() >= d); + for i in 0..d { + out[i] = x[i] - y[i]; + } +} + +fn qtimesx(d: usize, q_diag: &[f64], ltri: &[f64], x: &[f64], out: &mut [f64]) { + assert!(out.len() >= d); + assert!(q_diag.len() >= d); + assert!(x.len() >= d); + for i in 0..d { + out[i] = q_diag[i] * x[i]; + } + + for i in 0..d { + let mut lparamsidx = i*(2*d-i-1)/2; + for j in i + 1..d { + out[j] = out[j] + ltri[lparamsidx] * x[i]; + lparamsidx += 1; + } + } +} + +fn log_sum_exp(n: usize, x: &[f64]) -> f64 { + let mx = arr_max(n, x); + let semx: f64 = x.iter().map(|x| (x - mx).exp()).sum(); + semx.ln() + mx +} +fn log_gamma_distrib(a: f64, p: f64) -> f64 { + 0.25 * p * (p - 1.) * PI.ln() + (1..=p as usize).map(|j| lgamma(a + 0.5 * (1. - j as f64))).sum::() +} + +fn log_wishart_prior(p: usize, k: usize, wishart: Wishart, sum_qs: &[f64], qdiags: &[f64], icf: &[f64]) -> f64 { + let n = p + wishart.m as usize + 1; + let icf_sz = p * (p + 1) / 2; + + let c = n as f64 * p as f64 * (wishart.gamma.ln() - 0.5 * 2f64.ln()) - log_gamma_distrib(0.5 * n as f64, p as f64); + + let out = (0..k).map(|ik| { + let frobenius = sqnorm(&qdiags[ik * p as usize..][..p]) + sqnorm(&icf[ik * icf_sz as usize + p as usize..][..icf_sz -p]); + 0.5 * wishart.gamma * wishart.gamma * (frobenius) - (wishart.m as f64) * sum_qs[ik as usize] + }).sum::(); + + out - k as f64 * c +} + +fn sqnorm(x: &[f64]) -> f64 { + x.iter().map(|x| x * x).sum() +} diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/unsafe.rs b/enzyme/benchmarks/ReverseMode/gmm/src/unsafe.rs new file mode 100644 index 000000000000..b2730538c88e --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/gmm/src/unsafe.rs @@ -0,0 +1,147 @@ +use std::f64::consts::PI; +use crate::Wishart; + +#[cfg(feature = "libm")] +use libm::lgamma; + +#[cfg(not(feature = "libm"))] +mod cmath { + extern "C" { + pub fn lgamma(x: f64) -> f64; + } +} +#[cfg(not(feature = "libm"))] +#[inline] +fn lgamma(x: f64) -> f64 { + unsafe { cmath::lgamma(x) } +} + +#[no_mangle] +pub extern "C" fn rust_unsafe_dgmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { + let k = k as usize; + let n = n as usize; + let d = d as usize; + unsafe { dgmm_objective(d, k, n, alphas, dalphas, means, dmeans, icf, dicf, x, wishart, err, derr); } +} + +#[no_mangle] +pub extern "C" fn rust_unsafe_gmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { + let k = k as usize; + let n = n as usize; + let d = d as usize; + unsafe {gmm_objective(d, k, n, alphas, means, icf, x, wishart, err); } +} + +//#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] +//pub unsafe fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], gamma: f64, m: i32, err: &mut f64) { +// gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut my_err); +//} + +#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] +pub unsafe fn gmm_objective(d: usize, k: usize, n: usize, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { + let constant = -(n as f64) * d as f64 * 0.5 * (2.0 * PI).ln(); + let icf_sz = d * (d + 1) / 2; + let mut qdiags = vec![0.; d * k]; + let mut sum_qs = vec![0.; k]; + let mut xcentered = vec![0.; d]; + let mut qxcentered = vec![0.; d]; + let mut main_term = vec![0.; k]; + + preprocess_qs(d, k, icf, sum_qs.as_mut_ptr(), qdiags.as_mut_ptr()); + + let mut slse = 0.; + for ix in 0..n { + for ik in 0..k { + subtract(d, x.add(ix * d), means.add(ik * d), xcentered.as_mut_ptr()); + qtimesx(d, qdiags.as_mut_ptr().add(ik * d), icf.add(ik * icf_sz + d), xcentered.as_ptr(), qxcentered.as_mut_ptr()); + main_term[ik] = *alphas.add(ik) + sum_qs[ik] - 0.5 * sqnorm(d, qxcentered.as_ptr()); + //main_term[ik] = alphas[ik] + sum_qs[ik] - 0.5 * sqnorm(d, &Qxcentered[0]); + } + + slse = slse + log_sum_exp(k, main_term.as_ptr()); + } + + let lse_alphas = log_sum_exp(k, alphas); + + *err = constant + slse - n as f64 * lse_alphas + log_wishart_prior(d, k, *wishart, sum_qs.as_ptr(), qdiags.as_ptr(), icf); +} + +unsafe fn arr_max(n: usize, x: *const f64) -> f64 { + let mut max = f64::NEG_INFINITY; + for i in 0..n { + if max < *x.add(i) { + max = *x.add(i); + } + } + max +} + +unsafe fn preprocess_qs(d: usize, k: usize, icf: *const f64, sum_qs: *mut f64, qdiags: *mut f64) { + let icf_sz = d * (d + 1) / 2; + for ik in 0..k { + *sum_qs.add(ik) = 0.; + for id in 0..d { + let q = *icf.add(ik * icf_sz + id); + *sum_qs.add(ik) = *sum_qs.add(ik) + q; + *qdiags.add(ik * d + id) = q.exp(); + } + } +} + +unsafe fn subtract(d: usize, x: *const f64, y: *const f64, out: *mut f64) { + for i in 0..d { + *out.add(i) = *x.add(i) - *y.add(i); + } +} + +unsafe fn qtimesx(d: usize, q_diag: *const f64, ltri: *const f64, x: *const f64, out: *mut f64) { + for i in 0..d { + *out.add(i) = *q_diag.add(i) * *x.add(i); + } + + for i in 0..d { + let mut lparamsidx = i*(2*d-i-1)/2; + for j in i + 1..d { + *out.add(j) = *out.add(j) + *ltri.add(lparamsidx) * *x.add(i); + lparamsidx += 1; + } + } +} + +unsafe fn log_sum_exp(n: usize, x: *const f64) -> f64 { + let mx = arr_max(n, x); + let mut semx: f64 = 0.0; + + for i in 0..n { + semx = semx + (*x.add(i) - mx).exp(); + } + semx.ln() + mx +} + +fn log_gamma_distrib(a: f64, p: f64) -> f64 { + 0.25 * p * (p - 1.) * PI.ln() + (1..=p as usize).map(|j| lgamma(a + 0.5 * (1. - j as f64))).sum::() +} + +unsafe fn log_wishart_prior(p: usize, k: usize, wishart: Wishart, sum_qs: *const f64, qdiags: *const f64, icf: *const f64) -> f64 { + let n = p + wishart.m as usize + 1; + let icf_sz = p * (p + 1) / 2; + + let c = n as f64 * p as f64 * (wishart.gamma.ln() - 0.5 * 2f64.ln()) - log_gamma_distrib(0.5 * n as f64, p as f64); + + let mut out = 0.; + + for ik in 0..k { + let frobenius = sqnorm(p, qdiags.add(ik * p)) + sqnorm(icf_sz - p, icf.add(ik * icf_sz + p)); + out = out + 0.5 * wishart.gamma * wishart.gamma * (frobenius) - wishart.m as f64 * *sum_qs.add(ik); + } + + out - k as f64 * c +} + +unsafe fn sqnorm(n: usize, x: *const f64) -> f64 { + let mut sum = 0.; + for i in 0..n { + sum += *x.add(i) * *x.add(i); + } + sum +} From af3e07891217adc18548d961fdf4a22be2985516 Mon Sep 17 00:00:00 2001 From: Lorenz Schmidt Date: Thu, 4 Apr 2024 02:30:12 -0400 Subject: [PATCH 25/56] Add FFT and LSTM benchmark for Rust Enzyme --- enzyme/benchmarks/ReverseMode/adbench/lstm.h | 49 +++++ enzyme/benchmarks/ReverseMode/fft/Cargo.lock | 7 + enzyme/benchmarks/ReverseMode/fft/Cargo.toml | 18 ++ .../benchmarks/ReverseMode/fft/Makefile.make | 20 +-- enzyme/benchmarks/ReverseMode/fft/fft.cpp | 56 ++++++ enzyme/benchmarks/ReverseMode/fft/src/lib.rs | 106 +++++++++++ enzyme/benchmarks/ReverseMode/fft/src/main.rs | 14 ++ enzyme/benchmarks/ReverseMode/lstm/Cargo.lock | 7 + enzyme/benchmarks/ReverseMode/lstm/Cargo.toml | 18 ++ .../benchmarks/ReverseMode/lstm/Makefile.make | 19 +- enzyme/benchmarks/ReverseMode/lstm/src/lib.rs | 169 ++++++++++++++++++ .../benchmarks/ReverseMode/lstm/src/main.rs | 3 + 12 files changed, 466 insertions(+), 20 deletions(-) create mode 100644 enzyme/benchmarks/ReverseMode/fft/Cargo.lock create mode 100644 enzyme/benchmarks/ReverseMode/fft/Cargo.toml create mode 100644 enzyme/benchmarks/ReverseMode/fft/src/lib.rs create mode 100644 enzyme/benchmarks/ReverseMode/fft/src/main.rs create mode 100644 enzyme/benchmarks/ReverseMode/lstm/Cargo.lock create mode 100644 enzyme/benchmarks/ReverseMode/lstm/Cargo.toml create mode 100644 enzyme/benchmarks/ReverseMode/lstm/src/lib.rs create mode 100644 enzyme/benchmarks/ReverseMode/lstm/src/main.rs diff --git a/enzyme/benchmarks/ReverseMode/adbench/lstm.h b/enzyme/benchmarks/ReverseMode/adbench/lstm.h index e6d13303d1f8..bd765ad1dbd1 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/lstm.h +++ b/enzyme/benchmarks/ReverseMode/adbench/lstm.h @@ -34,6 +34,20 @@ struct LSTMOutput { }; extern "C" { + void rust_dlstm_objective( + int l, + int c, + int b, + double const* main_params, + double* dmain_params, + double const* extra_params, + double* dextra_params, + double* state, + double const* sequence, + double* loss, + double* dloss + ); + void dlstm_objective( int l, int c, @@ -291,6 +305,41 @@ int main(const int argc, const char* argv[]) { } } + + { + + struct LSTMInput input = {}; + + // Read instance + read_lstm_instance("data/" + path, &input.l, &input.c, &input.b, input.main_params, input.extra_params, input.state, + input.sequence); + + std::vector state = std::vector(input.state.size()); + + int Jcols = 8 * input.l * input.b + 3 * input.b; + struct LSTMOutput result = { 0, std::vector(Jcols) }; + + { + struct timeval start, end; + gettimeofday(&start, NULL); + calculate_jacobian(input, result); + gettimeofday(&end, NULL); + printf("Enzyme (Rust) combined %0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "Enzyme (Rust) combined"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = result.gradient.size() - 5; + i < result.gradient.size(); i++) { + printf("%f ", result.gradient[i]); + enzyme["result"].push_back(result.gradient[i]); + } + test_suite["tools"].push_back(enzyme); + + printf("\n"); + } + + } + test_suite["llvm-version"] = __clang_version__; test_suite["mode"] = "ReverseMode"; test_suite["batch-size"] = 1; diff --git a/enzyme/benchmarks/ReverseMode/fft/Cargo.lock b/enzyme/benchmarks/ReverseMode/fft/Cargo.lock new file mode 100644 index 000000000000..44847eca60f6 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/fft/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "fft" +version = "0.1.0" diff --git a/enzyme/benchmarks/ReverseMode/fft/Cargo.toml b/enzyme/benchmarks/ReverseMode/fft/Cargo.toml new file mode 100644 index 000000000000..5366aefa719e --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/fft/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "fft" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] + +[lib] +crate-type = ["lib"] + +[profile.release] +lto = "fat" +opt-level = 3 + +[profile.dev] +lto = "fat" diff --git a/enzyme/benchmarks/ReverseMode/fft/Makefile.make b/enzyme/benchmarks/ReverseMode/fft/Makefile.make index ffeddd5507df..a2de0fdbcc62 100644 --- a/enzyme/benchmarks/ReverseMode/fft/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/fft/Makefile.make @@ -1,23 +1,17 @@ -# RUN: cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%loadEnzyme" make -B fft-unopt.ll fft-raw.ll fft-opt.ll results.txt VERBOSE=1 -f %s +# RUN: cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%newLoadClangEnzyme" make -B fft.o results.txt VERBOSE=1 -f %s .PHONY: clean +dir := $(abspath $(lastword $(MAKEFILE_LIST))/../../../..) + clean: rm -f *.ll *.o results.txt -%-unopt.ll: %.cpp - clang++ $(BENCH) $^ -O2 -fno-use-cxa-atexit -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm - #clang++ $(BENCH) $^ -O1 -Xclang -disable-llvm-passes -fno-use-cxa-atexit -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm - -%-raw.ll: %-unopt.ll - opt $^ $(LOAD) -enzyme -o $@ -S - -%-opt.ll: %-raw.ll - opt $^ -o $@ -S - #opt $^ -O2 -o $@ -S +$(dir)/benchmarks/ReverseMode/fft/target/release/libfft.a: src/lib.rs Cargo.toml + cargo +enzyme rustc --release --lib --crate-type=staticlib -fft.o: fft-opt.ll - clang++ -O2 $^ -o $@ $(BENCHLINK) -lm +fft.o: fft.cpp $(dir)/benchmarks/ReverseMode/fft/target/release/libfft.a + clang++ $(LOAD) $(BENCH) fft.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o fft.o -lpthread $(BENCHLINK) -lm -lfft -L $(dir)/benchmarks/ReverseMode/fft/target/release/ -L /usr/lib/gcc/x86_64-linux-gnu/11 results.txt: fft.o ./$^ 1048576 | tee $@ diff --git a/enzyme/benchmarks/ReverseMode/fft/fft.cpp b/enzyme/benchmarks/ReverseMode/fft/fft.cpp index cf9459b9597a..5c67b3be1678 100644 --- a/enzyme/benchmarks/ReverseMode/fft/fft.cpp +++ b/enzyme/benchmarks/ReverseMode/fft/fft.cpp @@ -33,6 +33,21 @@ extern "C" { int enzyme_dupnoneed; } +extern "C" void rust_dfoobar(int n, double* data, double* ddata); +extern "C" void rust_foobar(int n, double* data); + +static double rust_foobar_and_gradient(unsigned len) { + double *inp = new double[2*len]; + for(int i=0; i<2*len; i++) inp[i] = 2.0; + double *dinp = new double[2*len]; + for(int i=0; i<2*len; i++) dinp[i] = 1.0; + rust_dfoobar(len*2, inp, dinp); + double res = dinp[0]; + delete[] dinp; + delete[] inp; + return res; +} + static double foobar_and_gradient(unsigned len) { double *inp = new double[2*len]; for(int i=0; i<2*len; i++) inp[i] = 2.0; @@ -202,6 +217,46 @@ static void enzyme_sincos(double inp, unsigned len) { } } +static void enzyme_rust_sincos(double inp, unsigned len) { + + { + struct timeval start, end; + gettimeofday(&start, NULL); + + double *x = new double[2*len]; + for(int i=0; i<2*len; i++) x[i] = 2.0; + rust_foobar(len, x); + double res = x[0]; + + gettimeofday(&end, NULL); + printf("Enzyme (Rust) real %0.6f res=%f\n", tdiff(&start, &end), res); + delete[] x; + } + + { + struct timeval start, end; + gettimeofday(&start, NULL); + + double *x = new double[2*len]; + for(int i=0; i<2*len; i++) x[i] = 2.0; + rust_foobar(len, x); + double res = x[0]; + + gettimeofday(&end, NULL); + printf("Enzyme (Rust) forward %0.6f res=%f\n", tdiff(&start, &end), res); + delete[] x; + } + + { + struct timeval start, end; + gettimeofday(&start, NULL); + + double res2 = rust_foobar_and_gradient(len); + + gettimeofday(&end, NULL); + printf("Enzyme (Rust) combined %0.6f res'=%f\n", tdiff(&start, &end), res2); + } +} /* Function to check if x is power of 2*/ bool isPowerOfTwo (int x) @@ -233,5 +288,6 @@ int main(int argc, char** argv) { adept_sincos(inp, iters); tapenade_sincos(inp, iters); enzyme_sincos(inp, iters); + enzyme_rust_sincos(inp, iters); } } diff --git a/enzyme/benchmarks/ReverseMode/fft/src/lib.rs b/enzyme/benchmarks/ReverseMode/fft/src/lib.rs new file mode 100644 index 000000000000..e2df837805e0 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/fft/src/lib.rs @@ -0,0 +1,106 @@ +#![feature(autodiff)] + +use std::slice; +use std::f64::consts::PI; + +fn bitreversal_perm(data: &mut [T]) { + let len = data.len() / 2; + let mut j = 1; + + let mut i = 1; + while i < 2*len { + if j > i { + //dbg!(&i, &j); + data.swap(j-1, i-1); + data.swap(j, i); + } + + let mut m = len; + while m >= 2 && j > m { + j -= m; + m >>= 1; + } + + j += m; + i += 2; + } +} + +fn radix2(data: &mut [f64], i_sign: f64, n: usize) { + if n == 1 { + return; + } + + let (a,b) = data.split_at_mut(n); + radix2(a, i_sign, n/2); + radix2(b, i_sign, n/2); + + let wtemp = i_sign * (PI / n as f64).sin(); + let wpi = -i_sign * (2.0 * PI / n as f64).sin(); + let wpr = -2.0 * wtemp * wtemp; + let mut wr = 1.0; + let mut wi = 0.0; + + let mut i = 0; + while i < n { + let in_n = i + n; + + let tempr = data[in_n] * wr - data[in_n + 1] * wi; + let tempi = data[in_n] * wi + data[in_n + 1] * wr; + + data[in_n] = data[i] - tempr; + data[in_n + 1] = data[i + 1] - tempi; + data[i] += tempr; + data[i + 1] += tempi; + + let wtemp_new = wr; + wr += wr * wpr - wi * wpi; + wi += wi * wpr + wtemp_new * wpi; + + i += 2; + } +} + +fn rescale(data: &mut [f64], scale: f64) { + let scale = 1. / scale; + for elm in data { + *elm *= scale; + } +} + +fn fft(data: &mut [f64]) { + bitreversal_perm(data); + radix2(data, 1.0, data.len() / 2); +} + +fn ifft(data: &mut [f64]) { + bitreversal_perm(data); + radix2(data, -1.0, data.len() / 2); + rescale(data, data.len() as f64 / 2.); +} + +#[autodiff(dfoobar, Reverse, Duplicated)] +pub fn foobar(data: &mut [f64]) { + fft(data); + ifft(data); +} + +#[no_mangle] +pub extern "C" fn rust_dfoobar(n: usize, data: *mut f64, ddata: *mut f64) { + + let (data, ddata) = unsafe { + ( + slice::from_raw_parts_mut(data, n), + slice::from_raw_parts_mut(ddata, n) + ) + }; + + dfoobar(data, ddata); +} + +#[no_mangle] +pub extern "C" fn rust_foobar(n: usize, data: *mut f64) { + let data = unsafe { slice::from_raw_parts_mut(data, n) }; + + foobar(data); +} diff --git a/enzyme/benchmarks/ReverseMode/fft/src/main.rs b/enzyme/benchmarks/ReverseMode/fft/src/main.rs new file mode 100644 index 000000000000..f2a857806eb2 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/fft/src/main.rs @@ -0,0 +1,14 @@ +use fft::dfoobar; + +fn main() { + let mut data = vec![1.0; 32]; + for i in 0..16 { + data[i] = 2.0; + } + let mut data_d = vec![1.0; data.len()]; + + dfoobar(&mut data, &mut data_d); + + dbg!(&data_d); + dbg!(&data); +} diff --git a/enzyme/benchmarks/ReverseMode/lstm/Cargo.lock b/enzyme/benchmarks/ReverseMode/lstm/Cargo.lock new file mode 100644 index 000000000000..270bf4367433 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/lstm/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "lstm" +version = "0.1.0" diff --git a/enzyme/benchmarks/ReverseMode/lstm/Cargo.toml b/enzyme/benchmarks/ReverseMode/lstm/Cargo.toml new file mode 100644 index 000000000000..6e659faf3a3b --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/lstm/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "lstm" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] + +[lib] +crate-type = ["lib"] + +[profile.release] +lto = "fat" +opt-level = 3 + +[profile.dev] +lto = "fat" diff --git a/enzyme/benchmarks/ReverseMode/lstm/Makefile.make b/enzyme/benchmarks/ReverseMode/lstm/Makefile.make index 4323ac694a08..f3cdb818b742 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/lstm/Makefile.make @@ -1,23 +1,28 @@ -# RUN: cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%loadEnzyme" make -B lstm-raw.ll results.json -f %s +# RUN: cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%newLoadEnzyme %enzyme" make -B lstm-raw.ll results.json -f %s .PHONY: clean +dir := $(abspath $(lastword $(MAKEFILE_LIST))/../../../..) + clean: - rm -f *.ll *.o results.txt + rm -f *.ll *.o results.json + +$(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a: src/lib.rs Cargo.toml + cargo +enzyme rustc --release --lib --crate-type=staticlib %-unopt.ll: %.cpp - clang++ $(BENCH) $^ -O2 -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm - #clang++ $(BENCH) $^ -O1 -Xclang -disable-llvm-passes -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm + clang++ $(BENCH) $^ -O2 --gcc-install-dir=/usr/lib/gcc/x86_64-linux-gnu/11 -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm %-raw.ll: %-unopt.ll - opt $^ $(LOAD) -enzyme -o $@ -S + @echo $(LOAD) + opt $^ $(LOAD) -o $@ -S %-opt.ll: %-raw.ll opt $^ -o $@ -S #opt $^ -O2 -o $@ -S -lstm.o: lstm-opt.ll - clang++ -O2 $^ -o $@ $(BENCHLINK) -lm +lstm.o: lstm-opt.ll $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a + clang++ --gcc-install-dir=/usr/lib/gcc/x86_64-linux-gnu/11 -O2 $^ -o $@ $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a results.json: lstm.o ./$^ diff --git a/enzyme/benchmarks/ReverseMode/lstm/src/lib.rs b/enzyme/benchmarks/ReverseMode/lstm/src/lib.rs new file mode 100644 index 000000000000..aba88ac76617 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/lstm/src/lib.rs @@ -0,0 +1,169 @@ +#![feature(autodiff)] + +use std::slice; + +// Sigmoid on scalar +fn sigmoid(x: f64) -> f64 { + 1.0 / (1.0 + (-x).exp()) +} + +// log(sum(exp(x), 2)) +fn logsumexp(vect: &[f64]) -> f64 { + let mut sum = 0.0; + for &val in vect { + sum += val.exp(); + } + sum += 2.0; // Adding 2 to sum + sum.ln() +} + +// LSTM OBJECTIVE +// The LSTM model +fn lstm_model( + hsize: usize, + weight: &[f64], + bias: &[f64], + hidden: &mut [f64], + cell: &mut [f64], + input: &[f64], +) { + let mut gates = vec![0.0; 4 * hsize]; + let (a,b) = gates.split_at_mut(2*hsize); + let ((forget, ingate), (outgate, change)) = ( + a.split_at_mut(hsize), b.split_at_mut(hsize)); + + // caching input + for i in 0..hsize { + forget[i] = sigmoid(input[i] * weight[i] + bias[i]); + ingate[i] = sigmoid(hidden[i] * weight[hsize + i] + bias[hsize + i]); + outgate[i] = sigmoid(input[i] * weight[2 * hsize + i] + bias[2 * hsize + i]); + change[i] = (hidden[i] * weight[3 * hsize + i] + bias[3 * hsize + i]).tanh(); + } + + // caching cell + for i in 0..hsize { + cell[i] = cell[i] * forget[i] + ingate[i] * change[i]; + } + + for i in 0..hsize { + hidden[i] = outgate[i] * cell[i].tanh(); + } +} + +// Predict LSTM output given an input +fn lstm_predict( + l: usize, + b: usize, + w: &[f64], + w2: &[f64], + s: &mut [f64], + x: &[f64], + x2: &mut [f64], +) { + for i in 0..b { + x2[i] = x[i] * w2[i]; + } + + let mut i = 0; + while i <= 2*l*b - 1 { + // make borrow-checker happy with non-overlapping mutable references + let (xp, s1, s2) = if i == 0 { + let (s1, s2) = s.split_at_mut(b); + (x2.as_mut(), s1, s2) + } else { + let tmp = &mut s[i-2*b..]; + let (a, d) = tmp.split_at_mut(2*b); + let (d, c) = d.split_at_mut(b); + + (a,d,c) + }; + + lstm_model( + b, + &w[i * 4..], + &w[(i + b) * 4..], + s1, + s2, + xp, + ); + + i += 2 * b; + } + + let xp = &s[i-2*b..]; + + for i in 0..b { + x2[i] = xp[i] * w2[b + i] + w2[2 * b + i]; + } +} + +// LSTM objective (loss function) +#[autodiff(d_lstm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Const, Const, Duplicated)] +pub fn lstm_objective( + l: usize, + c: usize, + b: usize, + main_params: &[f64], + extra_params: &[f64], + state: &mut [f64], + sequence: &[f64], + loss: &mut f64, +) { + let mut total = 0.0; + let mut count = 0; + + let mut input = &sequence[..b]; + let mut ypred = vec![0.0; b]; + let mut ynorm = vec![0.0; b]; + let mut lse; + + assert!(b > 0); + + for t in (0..=(c - 1) * b - 1).step_by(b) { + lstm_predict(l, b, main_params, extra_params, state, input, &mut ypred); + lse = logsumexp(&ypred); + for i in 0..b { + ynorm[i] = ypred[i] - lse; + } + + let ygold = &sequence[t + b..]; + for i in 0..b { + total += ygold[i] * ynorm[i]; + } + + count += b; + input = ygold; + } + + *loss = -total / count as f64; +} + +#[no_mangle] +pub extern "C" fn rust_lstm_objective(l: usize, c: usize, b: usize, main_params: *const f64, extra_params: *const f64, state: *mut f64, sequence: *const f64, loss: *mut f64) { + let (main_params, extra_params, state, sequence) = unsafe {( + slice::from_raw_parts(main_params, 2*l*4*b), + slice::from_raw_parts(extra_params, 3*b), + slice::from_raw_parts_mut(state, 2*l*b), + slice::from_raw_parts(sequence, c*b) + )}; + + unsafe { + lstm_objective(l,c,b,main_params,extra_params,state,sequence, &mut *loss); + } +} + +#[no_mangle] +pub extern "C" fn rust_dlstm_objective(l: usize, c: usize, b: usize, main_params: *const f64, d_main_params: *mut f64, extra_params: *const f64, d_extra_params: *mut f64, state: *mut f64, sequence: *const f64, res: *mut f64, d_res: *mut f64) { + let (main_params, d_main_params, extra_params, d_extra_params, state, sequence) = unsafe {( + slice::from_raw_parts(main_params, 2*l*4*b), + slice::from_raw_parts_mut(d_main_params, 2*l*4*b), + slice::from_raw_parts(extra_params, 3*b), + slice::from_raw_parts_mut(d_extra_params, 3*b), + slice::from_raw_parts_mut(state, 2*l*b), + slice::from_raw_parts(sequence, c*b) + )}; + + unsafe { + d_lstm_objective(l,c,b,main_params,d_main_params, extra_params,d_extra_params, state,sequence, &mut *res, &mut *d_res); + } +} diff --git a/enzyme/benchmarks/ReverseMode/lstm/src/main.rs b/enzyme/benchmarks/ReverseMode/lstm/src/main.rs new file mode 100644 index 000000000000..e7a11a969c03 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/lstm/src/main.rs @@ -0,0 +1,3 @@ +fn main() { + println!("Hello, world!"); +} From c8bcfe9aa02c5b8037ae38f94621822198b422e7 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Thu, 4 Apr 2024 03:24:08 -0400 Subject: [PATCH 26/56] adding unsafe Rust fft version (how to run?) --- enzyme/benchmarks/ReverseMode/fft/fft.cpp | 63 ++++++++++ enzyme/benchmarks/ReverseMode/fft/src/lib.rs | 104 +-------------- enzyme/benchmarks/ReverseMode/fft/src/safe.rs | 104 +++++++++++++++ enzyme/benchmarks/ReverseMode/fft/src/unsf.rs | 119 ++++++++++++++++++ 4 files changed, 288 insertions(+), 102 deletions(-) create mode 100644 enzyme/benchmarks/ReverseMode/fft/src/safe.rs create mode 100644 enzyme/benchmarks/ReverseMode/fft/src/unsf.rs diff --git a/enzyme/benchmarks/ReverseMode/fft/fft.cpp b/enzyme/benchmarks/ReverseMode/fft/fft.cpp index 5c67b3be1678..3c566c33b31a 100644 --- a/enzyme/benchmarks/ReverseMode/fft/fft.cpp +++ b/enzyme/benchmarks/ReverseMode/fft/fft.cpp @@ -33,9 +33,25 @@ extern "C" { int enzyme_dupnoneed; } +extern "C" void rust_unsafe_dfoobar(int n, double *data, double *ddata); +extern "C" void rust_unsafe_foobar(int n, double *data); extern "C" void rust_dfoobar(int n, double* data, double* ddata); extern "C" void rust_foobar(int n, double* data); +static double rust_unsafe_foobar_and_gradient(unsigned len) { + double *inp = new double[2 * len]; + for (int i = 0; i < 2 * len; i++) + inp[i] = 2.0; + double *dinp = new double[2 * len]; + for (int i = 0; i < 2 * len; i++) + dinp[i] = 1.0; + rust_unsafe_dfoobar(len * 2, inp, dinp); + double res = dinp[0]; + delete[] dinp; + delete[] inp; + return res; +} + static double rust_foobar_and_gradient(unsigned len) { double *inp = new double[2*len]; for(int i=0; i<2*len; i++) inp[i] = 2.0; @@ -217,6 +233,51 @@ static void enzyme_sincos(double inp, unsigned len) { } } +static void enzyme_unsafe_rust_sincos(double inp, unsigned len) { + + { + struct timeval start, end; + gettimeofday(&start, NULL); + + double *x = new double[2 * len]; + for (int i = 0; i < 2 * len; i++) + x[i] = 2.0; + rust_unsafe_foobar(len, x); + double res = x[0]; + + gettimeofday(&end, NULL); + printf("Enzyme (unsafe Rust) real %0.6f res=%f\n", tdiff(&start, &end), res); + delete[] x; + } + + { + struct timeval start, end; + gettimeofday(&start, NULL); + + double *x = new double[2 * len]; + for (int i = 0; i < 2 * len; i++) + x[i] = 2.0; + rust_unsafe_foobar(len, x); + double res = x[0]; + + gettimeofday(&end, NULL); + printf("Enzyme (unsafe Rust) forward %0.6f res=%f\n", tdiff(&start, &end), + res); + delete[] x; + } + + { + struct timeval start, end; + gettimeofday(&start, NULL); + + double res2 = rust_unsafe_foobar_and_gradient(len); + + gettimeofday(&end, NULL); + printf("Enzyme (unsafe Rust) combined %0.6f res'=%f\n", tdiff(&start, &end), + res2); + } +} + static void enzyme_rust_sincos(double inp, unsigned len) { { @@ -281,6 +342,7 @@ int main(int argc, char** argv) { printf("usage %s n [must be power of 2]\n", argv[0]); return 1; } + N = 2; double inp = -2.1; for(unsigned iters=max(1, N>>5); iters <= N; iters*=2) { @@ -289,5 +351,6 @@ int main(int argc, char** argv) { tapenade_sincos(inp, iters); enzyme_sincos(inp, iters); enzyme_rust_sincos(inp, iters); + // enzyme_unsafe_rust_sincos(inp, iters); } } diff --git a/enzyme/benchmarks/ReverseMode/fft/src/lib.rs b/enzyme/benchmarks/ReverseMode/fft/src/lib.rs index e2df837805e0..47b0aa1e97fd 100644 --- a/enzyme/benchmarks/ReverseMode/fft/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/fft/src/lib.rs @@ -1,106 +1,6 @@ #![feature(autodiff)] -use std::slice; -use std::f64::consts::PI; +pub mod safe; +pub mod unsf; -fn bitreversal_perm(data: &mut [T]) { - let len = data.len() / 2; - let mut j = 1; - let mut i = 1; - while i < 2*len { - if j > i { - //dbg!(&i, &j); - data.swap(j-1, i-1); - data.swap(j, i); - } - - let mut m = len; - while m >= 2 && j > m { - j -= m; - m >>= 1; - } - - j += m; - i += 2; - } -} - -fn radix2(data: &mut [f64], i_sign: f64, n: usize) { - if n == 1 { - return; - } - - let (a,b) = data.split_at_mut(n); - radix2(a, i_sign, n/2); - radix2(b, i_sign, n/2); - - let wtemp = i_sign * (PI / n as f64).sin(); - let wpi = -i_sign * (2.0 * PI / n as f64).sin(); - let wpr = -2.0 * wtemp * wtemp; - let mut wr = 1.0; - let mut wi = 0.0; - - let mut i = 0; - while i < n { - let in_n = i + n; - - let tempr = data[in_n] * wr - data[in_n + 1] * wi; - let tempi = data[in_n] * wi + data[in_n + 1] * wr; - - data[in_n] = data[i] - tempr; - data[in_n + 1] = data[i + 1] - tempi; - data[i] += tempr; - data[i + 1] += tempi; - - let wtemp_new = wr; - wr += wr * wpr - wi * wpi; - wi += wi * wpr + wtemp_new * wpi; - - i += 2; - } -} - -fn rescale(data: &mut [f64], scale: f64) { - let scale = 1. / scale; - for elm in data { - *elm *= scale; - } -} - -fn fft(data: &mut [f64]) { - bitreversal_perm(data); - radix2(data, 1.0, data.len() / 2); -} - -fn ifft(data: &mut [f64]) { - bitreversal_perm(data); - radix2(data, -1.0, data.len() / 2); - rescale(data, data.len() as f64 / 2.); -} - -#[autodiff(dfoobar, Reverse, Duplicated)] -pub fn foobar(data: &mut [f64]) { - fft(data); - ifft(data); -} - -#[no_mangle] -pub extern "C" fn rust_dfoobar(n: usize, data: *mut f64, ddata: *mut f64) { - - let (data, ddata) = unsafe { - ( - slice::from_raw_parts_mut(data, n), - slice::from_raw_parts_mut(ddata, n) - ) - }; - - dfoobar(data, ddata); -} - -#[no_mangle] -pub extern "C" fn rust_foobar(n: usize, data: *mut f64) { - let data = unsafe { slice::from_raw_parts_mut(data, n) }; - - foobar(data); -} diff --git a/enzyme/benchmarks/ReverseMode/fft/src/safe.rs b/enzyme/benchmarks/ReverseMode/fft/src/safe.rs new file mode 100644 index 000000000000..e17599b12683 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/fft/src/safe.rs @@ -0,0 +1,104 @@ +use std::slice; +use std::f64::consts::PI; + +fn bitreversal_perm(data: &mut [T]) { + let len = data.len() / 2; + let mut j = 1; + + let mut i = 1; + while i < 2*len { + if j > i { + //dbg!(&i, &j); + data.swap(j-1, i-1); + data.swap(j, i); + } + + let mut m = len; + while m >= 2 && j > m { + j -= m; + m >>= 1; + } + + j += m; + i += 2; + } +} + +fn radix2(data: &mut [f64], i_sign: f64, n: usize) { + if n == 1 { + return; + } + + let (a,b) = data.split_at_mut(n); + radix2(a, i_sign, n/2); + radix2(b, i_sign, n/2); + + let wtemp = i_sign * (PI / n as f64).sin(); + let wpi = -i_sign * (2.0 * PI / n as f64).sin(); + let wpr = -2.0 * wtemp * wtemp; + let mut wr = 1.0; + let mut wi = 0.0; + + let mut i = 0; + while i < n { + let in_n = i + n; + + let tempr = data[in_n] * wr - data[in_n + 1] * wi; + let tempi = data[in_n] * wi + data[in_n + 1] * wr; + + data[in_n] = data[i] - tempr; + data[in_n + 1] = data[i + 1] - tempi; + data[i] += tempr; + data[i + 1] += tempi; + + let wtemp_new = wr; + wr += wr * wpr - wi * wpi; + wi += wi * wpr + wtemp_new * wpi; + + i += 2; + } +} + +fn rescale(data: &mut [f64], scale: f64) { + let scale = 1. / scale; + for elm in data { + *elm *= scale; + } +} + +fn fft(data: &mut [f64]) { + bitreversal_perm(data); + radix2(data, 1.0, data.len() / 2); +} + +fn ifft(data: &mut [f64]) { + bitreversal_perm(data); + radix2(data, -1.0, data.len() / 2); + rescale(data, data.len() as f64 / 2.); +} + +#[autodiff(dfoobar, Reverse, Duplicated)] +pub fn foobar(data: &mut [f64]) { + fft(data); + ifft(data); +} + +#[no_mangle] +pub extern "C" fn rust_dfoobar(n: usize, data: *mut f64, ddata: *mut f64) { + + let (data, ddata) = unsafe { + ( + slice::from_raw_parts_mut(data, n), + slice::from_raw_parts_mut(ddata, n) + ) + }; + + dfoobar(data, ddata); +} + +#[no_mangle] +pub extern "C" fn rust_foobar(n: usize, data: *mut f64) { + let data = unsafe { slice::from_raw_parts_mut(data, n) }; + + foobar(data); +} diff --git a/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs b/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs new file mode 100644 index 000000000000..6c5d086ffdf1 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs @@ -0,0 +1,119 @@ +use std::f64::consts::PI; + +//static void scramble(double* data, unsigned N) { +// int j=1; +// for (int i=1; i<2*N; i+=2) { +// if (j>i) { +// swap(&data[j-1], &data[i-1]); +// swap(&data[j], &data[i]); +// } +// int m = N; +// while (m>=2 && j>m) { +// j -= m; +// m >>= 1; +// } +// j += m; +// } +//} +unsafe fn bitreversal_perm(data: *mut f64, n: usize) { + //let len = data.len() / 2; + let mut j = 1; + + for i in (1..2*n).step_by(2) { + //let mut i = 1; + //while i < 2*len { + if j > i { + std::ptr::swap(data.add(j-1), data.add(i-1)); + std::ptr::swap(data.add(j), data.add(i)); + //data.swap(j-1, i-1); + //data.swap(j, i); + } + + let mut m = n; + while m >= 2 && j > m { + j -= m; + m >>= 1; + } + + j += m; + //i += 2; + } +} + +unsafe fn radix2(data: *mut f64, i_sign: f64, n: usize) { + if n == 1 { + return; + } + + let b = data.add(n); + let a = data; + //let (a,b) = data.split_at_mut(n); + radix2(a, i_sign, n/2); + radix2(b, i_sign, n/2); + + let wtemp = i_sign * (PI / n as f64).sin(); + let wpi = -i_sign * (2.0 * PI / n as f64).sin(); + let wpr = -2.0 * wtemp * wtemp; + let mut wr = 1.0; + let mut wi = 0.0; + + let mut i = 0; + while i < n { + let in_n = i + n; + + let tempr = *data.add(in_n) * wr - *data.add(in_n + 1) * wi; + let tempi = *data.add(in_n) * wi + *data.add(in_n + 1) * wr; + + *data.add(in_n) = *data.add(i) - tempr; + *data.add(in_n + 1) = *data.add(i + 1) - tempi; + *data.add(i) += tempr; + *data.add(i + 1) += tempi; + + let wtemp_new = wr; + wr += wr * wpr - wi * wpi; + wi += wi * wpr + wtemp_new * wpi; + + i += 2; + } +} + +//static void rescale(double* data, unsigned N) { +// double scale = ((double)1)/N; +// for (unsigned i=0; i<2*N; i++) { +// data[i] *= scale; +// } +//} + +unsafe fn rescale(data: *mut f64, n: usize) { + let scale = 1. / n as f64; + for i in 0..2*n { + *data.add(i) = *data.add(i) * scale; + } +} + +unsafe fn fft(data: *mut f64, n: usize) { + bitreversal_perm(data, n); + radix2(data, 1.0, n); +} + +unsafe fn ifft(data: *mut f64, n: usize) { + bitreversal_perm(data, n); + radix2(data, -1.0, n); + rescale(data, n); +} + +#[autodiff(unsafe_dfoobar, Reverse, Const, Duplicated)] +pub unsafe fn unsafe_foobar(n: usize, data: *mut f64) { + fft(data, n); + ifft(data, n); +} + +#[no_mangle] +pub extern "C" fn rust_unsafe_dfoobar(n: usize, data: *mut f64, ddata: *mut f64) { + unsafe {unsafe_dfoobar(n, data, ddata); } +} + +#[no_mangle] +pub extern "C" fn rust_unsafe_foobar(n: usize, data: *mut f64) { + unsafe {unsafe_foobar(n, data); } +} From cad42213596cb5e3a6b22a0ab9b907197691663f Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Thu, 4 Apr 2024 17:31:27 -0400 Subject: [PATCH 27/56] imprv safe rus tto work like c++ --- enzyme/benchmarks/ReverseMode/fft/fft.cpp | 6 +++--- enzyme/benchmarks/ReverseMode/fft/src/main.rs | 18 +++++++++++++----- enzyme/benchmarks/ReverseMode/fft/src/safe.rs | 13 ++++++------- enzyme/benchmarks/ReverseMode/fft/src/unsf.rs | 4 ++-- 4 files changed, 24 insertions(+), 17 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/fft/fft.cpp b/enzyme/benchmarks/ReverseMode/fft/fft.cpp index 3c566c33b31a..6d16839a56ff 100644 --- a/enzyme/benchmarks/ReverseMode/fft/fft.cpp +++ b/enzyme/benchmarks/ReverseMode/fft/fft.cpp @@ -45,7 +45,7 @@ static double rust_unsafe_foobar_and_gradient(unsigned len) { double *dinp = new double[2 * len]; for (int i = 0; i < 2 * len; i++) dinp[i] = 1.0; - rust_unsafe_dfoobar(len * 2, inp, dinp); + rust_unsafe_dfoobar(len, inp, dinp); double res = dinp[0]; delete[] dinp; delete[] inp; @@ -57,7 +57,7 @@ static double rust_foobar_and_gradient(unsigned len) { for(int i=0; i<2*len; i++) inp[i] = 2.0; double *dinp = new double[2*len]; for(int i=0; i<2*len; i++) dinp[i] = 1.0; - rust_dfoobar(len*2, inp, dinp); + rust_dfoobar(len, inp, dinp); double res = dinp[0]; delete[] dinp; delete[] inp; @@ -351,6 +351,6 @@ int main(int argc, char** argv) { tapenade_sincos(inp, iters); enzyme_sincos(inp, iters); enzyme_rust_sincos(inp, iters); - // enzyme_unsafe_rust_sincos(inp, iters); + enzyme_unsafe_rust_sincos(inp, iters); } } diff --git a/enzyme/benchmarks/ReverseMode/fft/src/main.rs b/enzyme/benchmarks/ReverseMode/fft/src/main.rs index f2a857806eb2..5f76ad96243e 100644 --- a/enzyme/benchmarks/ReverseMode/fft/src/main.rs +++ b/enzyme/benchmarks/ReverseMode/fft/src/main.rs @@ -1,14 +1,22 @@ -use fft::dfoobar; +use core::mem; +use fft::safe;//::dfoobar; +use fft::unsf;//::dfoobar; fn main() { - let mut data = vec![1.0; 32]; - for i in 0..16 { + let len = 16; + let mut data = vec![1.0; 2*len]; + for i in 0..len { data[i] = 2.0; } - let mut data_d = vec![1.0; data.len()]; + let mut data_d = vec![1.0; 2*len]; - dfoobar(&mut data, &mut data_d); + //unsafe {safe::rust_dfoobar(len, data.as_mut_ptr(), data_d.as_mut_ptr());} + //unsafe {safe::rust_foobar(len, data.as_mut_ptr());} + unsafe {unsf::unsafe_dfoobar(len, data.as_mut_ptr(), data_d.as_mut_ptr());} + unsafe {unsf::unsafe_foobar(len, data.as_mut_ptr());} dbg!(&data_d); dbg!(&data); + //mem::forget(data); + //mem::forget(data_d); } diff --git a/enzyme/benchmarks/ReverseMode/fft/src/safe.rs b/enzyme/benchmarks/ReverseMode/fft/src/safe.rs index e17599b12683..7332dcb91356 100644 --- a/enzyme/benchmarks/ReverseMode/fft/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/fft/src/safe.rs @@ -59,8 +59,8 @@ fn radix2(data: &mut [f64], i_sign: f64, n: usize) { } } -fn rescale(data: &mut [f64], scale: f64) { - let scale = 1. / scale; +fn rescale(data: &mut [f64], scale: usize) { + let scale = 1. / scale as f64; for elm in data { *elm *= scale; } @@ -74,7 +74,7 @@ fn fft(data: &mut [f64]) { fn ifft(data: &mut [f64]) { bitreversal_perm(data); radix2(data, -1.0, data.len() / 2); - rescale(data, data.len() as f64 / 2.); + rescale(data, data.len() / 2); } #[autodiff(dfoobar, Reverse, Duplicated)] @@ -88,8 +88,8 @@ pub extern "C" fn rust_dfoobar(n: usize, data: *mut f64, ddata: *mut f64) { let (data, ddata) = unsafe { ( - slice::from_raw_parts_mut(data, n), - slice::from_raw_parts_mut(ddata, n) + slice::from_raw_parts_mut(data, n * 2), + slice::from_raw_parts_mut(ddata, n * 2) ) }; @@ -98,7 +98,6 @@ pub extern "C" fn rust_dfoobar(n: usize, data: *mut f64, ddata: *mut f64) { #[no_mangle] pub extern "C" fn rust_foobar(n: usize, data: *mut f64) { - let data = unsafe { slice::from_raw_parts_mut(data, n) }; - + let data = unsafe { slice::from_raw_parts_mut(data, n * 2) }; foobar(data); } diff --git a/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs b/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs index 6c5d086ffdf1..653e495bb5f1 100644 --- a/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs +++ b/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs @@ -104,8 +104,8 @@ unsafe fn ifft(data: *mut f64, n: usize) { #[autodiff(unsafe_dfoobar, Reverse, Const, Duplicated)] pub unsafe fn unsafe_foobar(n: usize, data: *mut f64) { - fft(data, n); - ifft(data, n); + fft(data, n / 2); + ifft(data, n / 2); } #[no_mangle] From f0bf16b64e063d94c97270fd1a8a6103fd6a8a81 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Thu, 4 Apr 2024 17:52:28 -0400 Subject: [PATCH 28/56] unsafe version not crashing --- enzyme/benchmarks/ReverseMode/fft/src/unsf.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs b/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs index 653e495bb5f1..5391d035095f 100644 --- a/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs +++ b/enzyme/benchmarks/ReverseMode/fft/src/unsf.rs @@ -15,11 +15,11 @@ use std::f64::consts::PI; // j += m; // } //} -unsafe fn bitreversal_perm(data: *mut f64, n: usize) { +unsafe fn bitreversal_perm(data: *mut f64, len: usize) { //let len = data.len() / 2; let mut j = 1; - for i in (1..2*n).step_by(2) { + for i in (1..2*len).step_by(2) { //let mut i = 1; //while i < 2*len { if j > i { @@ -29,7 +29,7 @@ unsafe fn bitreversal_perm(data: *mut f64, n: usize) { //data.swap(j, i); } - let mut m = n; + let mut m = len; while m >= 2 && j > m { j -= m; m >>= 1; @@ -104,8 +104,8 @@ unsafe fn ifft(data: *mut f64, n: usize) { #[autodiff(unsafe_dfoobar, Reverse, Const, Duplicated)] pub unsafe fn unsafe_foobar(n: usize, data: *mut f64) { - fft(data, n / 2); - ifft(data, n / 2); + fft(data, n ); + ifft(data, n ); } #[no_mangle] From 12e9a4a826091b4f228284361aa30d323bde63a0 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Thu, 4 Apr 2024 18:01:45 -0400 Subject: [PATCH 29/56] fix lstm makefile --- enzyme/benchmarks/ReverseMode/lstm/Makefile.make | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/lstm/Makefile.make b/enzyme/benchmarks/ReverseMode/lstm/Makefile.make index f3cdb818b742..51305ac7db2d 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/lstm/Makefile.make @@ -11,7 +11,7 @@ $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a: src/lib.rs Cargo.to cargo +enzyme rustc --release --lib --crate-type=staticlib %-unopt.ll: %.cpp - clang++ $(BENCH) $^ -O2 --gcc-install-dir=/usr/lib/gcc/x86_64-linux-gnu/11 -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm + clang++ $(BENCH) $^ -pthread -O2 -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm %-raw.ll: %-unopt.ll @echo $(LOAD) @@ -22,7 +22,7 @@ $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a: src/lib.rs Cargo.to #opt $^ -O2 -o $@ -S lstm.o: lstm-opt.ll $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a - clang++ --gcc-install-dir=/usr/lib/gcc/x86_64-linux-gnu/11 -O2 $^ -o $@ $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a + clang++ -pthread -O2 $^ -o $@ $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a results.json: lstm.o ./$^ From 688721dfd3d8fbacde48e951b3c6eef6225e942d Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Thu, 4 Apr 2024 19:35:44 -0400 Subject: [PATCH 30/56] adding unsafe rust lstm version --- enzyme/benchmarks/ReverseMode/adbench/lstm.h | 118 +++++++------ .../benchmarks/ReverseMode/lstm/Makefile.make | 2 +- enzyme/benchmarks/ReverseMode/lstm/src/lib.rs | 151 ++-------------- .../benchmarks/ReverseMode/lstm/src/safe.rs | 167 ++++++++++++++++++ .../benchmarks/ReverseMode/lstm/src/unsf.rs | 114 ++++++++++++ 5 files changed, 362 insertions(+), 190 deletions(-) create mode 100644 enzyme/benchmarks/ReverseMode/lstm/src/safe.rs create mode 100644 enzyme/benchmarks/ReverseMode/lstm/src/unsf.rs diff --git a/enzyme/benchmarks/ReverseMode/adbench/lstm.h b/enzyme/benchmarks/ReverseMode/adbench/lstm.h index bd765ad1dbd1..03107c62ec41 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/lstm.h +++ b/enzyme/benchmarks/ReverseMode/adbench/lstm.h @@ -34,51 +34,33 @@ struct LSTMOutput { }; extern "C" { - void rust_dlstm_objective( - int l, - int c, - int b, - double const* main_params, - double* dmain_params, - double const* extra_params, - double* dextra_params, - double* state, - double const* sequence, - double* loss, - double* dloss - ); - - void dlstm_objective( - int l, - int c, - int b, - double const* main_params, - double* dmain_params, - double const* extra_params, - double* dextra_params, - double* state, - double const* sequence, - double* loss, - double* dloss - ); - - void lstm_objective_b(int l, int c, int b, const double *main_params, double * - main_paramsb, const double *extra_params, double *extra_paramsb, - double *state, const double *sequence, double *loss, double *lossb); - - void adept_dlstm_objective( - int l, - int c, - int b, - double const* main_params, - double* dmain_params, - double const* extra_params, - double* dextra_params, - double* state, - double const* sequence, - double* loss, - double* dloss - ); +void rust_unsafe_dlstm_objective(int l, int c, int b, double const *main_params, + double *dmain_params, + double const *extra_params, + double *dextra_params, double *state, + double const *sequence, double *loss, + double *dloss); + +void rust_safe_dlstm_objective(int l, int c, int b, double const *main_params, + double *dmain_params, double const *extra_params, + double *dextra_params, double *state, + double const *sequence, double *loss, + double *dloss); + +void dlstm_objective(int l, int c, int b, double const *main_params, + double *dmain_params, double const *extra_params, + double *dextra_params, double *state, + double const *sequence, double *loss, double *dloss); + +void lstm_objective_b(int l, int c, int b, const double *main_params, + double *main_paramsb, const double *extra_params, + double *extra_paramsb, double *state, + const double *sequence, double *loss, double *lossb); + +void adept_dlstm_objective(int l, int c, int b, double const *main_params, + double *dmain_params, double const *extra_params, + double *dextra_params, double *state, + double const *sequence, double *loss, double *dloss); } void read_lstm_instance(const string& fn, @@ -322,14 +304,14 @@ int main(const int argc, const char* argv[]) { { struct timeval start, end; gettimeofday(&start, NULL); - calculate_jacobian(input, result); + calculate_jacobian(input, result); gettimeofday(&end, NULL); - printf("Enzyme (Rust) combined %0.6f\n", tdiff(&start, &end)); + printf("Enzyme (safe Rust) combined %0.6f\n", tdiff(&start, &end)); json enzyme; - enzyme["name"] = "Enzyme (Rust) combined"; - enzyme["runtime"] = tdiff(&start, &end); - for (unsigned i = result.gradient.size() - 5; - i < result.gradient.size(); i++) { + enzyme["name"] = "Enzyme (safe Rust) combined"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = result.gradient.size() - 5; i < result.gradient.size(); + i++) { printf("%f ", result.gradient[i]); enzyme["result"].push_back(result.gradient[i]); } @@ -340,6 +322,40 @@ int main(const int argc, const char* argv[]) { } + { + + struct LSTMInput input = {}; + + // Read instance + read_lstm_instance("data/" + path, &input.l, &input.c, &input.b, + input.main_params, input.extra_params, input.state, + input.sequence); + + std::vector state = std::vector(input.state.size()); + + int Jcols = 8 * input.l * input.b + 3 * input.b; + struct LSTMOutput result = {0, std::vector(Jcols)}; + + { + struct timeval start, end; + gettimeofday(&start, NULL); + calculate_jacobian(input, result); + gettimeofday(&end, NULL); + printf("Enzyme (unsafe Rust) combined %0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "Enzyme (unsafe Rust) combined"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = result.gradient.size() - 5; i < result.gradient.size(); + i++) { + printf("%f ", result.gradient[i]); + enzyme["result"].push_back(result.gradient[i]); + } + test_suite["tools"].push_back(enzyme); + + printf("\n"); + } + } + test_suite["llvm-version"] = __clang_version__; test_suite["mode"] = "ReverseMode"; test_suite["batch-size"] = 1; diff --git a/enzyme/benchmarks/ReverseMode/lstm/Makefile.make b/enzyme/benchmarks/ReverseMode/lstm/Makefile.make index 51305ac7db2d..23ba9a51ceff 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/lstm/Makefile.make @@ -8,7 +8,7 @@ clean: rm -f *.ll *.o results.json $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a: src/lib.rs Cargo.toml - cargo +enzyme rustc --release --lib --crate-type=staticlib + ENZYME_LOOSE_TYPES=1 cargo +enzyme rustc --release --lib --crate-type=staticlib %-unopt.ll: %.cpp clang++ $(BENCH) $^ -pthread -O2 -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm diff --git a/enzyme/benchmarks/ReverseMode/lstm/src/lib.rs b/enzyme/benchmarks/ReverseMode/lstm/src/lib.rs index aba88ac76617..b6b0e3e33225 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/lstm/src/lib.rs @@ -1,145 +1,16 @@ #![feature(autodiff)] +pub (crate) mod unsf; +pub (crate) mod safe; use std::slice; -// Sigmoid on scalar -fn sigmoid(x: f64) -> f64 { - 1.0 / (1.0 + (-x).exp()) -} - -// log(sum(exp(x), 2)) -fn logsumexp(vect: &[f64]) -> f64 { - let mut sum = 0.0; - for &val in vect { - sum += val.exp(); - } - sum += 2.0; // Adding 2 to sum - sum.ln() -} - -// LSTM OBJECTIVE -// The LSTM model -fn lstm_model( - hsize: usize, - weight: &[f64], - bias: &[f64], - hidden: &mut [f64], - cell: &mut [f64], - input: &[f64], -) { - let mut gates = vec![0.0; 4 * hsize]; - let (a,b) = gates.split_at_mut(2*hsize); - let ((forget, ingate), (outgate, change)) = ( - a.split_at_mut(hsize), b.split_at_mut(hsize)); - - // caching input - for i in 0..hsize { - forget[i] = sigmoid(input[i] * weight[i] + bias[i]); - ingate[i] = sigmoid(hidden[i] * weight[hsize + i] + bias[hsize + i]); - outgate[i] = sigmoid(input[i] * weight[2 * hsize + i] + bias[2 * hsize + i]); - change[i] = (hidden[i] * weight[3 * hsize + i] + bias[3 * hsize + i]).tanh(); - } - - // caching cell - for i in 0..hsize { - cell[i] = cell[i] * forget[i] + ingate[i] * change[i]; - } - - for i in 0..hsize { - hidden[i] = outgate[i] * cell[i].tanh(); - } -} - -// Predict LSTM output given an input -fn lstm_predict( - l: usize, - b: usize, - w: &[f64], - w2: &[f64], - s: &mut [f64], - x: &[f64], - x2: &mut [f64], -) { - for i in 0..b { - x2[i] = x[i] * w2[i]; - } - - let mut i = 0; - while i <= 2*l*b - 1 { - // make borrow-checker happy with non-overlapping mutable references - let (xp, s1, s2) = if i == 0 { - let (s1, s2) = s.split_at_mut(b); - (x2.as_mut(), s1, s2) - } else { - let tmp = &mut s[i-2*b..]; - let (a, d) = tmp.split_at_mut(2*b); - let (d, c) = d.split_at_mut(b); - - (a,d,c) - }; - - lstm_model( - b, - &w[i * 4..], - &w[(i + b) * 4..], - s1, - s2, - xp, - ); - i += 2 * b; - } - - let xp = &s[i-2*b..]; - - for i in 0..b { - x2[i] = xp[i] * w2[b + i] + w2[2 * b + i]; - } -} - -// LSTM objective (loss function) -#[autodiff(d_lstm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Const, Const, Duplicated)] -pub fn lstm_objective( - l: usize, - c: usize, - b: usize, - main_params: &[f64], - extra_params: &[f64], - state: &mut [f64], - sequence: &[f64], - loss: &mut f64, -) { - let mut total = 0.0; - let mut count = 0; - - let mut input = &sequence[..b]; - let mut ypred = vec![0.0; b]; - let mut ynorm = vec![0.0; b]; - let mut lse; - - assert!(b > 0); - - for t in (0..=(c - 1) * b - 1).step_by(b) { - lstm_predict(l, b, main_params, extra_params, state, input, &mut ypred); - lse = logsumexp(&ypred); - for i in 0..b { - ynorm[i] = ypred[i] - lse; - } - - let ygold = &sequence[t + b..]; - for i in 0..b { - total += ygold[i] * ynorm[i]; - } - - count += b; - input = ygold; - } - - *loss = -total / count as f64; +#[no_mangle] +pub extern "C" fn rust_unsafe_lstm_objective(l: usize, c: usize, b: usize, main_params: *const f64, extra_params: *const f64, state: *mut f64, sequence: *const f64, loss: *mut f64) { + unsafe {unsf::lstm_unsafe_objective(l,c,b,main_params,extra_params,state,sequence, loss);} } - #[no_mangle] -pub extern "C" fn rust_lstm_objective(l: usize, c: usize, b: usize, main_params: *const f64, extra_params: *const f64, state: *mut f64, sequence: *const f64, loss: *mut f64) { +pub extern "C" fn rust_safe_lstm_objective(l: usize, c: usize, b: usize, main_params: *const f64, extra_params: *const f64, state: *mut f64, sequence: *const f64, loss: *mut f64) { let (main_params, extra_params, state, sequence) = unsafe {( slice::from_raw_parts(main_params, 2*l*4*b), slice::from_raw_parts(extra_params, 3*b), @@ -148,12 +19,16 @@ pub extern "C" fn rust_lstm_objective(l: usize, c: usize, b: usize, main_params: )}; unsafe { - lstm_objective(l,c,b,main_params,extra_params,state,sequence, &mut *loss); + safe::lstm_objective(l,c,b,main_params,extra_params,state,sequence, &mut *loss); } } #[no_mangle] -pub extern "C" fn rust_dlstm_objective(l: usize, c: usize, b: usize, main_params: *const f64, d_main_params: *mut f64, extra_params: *const f64, d_extra_params: *mut f64, state: *mut f64, sequence: *const f64, res: *mut f64, d_res: *mut f64) { +pub extern "C" fn rust_unsafe_dlstm_objective(l: usize, c: usize, b: usize, main_params: *const f64, d_main_params: *mut f64, extra_params: *const f64, d_extra_params: *mut f64, state: *mut f64, sequence: *const f64, res: *mut f64, d_res: *mut f64) { + unsafe {unsf::d_lstm_unsafe_objective(l,c,b,main_params,d_main_params, extra_params,d_extra_params, state,sequence, res, d_res);} +} +#[no_mangle] +pub extern "C" fn rust_safe_dlstm_objective(l: usize, c: usize, b: usize, main_params: *const f64, d_main_params: *mut f64, extra_params: *const f64, d_extra_params: *mut f64, state: *mut f64, sequence: *const f64, res: *mut f64, d_res: *mut f64) { let (main_params, d_main_params, extra_params, d_extra_params, state, sequence) = unsafe {( slice::from_raw_parts(main_params, 2*l*4*b), slice::from_raw_parts_mut(d_main_params, 2*l*4*b), @@ -164,6 +39,6 @@ pub extern "C" fn rust_dlstm_objective(l: usize, c: usize, b: usize, main_params )}; unsafe { - d_lstm_objective(l,c,b,main_params,d_main_params, extra_params,d_extra_params, state,sequence, &mut *res, &mut *d_res); + safe::d_lstm_objective(l,c,b,main_params,d_main_params, extra_params,d_extra_params, state,sequence, &mut *res, &mut *d_res); } } diff --git a/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs new file mode 100644 index 000000000000..8734998acfb7 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs @@ -0,0 +1,167 @@ +use std::slice; + +// Sigmoid on scalar +fn sigmoid(x: f64) -> f64 { + 1.0 / (1.0 + (-x).exp()) +} + +// log(sum(exp(x), 2)) +fn logsumexp(vect: &[f64]) -> f64 { + let mut sum = 0.0; + for &val in vect { + sum += val.exp(); + } + sum += 2.0; // Adding 2 to sum + sum.ln() +} + +// LSTM OBJECTIVE +// The LSTM model +fn lstm_model( + hsize: usize, + weight: &[f64], + bias: &[f64], + hidden: &mut [f64], + cell: &mut [f64], + input: &[f64], +) { + let mut gates = vec![0.0; 4 * hsize]; + let (a,b) = gates.split_at_mut(2*hsize); + let ((forget, ingate), (outgate, change)) = ( + a.split_at_mut(hsize), b.split_at_mut(hsize)); + + // caching input + for i in 0..hsize { + forget[i] = sigmoid(input[i] * weight[i] + bias[i]); + ingate[i] = sigmoid(hidden[i] * weight[hsize + i] + bias[hsize + i]); + outgate[i] = sigmoid(input[i] * weight[2 * hsize + i] + bias[2 * hsize + i]); + change[i] = (hidden[i] * weight[3 * hsize + i] + bias[3 * hsize + i]).tanh(); + } + + // caching cell + for i in 0..hsize { + cell[i] = cell[i] * forget[i] + ingate[i] * change[i]; + } + + for i in 0..hsize { + hidden[i] = outgate[i] * cell[i].tanh(); + } +} + +// Predict LSTM output given an input +fn lstm_predict( + l: usize, + b: usize, + w: &[f64], + w2: &[f64], + s: &mut [f64], + x: &[f64], + x2: &mut [f64], +) { + for i in 0..b { + x2[i] = x[i] * w2[i]; + } + + let mut i = 0; + while i <= 2*l*b - 1 { + // make borrow-checker happy with non-overlapping mutable references + let (xp, s1, s2) = if i == 0 { + let (s1, s2) = s.split_at_mut(b); + (x2.as_mut(), s1, s2) + } else { + let tmp = &mut s[i-2*b..]; + let (a, d) = tmp.split_at_mut(2*b); + let (d, c) = d.split_at_mut(b); + + (a,d,c) + }; + + lstm_model( + b, + &w[i * 4..], + &w[(i + b) * 4..], + s1, + s2, + xp, + ); + + i += 2 * b; + } + + let xp = &s[i-2*b..]; + + for i in 0..b { + x2[i] = xp[i] * w2[b + i] + w2[2 * b + i]; + } +} + +// LSTM objective (loss function) +#[autodiff(d_lstm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Const, Const, Duplicated)] +pub (crate) fn lstm_objective( + l: usize, + c: usize, + b: usize, + main_params: &[f64], + extra_params: &[f64], + state: &mut [f64], + sequence: &[f64], + loss: &mut f64, +) { + let mut total = 0.0; + let mut count = 0; + + let mut input = &sequence[..b]; + let mut ypred = vec![0.0; b]; + let mut ynorm = vec![0.0; b]; + let mut lse; + + assert!(b > 0); + + for t in (0..=(c - 1) * b - 1).step_by(b) { + lstm_predict(l, b, main_params, extra_params, state, input, &mut ypred); + lse = logsumexp(&ypred); + for i in 0..b { + ynorm[i] = ypred[i] - lse; + } + + let ygold = &sequence[t + b..]; + for i in 0..b { + total += ygold[i] * ynorm[i]; + } + + count += b; + input = ygold; + } + + *loss = -total / count as f64; +} + +#[no_mangle] +pub extern "C" fn rust_lstm_objective(l: usize, c: usize, b: usize, main_params: *const f64, extra_params: *const f64, state: *mut f64, sequence: *const f64, loss: *mut f64) { + let (main_params, extra_params, state, sequence) = unsafe {( + slice::from_raw_parts(main_params, 2*l*4*b), + slice::from_raw_parts(extra_params, 3*b), + slice::from_raw_parts_mut(state, 2*l*b), + slice::from_raw_parts(sequence, c*b) + )}; + + unsafe { + lstm_objective(l,c,b,main_params,extra_params,state,sequence, &mut *loss); + } +} + +#[no_mangle] +pub extern "C" fn rust_dlstm_objective(l: usize, c: usize, b: usize, main_params: *const f64, d_main_params: *mut f64, extra_params: *const f64, d_extra_params: *mut f64, state: *mut f64, sequence: *const f64, res: *mut f64, d_res: *mut f64) { + let (main_params, d_main_params, extra_params, d_extra_params, state, sequence) = unsafe {( + slice::from_raw_parts(main_params, 2*l*4*b), + slice::from_raw_parts_mut(d_main_params, 2*l*4*b), + slice::from_raw_parts(extra_params, 3*b), + slice::from_raw_parts_mut(d_extra_params, 3*b), + slice::from_raw_parts_mut(state, 2*l*b), + slice::from_raw_parts(sequence, c*b) + )}; + + unsafe { + d_lstm_objective(l,c,b,main_params,d_main_params, extra_params,d_extra_params, state,sequence, &mut *res, &mut *d_res); + } +} diff --git a/enzyme/benchmarks/ReverseMode/lstm/src/unsf.rs b/enzyme/benchmarks/ReverseMode/lstm/src/unsf.rs new file mode 100644 index 000000000000..3758c8e1e97a --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/lstm/src/unsf.rs @@ -0,0 +1,114 @@ +// Sigmoid on scalar +fn sigmoid(x: f64) -> f64 { + 1.0 / (1.0 + (-x).exp()) +} + +// log(sum(exp(x), 2)) +unsafe fn logsumexp(vect: *const f64, sz: usize) -> f64 { + let mut sum: f64 = 0.0; + for i in 0..sz { + sum += (*vect.add(i)).exp(); + } + sum += 2.0; // Adding 2 to sum + sum.ln() +} + +// LSTM OBJECTIVE +// The LSTM model +unsafe fn lstm_model( + hsize: usize, + weight: *const f64, + bias: *const f64, + hidden: *mut f64, + cell: *mut f64, + input: *const f64, +) { +// // TODO NOTE THIS +// //__builtin_assume(hsize > 0); + let mut gates = vec![0.0; 4 * hsize]; + let forget: *mut f64 = gates.as_mut_ptr(); + let ingate: *mut f64 = gates[hsize..].as_mut_ptr(); + let outgate: *mut f64 = gates[2 * hsize..].as_mut_ptr(); + let change: *mut f64 = gates[3 * hsize..].as_mut_ptr(); + //let (a,b) = gates.split_at_mut(2*hsize); + //let ((forget, ingate), (outgate, change)) = ( + // a.split_at_mut(hsize), b.split_at_mut(hsize)); + + // caching input + for i in 0..hsize { + *forget.add(i) = sigmoid(*input.add(i) * *weight.add(i) + *bias.add(i)); + *ingate.add(i) = sigmoid(*hidden.add(i) * *weight.add(hsize + i) + *bias.add(hsize + i)); + *outgate.add(i) = sigmoid(*input.add(i) * *weight.add(2 * hsize + i) + *bias.add(2 * hsize + i)); + *change.add(i) = (*hidden.add(i) * *weight.add(3 * hsize + i) + *bias.add(3 * hsize + i)).tanh(); + } + + // caching cell + for i in 0..hsize { + *cell.add(i) = *cell.add(i) * *forget.add(i) + *ingate.add(i) * *change.add(i); + } + + for i in 0..hsize { + *hidden.add(i) = *outgate.add(i) * (*cell.add(i)).tanh(); + } +} + +// Predict LSTM output given an input +unsafe fn lstm_predict( + l: usize, + b: usize, + w: *const f64, + w2: *const f64, + s: *mut f64, + x: *const f64, + x2: *mut f64, +) { + for i in 0..b { + *x2.add(i) = *x.add(i) * *w2.add(i); + } + + let mut xp = x2; + let stop = 2 * l * b; + for i in (0..=stop - 1).step_by(2 * b) { + lstm_model(b, w.add(i * 4), w.add((i + b) * 4), s.add(i), s.add(i + b), xp); + xp = s.add(i); + } + + for i in 0..b { + *x2.add(i) = *xp.add(i) * *w2.add(b + i) + *w2.add(2 * b + i); + } +} + +// LSTM objective (loss function) +#[autodiff(d_lstm_unsafe_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Const, Const, Duplicated)] +pub (crate) unsafe fn lstm_unsafe_objective(l: usize, c: usize, b: usize, main_params: *const f64, extra_params: *const f64, state: *mut f64, sequence: *const f64, loss: *mut f64) { + let mut total = 0.0; + let mut count = 0; + + //const double* input = &(sequence[0]); + let mut input = sequence; + let mut ypred = vec![0.0; b]; + let mut ynorm = vec![0.0; b]; + let mut lse; + + assert!(b > 0); + + let stop = (c - 1) * b; + for t in (0..=stop - 1).step_by(b) { + lstm_predict(l, b, main_params, extra_params, state, input, ypred.as_mut_ptr()); + lse = logsumexp(ypred.as_mut_ptr(), b); + for i in 0..b { + ynorm[i] = ypred[i] - lse; + } + + //let ygold = &sequence[t + b..]; + let ygold = sequence.add(t + b); + for i in 0..b { + total += *ygold.add(i) * ynorm[i]; + } + + count += b; + input = ygold; + } + + *loss = -total / count as f64; +} From 7ca8092e0ebbd0e1f433c736befcd6e4955afc9b Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Thu, 4 Apr 2024 19:36:27 -0400 Subject: [PATCH 31/56] run full fft tests --- enzyme/benchmarks/ReverseMode/fft/fft.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/enzyme/benchmarks/ReverseMode/fft/fft.cpp b/enzyme/benchmarks/ReverseMode/fft/fft.cpp index 6d16839a56ff..b6c5fb7b5eaa 100644 --- a/enzyme/benchmarks/ReverseMode/fft/fft.cpp +++ b/enzyme/benchmarks/ReverseMode/fft/fft.cpp @@ -342,7 +342,6 @@ int main(int argc, char** argv) { printf("usage %s n [must be power of 2]\n", argv[0]); return 1; } - N = 2; double inp = -2.1; for(unsigned iters=max(1, N>>5); iters <= N; iters*=2) { From 73f807567c232d50c44c4b0d7a14f86112456aa5 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Thu, 4 Apr 2024 20:00:38 -0400 Subject: [PATCH 32/56] Delete enzyme/benchmarks/ReverseMode/lstm/src/main.rs --- enzyme/benchmarks/ReverseMode/lstm/src/main.rs | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 enzyme/benchmarks/ReverseMode/lstm/src/main.rs diff --git a/enzyme/benchmarks/ReverseMode/lstm/src/main.rs b/enzyme/benchmarks/ReverseMode/lstm/src/main.rs deleted file mode 100644 index e7a11a969c03..000000000000 --- a/enzyme/benchmarks/ReverseMode/lstm/src/main.rs +++ /dev/null @@ -1,3 +0,0 @@ -fn main() { - println!("Hello, world!"); -} From ba3aa5d7b32ab9af3159083b131d69e062296772 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Sat, 6 Apr 2024 04:13:22 -0400 Subject: [PATCH 33/56] cleanup and correctness --- enzyme/benchmarks/ReverseMode/adbench/lstm.h | 96 +++++++++++++++++++ enzyme/benchmarks/ReverseMode/gmm/src/safe.rs | 8 +- enzyme/benchmarks/ReverseMode/lstm/src/lib.rs | 20 +++- 3 files changed, 113 insertions(+), 11 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/adbench/lstm.h b/enzyme/benchmarks/ReverseMode/adbench/lstm.h index 03107c62ec41..7472bf37beb2 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/lstm.h +++ b/enzyme/benchmarks/ReverseMode/adbench/lstm.h @@ -41,6 +41,14 @@ void rust_unsafe_dlstm_objective(int l, int c, int b, double const *main_params, double const *sequence, double *loss, double *dloss); +void rust_unsafe_lstm_objective(int l, int c, int b, double const *main_params, + double const *extra_params, double *state, + double const *sequence, double *loss); + +void rust_safe_lstm_objective(int l, int c, int b, double const *main_params, + double const *extra_params, double *state, + double const *sequence, double *loss); + void rust_safe_dlstm_objective(int l, int c, int b, double const *main_params, double *dmain_params, double const *extra_params, double *dextra_params, double *state, @@ -173,6 +181,28 @@ void calculate_jacobian(struct LSTMInput &input, struct LSTMOutput &result) } } +double calculate_unsafe_primal(struct LSTMInput &input) { + double loss = 0.0; + for (int i = 0; i < 100; i++) { + rust_unsafe_lstm_objective( + input.l, input.c, input.b, input.main_params.data(), + input.extra_params.data(), input.state.data(), + input.sequence.data(), &loss); + } + return loss; +} + +double calculate_safe_primal(struct LSTMInput &input) { + double loss = 0.0; + for (int i = 0; i < 100; i++) { + rust_safe_lstm_objective(input.l, input.c, input.b, + input.main_params.data(), + input.extra_params.data(), input.state.data(), + input.sequence.data(), &loss); + } + return loss; +} + int main(const int argc, const char* argv[]) { printf("starting main\n"); @@ -355,6 +385,72 @@ int main(const int argc, const char* argv[]) { printf("\n"); } } + { + + struct LSTMInput input = {}; + + // Read instance + read_lstm_instance("data/" + path, &input.l, &input.c, &input.b, + input.main_params, input.extra_params, input.state, + input.sequence); + + std::vector state = std::vector(input.state.size()); + + int Jcols = 8 * input.l * input.b + 3 * input.b; + struct LSTMOutput result = {0, std::vector(Jcols)}; + + { + struct timeval start, end; + gettimeofday(&start, NULL); + calculate_unsafe_primal(input); + gettimeofday(&end, NULL); + printf("Enzyme (unsafe Rust) primal %0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "Enzyme (unsafe Rust) primal"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = result.gradient.size() - 5; i < result.gradient.size(); + i++) { + printf("%f ", result.gradient[i]); + enzyme["result"].push_back(result.gradient[i]); + } + test_suite["tools"].push_back(enzyme); + + printf("\n"); + } + } + { + + struct LSTMInput input = {}; + + // Read instance + read_lstm_instance("data/" + path, &input.l, &input.c, &input.b, + input.main_params, input.extra_params, input.state, + input.sequence); + + std::vector state = std::vector(input.state.size()); + + int Jcols = 8 * input.l * input.b + 3 * input.b; + struct LSTMOutput result = {0, std::vector(Jcols)}; + + { + struct timeval start, end; + gettimeofday(&start, NULL); + calculate_safe_primal(input); + gettimeofday(&end, NULL); + printf("Enzyme (safe Rust) primal %0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "Enzyme (safe Rust) primal"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = result.gradient.size() - 5; i < result.gradient.size(); + i++) { + printf("%f ", result.gradient[i]); + enzyme["result"].push_back(result.gradient[i]); + } + test_suite["tools"].push_back(enzyme); + + printf("\n"); + } + } test_suite["llvm-version"] = __clang_version__; test_suite["mode"] = "ReverseMode"; diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs b/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs index 5f954347f1d7..e56b9d5609e5 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs @@ -1,4 +1,3 @@ -//#![feature(autodiff)] use std::f64::consts::PI; use crate::Wishart; @@ -55,17 +54,12 @@ pub extern "C" fn rust_gmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, unsafe { *err = my_err }; } -//#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Duplicated)] -//pub fn gmm_objective_c(d: usize, k: usize, n: usize, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { -// gmm_objective(d, k, n, alphas, means, icf, x, wishart, &mut my_err); -//} - #[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Const, Duplicated)] pub fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], gamma: f64, m: i32, err: &mut f64) { let wishart: Wishart = Wishart { gamma, m }; - //let wishart: Wishart = unsafe { *wishart }; let constant = -(n as f64) * d as f64 * 0.5 * (2.0 * PI).ln(); let icf_sz = d * (d + 1) / 2; + let mut qdiags = vec![0.; d * k]; let mut sum_qs = vec![0.; k]; let mut xcentered = vec![0.; d]; diff --git a/enzyme/benchmarks/ReverseMode/lstm/src/lib.rs b/enzyme/benchmarks/ReverseMode/lstm/src/lib.rs index b6b0e3e33225..937460f3cee3 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/lstm/src/lib.rs @@ -6,11 +6,17 @@ use std::slice; #[no_mangle] -pub extern "C" fn rust_unsafe_lstm_objective(l: usize, c: usize, b: usize, main_params: *const f64, extra_params: *const f64, state: *mut f64, sequence: *const f64, loss: *mut f64) { +pub extern "C" fn rust_unsafe_lstm_objective(l: i32, c: i32, b: i32, main_params: *const f64, extra_params: *const f64, state: *mut f64, sequence: *const f64, loss: *mut f64) { + let l = l as usize; + let c = c as usize; + let b = b as usize; unsafe {unsf::lstm_unsafe_objective(l,c,b,main_params,extra_params,state,sequence, loss);} } #[no_mangle] -pub extern "C" fn rust_safe_lstm_objective(l: usize, c: usize, b: usize, main_params: *const f64, extra_params: *const f64, state: *mut f64, sequence: *const f64, loss: *mut f64) { +pub extern "C" fn rust_safe_lstm_objective(l: i32, c: i32, b: i32, main_params: *const f64, extra_params: *const f64, state: *mut f64, sequence: *const f64, loss: *mut f64) { + let l = l as usize; + let c = c as usize; + let b = b as usize; let (main_params, extra_params, state, sequence) = unsafe {( slice::from_raw_parts(main_params, 2*l*4*b), slice::from_raw_parts(extra_params, 3*b), @@ -24,11 +30,17 @@ pub extern "C" fn rust_safe_lstm_objective(l: usize, c: usize, b: usize, main_pa } #[no_mangle] -pub extern "C" fn rust_unsafe_dlstm_objective(l: usize, c: usize, b: usize, main_params: *const f64, d_main_params: *mut f64, extra_params: *const f64, d_extra_params: *mut f64, state: *mut f64, sequence: *const f64, res: *mut f64, d_res: *mut f64) { +pub extern "C" fn rust_unsafe_dlstm_objective(l: i32, c: i32, b: i32, main_params: *const f64, d_main_params: *mut f64, extra_params: *const f64, d_extra_params: *mut f64, state: *mut f64, sequence: *const f64, res: *mut f64, d_res: *mut f64) { + let l = l as usize; + let c = c as usize; + let b = b as usize; unsafe {unsf::d_lstm_unsafe_objective(l,c,b,main_params,d_main_params, extra_params,d_extra_params, state,sequence, res, d_res);} } #[no_mangle] -pub extern "C" fn rust_safe_dlstm_objective(l: usize, c: usize, b: usize, main_params: *const f64, d_main_params: *mut f64, extra_params: *const f64, d_extra_params: *mut f64, state: *mut f64, sequence: *const f64, res: *mut f64, d_res: *mut f64) { +pub extern "C" fn rust_safe_dlstm_objective(l: i32, c: i32, b: i32, main_params: *const f64, d_main_params: *mut f64, extra_params: *const f64, d_extra_params: *mut f64, state: *mut f64, sequence: *const f64, res: *mut f64, d_res: *mut f64) { + let l = l as usize; + let c = c as usize; + let b = b as usize; let (main_params, d_main_params, extra_params, d_extra_params, state, sequence) = unsafe {( slice::from_raw_parts(main_params, 2*l*4*b), slice::from_raw_parts_mut(d_main_params, 2*l*4*b), From 273773439e0578b26fbf985a4d5d343df0449fb2 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Mon, 6 May 2024 03:52:02 -0400 Subject: [PATCH 34/56] initial (compiling) rust ode version --- .../ReverseMode/ode-real/ode/Cargo.lock | 7 + .../ReverseMode/ode-real/ode/Cargo.toml | 21 ++ .../ReverseMode/ode-real/ode/src/lib.rs | 200 ++++++++++++++++++ 3 files changed, 228 insertions(+) create mode 100644 enzyme/benchmarks/ReverseMode/ode-real/ode/Cargo.lock create mode 100644 enzyme/benchmarks/ReverseMode/ode-real/ode/Cargo.toml create mode 100644 enzyme/benchmarks/ReverseMode/ode-real/ode/src/lib.rs diff --git a/enzyme/benchmarks/ReverseMode/ode-real/ode/Cargo.lock b/enzyme/benchmarks/ReverseMode/ode-real/ode/Cargo.lock new file mode 100644 index 000000000000..93dcf6a53b60 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/ode-real/ode/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "ode" +version = "0.1.0" diff --git a/enzyme/benchmarks/ReverseMode/ode-real/ode/Cargo.toml b/enzyme/benchmarks/ReverseMode/ode-real/ode/Cargo.toml new file mode 100644 index 000000000000..3013b597df4e --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/ode-real/ode/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "ode" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[lib] +crate-type = ["lib"] + + +[profile.release] +lto = "fat" +opt-level = 3 +#debug = true +#strip = "none" + +[profile.dev] +lto = "fat" + +[dependencies] diff --git a/enzyme/benchmarks/ReverseMode/ode-real/ode/src/lib.rs b/enzyme/benchmarks/ReverseMode/ode-real/ode/src/lib.rs new file mode 100644 index 000000000000..15a4dc606cde --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/ode-real/ode/src/lib.rs @@ -0,0 +1,200 @@ +#![feature(autodiff)] +#![feature(iter_next_chunk)] +#![allow(non_snake_case)] +#![allow(non_camel_case_types)] +#![allow(non_upper_case_globals)] + +//#define BOOST_MATH_NO_LONG_DOUBLE_MATH_FUNCTIONS +//#define BOOST_NO_EXCEPTIONS + +const N: usize = 32; +const xmin: f64 = 0.; +const xmax: f64 = 1.; +const ymin: f64 = 0.; +const ymax: f64 = 1.; + +#[inline(always)] +fn range(min: f64, max: f64, i: usize, N_var: usize) -> f64 { + (max - min) / (N_var as f64 - 1.) * i as f64 + min +} +#[inline(always)] +fn get(x: &[f64], i: usize, j: usize) -> f64 { + assert!(i > 0); + assert!(j < N); + x[N * i + j] +} + +//#define RANGE(min, max, i, N) ((max-min)/(N-1)*i + min) +//#define GETnb(x, i, j) (x)[N*i+j] +//#define GET(x, i, j) GETnb(x, i, j) +// #define GET(x, i, j) ({ assert(i >=0); assert( j>=0); assert(j f64 { + let eq1 = (x - 0.3) * (x - 0.3) + (y - 0.6) * (y - 0.6) <= 0.1 * 0.1; + let eq2 = t >= 1.1; + if eq1 && eq2 { + 5.0 + } else { + 0.0 + } +} + +fn init_brusselator(u: &mut [f64], v: &mut [f64]) { + for i in 0..N { + for j in 0..N { + let x = range(xmin, xmax, i, N); + let y = range(ymin, ymax, j, N); + u[N * i + j] = 22.0 * y * (1.0 - y) * (y * (1.0 - y)).sqrt(); + v[N * i + j] = 27.0 * x * (1.0 - x) * (x * (1.0 - x)).sqrt(); + } + } +} +// __enzyme_autodiff(brusselator_2d_loop, +// enzyme_dupnoneed, nullptr, dadjoint_inp.data(), +// enzyme_dupnoneed, nullptr, dadjoint_inp.data() + N * N, +// enzyme_dup, x.data(), dx.data(), +// enzyme_dup, x.data() + N * N, dx.data() + N * N, +// enzyme_dup, p, dp, +// enzyme_const, t); + + +#[autodiff(dbrusselator_2d_loop, Reverse, Duplicated, Duplicated, Duplicated, Duplicated, Duplicated, Const)] +fn brusselator_2d_loop(d_u: &mut [f64], d_v: &mut [f64], u: &[f64], v: &[f64], p: &[f64;3], t: f64) { + let A = p[0]; + let B = p[1]; + let alpha = p[2]; + let dx = 1. / (N - 1) as f64; + let alpha = alpha / (dx * dx); + for i in 0..N { + for j in 0..N { + let x = range(xmin, xmax, i, N); + let y = range(ymin, ymax, j, N); + let ip1 = if i == N - 1 { i } else { i + 1 }; + let im1 = if i == 0 { i } else { i - 1 }; + let jp1 = if j == N - 1 { j } else { j + 1 }; + let jm1 = if j == 0 { j } else { j - 1 }; + let u2v = u[N * i + j] * u[N * i + j] * v[N * i + j]; + d_u[N * i + j] = alpha * (u[N * im1 + j] + u[N * ip1 + j] + u[N * i + jp1] + u[N * i + jm1] - 4. * u[N * i + j]) + + B + u2v - (A + 1.) * u[N * i + j] + brusselator_f(x, y, t); + d_v[N * i + j] = alpha * (v[N * im1 + j] + v[N * ip1 + j] + v[N * i + jp1] + v[N * i + jm1] - 4. * v[N * i + j]) + + A * u[N * i + j] - u2v; + } + } +} + +//__attribute__((noinline)) +//void brusselator_2d_loop(double* __restrict du, double* __restrict dv, const double* __restrict u, const double* __restrict v, const double* __restrict p, double t) { +// double A = p[0]; +// double B = p[1]; +// double alpha = p[2]; +// double dx = (double)1/(N-1); +// +// alpha = alpha/(dx*dx); +// +// for(int i=0; i f64 { + let x = unsafe { *x }; + let mut adjoint = unsafe { *adjoint }; + let p: [f64;3] = unsafe { *p.cast::<[f64;3]>().as_ref().unwrap() }; + let mut dp = [0.; 3]; + let mut dx1 = [0.; N * N]; + let mut dx2 = [0.; N * N]; + let (mut dadj1, mut dadj2) = adjoint.split_at_mut(N * N); + + let (tmp1, tmp2) = x.split_at(N * N); + let x1: [f64; N * N] = tmp1.try_into().unwrap(); + let x2: [f64; N * N] = tmp2.try_into().unwrap(); + + let mut null1 = [0.; 2 * N * N]; + let mut null2 = [0.; 2 * N * N]; + dbrusselator_2d_loop(&mut null1, &mut dadj1, + &mut null2, &mut dadj2, + &x1, &mut dx1, + &x2, &mut dx2, + &p, &mut dp, t); + dx1[0] +} + + +fn foobar(p: &[f64;3], x: state_type, mut adjoint: state_type, t: f64) -> f64 { + let mut dp = [0.; 3]; + let mut dx1 = [0.; N * N]; + let mut dx2 = [0.; N * N]; + let (mut dadj1, mut dadj2) = adjoint.split_at_mut(N * N); + let mut null1 = [0.; 2 * N * N]; + let mut null2 = [0.; 2 * N * N]; + let (tmp1, tmp2) = x.split_at(N * N); + let x1: [f64; N * N] = tmp1.try_into().unwrap(); + let x2: [f64; N * N] = tmp2.try_into().unwrap(); + dbrusselator_2d_loop(&mut null1, &mut dadj1, + &mut null2, &mut dadj2, + &x1, &mut dx1, + &x2, &mut dx2, + &p, &mut dp, t); + dx1[0] +} + +//double foobar(const double* p, const state_type x, const state_type adjoint, double t) { +// double dp[3] = { 0. }; +// +// state_type dx = { 0. }; +// +// state_type dadjoint_inp = adjoint; +// +// state_type dxdu; +// +// __enzyme_autodiff(brusselator_2d_loop, +// enzyme_dupnoneed, nullptr, dadjoint_inp.data(), +// enzyme_dupnoneed, nullptr, dadjoint_inp.data() + N * N, +// enzyme_dup, x.data(), dx.data(), +// enzyme_dup, x.data() + N * N, dx.data() + N * N, +// enzyme_dup, p, dp, +// enzyme_const, t); +// +// return dx[0]; +//} + +fn main() { + let p = [3.4, 1., 10.]; + let mut x = [0.; 2 * N * N]; + let mut adjoint = [0.; 2 * N * N]; + init_brusselator(&mut x, &mut adjoint); + let t = 2.1; + let mut res = 0.; + let time = std::time::Instant::now(); + for _ in 0..10000 { + res = foobar(&p, x, adjoint, t); + } + println!("Enzyme combined {} res={}", time.elapsed().as_secs_f64(), res); +} From 1ffbaaa618999c31521bfa4cec805df19a40be4f Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Mon, 6 May 2024 04:09:05 -0400 Subject: [PATCH 35/56] cleanups --- .../ReverseMode/ode-real/ode/src/lib.rs | 49 ++++--------------- 1 file changed, 10 insertions(+), 39 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/ode-real/ode/src/lib.rs b/enzyme/benchmarks/ReverseMode/ode-real/ode/src/lib.rs index 15a4dc606cde..83c6d0586790 100644 --- a/enzyme/benchmarks/ReverseMode/ode-real/ode/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/ode-real/ode/src/lib.rs @@ -1,4 +1,6 @@ #![feature(autodiff)] +#![feature(slice_first_last_chunk)] +#![feature(slice_as_chunks)] #![feature(iter_next_chunk)] #![allow(non_snake_case)] #![allow(non_camel_case_types)] @@ -83,35 +85,6 @@ fn brusselator_2d_loop(d_u: &mut [f64], d_v: &mut [f64], u: &[f64], v: &[f64], p } } -//__attribute__((noinline)) -//void brusselator_2d_loop(double* __restrict du, double* __restrict dv, const double* __restrict u, const double* __restrict v, const double* __restrict p, double t) { -// double A = p[0]; -// double B = p[1]; -// double alpha = p[2]; -// double dx = (double)1/(N-1); -// -// alpha = alpha/(dx*dx); -// -// for(int i=0; i f64 { let (mut dadj1, mut dadj2) = adjoint.split_at_mut(N * N); let mut null1 = [0.; 2 * N * N]; let mut null2 = [0.; 2 * N * N]; - let (tmp1, tmp2) = x.split_at(N * N); - let x1: [f64; N * N] = tmp1.try_into().unwrap(); - let x2: [f64; N * N] = tmp2.try_into().unwrap(); + // https://discord.com/channels/273534239310479360/273541522815713281/1236945105601040446 + let ([x1, x2], []): (&[[f64; N*N]], &[f64])= x.as_chunks() else { unreachable!() }; dbrusselator_2d_loop(&mut null1, &mut dadj1, &mut null2, &mut dadj2, - &x1, &mut dx1, - &x2, &mut dx2, + x1, &mut dx1, + x2, &mut dx2, &p, &mut dp, t); dx1[0] } From 8bd316aa04fcfd4c77b38f2b57cccb8141d4b87e Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Wed, 22 May 2024 03:19:42 -0400 Subject: [PATCH 36/56] fix ba bench --- enzyme/benchmarks/ReverseMode/ba/src/lib.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/enzyme/benchmarks/ReverseMode/ba/src/lib.rs b/enzyme/benchmarks/ReverseMode/ba/src/lib.rs index 768f3fec8e38..72a9aece7737 100644 --- a/enzyme/benchmarks/ReverseMode/ba/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/ba/src/lib.rs @@ -32,7 +32,10 @@ fn rodrigues_rotate_point(rot: &[f64; 3], pt: &[f64; 3], rotated_pt: &mut [f64; let costheta = theta.cos(); let sintheta = theta.sin(); let theta_inverse = 1. / theta; - let w = rot.map(|v| v * theta_inverse); + let mut w = [0.; 3]; + for i in 0..3 { + w[i] = rot[i] * theta_inverse; + } let w_cross_pt = cross(&w, &pt); let tmp = (w[0] * pt[0] + w[1] * pt[1] + w[2] * pt[2]) * (1. - costheta); for i in 0..3 { From c769dacb20fa89fc23220f10123f7f5911d0d51b Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Wed, 22 May 2024 23:16:23 -0600 Subject: [PATCH 37/56] bench gmm: move allocation of scratch space outside AD'd function --- enzyme/benchmarks/ReverseMode/gmm/src/safe.rs | 181 +++++++++++++++--- 1 file changed, 155 insertions(+), 26 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs b/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs index e56b9d5609e5..ec847941a58f 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs @@ -1,5 +1,5 @@ -use std::f64::consts::PI; use crate::Wishart; +use std::f64::consts::PI; #[cfg(feature = "libm")] use libm::lgamma; @@ -17,7 +17,21 @@ fn lgamma(x: f64) -> f64 { } #[no_mangle] -pub extern "C" fn rust_dgmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, dalphas: *mut f64, means: *const f64, dmeans: *mut f64, icf: *const f64, dicf: *mut f64, x: *const f64, wishart: *const Wishart, err: *mut f64, derr: *mut f64) { +pub extern "C" fn rust_dgmm_objective( + d: i32, + k: i32, + n: i32, + alphas: *const f64, + dalphas: *mut f64, + means: *const f64, + dmeans: *mut f64, + icf: *const f64, + dicf: *mut f64, + x: *const f64, + wishart: *const Wishart, + err: *mut f64, + derr: *mut f64, +) { let k = k as usize; let n = n as usize; let d = d as usize; @@ -32,15 +46,47 @@ pub extern "C" fn rust_dgmm_objective(d: i32, k: i32, n: i32, alphas: *const f64 let d_means = unsafe { std::slice::from_raw_parts_mut(dmeans, k * d) }; let d_icf = unsafe { std::slice::from_raw_parts_mut(dicf, k * d * (d + 1) / 2) }; let mut my_derr = unsafe { *derr }; + let (mut qdiags, mut sum_qs, mut xcentered, mut qxcentered, mut main_term) = + get_workspace(d, k); - dgmm_objective(d, k, n, alphas, d_alphas, means, d_means, icf, d_icf, x, wishart.gamma, wishart.m, &mut my_err, &mut my_derr); + dgmm_objective( + d, + k, + n, + alphas, + d_alphas, + means, + d_means, + icf, + d_icf, + x, + wishart.gamma, + wishart.m, + &mut my_err, + &mut my_derr, + &mut qdiags, + &mut sum_qs, + &mut xcentered, + &mut qxcentered, + &mut main_term, + ); unsafe { *err = my_err }; unsafe { *derr = my_derr }; } #[no_mangle] -pub extern "C" fn rust_gmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, means: *const f64, icf: *const f64, x: *const f64, wishart: *const Wishart, err: *mut f64) { +pub extern "C" fn rust_gmm_objective( + d: i32, + k: i32, + n: i32, + alphas: *const f64, + means: *const f64, + icf: *const f64, + x: *const f64, + wishart: *const Wishart, + err: *mut f64, +) { let k = k as usize; let n = n as usize; let d = d as usize; @@ -50,30 +96,97 @@ pub extern "C" fn rust_gmm_objective(d: i32, k: i32, n: i32, alphas: *const f64, let x = unsafe { std::slice::from_raw_parts(x, n * d) }; let wishart: Wishart = unsafe { *wishart }; let mut my_err = unsafe { *err }; - gmm_objective(d, k, n, alphas, means, icf, x, wishart.gamma, wishart.m, &mut my_err); + let (mut qdiags, mut sum_qs, mut xcentered, mut qxcentered, mut main_term) = + get_workspace(d, k); + gmm_objective( + d, + k, + n, + alphas, + means, + icf, + x, + wishart.gamma, + wishart.m, + &mut my_err, + &mut qdiags, + &mut sum_qs, + &mut xcentered, + &mut qxcentered, + &mut main_term, + ); unsafe { *err = my_err }; } -#[autodiff(dgmm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Duplicated, Const, Const, Const, Duplicated)] -pub fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64], icf: &[f64], x: &[f64], gamma: f64, m: i32, err: &mut f64) { +fn get_workspace(d: usize, k: usize) -> (Vec, Vec, Vec, Vec, Vec) { + let qdiags = vec![0.; d * k]; + let sum_qs = vec![0.; k]; + let xcentered = vec![0.; d]; + let qxcentered = vec![0.; d]; + let main_term = vec![0.; k]; + (qdiags, sum_qs, xcentered, qxcentered, main_term) +} + +#[autodiff( + dgmm_objective, + Reverse, + Const, + Const, + Const, + Duplicated, + Duplicated, + Duplicated, + Const, + Const, + Const, + Duplicated, + Const, + Const, + Const, + Const, + Const +)] +pub fn gmm_objective( + d: usize, + k: usize, + n: usize, + alphas: &[f64], + means: &[f64], + icf: &[f64], + x: &[f64], + gamma: f64, + m: i32, + err: &mut f64, + qdiags: &mut [f64], + sum_qs: &mut [f64], + xcentered: &mut [f64], + qxcentered: &mut [f64], + main_term: &mut [f64], +) { let wishart: Wishart = Wishart { gamma, m }; let constant = -(n as f64) * d as f64 * 0.5 * (2.0 * PI).ln(); let icf_sz = d * (d + 1) / 2; - let mut qdiags = vec![0.; d * k]; - let mut sum_qs = vec![0.; k]; - let mut xcentered = vec![0.; d]; - let mut qxcentered = vec![0.; d]; - let mut main_term = vec![0.; k]; - - preprocess_qs(d, k, icf, &mut sum_qs, &mut qdiags); + preprocess_qs(d, k, icf, sum_qs, qdiags); let mut slse = 0.; for ix in 0..n { for ik in 0..k { - subtract(d, &x[ix as usize * d as usize..], &means[ik as usize * d as usize..], &mut xcentered); - qtimesx(d, &qdiags[ik as usize * d as usize..], &icf[ik as usize * icf_sz as usize + d as usize..], &xcentered, &mut qxcentered); - main_term[ik as usize] = alphas[ik as usize] + sum_qs[ik as usize] - 0.5 * sqnorm(&qxcentered); + subtract( + d, + &x[ix as usize * d as usize..], + &means[ik as usize * d as usize..], + xcentered, + ); + qtimesx( + d, + &qdiags[ik as usize * d as usize..], + &icf[ik as usize * icf_sz as usize + d as usize..], + &*xcentered, + qxcentered, + ); + main_term[ik as usize] = + alphas[ik as usize] + sum_qs[ik as usize] - 0.5 * sqnorm(&*qxcentered); } slse = slse + log_sum_exp(k, &main_term); @@ -81,7 +194,8 @@ pub fn gmm_objective(d: usize, k: usize, n: usize, alphas: &[f64], means: &[f64] let lse_alphas = log_sum_exp(k, alphas); - *err = constant + slse - n as f64 * lse_alphas + log_wishart_prior(d, k, wishart, &sum_qs, &qdiags, icf); + *err = constant + slse - n as f64 * lse_alphas + + log_wishart_prior(d, k, wishart, &sum_qs, &*qdiags, icf); } fn arr_max(n: usize, x: &[f64]) -> f64 { @@ -123,7 +237,7 @@ fn qtimesx(d: usize, q_diag: &[f64], ltri: &[f64], x: &[f64], out: &mut [f64]) { } for i in 0..d { - let mut lparamsidx = i*(2*d-i-1)/2; + let mut lparamsidx = i * (2 * d - i - 1) / 2; for j in i + 1..d { out[j] = out[j] + ltri[lparamsidx] * x[i]; lparamsidx += 1; @@ -137,19 +251,34 @@ fn log_sum_exp(n: usize, x: &[f64]) -> f64 { semx.ln() + mx } fn log_gamma_distrib(a: f64, p: f64) -> f64 { - 0.25 * p * (p - 1.) * PI.ln() + (1..=p as usize).map(|j| lgamma(a + 0.5 * (1. - j as f64))).sum::() + 0.25 * p * (p - 1.) * PI.ln() + + (1..=p as usize) + .map(|j| lgamma(a + 0.5 * (1. - j as f64))) + .sum::() } -fn log_wishart_prior(p: usize, k: usize, wishart: Wishart, sum_qs: &[f64], qdiags: &[f64], icf: &[f64]) -> f64 { +fn log_wishart_prior( + p: usize, + k: usize, + wishart: Wishart, + sum_qs: &[f64], + qdiags: &[f64], + icf: &[f64], +) -> f64 { let n = p + wishart.m as usize + 1; let icf_sz = p * (p + 1) / 2; - let c = n as f64 * p as f64 * (wishart.gamma.ln() - 0.5 * 2f64.ln()) - log_gamma_distrib(0.5 * n as f64, p as f64); + let c = n as f64 * p as f64 * (wishart.gamma.ln() - 0.5 * 2f64.ln()) + - log_gamma_distrib(0.5 * n as f64, p as f64); - let out = (0..k).map(|ik| { - let frobenius = sqnorm(&qdiags[ik * p as usize..][..p]) + sqnorm(&icf[ik * icf_sz as usize + p as usize..][..icf_sz -p]); - 0.5 * wishart.gamma * wishart.gamma * (frobenius) - (wishart.m as f64) * sum_qs[ik as usize] - }).sum::(); + let out = (0..k) + .map(|ik| { + let frobenius = sqnorm(&qdiags[ik * p as usize..][..p]) + + sqnorm(&icf[ik * icf_sz as usize + p as usize..][..icf_sz - p]); + 0.5 * wishart.gamma * wishart.gamma * (frobenius) + - (wishart.m as f64) * sum_qs[ik as usize] + }) + .sum::(); out - k as f64 * c } From c5e1f19d48b85f85779034d28ed321f0f3e85f3f Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Wed, 22 May 2024 23:36:20 -0600 Subject: [PATCH 38/56] bench gmm: switch scratch from Const to Duplicated This makes the reverse mode correct, and a bit faster than the old version (with allocations inside the AD'd code). --- enzyme/benchmarks/ReverseMode/gmm/src/safe.rs | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs b/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs index ec847941a58f..d7f3b78ac75f 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs @@ -48,6 +48,8 @@ pub extern "C" fn rust_dgmm_objective( let mut my_derr = unsafe { *derr }; let (mut qdiags, mut sum_qs, mut xcentered, mut qxcentered, mut main_term) = get_workspace(d, k); + let (mut bqdiags, mut bsum_qs, mut bxcentered, mut bqxcentered, mut bmain_term) = + get_workspace(d, k); dgmm_objective( d, @@ -65,10 +67,15 @@ pub extern "C" fn rust_dgmm_objective( &mut my_err, &mut my_derr, &mut qdiags, + &mut bqdiags, &mut sum_qs, + &mut bsum_qs, &mut xcentered, + &mut bxcentered, &mut qxcentered, + &mut bqxcentered, &mut main_term, + &mut bmain_term, ); unsafe { *err = my_err }; @@ -140,11 +147,11 @@ fn get_workspace(d: usize, k: usize) -> (Vec, Vec, Vec, Vec, Const, Const, Duplicated, - Const, - Const, - Const, - Const, - Const + Duplicated, + Duplicated, + Duplicated, + Duplicated, + Duplicated )] pub fn gmm_objective( d: usize, From 947852c60542c416f9c167a9a07e707dc42d542c Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Thu, 23 May 2024 12:52:23 -0600 Subject: [PATCH 39/56] bench gmm: match C++ performance by asserting sizes of work slices --- enzyme/benchmarks/ReverseMode/gmm/src/safe.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs b/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs index d7f3b78ac75f..c809c34e5454 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/safe.rs @@ -174,6 +174,13 @@ pub fn gmm_objective( let constant = -(n as f64) * d as f64 * 0.5 * (2.0 * PI).ln(); let icf_sz = d * (d + 1) / 2; + // Let the compiler know sizes so it can eliminate bounds checks + assert_eq!(qdiags.len(), d * k); + assert_eq!(sum_qs.len(), k); + assert_eq!(xcentered.len(), d); + assert_eq!(qxcentered.len(), d); + assert_eq!(main_term.len(), k); + preprocess_qs(d, k, icf, sum_qs, qdiags); let mut slse = 0.; From cb9d4030fe2a92d0e25092399fbe039b133e5fa3 Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Thu, 23 May 2024 13:29:23 -0600 Subject: [PATCH 40/56] bench gmm: shed unused import (warning) --- enzyme/benchmarks/ReverseMode/gmm/src/lib.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs index 8fcb11ffed10..4f9fc5336e8e 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/gmm/src/lib.rs @@ -1,9 +1,6 @@ #![feature(autodiff)] -pub mod r#unsafe; pub mod safe; - -use r#unsafe::dgmm_objective as dgmm_objective; - +pub mod r#unsafe; #[derive(Clone, Copy)] #[repr(C)] @@ -11,4 +8,3 @@ pub struct Wishart { pub gamma: f64, pub m: i32, } - From dbffef699335804558770e3529b09d6b18bbf1a6 Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Thu, 23 May 2024 13:48:46 -0600 Subject: [PATCH 41/56] bench lstm: optimize using length assertions --- .../benchmarks/ReverseMode/lstm/src/safe.rs | 131 +++++++++++++----- 1 file changed, 97 insertions(+), 34 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs index 8734998acfb7..6a43419af5bf 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs @@ -26,10 +26,15 @@ fn lstm_model( input: &[f64], ) { let mut gates = vec![0.0; 4 * hsize]; - let (a,b) = gates.split_at_mut(2*hsize); - let ((forget, ingate), (outgate, change)) = ( - a.split_at_mut(hsize), b.split_at_mut(hsize)); - + let gates = &mut gates[..4 * hsize]; + let (a, b) = gates.split_at_mut(2 * hsize); + let ((forget, ingate), (outgate, change)) = (a.split_at_mut(hsize), b.split_at_mut(hsize)); + + assert_eq!(weight.len(), 4 * hsize); + assert_eq!(bias.len(), 4 * hsize); + assert_eq!(hidden.len(), hsize); + assert!(cell.len() >= hsize); + assert!(input.len() >= hsize); // caching input for i in 0..hsize { forget[i] = sigmoid(input[i] * weight[i] + bias[i]); @@ -63,23 +68,23 @@ fn lstm_predict( } let mut i = 0; - while i <= 2*l*b - 1 { + while i <= 2 * l * b - 1 { // make borrow-checker happy with non-overlapping mutable references let (xp, s1, s2) = if i == 0 { let (s1, s2) = s.split_at_mut(b); (x2.as_mut(), s1, s2) } else { - let tmp = &mut s[i-2*b..]; - let (a, d) = tmp.split_at_mut(2*b); + let tmp = &mut s[i - 2 * b..]; + let (a, d) = tmp.split_at_mut(2 * b); let (d, c) = d.split_at_mut(b); - (a,d,c) + (a, d, c) }; lstm_model( b, - &w[i * 4..], - &w[(i + b) * 4..], + &w[i * 4..(i + b) * 4], + &w[(i + b) * 4..(i + 2 * b) * 4], s1, s2, xp, @@ -88,7 +93,7 @@ fn lstm_predict( i += 2 * b; } - let xp = &s[i-2*b..]; + let xp = &s[i - 2 * b..]; for i in 0..b { x2[i] = xp[i] * w2[b + i] + w2[2 * b + i]; @@ -96,8 +101,19 @@ fn lstm_predict( } // LSTM objective (loss function) -#[autodiff(d_lstm_objective, Reverse, Const, Const, Const, Duplicated, Duplicated, Const, Const, Duplicated)] -pub (crate) fn lstm_objective( +#[autodiff( + d_lstm_objective, + Reverse, + Const, + Const, + Const, + Duplicated, + Duplicated, + Const, + Const, + Duplicated +)] +pub(crate) fn lstm_objective( l: usize, c: usize, b: usize, @@ -112,14 +128,15 @@ pub (crate) fn lstm_objective( let mut input = &sequence[..b]; let mut ypred = vec![0.0; b]; + let ypred = &mut ypred[..b]; let mut ynorm = vec![0.0; b]; - let mut lse; + let ynorm = &mut ynorm[..b]; assert!(b > 0); for t in (0..=(c - 1) * b - 1).step_by(b) { - lstm_predict(l, b, main_params, extra_params, state, input, &mut ypred); - lse = logsumexp(&ypred); + lstm_predict(l, b, main_params, extra_params, state, input, ypred); + let lse = logsumexp(&ypred); for i in 0..b { ynorm[i] = ypred[i] - lse; } @@ -137,31 +154,77 @@ pub (crate) fn lstm_objective( } #[no_mangle] -pub extern "C" fn rust_lstm_objective(l: usize, c: usize, b: usize, main_params: *const f64, extra_params: *const f64, state: *mut f64, sequence: *const f64, loss: *mut f64) { - let (main_params, extra_params, state, sequence) = unsafe {( - slice::from_raw_parts(main_params, 2*l*4*b), - slice::from_raw_parts(extra_params, 3*b), - slice::from_raw_parts_mut(state, 2*l*b), - slice::from_raw_parts(sequence, c*b) - )}; +pub extern "C" fn rust_lstm_objective( + l: usize, + c: usize, + b: usize, + main_params: *const f64, + extra_params: *const f64, + state: *mut f64, + sequence: *const f64, + loss: *mut f64, +) { + let (main_params, extra_params, state, sequence) = unsafe { + ( + slice::from_raw_parts(main_params, 2 * l * 4 * b), + slice::from_raw_parts(extra_params, 3 * b), + slice::from_raw_parts_mut(state, 2 * l * b), + slice::from_raw_parts(sequence, c * b), + ) + }; unsafe { - lstm_objective(l,c,b,main_params,extra_params,state,sequence, &mut *loss); + lstm_objective( + l, + c, + b, + main_params, + extra_params, + state, + sequence, + &mut *loss, + ); } } #[no_mangle] -pub extern "C" fn rust_dlstm_objective(l: usize, c: usize, b: usize, main_params: *const f64, d_main_params: *mut f64, extra_params: *const f64, d_extra_params: *mut f64, state: *mut f64, sequence: *const f64, res: *mut f64, d_res: *mut f64) { - let (main_params, d_main_params, extra_params, d_extra_params, state, sequence) = unsafe {( - slice::from_raw_parts(main_params, 2*l*4*b), - slice::from_raw_parts_mut(d_main_params, 2*l*4*b), - slice::from_raw_parts(extra_params, 3*b), - slice::from_raw_parts_mut(d_extra_params, 3*b), - slice::from_raw_parts_mut(state, 2*l*b), - slice::from_raw_parts(sequence, c*b) - )}; +pub extern "C" fn rust_dlstm_objective( + l: usize, + c: usize, + b: usize, + main_params: *const f64, + d_main_params: *mut f64, + extra_params: *const f64, + d_extra_params: *mut f64, + state: *mut f64, + sequence: *const f64, + res: *mut f64, + d_res: *mut f64, +) { + let (main_params, d_main_params, extra_params, d_extra_params, state, sequence) = unsafe { + ( + slice::from_raw_parts(main_params, 2 * l * 4 * b), + slice::from_raw_parts_mut(d_main_params, 2 * l * 4 * b), + slice::from_raw_parts(extra_params, 3 * b), + slice::from_raw_parts_mut(d_extra_params, 3 * b), + slice::from_raw_parts_mut(state, 2 * l * b), + slice::from_raw_parts(sequence, c * b), + ) + }; unsafe { - d_lstm_objective(l,c,b,main_params,d_main_params, extra_params,d_extra_params, state,sequence, &mut *res, &mut *d_res); + d_lstm_objective( + l, + c, + b, + main_params, + d_main_params, + extra_params, + d_extra_params, + state, + sequence, + &mut *res, + &mut *d_res, + ); } } From 0174227cb5b3fbf76312112b40a50c05fb6a66e3 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Fri, 24 May 2024 03:21:55 -0400 Subject: [PATCH 42/56] adding unsafe ba version --- enzyme/benchmarks/ReverseMode/adbench/ba.h | 130 ++++++++--- enzyme/benchmarks/ReverseMode/ba/src/lib.rs | 204 +----------------- enzyme/benchmarks/ReverseMode/ba/src/safe.rs | 203 +++++++++++++++++ .../benchmarks/ReverseMode/ba/src/unsafe.rs | 139 ++++++++++++ 4 files changed, 445 insertions(+), 231 deletions(-) create mode 100644 enzyme/benchmarks/ReverseMode/ba/src/safe.rs create mode 100644 enzyme/benchmarks/ReverseMode/ba/src/unsafe.rs diff --git a/enzyme/benchmarks/ReverseMode/adbench/ba.h b/enzyme/benchmarks/ReverseMode/adbench/ba.h index 5d9178120e76..aa62cf2a165f 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/ba.h +++ b/enzyme/benchmarks/ReverseMode/adbench/ba.h @@ -127,19 +127,16 @@ extern "C" { double* reproj_err, double* w_err ); - - void rust2_ba_objective( - int n, - int m, - int p, - double const* cams, - double const* X, - double const* w, - int const* obs, - double const* feats, - double* reproj_err, - double* w_err - ); + + void rust2_unsafe_ba_objective(int n, int m, int p, double const *cams, + double const *X, double const *w, + int const *obs, double const *feats, + double *reproj_err, double *w_err); + + void rust2_ba_objective(int n, int m, int p, double const *cams, + double const *X, double const *w, int const *obs, + double const *feats, double *reproj_err, + double *w_err); void dcompute_reproj_error( double const* cam, @@ -183,17 +180,17 @@ extern "C" { void adept_compute_zach_weight_error(double const* w, double* dw, double* err, double* derr); - void rust_dcompute_reproj_error( - double const* cam, - double * dcam, - double const* X, - double * dX, - double const* w, - double * wb, - double const* feat, - double *err, - double *derr - ); + void rust_unsafe_dcompute_reproj_error(double const *cam, double *dcam, + double const *X, double *dX, + double const *w, double *wb, + double const *feat, double *err, + double *derr); + + void rust_dcompute_reproj_error(double const *cam, double *dcam, + double const *X, double *dX, + double const *w, double *wb, + double const *feat, double *err, + double *derr); void rust_dcompute_zach_weight_error(double const* w, double* dw, double* err, double* derr); } @@ -362,10 +359,22 @@ int main(const int argc, const char* argv[]) { std::string path = "/mnt/Data/git/Enzyme/apps/ADBench/data/ba/ba1_n49_m7776_p31843.txt"; std::vector paths = { - "ba10_n1197_m126327_p563734.txt", "ba14_n356_m226730_p1255268.txt", "ba18_n1936_m649673_p5213733.txt", "ba2_n21_m11315_p36455.txt", "ba6_n539_m65220_p277273.txt", "test.txt", - "ba11_n1723_m156502_p678718.txt", "ba15_n1102_m780462_p4052340.txt", "ba19_n4585_m1324582_p9125125.txt", "ba3_n161_m48126_p182072.txt", "ba7_n93_m61203_p287451.txt", - "ba12_n253_m163691_p899155.txt", "ba16_n1544_m942409_p4750193.txt", "ba1_n49_m7776_p31843.txt", "ba4_n372_m47423_p204472.txt", "ba8_n88_m64298_p383937.txt", - "ba13_n245_m198739_p1091386.txt", "ba17_n1778_m993923_p5001946.txt", "ba20_n13682_m4456117_p2987644.txt", "ba5_n257_m65132_p225911.txt", "ba9_n810_m88814_p393775.txt", + "ba10_n1197_m126327_p563734.txt", + "ba14_n356_m226730_p1255268.txt", // "ba18_n1936_m649673_p5213733.txt", + // "ba2_n21_m11315_p36455.txt", + // "ba6_n539_m65220_p277273.txt", + // "test.txt", + // "ba11_n1723_m156502_p678718.txt", + // "ba15_n1102_m780462_p4052340.txt", + // "ba19_n4585_m1324582_p9125125.txt", + // "ba3_n161_m48126_p182072.txt", "ba7_n93_m61203_p287451.txt", + // "ba12_n253_m163691_p899155.txt", + // "ba16_n1544_m942409_p4750193.txt", "ba1_n49_m7776_p31843.txt", + // "ba4_n372_m47423_p204472.txt", "ba8_n88_m64298_p383937.txt", + // "ba13_n245_m198739_p1091386.txt", + // "ba17_n1778_m993923_p5001946.txt", + // "ba20_n13682_m4456117_p2987644.txt", + // "ba5_n257_m65132_p225911.txt", "ba9_n810_m88814_p393775.txt", }; std::ofstream jsonfile("results.json", std::ofstream::trunc); @@ -571,7 +580,40 @@ int main(const int argc, const char* argv[]) { } } - + { + struct BAInput input; + read_ba_instance("data/" + path, input.n, input.m, input.p, input.cams, + input.X, input.w, input.obs, input.feats); + + struct BAOutput result = {std::vector(2 * input.p), + std::vector(input.p), + BASparseMat(input.n, input.m, input.p)}; + { + + struct timeval start, end; + gettimeofday(&start, NULL); + rust2_unsafe_ba_objective(input.n, input.m, input.p, input.cams.data(), + input.X.data(), input.w.data(), + input.obs.data(), input.feats.data(), + result.reproj_err.data(), result.w_err.data()); + gettimeofday(&end, NULL); + printf("primal unsafe rust t=%0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "primal unsafe rust"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = 0; i < 5; i++) { + printf("%f ", result.reproj_err[i]); + enzyme["result"].push_back(result.reproj_err[i]); + } + for (unsigned i = 0; i < 5; i++) { + printf("%f ", result.w_err[i]); + enzyme["result"].push_back(result.w_err[i]); + } + printf("\n"); + test_suite["tools"].push_back(enzyme); + } + } + { struct BAInput input; read_ba_instance("data/" + path, input.n, input.m, input.p, input.cams, input.X, input.w, input.obs, input.feats); @@ -626,6 +668,35 @@ int main(const int argc, const char* argv[]) { BASparseMat(input.n, input.m, input.p) }; + { + struct timeval start, end; + gettimeofday(&start, NULL); + calculate_jacobian(input, result); + gettimeofday(&end, NULL); + printf("Enzyme unsafe rust combined %0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "Enzyme unsafe rust combined"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = 0; i < 5; i++) { + printf("%f ", result.J.vals[i]); + enzyme["result"].push_back(result.J.vals[i]); + } + printf("\n"); + test_suite["tools"].push_back(enzyme); + } + } + + { + + struct BAInput input; + read_ba_instance("data/" + path, input.n, input.m, input.p, input.cams, + input.X, input.w, input.obs, input.feats); + + struct BAOutput result = {std::vector(2 * input.p), + std::vector(input.p), + BASparseMat(input.n, input.m, input.p)}; + { struct timeval start, end; gettimeofday(&start, NULL); @@ -642,7 +713,6 @@ int main(const int argc, const char* argv[]) { printf("\n"); test_suite["tools"].push_back(enzyme); } - } test_suite["llvm-version"] = __clang_version__; diff --git a/enzyme/benchmarks/ReverseMode/ba/src/lib.rs b/enzyme/benchmarks/ReverseMode/ba/src/lib.rs index 72a9aece7737..1f665012c07a 100644 --- a/enzyme/benchmarks/ReverseMode/ba/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/ba/src/lib.rs @@ -2,87 +2,10 @@ #![feature(slice_first_last_chunk)] #![allow(non_snake_case)] -//#define BA_NCAMPARAMS 11 -static BA_NCAMPARAMS: usize = 11; - -fn sqsum(x: &[f64]) -> f64 { - x.iter().map(|&v| v * v).sum() -} - -#[inline] -fn cross(a: &[f64; 3], b: &[f64; 3]) -> [f64; 3] { - [ - a[1] * b[2] - a[2] * b[1], - a[2] * b[0] - a[0] * b[2], - a[0] * b[1] - a[1] * b[0], - ] -} - -fn radial_distort(rad_params: &[f64], proj: &mut [f64]) { - let rsq = sqsum(proj); - let l = 1. + rad_params[0] * rsq + rad_params[1] * rsq * rsq; - proj[0] = proj[0] * l; - proj[1] = proj[1] * l; -} - -fn rodrigues_rotate_point(rot: &[f64; 3], pt: &[f64; 3], rotated_pt: &mut [f64; 3]) { - let sqtheta = sqsum(rot); - if sqtheta != 0. { - let theta = sqtheta.sqrt(); - let costheta = theta.cos(); - let sintheta = theta.sin(); - let theta_inverse = 1. / theta; - let mut w = [0.; 3]; - for i in 0..3 { - w[i] = rot[i] * theta_inverse; - } - let w_cross_pt = cross(&w, &pt); - let tmp = (w[0] * pt[0] + w[1] * pt[1] + w[2] * pt[2]) * (1. - costheta); - for i in 0..3 { - rotated_pt[i] = pt[i] * costheta + w_cross_pt[i] * sintheta + w[i] * tmp; - } - } else { - let rot_cross_pt = cross(&rot, &pt); - for i in 0..3 { - rotated_pt[i] = pt[i] + rot_cross_pt[i]; - } - } -} - -fn project(cam: &[f64; 11], X: &[f64; 3], proj: &mut [f64; 2]) { - let C = &cam[3..6]; - let mut Xo = [0.; 3]; - let mut Xcam = [0.; 3]; - - Xo[0] = X[0] - C[0]; - Xo[1] = X[1] - C[1]; - Xo[2] = X[2] - C[2]; - - rodrigues_rotate_point(cam.first_chunk::<3>().unwrap(), &Xo, &mut Xcam); - - proj[0] = Xcam[0] / Xcam[2]; - proj[1] = Xcam[1] / Xcam[2]; - - radial_distort(&cam[9..], proj); - - proj[0] = proj[0] * cam[6] + cam[7]; - proj[1] = proj[1] * cam[6] + cam[8]; -} +pub mod safe; +pub mod r#unsafe; -#[no_mangle] -pub extern "C" fn rust_dcompute_reproj_error( - cam: *const [f64; 11], - dcam: *mut [f64; 11], - x: *const [f64; 3], - dx: *mut [f64; 3], - w: *const [f64; 1], - wb: *mut [f64; 1], - feat: *const [f64; 2], - err: *mut [f64; 2], - derr: *mut [f64; 2], -) { - dcompute_reproj_error(cam, dcam, x, dx, w, wb, feat, err, derr); -} +static BA_NCAMPARAMS: usize = 11; #[no_mangle] pub extern "C" fn rust_dcompute_zach_weight_error( @@ -94,130 +17,9 @@ pub extern "C" fn rust_dcompute_zach_weight_error( dcompute_zach_weight_error(w, dw, err, derr); } -#[autodiff( - dcompute_reproj_error, - Reverse, - Duplicated, - Duplicated, - Duplicated, - Const, - Duplicated -)] -pub fn compute_reproj_error( - cam: *const [f64; 11], - x: *const [f64; 3], - w: *const [f64; 1], - feat: *const [f64; 2], - err: *mut [f64; 2], -) { - let cam = unsafe { &*cam }; - let w = unsafe { *(*w).get_unchecked(0) }; - let x = unsafe { &*x }; - let feat = unsafe { &*feat }; - let mut err = unsafe { &mut *err }; - let mut proj = [0.; 2]; - project(cam, x, &mut proj); - err[0] = w * (proj[0] - feat[0]); - err[1] = w * (proj[1] - feat[1]); -} - #[autodiff(dcompute_zach_weight_error, Reverse, Duplicated, Duplicated)] pub fn compute_zach_weight_error(w: *const f64, err: *mut f64) { let w = unsafe { *w }; unsafe { *err = 1. - w * w; } } -// n number of cameras -// m number of points -// p number of observations -// cams: 11*n cameras in format [r1 r2 r3 C1 C2 C3 f u0 v0 k1 k2] -// r1, r2, r3 are angle - axis rotation parameters(Rodrigues) -// [C1 C2 C3]' is the camera center -// f is the focal length in pixels -// [u0 v0]' is the principal point -// k1, k2 are radial distortion parameters -// X: 3*m points -// obs: 2*p observations (pairs cameraIdx, pointIdx) -// feats: 2*p features (x,y coordinates corresponding to observations) -// reproj_err: 2*p errors of observations -// w_err: p weight "error" terms -fn rust_ba_objective( - n: usize, - m: usize, - p: usize, - cams: &[f64], - x: &[f64], - w: &[f64], - obs: &[i32], - feats: &[f64], - reproj_err: &mut [f64], - w_err: &mut [f64], -) { - assert_eq!(cams.len(), n * 11); - assert_eq!(x.len(), m * 3); - assert_eq!(w.len(), p); - assert_eq!(obs.len(), p * 2); - assert_eq!(feats.len(), p * 2); - assert_eq!(reproj_err.len(), p * 2); - assert_eq!(w_err.len(), p); - - for i in 0..p { - let cam_idx = obs[i * 2 + 0] as usize; - let pt_idx = obs[i * 2 + 1] as usize; - let start = cam_idx * BA_NCAMPARAMS; - let cam: &[f64; 11] = unsafe { - cams[start..] - .get_unchecked(..11) - .try_into() - .unwrap_unchecked() - }; - let x: &[f64; 3] = unsafe { - x[pt_idx * 3..] - .get_unchecked(..3) - .try_into() - .unwrap_unchecked() - }; - let w: &[f64; 1] = unsafe { w[i..].get_unchecked(..1).try_into().unwrap_unchecked() }; - let feat: &[f64; 2] = unsafe { - feats[i * 2..] - .get_unchecked(..2) - .try_into() - .unwrap_unchecked() - }; - let reproj_err: &mut [f64; 2] = unsafe { - reproj_err[i * 2..] - .get_unchecked_mut(..2) - .try_into() - .unwrap_unchecked() - }; - compute_reproj_error(cam, x, w, feat, reproj_err); - } - - for i in 0..p { - let w_err: &mut f64 = unsafe { w_err.get_unchecked_mut(i) }; - compute_zach_weight_error(w[i..].as_ptr(), w_err as *mut f64); - } -} - -#[no_mangle] -extern "C" fn rust2_ba_objective( - n: usize, - m: usize, - p: usize, - cams: *const f64, - x: *const f64, - w: *const f64, - obs: *const i32, - feats: *const f64, - reproj_err: *mut f64, - w_err: *mut f64, -) { - let cams = unsafe { std::slice::from_raw_parts(cams, n * 11) }; - let x = unsafe { std::slice::from_raw_parts(x, m * 3) }; - let w = unsafe { std::slice::from_raw_parts(w, p) }; - let obs = unsafe { std::slice::from_raw_parts(obs, p * 2) }; - let feats = unsafe { std::slice::from_raw_parts(feats, p * 2) }; - let reproj_err = unsafe { std::slice::from_raw_parts_mut(reproj_err, p * 2) }; - let w_err = unsafe { std::slice::from_raw_parts_mut(w_err, p) }; - rust_ba_objective(n, m, p, cams, x, w, obs, feats, reproj_err, w_err); -} diff --git a/enzyme/benchmarks/ReverseMode/ba/src/safe.rs b/enzyme/benchmarks/ReverseMode/ba/src/safe.rs new file mode 100644 index 000000000000..c38f5359cc30 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/ba/src/safe.rs @@ -0,0 +1,203 @@ +use crate::BA_NCAMPARAMS; +use crate::compute_zach_weight_error; + +fn sqsum(x: &[f64]) -> f64 { + x.iter().map(|&v| v * v).sum() +} + +#[inline] +fn cross(a: &[f64; 3], b: &[f64; 3]) -> [f64; 3] { + [ + a[1] * b[2] - a[2] * b[1], + a[2] * b[0] - a[0] * b[2], + a[0] * b[1] - a[1] * b[0], + ] +} + +fn radial_distort(rad_params: &[f64], proj: &mut [f64]) { + let rsq = sqsum(proj); + let l = 1. + rad_params[0] * rsq + rad_params[1] * rsq * rsq; + proj[0] = proj[0] * l; + proj[1] = proj[1] * l; +} + +fn rodrigues_rotate_point(rot: &[f64; 3], pt: &[f64; 3], rotated_pt: &mut [f64; 3]) { + let sqtheta = sqsum(rot); + if sqtheta != 0. { + let theta = sqtheta.sqrt(); + let costheta = theta.cos(); + let sintheta = theta.sin(); + let theta_inverse = 1. / theta; + let mut w = [0.; 3]; + for i in 0..3 { + w[i] = rot[i] * theta_inverse; + } + let w_cross_pt = cross(&w, &pt); + let tmp = (w[0] * pt[0] + w[1] * pt[1] + w[2] * pt[2]) * (1. - costheta); + for i in 0..3 { + rotated_pt[i] = pt[i] * costheta + w_cross_pt[i] * sintheta + w[i] * tmp; + } + } else { + let rot_cross_pt = cross(&rot, &pt); + for i in 0..3 { + rotated_pt[i] = pt[i] + rot_cross_pt[i]; + } + } +} + +fn project(cam: &[f64; 11], X: &[f64; 3], proj: &mut [f64; 2]) { + let C = &cam[3..6]; + let mut Xo = [0.; 3]; + let mut Xcam = [0.; 3]; + + Xo[0] = X[0] - C[0]; + Xo[1] = X[1] - C[1]; + Xo[2] = X[2] - C[2]; + + rodrigues_rotate_point(cam.first_chunk::<3>().unwrap(), &Xo, &mut Xcam); + + proj[0] = Xcam[0] / Xcam[2]; + proj[1] = Xcam[1] / Xcam[2]; + + radial_distort(&cam[9..], proj); + + proj[0] = proj[0] * cam[6] + cam[7]; + proj[1] = proj[1] * cam[6] + cam[8]; +} + +#[no_mangle] +pub extern "C" fn rust_dcompute_reproj_error( + cam: *const [f64; 11], + dcam: *mut [f64; 11], + x: *const [f64; 3], + dx: *mut [f64; 3], + w: *const [f64; 1], + wb: *mut [f64; 1], + feat: *const [f64; 2], + err: *mut [f64; 2], + derr: *mut [f64; 2], +) { + dcompute_reproj_error(cam, dcam, x, dx, w, wb, feat, err, derr); +} + +#[autodiff( + dcompute_reproj_error, + Reverse, + Duplicated, + Duplicated, + Duplicated, + Const, + Duplicated +)] +pub fn compute_reproj_error( + cam: *const [f64; 11], + x: *const [f64; 3], + w: *const [f64; 1], + feat: *const [f64; 2], + err: *mut [f64; 2], +) { + let cam = unsafe { &*cam }; + let w = unsafe { *(*w).get_unchecked(0) }; + let x = unsafe { &*x }; + let feat = unsafe { &*feat }; + let err = unsafe { &mut *err }; + let mut proj = [0.; 2]; + project(cam, x, &mut proj); + err[0] = w * (proj[0] - feat[0]); + err[1] = w * (proj[1] - feat[1]); +} + +// n number of cameras +// m number of points +// p number of observations +// cams: 11*n cameras in format [r1 r2 r3 C1 C2 C3 f u0 v0 k1 k2] +// r1, r2, r3 are angle - axis rotation parameters(Rodrigues) +// [C1 C2 C3]' is the camera center +// f is the focal length in pixels +// [u0 v0]' is the principal point +// k1, k2 are radial distortion parameters +// X: 3*m points +// obs: 2*p observations (pairs cameraIdx, pointIdx) +// feats: 2*p features (x,y coordinates corresponding to observations) +// reproj_err: 2*p errors of observations +// w_err: p weight "error" terms +fn rust_ba_objective( + n: usize, + m: usize, + p: usize, + cams: &[f64], + x: &[f64], + w: &[f64], + obs: &[i32], + feats: &[f64], + reproj_err: &mut [f64], + w_err: &mut [f64], +) { + assert_eq!(cams.len(), n * 11); + assert_eq!(x.len(), m * 3); + assert_eq!(w.len(), p); + assert_eq!(obs.len(), p * 2); + assert_eq!(feats.len(), p * 2); + assert_eq!(reproj_err.len(), p * 2); + assert_eq!(w_err.len(), p); + + for i in 0..p { + let cam_idx = obs[i * 2 + 0] as usize; + let pt_idx = obs[i * 2 + 1] as usize; + let start = cam_idx * BA_NCAMPARAMS; + let cam: &[f64; 11] = unsafe { + cams[start..] + .get_unchecked(..11) + .try_into() + .unwrap_unchecked() + }; + let x: &[f64; 3] = unsafe { + x[pt_idx * 3..] + .get_unchecked(..3) + .try_into() + .unwrap_unchecked() + }; + let w: &[f64; 1] = unsafe { w[i..].get_unchecked(..1).try_into().unwrap_unchecked() }; + let feat: &[f64; 2] = unsafe { + feats[i * 2..] + .get_unchecked(..2) + .try_into() + .unwrap_unchecked() + }; + let reproj_err: &mut [f64; 2] = unsafe { + reproj_err[i * 2..] + .get_unchecked_mut(..2) + .try_into() + .unwrap_unchecked() + }; + compute_reproj_error(cam, x, w, feat, reproj_err); + } + + for i in 0..p { + let w_err: &mut f64 = unsafe { w_err.get_unchecked_mut(i) }; + compute_zach_weight_error(w[i..].as_ptr(), w_err as *mut f64); + } +} + +#[no_mangle] +extern "C" fn rust2_ba_objective( + n: usize, + m: usize, + p: usize, + cams: *const f64, + x: *const f64, + w: *const f64, + obs: *const i32, + feats: *const f64, + reproj_err: *mut f64, + w_err: *mut f64, +) { + let cams = unsafe { std::slice::from_raw_parts(cams, n * 11) }; + let x = unsafe { std::slice::from_raw_parts(x, m * 3) }; + let w = unsafe { std::slice::from_raw_parts(w, p) }; + let obs = unsafe { std::slice::from_raw_parts(obs, p * 2) }; + let feats = unsafe { std::slice::from_raw_parts(feats, p * 2) }; + let reproj_err = unsafe { std::slice::from_raw_parts_mut(reproj_err, p * 2) }; + let w_err = unsafe { std::slice::from_raw_parts_mut(w_err, p) }; + rust_ba_objective(n, m, p, cams, x, w, obs, feats, reproj_err, w_err); +} diff --git a/enzyme/benchmarks/ReverseMode/ba/src/unsafe.rs b/enzyme/benchmarks/ReverseMode/ba/src/unsafe.rs new file mode 100644 index 000000000000..477d900c3310 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/ba/src/unsafe.rs @@ -0,0 +1,139 @@ +use crate::BA_NCAMPARAMS; +use crate::compute_zach_weight_error; + +unsafe fn sqsum(x: *const f64, n: usize) -> f64 { + let mut sum = 0.; + for i in 0..n { + let v = unsafe { *x.add(i) }; + sum += v * v; + } + sum +} + +#[inline] +unsafe fn cross(a: *const f64, b: *const f64, out: *mut f64) { + *out.add(0) = *a.add(1) * *b.add(2) - *a.add(2) * *b.add(1); + *out.add(1) = *a.add(2) * *b.add(0) - *a.add(0) * *b.add(2); + *out.add(2) = *a.add(0) * *b.add(1) - *a.add(1) * *b.add(0); +} + +unsafe fn radial_distort(rad_params: *const f64, proj: *mut f64) { + let rsq = sqsum(proj, 2); + let l = 1. + *rad_params.add(0) * rsq + *rad_params.add(1) * rsq * rsq; + *proj.add(0) = *proj.add(0) * l; + *proj.add(1) = *proj.add(1) * l; +} + +unsafe fn rodrigues_rotate_point(rot: *const f64, pt: *const f64, rotated_pt: *mut f64) { + let sqtheta = sqsum(rot, 3); + if sqtheta != 0. { + let theta = sqtheta.sqrt(); + let costheta = theta.cos(); + let sintheta = theta.sin(); + let theta_inverse = 1. / theta; + let mut w = [0.; 3]; + for i in 0..3 { + w[i] = *rot.add(i) * theta_inverse; + } + let mut w_cross_pt = [0.; 3]; + cross(w.as_ptr(), pt, w_cross_pt.as_mut_ptr()); + let tmp = (w[0] * *pt.add(0) + w[1] * *pt.add(1) + w[2] * *pt.add(2)) * (1. - costheta); + for i in 0..3 { + *rotated_pt.add(i) = *pt.add(i) * costheta + w_cross_pt[i] * sintheta + w[i] * tmp; + } + } else { + let mut rot_cross_pt = [0.; 3]; + cross(rot, pt, rot_cross_pt.as_mut_ptr()); + for i in 0..3 { + *rotated_pt.add(i) = *pt.add(i) + rot_cross_pt[i]; + } + } +} + +unsafe fn project(cam: *const f64, X: *const f64, proj: *mut f64) { + let C = cam.add(3); + let mut Xo = [0.; 3]; + let mut Xcam = [0.; 3]; + + Xo[0] = *X.add(0) - *C.add(0); + Xo[1] = *X.add(1) - *C.add(1); + Xo[2] = *X.add(2) - *C.add(2); + + rodrigues_rotate_point(cam, Xo.as_ptr(), Xcam.as_mut_ptr()); + + *proj.add(0) = Xcam[0] / Xcam[2]; + *proj.add(1) = Xcam[1] / Xcam[2]; + + radial_distort(cam.add(9), proj); + *proj.add(0) = *proj.add(0) * *cam.add(6) + *cam.add(7); + *proj.add(1) = *proj.add(1) * *cam.add(6) + *cam.add(8); +} + +#[no_mangle] +pub unsafe extern "C" fn rust_unsafe_dcompute_reproj_error( + cam: *const f64, + dcam: *mut f64, + x: *const f64, + dx: *mut f64, + w: *const f64, + wb: *mut f64, + feat: *const f64, + err: *mut f64, + derr: *mut f64, +) { + dcompute_reproj_error(cam, dcam, x, dx, w, wb, feat, err, derr); +} + + +#[autodiff( + dcompute_reproj_error, + Reverse, + Duplicated, + Duplicated, + Duplicated, + Const, + Duplicated +)] +pub unsafe fn compute_reproj_error( + cam: *const f64, + x: *const f64, + w: *const f64, + feat: *const f64, + err: *mut f64, +) { + let mut proj = [0.; 2]; + project(cam, x, proj.as_mut_ptr()); + *err.add(0) = *w * (proj[0] - *feat.add(0)); + *err.add(1) = *w * (proj[1] - *feat.add(1)); +} + +#[no_mangle] +unsafe extern "C" fn rust2_unsafe_ba_objective( + n: usize, + m: usize, + p: usize, + cams: *const f64, + x: *const f64, + w: *const f64, + obs: *const i32, + feats: *const f64, + reproj_err: *mut f64, + w_err: *mut f64, +) { + for i in 0..p { + let cam_idx = *obs.add(i * 2 + 0) as usize; + let pt_idx = *obs.add(i * 2 + 1) as usize; + let start = cam_idx * BA_NCAMPARAMS; + + let cam: *const f64 = cams.add(start); + let x: *const f64 = x.add(pt_idx * 3); + let w: *const f64 = w.add(i); + let feat: *const f64 = feats.add(i * 2); + let reproj_err: *mut f64 = reproj_err.add(i * 2); + compute_reproj_error(cam, x, w, feat, reproj_err); + } + + for i in 0..p { + compute_zach_weight_error(w.add(i), w_err.add(i)); + } +} From 629f87c4b0a184d52390e830c27965eaa4e25544 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Sun, 2 Jun 2024 00:40:38 -0400 Subject: [PATCH 43/56] smaller perf improvements --- .../benchmarks/ReverseMode/lstm/src/safe.rs | 38 ++++++++++++------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs index 6a43419af5bf..76c4316fab51 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs @@ -28,14 +28,17 @@ fn lstm_model( let mut gates = vec![0.0; 4 * hsize]; let gates = &mut gates[..4 * hsize]; let (a, b) = gates.split_at_mut(2 * hsize); - let ((forget, ingate), (outgate, change)) = (a.split_at_mut(hsize), b.split_at_mut(hsize)); + let (forget, ingate) = a.split_at_mut(hsize); + let (outgate, change) = b.split_at_mut(hsize); assert_eq!(weight.len(), 4 * hsize); assert_eq!(bias.len(), 4 * hsize); assert_eq!(hidden.len(), hsize); + assert_eq!(ingate.len(), hsize); + assert_eq!(change.len(), hsize); assert!(cell.len() >= hsize); assert!(input.len() >= hsize); - // caching input + // Using unchecked indexing here was slightly slower for some reason for i in 0..hsize { forget[i] = sigmoid(input[i] * weight[i] + bias[i]); ingate[i] = sigmoid(hidden[i] * weight[hsize + i] + bias[hsize + i]); @@ -66,33 +69,42 @@ fn lstm_predict( for i in 0..b { x2[i] = x[i] * w2[i]; } - - let mut i = 0; - while i <= 2 * l * b - 1 { - // make borrow-checker happy with non-overlapping mutable references - let (xp, s1, s2) = if i == 0 { - let (s1, s2) = s.split_at_mut(b); - (x2.as_mut(), s1, s2) - } else { + + let (s1, s2) = s.split_at_mut(b); + lstm_model( + b, + &w[0..b * 4], + &w[b * 4..2 * b * 4], + s1, + s2, + x2.as_mut(), + ); + + assert_eq!(s.len(), 2 * b * l); + assert_eq!(w.len(), 4 * b * l); + for i in 1..l { + let i = i * 2 * b; + let (xp, s1, s2) = { let tmp = &mut s[i - 2 * b..]; let (a, d) = tmp.split_at_mut(2 * b); let (d, c) = d.split_at_mut(b); - (a, d, c) }; + let (w1, w2) = w.split_at((i + b) * 4); lstm_model( b, + //&w1[i * 4..], + //&w2[0..(i + 2 * b) * 4], &w[i * 4..(i + b) * 4], &w[(i + b) * 4..(i + 2 * b) * 4], s1, s2, xp, ); - - i += 2 * b; } + let i = 2 * l * b; let xp = &s[i - 2 * b..]; for i in 0..b { From c6f44b33f91f3440f7084741974f7df23e4ec7d2 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Sun, 2 Jun 2024 01:16:23 -0400 Subject: [PATCH 44/56] small improvment --- enzyme/benchmarks/ReverseMode/lstm/src/safe.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs index 76c4316fab51..3cb5ca449747 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs @@ -6,6 +6,7 @@ fn sigmoid(x: f64) -> f64 { } // log(sum(exp(x), 2)) +#[inline] fn logsumexp(vect: &[f64]) -> f64 { let mut sum = 0.0; for &val in vect { @@ -136,18 +137,17 @@ pub(crate) fn lstm_objective( loss: &mut f64, ) { let mut total = 0.0; - let mut count = 0; let mut input = &sequence[..b]; let mut ypred = vec![0.0; b]; - let ypred = &mut ypred[..b]; let mut ynorm = vec![0.0; b]; - let ynorm = &mut ynorm[..b]; assert!(b > 0); - for t in (0..=(c - 1) * b - 1).step_by(b) { - lstm_predict(l, b, main_params, extra_params, state, input, ypred); + let limit = (c - 1) * b; + for j in 0..(c - 1) { + let t = j * b; + lstm_predict(l, b, main_params, extra_params, state, input, &mut ypred); let lse = logsumexp(&ypred); for i in 0..b { ynorm[i] = ypred[i] - lse; @@ -158,9 +158,9 @@ pub(crate) fn lstm_objective( total += ygold[i] * ynorm[i]; } - count += b; input = ygold; } + let count = (c - 1) * b; *loss = -total / count as f64; } From fb6df5b3ca6fc8ee7da96e2b8956193146c8c159 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Sun, 2 Jun 2024 21:25:24 -0400 Subject: [PATCH 45/56] clean up ode-real example (not building) --- .../ReverseMode/ode-real/{ode => }/Cargo.lock | 0 .../ReverseMode/ode-real/{ode => }/Cargo.toml | 5 ++--- .../ReverseMode/ode-real/Makefile.make | 18 ++++++++++++++---- .../ReverseMode/ode-real/{ode => }/src/lib.rs | 17 +++++++++++++---- 4 files changed, 29 insertions(+), 11 deletions(-) rename enzyme/benchmarks/ReverseMode/ode-real/{ode => }/Cargo.lock (100%) rename enzyme/benchmarks/ReverseMode/ode-real/{ode => }/Cargo.toml (99%) rename enzyme/benchmarks/ReverseMode/ode-real/{ode => }/src/lib.rs (89%) diff --git a/enzyme/benchmarks/ReverseMode/ode-real/ode/Cargo.lock b/enzyme/benchmarks/ReverseMode/ode-real/Cargo.lock similarity index 100% rename from enzyme/benchmarks/ReverseMode/ode-real/ode/Cargo.lock rename to enzyme/benchmarks/ReverseMode/ode-real/Cargo.lock diff --git a/enzyme/benchmarks/ReverseMode/ode-real/ode/Cargo.toml b/enzyme/benchmarks/ReverseMode/ode-real/Cargo.toml similarity index 99% rename from enzyme/benchmarks/ReverseMode/ode-real/ode/Cargo.toml rename to enzyme/benchmarks/ReverseMode/ode-real/Cargo.toml index 3013b597df4e..27a031a49570 100644 --- a/enzyme/benchmarks/ReverseMode/ode-real/ode/Cargo.toml +++ b/enzyme/benchmarks/ReverseMode/ode-real/Cargo.toml @@ -5,10 +5,11 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[dependencies] + [lib] crate-type = ["lib"] - [profile.release] lto = "fat" opt-level = 3 @@ -17,5 +18,3 @@ opt-level = 3 [profile.dev] lto = "fat" - -[dependencies] diff --git a/enzyme/benchmarks/ReverseMode/ode-real/Makefile.make b/enzyme/benchmarks/ReverseMode/ode-real/Makefile.make index 3dd680e5a1c4..16033d158a3a 100644 --- a/enzyme/benchmarks/ReverseMode/ode-real/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/ode-real/Makefile.make @@ -1,24 +1,34 @@ -# RUN: cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%loadEnzyme" make -B ode-raw.ll ode-opt.ll results.txt VERBOSE=1 -f %s +# RUN: cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%newLoadEnzyme %enzyme" make -B ode-raw.ll ode-opt.ll results.txt VERBOSE=1 -f %s .PHONY: clean +dir := $(abspath $(lastword $(MAKEFILE_LIST))/../../../..) + clean: rm -f *.ll *.o results.txt +$(dir)/benchmarks/ReverseMode/ode-real/target/release/libode.a: src/lib.rs Cargo.toml + cargo +enzyme rustc --release --lib --crate-type=staticlib + %-unopt.ll: %.cpp - clang++ $(BENCH) $^ -O2 -fno-use-cxa-atexit -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm + clang++ $(BENCH) $^ -O2 -I /u/drehwald/prog/boost_1_81_0 -DBOOST_DIR=/u/drehwald/prog/boost_1_81_0 -fno-use-cxa-atexit -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm #clang++ $(BENCH) $^ -O1 -Xclang -disable-llvm-passes -fno-use-cxa-atexit -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm %-raw.ll: %-unopt.ll - opt $^ $(LOAD) -enzyme -o $@ -S + @echo $(LOAD) + opt $^ $(LOAD) -o $@ -S %-opt.ll: %-raw.ll opt $^ -o $@ -S #opt $^ -O2 -o $@ -S -ode.o: ode-opt.ll +ode.o: ode-opt.ll $(dir)/benchmarks/ReverseMode/ode-real/target/release/libode.a clang++ -O2 $^ -o $@ $(BENCHLINK) +#ode.o: ode-opt.ll $(dir)/benchmarks/ReverseMode/ode-real/target/release/libode.a +# clang++ $(LOAD) $(BENCH) ode.cpp -I /u/drehwald/prog/boost_1_81_0 -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o ode.o -lpthread $(BENCHLINK) -lm -lode -L $(dir)/benchmarks/ReverseMode/ode/target/release/ -L /usr/lib/gcc/x86_64-linux-gnu/11 + + results.txt: ode.o ./$^ 1000 | tee $@ ./$^ 1000 >> $@ diff --git a/enzyme/benchmarks/ReverseMode/ode-real/ode/src/lib.rs b/enzyme/benchmarks/ReverseMode/ode-real/src/lib.rs similarity index 89% rename from enzyme/benchmarks/ReverseMode/ode-real/ode/src/lib.rs rename to enzyme/benchmarks/ReverseMode/ode-real/src/lib.rs index 83c6d0586790..23995eaa5626 100644 --- a/enzyme/benchmarks/ReverseMode/ode-real/ode/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/ode-real/src/lib.rs @@ -96,13 +96,16 @@ fn lorenz(x: &state_type, dxdt: &mut state_type, t: f64) { } #[no_mangle] -pub extern "C" fn rust_dbrusselator_2d_loop(p: *const f64, x: *const state_type, adjoint: *mut state_type, t: f64) -> f64 { +pub extern "C" fn rust_dbrusselator_2d_loop(p: *const f64, dp: *mut f64, x: *const state_type, dx: *mut state_type, adjoint: *mut state_type, t: f64) -> f64 { let x = unsafe { *x }; let mut adjoint = unsafe { *adjoint }; let p: [f64;3] = unsafe { *p.cast::<[f64;3]>().as_ref().unwrap() }; - let mut dp = [0.; 3]; - let mut dx1 = [0.; N * N]; - let mut dx2 = [0.; N * N]; + let mut dp: [f64;3] = unsafe { dp.cast::<[f64;3]>().as_mut().unwrap() }; + + let (mut dx1, mut dx2) = dx.split_at_mut(N * N); + //let mut dp = [0.; 3]; + //let mut dx1 = [0.; N * N]; + //let mut dx2 = [0.; N * N]; let (mut dadj1, mut dadj2) = adjoint.split_at_mut(N * N); // https://discord.com/channels/273534239310479360/273541522815713281/1236945105601040446 @@ -116,6 +119,12 @@ pub extern "C" fn rust_dbrusselator_2d_loop(p: *const f64, x: *const state_type, x2, &mut dx2, &p, &mut dp, t); dx1[0] + //brusselator_2d_loop_b(nullptr, dadjoint_inp.data(), + // nullptr, dadjoint_inp.data() + N * N, + // x.data(), dx.data(), + // x.data() + N * N, dx.data() + N * N, + // p, dp, + // t); } From a6d4a7c7d55f6da1e057ec1eb6e26cc20bd992ef Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Fri, 19 Jul 2024 20:49:14 -0400 Subject: [PATCH 46/56] add ba.cpp version without restrict --- enzyme/benchmarks/ReverseMode/adbench/ba.h | 296 ++++++++---------- enzyme/benchmarks/ReverseMode/ba/ba.cpp | 118 +++---- .../benchmarks/ReverseMode/ba/ba_mayalias.h | 198 ++++++++++++ 3 files changed, 372 insertions(+), 240 deletions(-) create mode 100644 enzyme/benchmarks/ReverseMode/ba/ba_mayalias.h diff --git a/enzyme/benchmarks/ReverseMode/adbench/ba.h b/enzyme/benchmarks/ReverseMode/adbench/ba.h index aa62cf2a165f..6a3f97737985 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/ba.h +++ b/enzyme/benchmarks/ReverseMode/adbench/ba.h @@ -115,84 +115,68 @@ struct BAOutput { }; extern "C" { - void ba_objective( - int n, - int m, - int p, - double const* cams, - double const* X, - double const* w, - int const* obs, - double const* feats, - double* reproj_err, - double* w_err - ); - - void rust2_unsafe_ba_objective(int n, int m, int p, double const *cams, - double const *X, double const *w, - int const *obs, double const *feats, - double *reproj_err, double *w_err); - - void rust2_ba_objective(int n, int m, int p, double const *cams, - double const *X, double const *w, int const *obs, - double const *feats, double *reproj_err, - double *w_err); - - void dcompute_reproj_error( - double const* cam, - double * dcam, - double const* X, - double * dX, - double const* w, - double * wb, - double const* feat, - double *err, - double *derr - ); - - void dcompute_zach_weight_error(double const* w, double* dw, double* err, double* derr); - - void compute_reproj_error_b( - double const* cam, - double * dcam, - double const* X, - double * dX, - double const* w, - double * wb, - double const* feat, - double *err, - double *derr - ); +void ba_objective_restrict(int n, int m, int p, double const *cams, + double const *X, double const *w, int const *obs, + double const *feats, double *reproj_err, + double *w_err); - void compute_zach_weight_error_b(double const* w, double* dw, double* err, double* derr); - - void adept_compute_reproj_error( - double const* cam, - double * dcam, - double const* X, - double * dX, - double const* w, - double * wb, - double const* feat, - double *err, - double *derr - ); +void ba_objective(int n, int m, int p, double const *cams, double const *X, + double const *w, int const *obs, double const *feats, + double *reproj_err, double *w_err); - void adept_compute_zach_weight_error(double const* w, double* dw, double* err, double* derr); +void rust2_unsafe_ba_objective(int n, int m, int p, double const *cams, + double const *X, double const *w, int const *obs, + double const *feats, double *reproj_err, + double *w_err); - void rust_unsafe_dcompute_reproj_error(double const *cam, double *dcam, - double const *X, double *dX, - double const *w, double *wb, - double const *feat, double *err, - double *derr); +void rust2_ba_objective(int n, int m, int p, double const *cams, + double const *X, double const *w, int const *obs, + double const *feats, double *reproj_err, double *w_err); - void rust_dcompute_reproj_error(double const *cam, double *dcam, +void dcompute_reproj_error_restrict(double const *cam, double *dcam, double const *X, double *dX, double const *w, double *wb, double const *feat, double *err, double *derr); - void rust_dcompute_zach_weight_error(double const* w, double* dw, double* err, double* derr); +void dcompute_zach_weight_error_restrict(double const *w, double *dw, + double *err, double *derr); + +void dcompute_reproj_error(double const *cam, double *dcam, double const *X, + double *dX, double const *w, double *wb, + double const *feat, double *err, double *derr); + +void dcompute_zach_weight_error(double const *w, double *dw, double *err, + double *derr); + +void compute_reproj_error_b(double const *cam, double *dcam, double const *X, + double *dX, double const *w, double *wb, + double const *feat, double *err, double *derr); + +void compute_zach_weight_error_b(double const *w, double *dw, double *err, + double *derr); + +void adept_compute_reproj_error(double const *cam, double *dcam, + double const *X, double *dX, double const *w, + double *wb, double const *feat, double *err, + double *derr); + +void adept_compute_zach_weight_error(double const *w, double *dw, double *err, + double *derr); + +void rust_unsafe_dcompute_reproj_error(double const *cam, double *dcam, + double const *X, double *dX, + double const *w, double *wb, + double const *feat, double *err, + double *derr); + +void rust_dcompute_reproj_error(double const *cam, double *dcam, + double const *X, double *dX, double const *w, + double *wb, double const *feat, double *err, + double *derr); + +void rust_dcompute_zach_weight_error(double const *w, double *dw, double *err, + double *derr); } void read_ba_instance(const string& fn, @@ -394,27 +378,6 @@ int main(const int argc, const char* argv[]) { BASparseMat(input.n, input.m, input.p) }; - //BASparseMat(this->input.n, this->input.m, this->input.p) - - /* - ba_objective( - input.n, - input.m, - input.p, - input.cams.data(), - input.X.data(), - input.w.data(), - input.obs.data(), - input.feats.data(), - result.reproj_err.data(), - result.w_err.data() - ); - - for(unsigned i=0; iinput.n, this->input.m, this->input.p) - - /* - ba_objective( - input.n, - input.m, - input.p, - input.cams.data(), - input.X.data(), - input.w.data(), - input.obs.data(), - input.feats.data(), - result.reproj_err.data(), - result.w_err.data() - ); - - for(unsigned i=0; i(input, result); + calculate_jacobian(input, result); gettimeofday(&end, NULL); printf("Adept combined %0.6f\n", tdiff(&start, &end)); json adept; adept["name"] = "Adept combined"; adept["runtime"] = tdiff(&start, &end); - for(unsigned i=0; i<5; i++) { + for (unsigned i = 0; i < 5; i++) { printf("%f ", result.J.vals[i]); adept["result"].push_back(result.J.vals[i]); } printf("\n"); test_suite["tools"].push_back(adept); } - } { struct BAInput input; - read_ba_instance("data/" + path, input.n, input.m, input.p, input.cams, input.X, input.w, input.obs, input.feats); + read_ba_instance("data/" + path, input.n, input.m, input.p, input.cams, + input.X, input.w, input.obs, input.feats); - struct BAOutput result = { - std::vector(2 * input.p), - std::vector(input.p), - BASparseMat(input.n, input.m, input.p) - }; + struct BAOutput result = {std::vector(2 * input.p), + std::vector(input.p), + BASparseMat(input.n, input.m, input.p)}; - //BASparseMat(this->input.n, this->input.m, this->input.p) + { + struct timeval start, end; + gettimeofday(&start, NULL); + calculate_jacobian(input, result); + gettimeofday(&end, NULL); + printf("Enzyme restrict c++ combined %0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "Enzyme restrict c++ combined"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = 0; i < 5; i++) { + printf("%f ", result.J.vals[i]); + enzyme["result"].push_back(result.J.vals[i]); + } + printf("\n"); + test_suite["tools"].push_back(enzyme); + } + } - /* - ba_objective( - input.n, - input.m, - input.p, - input.cams.data(), - input.X.data(), - input.w.data(), - input.obs.data(), - input.feats.data(), - result.reproj_err.data(), - result.w_err.data() - ); + { - for(unsigned i=0; i(2 * input.p), + std::vector(input.p), + BASparseMat(input.n, input.m, input.p)}; { struct timeval start, end; gettimeofday(&start, NULL); - calculate_jacobian(input, result); + calculate_jacobian( + input, result); gettimeofday(&end, NULL); - printf("Enzyme c++ combined %0.6f\n", tdiff(&start, &end)); + printf("Enzyme aliasing c++ combined %0.6f\n", tdiff(&start, &end)); json enzyme; enzyme["name"] = "Enzyme c++ combined"; enzyme["runtime"] = tdiff(&start, &end); - for(unsigned i=0; i<5; i++) { + for (unsigned i = 0; i < 5; i++) { printf("%f ", result.J.vals[i]); enzyme["result"].push_back(result.J.vals[i]); } printf("\n"); test_suite["tools"].push_back(enzyme); } - } { struct BAInput input; - read_ba_instance("data/" + path, input.n, input.m, input.p, input.cams, input.X, input.w, input.obs, input.feats); + read_ba_instance("data/" + path, input.n, input.m, input.p, input.cams, + input.X, input.w, input.obs, input.feats); - struct BAOutput result = { - std::vector(2 * input.p), - std::vector(input.p), - BASparseMat(input.n, input.m, input.p) - }; + struct BAOutput result = {std::vector(2 * input.p), + std::vector(input.p), + BASparseMat(input.n, input.m, input.p)}; + + { + struct timeval start, end; + gettimeofday(&start, NULL); + ba_objective_restrict(input.n, input.m, input.p, input.cams.data(), + input.X.data(), input.w.data(), input.obs.data(), + input.feats.data(), result.reproj_err.data(), + result.w_err.data()); + gettimeofday(&end, NULL); + printf("primal restrict c++ t=%0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "primal restrict c++"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = 0; i < 5; i++) { + printf("%f ", result.reproj_err[i]); + enzyme["result"].push_back(result.reproj_err[i]); + } + for (unsigned i = 0; i < 5; i++) { + printf("%f ", result.w_err[i]); + enzyme["result"].push_back(result.w_err[i]); + } + printf("\n"); + test_suite["tools"].push_back(enzyme); + } + } + + { + struct BAInput input; + read_ba_instance("data/" + path, input.n, input.m, input.p, input.cams, + input.X, input.w, input.obs, input.feats); + struct BAOutput result = {std::vector(2 * input.p), + std::vector(input.p), + BASparseMat(input.n, input.m, input.p)}; { struct timeval start, end; gettimeofday(&start, NULL); - ba_objective( - input.n, - input.m, - input.p, - input.cams.data(), - input.X.data(), - input.w.data(), - input.obs.data(), - input.feats.data(), - result.reproj_err.data(), - result.w_err.data() - ); + ba_objective(input.n, input.m, input.p, input.cams.data(), input.X.data(), + input.w.data(), input.obs.data(), input.feats.data(), + result.reproj_err.data(), result.w_err.data()); gettimeofday(&end, NULL); - printf("primal c++ t=%0.6f\n", tdiff(&start, &end)); + printf("primal aliasing c++ t=%0.6f\n", tdiff(&start, &end)); json enzyme; - enzyme["name"] = "primal c++"; + enzyme["name"] = "primal aliasing c++"; enzyme["runtime"] = tdiff(&start, &end); for(unsigned i=0; i<5; i++) { printf("%f ", result.reproj_err[i]); diff --git a/enzyme/benchmarks/ReverseMode/ba/ba.cpp b/enzyme/benchmarks/ReverseMode/ba/ba.cpp index b71e05a0a011..602af73d8d5f 100644 --- a/enzyme/benchmarks/ReverseMode/ba/ba.cpp +++ b/enzyme/benchmarks/ReverseMode/ba/ba.cpp @@ -43,17 +43,13 @@ double sqsum(int n, double const* x) return res; } - - -void cross(double const* a, double const* b, double* out) -{ +void cross_restrict(double const *__restrict a, double const *__restrict b, + double *__restrict out) { out[0] = a[1] * b[2] - a[2] * b[1]; out[1] = a[2] * b[0] - a[0] * b[2]; out[2] = a[0] * b[1] - a[1] * b[0]; } - - /* ===================================================================== */ /* MAIN LOGIC */ /* ===================================================================== */ @@ -68,8 +64,9 @@ void cross(double const* a, double const* b, double* out) // n = w / theta; // n_x = au_cross_matrix(n); // R = eye(3) + n_x*sin(theta) + n_x*n_x*(1 - cos(theta)); -void rodrigues_rotate_point(double const* __restrict rot, double const* __restrict pt, double *__restrict rotatedPt) -{ +void rodrigues_rotate_point_restrict(double const *__restrict rot, + double const *__restrict pt, + double *__restrict rotatedPt) { int i; double sqtheta = sqsum(3, rot); if (sqtheta != 0) @@ -87,7 +84,7 @@ void rodrigues_rotate_point(double const* __restrict rot, double const* __restri w[i] = rot[i] * theta_inverse; } - cross(w, pt, w_cross_pt); + cross_restrict(w, pt, w_cross_pt); tmp = (w[0] * pt[0] + w[1] * pt[1] + w[2] * pt[2]) * (1. - costheta); @@ -100,7 +97,7 @@ void rodrigues_rotate_point(double const* __restrict rot, double const* __restri else { double rot_cross_pt[3]; - cross(rot, pt, rot_cross_pt); + cross_restrict(rot, pt, rot_cross_pt); for (i = 0; i < 3; i++) { @@ -109,8 +106,6 @@ void rodrigues_rotate_point(double const* __restrict rot, double const* __restri } } - - void radial_distort(double const* rad_params, double *proj) { double rsq, L; @@ -120,10 +115,8 @@ void radial_distort(double const* rad_params, double *proj) proj[1] = proj[1] * L; } - - -void project(double const* __restrict cam, double const* __restrict X, double* __restrict proj) -{ +void project_restrict(double const *__restrict cam, double const *__restrict X, + double *__restrict proj) { double const* C = &cam[3]; double Xo[3], Xcam[3]; @@ -131,7 +124,7 @@ void project(double const* __restrict cam, double const* __restrict X, double* _ Xo[1] = X[1] - C[1]; Xo[2] = X[2] - C[2]; - rodrigues_rotate_point(&cam[0], Xo, Xcam); + rodrigues_rotate_point_restrict(&cam[0], Xo, Xcam); proj[0] = Xcam[0] / Xcam[2]; proj[1] = Xcam[1] / Xcam[2]; @@ -142,8 +135,6 @@ void project(double const* __restrict cam, double const* __restrict X, double* _ proj[1] = proj[1] * cam[6] + cam[8]; } - - // cam: 11 camera in format [r1 r2 r3 C1 C2 C3 f u0 v0 k1 k2] // r1, r2, r3 are angle - axis rotation parameters(Rodrigues) // [C1 C2 C3]' is the camera center @@ -158,30 +149,23 @@ void project(double const* __restrict cam, double const* __restrict X, double* _ // distorted = radial_distort(projective2euclidean(Xcam), radial_parameters) // proj = distorted * f + principal_point // err = sqsum(proj - measurement) -void compute_reproj_error( - double const* __restrict cam, - double const* __restrict X, - double const* __restrict w, - double const* __restrict feat, - double * __restrict err -) -{ +void compute_reproj_error_restrict(double const *__restrict cam, + double const *__restrict X, + double const *__restrict w, + double const *__restrict feat, + double *__restrict err) { double proj[2]; - project(cam, X, proj); + project_restrict(cam, X, proj); err[0] = (*w)*(proj[0] - feat[0]); err[1] = (*w)*(proj[1] - feat[1]); } - - -void compute_zach_weight_error(double const* w, double* err) -{ +void compute_zach_weight_error_restrict(double const *__restrict w, + double *__restrict err) { *err = 1 - (*w)*(*w); } - - // n number of cameras // m number of points // p number of observations @@ -196,36 +180,23 @@ void compute_zach_weight_error(double const* w, double* err) // feats: 2*p features (x,y coordinates corresponding to observations) // reproj_err: 2*p errors of observations // w_err: p weight "error" terms -void ba_objective( - int n, - int m, - int p, - double const* cams, - double const* X, - double const* w, - int const* obs, - double const* feats, - double* reproj_err, - double* w_err -) -{ +void ba_objective_restrict(int n, int m, int p, double const *cams, + double const *X, double const *w, int const *obs, + double const *feats, double *reproj_err, + double *w_err) { int i; for (i = 0; i < p; i++) { int camIdx = obs[i * 2 + 0]; int ptIdx = obs[i * 2 + 1]; - compute_reproj_error( - &cams[camIdx * BA_NCAMPARAMS], - &X[ptIdx * 3], - &w[i], - &feats[i * 2], - &reproj_err[2 * i] - ); + compute_reproj_error_restrict(&cams[camIdx * BA_NCAMPARAMS], + &X[ptIdx * 3], &w[i], &feats[i * 2], + &reproj_err[2 * i]); } for (i = 0; i < p; i++) { - compute_zach_weight_error(&w[i], &w_err[i]); + compute_zach_weight_error_restrict(&w[i], &w_err[i]); } } @@ -234,32 +205,21 @@ extern int enzyme_dup; extern int enzyme_dupnoneed; void __enzyme_autodiff(...) noexcept; -void dcompute_reproj_error( - double const* cam, - double * dcam, - double const* X, - double * dX, - double const* w, - double * wb, - double const* feat, - double *err, - double *derr -) -{ - __enzyme_autodiff(compute_reproj_error, - enzyme_dup, cam, dcam, - enzyme_dup, X, dX, - enzyme_dup, w, wb, - enzyme_const, feat, - enzyme_dupnoneed, err, derr); +void dcompute_reproj_error_restrict(double const *cam, double *dcam, + double const *X, double *dX, + double const *w, double *wb, + double const *feat, double *err, + double *derr) { + __enzyme_autodiff(compute_reproj_error_restrict, enzyme_dup, cam, dcam, + enzyme_dup, X, dX, enzyme_dup, w, wb, enzyme_const, feat, + enzyme_dupnoneed, err, derr); } -void dcompute_zach_weight_error(double const* w, double* dw, double* err, double* derr) { - __enzyme_autodiff(compute_zach_weight_error, - enzyme_dup, w, dw, - enzyme_dupnoneed, err, derr); +void dcompute_zach_weight_error_restrict(double const *w, double *dw, + double *err, double *derr) { + __enzyme_autodiff(compute_zach_weight_error_restrict, enzyme_dup, w, dw, + enzyme_dupnoneed, err, derr); } - } @@ -911,3 +871,5 @@ void adept_compute_zach_weight_error(double const* w, double* dw, double* err, d *dw = aw.get_gradient(); } + +#include "ba_mayalias.h" diff --git a/enzyme/benchmarks/ReverseMode/ba/ba_mayalias.h b/enzyme/benchmarks/ReverseMode/ba/ba_mayalias.h new file mode 100644 index 000000000000..25197b52d7b2 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/ba/ba_mayalias.h @@ -0,0 +1,198 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +extern "C" { + +/* ===================================================================== */ +/* UTILS */ +/* ===================================================================== */ + +void cross(double const *a, double const *b, double *out) { + out[0] = a[1] * b[2] - a[2] * b[1]; + out[1] = a[2] * b[0] - a[0] * b[2]; + out[2] = a[0] * b[1] - a[1] * b[0]; +} + +/* ===================================================================== */ +/* MAIN LOGIC */ +/* ===================================================================== */ + +void compute_zach_weight_error(double const *w, double *err) { + *err = 1 - (*w) * (*w); +} + +// rot: 3 rotation parameters +// pt: 3 point to be rotated +// rotatedPt: 3 rotated point +// this is an efficient evaluation (part of +// the Ceres implementation) +// easy to understand calculation in matlab: +// theta = sqrt(sum(w. ^ 2)); +// n = w / theta; +// n_x = au_cross_matrix(n); +// R = eye(3) + n_x*sin(theta) + n_x*n_x*(1 - cos(theta)); +void rodrigues_rotate_point(double const *rot, double const *pt, + double *rotatedPt) { + int i; + double sqtheta = sqsum(3, rot); + if (sqtheta != 0) + { + double theta, costheta, sintheta, theta_inverse; + double w[3], w_cross_pt[3], tmp; + + theta = sqrt(sqtheta); + costheta = cos(theta); + sintheta = sin(theta); + theta_inverse = 1.0 / theta; + + for (i = 0; i < 3; i++) + { + w[i] = rot[i] * theta_inverse; + } + + cross(w, pt, w_cross_pt); + + tmp = (w[0] * pt[0] + w[1] * pt[1] + w[2] * pt[2]) * + (1. - costheta); + + for (i = 0; i < 3; i++) + { + rotatedPt[i] = pt[i] * costheta + w_cross_pt[i] * sintheta + w[i] * tmp; + } + } + else + { + double rot_cross_pt[3]; + cross(rot, pt, rot_cross_pt); + + for (i = 0; i < 3; i++) + { + rotatedPt[i] = pt[i] + rot_cross_pt[i]; + } + } +} + +void project(double const *cam, double const *X, double *proj) { + double const* C = &cam[3]; + double Xo[3], Xcam[3]; + + Xo[0] = X[0] - C[0]; + Xo[1] = X[1] - C[1]; + Xo[2] = X[2] - C[2]; + + rodrigues_rotate_point(&cam[0], Xo, Xcam); + + proj[0] = Xcam[0] / Xcam[2]; + proj[1] = Xcam[1] / Xcam[2]; + + radial_distort(&cam[9], proj); + + proj[0] = proj[0] * cam[6] + cam[7]; + proj[1] = proj[1] * cam[6] + cam[8]; +} + +// cam: 11 camera in format [r1 r2 r3 C1 C2 C3 f u0 v0 k1 k2] +// r1, r2, r3 are angle - axis rotation parameters(Rodrigues) +// [C1 C2 C3]' is the camera center +// f is the focal length in pixels +// [u0 v0]' is the principal point +// k1, k2 are radial distortion parameters +// X: 3 point +// feats: 2 feature (x,y coordinates) +// reproj_err: 2 +// projection function: +// Xcam = R * (X - C) +// distorted = radial_distort(projective2euclidean(Xcam), radial_parameters) +// proj = distorted * f + principal_point +// err = sqsum(proj - measurement) +void compute_reproj_error(double const *cam, double const *X, double const *w, + double const *feat, double *err) { + double proj[2]; + project(cam, X, proj); + + err[0] = (*w)*(proj[0] - feat[0]); + err[1] = (*w)*(proj[1] - feat[1]); +} + + + + +// n number of cameras +// m number of points +// p number of observations +// cams: 11*n cameras in format [r1 r2 r3 C1 C2 C3 f u0 v0 k1 k2] +// r1, r2, r3 are angle - axis rotation parameters(Rodrigues) +// [C1 C2 C3]' is the camera center +// f is the focal length in pixels +// [u0 v0]' is the principal point +// k1, k2 are radial distortion parameters +// X: 3*m points +// obs: 2*p observations (pairs cameraIdx, pointIdx) +// feats: 2*p features (x,y coordinates corresponding to observations) +// reproj_err: 2*p errors of observations +// w_err: p weight "error" terms +void ba_objective( + int n, + int m, + int p, + double const* cams, + double const* X, + double const* w, + int const* obs, + double const* feats, + double* reproj_err, + double* w_err +) +{ + int i; + for (i = 0; i < p; i++) + { + int camIdx = obs[i * 2 + 0]; + int ptIdx = obs[i * 2 + 1]; + compute_reproj_error( + &cams[camIdx * BA_NCAMPARAMS], + &X[ptIdx * 3], + &w[i], + &feats[i * 2], + &reproj_err[2 * i] + ); + } + + for (i = 0; i < p; i++) + { + compute_zach_weight_error(&w[i], &w_err[i]); + } +} + +extern int enzyme_const; +extern int enzyme_dup; +extern int enzyme_dupnoneed; +void __enzyme_autodiff(...) noexcept; + +void dcompute_reproj_error( + double const* cam, + double * dcam, + double const* X, + double * dX, + double const* w, + double * wb, + double const* feat, + double *err, + double *derr +) +{ + __enzyme_autodiff(compute_reproj_error, + enzyme_dup, cam, dcam, + enzyme_dup, X, dX, + enzyme_dup, w, wb, + enzyme_const, feat, + enzyme_dupnoneed, err, derr); +} + +void dcompute_zach_weight_error(double const* w, double* dw, double* err, double* derr) { + __enzyme_autodiff(compute_zach_weight_error, + enzyme_dup, w, dw, + enzyme_dupnoneed, err, derr); +} + +} From d08142b3f4a68bcfd942c58fe059e85c55884fff Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Fri, 19 Jul 2024 21:00:13 -0400 Subject: [PATCH 47/56] add gmm version without restrict --- enzyme/benchmarks/ReverseMode/adbench/gmm.h | 148 +++++++++++------- enzyme/benchmarks/ReverseMode/gmm/gmm.cpp | 49 +++--- .../benchmarks/ReverseMode/gmm/gmm_mayalias.h | 64 ++++++++ 3 files changed, 177 insertions(+), 84 deletions(-) create mode 100644 enzyme/benchmarks/ReverseMode/gmm/gmm_mayalias.h diff --git a/enzyme/benchmarks/ReverseMode/adbench/gmm.h b/enzyme/benchmarks/ReverseMode/adbench/gmm.h index 45d589c7ae75..91b2220bf950 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/gmm.h +++ b/enzyme/benchmarks/ReverseMode/adbench/gmm.h @@ -33,52 +33,50 @@ struct GMMParameters { }; extern "C" { -void gmm_objective( - int d, - int k, - int n, - double const* alphas, - double const* means, - double const* icf, - double const* x, - Wishart wishart, - double* err -); - void dgmm_objective(int d, int k, int n, const double *alphas, double * - alphasb, const double *means, double *meansb, const double *icf, - double *icfb, const double *x, Wishart wishart, double *err, double * - errb); - - void gmm_objective_b(int d, int k, int n, const double *alphas, double * - alphasb, const double *means, double *meansb, const double *icf, - double *icfb, const double *x, Wishart wishart, double *err, double * - errb); - - void adept_dgmm_objective(int d, int k, int n, const double *alphas, double * - alphasb, const double *means, double *meansb, const double *icf, - double *icfb, const double *x, Wishart wishart, double *err, double * - errb); - - void rust_unsafe_dgmm_objective(int d, int k, int n, const double *alphas, - double *alphasb, const double *means, - double *meansb, const double *icf, - double *icfb, const double *x, - Wishart &wishart, double *err, - double *errb); - - void rust_unsafe_gmm_objective(int d, int k, int n, const double *alphas, - const double *means, const double *icf, - const double *x, Wishart &wishart, - double *err); - - void rust_dgmm_objective(int d, int k, int n, const double *alphas, double * - alphasb, const double *means, double *meansb, const double *icf, - double *icfb, const double *x, Wishart &wishart, double *err, double * - errb); - - void rust_gmm_objective(int d, int k, int n, const double *alphas, - const double *means, const double *icf, - const double *x, Wishart &wishart, double *err); +void gmm_objective(int d, int k, int n, double const *alphas, + double const *means, double const *icf, double const *x, + Wishart wishart, double *err); +void gmm_objective_restrict(int d, int k, int n, double const *alphas, + double const *means, double const *icf, + double const *x, Wishart wishart, double *err); +void dgmm_objective_restrict(int d, int k, int n, const double *alphas, + double *alphasb, const double *means, + double *meansb, const double *icf, double *icfb, + const double *x, Wishart wishart, double *err, + double *errb); +void dgmm_objective(int d, int k, int n, const double *alphas, double *alphasb, + const double *means, double *meansb, const double *icf, + double *icfb, const double *x, Wishart wishart, double *err, + double *errb); + +void gmm_objective_b(int d, int k, int n, const double *alphas, double *alphasb, + const double *means, double *meansb, const double *icf, + double *icfb, const double *x, Wishart wishart, + double *err, double *errb); + +void adept_dgmm_objective(int d, int k, int n, const double *alphas, + double *alphasb, const double *means, double *meansb, + const double *icf, double *icfb, const double *x, + Wishart wishart, double *err, double *errb); + +void rust_unsafe_dgmm_objective(int d, int k, int n, const double *alphas, + double *alphasb, const double *means, + double *meansb, const double *icf, double *icfb, + const double *x, Wishart &wishart, double *err, + double *errb); + +void rust_unsafe_gmm_objective(int d, int k, int n, const double *alphas, + const double *means, const double *icf, + const double *x, Wishart &wishart, double *err); + +void rust_dgmm_objective(int d, int k, int n, const double *alphas, + double *alphasb, const double *means, double *meansb, + const double *icf, double *icfb, const double *x, + Wishart &wishart, double *err, double *errb); + +void rust_gmm_objective(int d, int k, int n, const double *alphas, + const double *means, const double *icf, const double *x, + Wishart &wishart, double *err); } void read_gmm_instance(const string& fn, @@ -302,14 +300,44 @@ int main(const int argc, const char* argv[]) { struct GMMOutput result = { 0, std::vector(Jcols) }; + { + struct timeval start, end; + gettimeofday(&start, NULL); + calculate_jacobian(input, result); + gettimeofday(&end, NULL); + printf("Enzyme c++ restrict combined %0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "Enzyme restrict combined"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = result.gradient.size() - 5; i < result.gradient.size(); + i++) { + printf("%f ", result.gradient[i]); + enzyme["result"].push_back(result.gradient[i]); + } + printf("\n"); + test_suite["tools"].push_back(enzyme); + } + } + + { + + struct GMMInput input; + read_gmm_instance("data/" + path, &input.d, &input.k, &input.n, + input.alphas, input.means, input.icf, input.x, + input.wishart, params.replicate_point); + + int Jcols = (input.k * (input.d + 1) * (input.d + 2)) / 2; + + struct GMMOutput result = {0, std::vector(Jcols)}; + { struct timeval start, end; gettimeofday(&start, NULL); calculate_jacobian(input, result); gettimeofday(&end, NULL); - printf("Enzyme c++ combined %0.6f\n", tdiff(&start, &end)); + printf("Enzyme c++ mayalias combined %0.6f\n", tdiff(&start, &end)); json enzyme; - enzyme["name"] = "Enzyme combined"; + enzyme["name"] = "Enzyme mayalias combined"; enzyme["runtime"] = tdiff(&start, &end); for (unsigned i = result.gradient.size() - 5; i < result.gradient.size(); i++) { @@ -319,9 +347,8 @@ int main(const int argc, const char* argv[]) { printf("\n"); test_suite["tools"].push_back(enzyme); } - } - + { struct GMMInput input; @@ -337,10 +364,25 @@ int main(const int argc, const char* argv[]) { gettimeofday(&start, NULL); auto res = primal(input); gettimeofday(&end, NULL); - printf("c++ primal combined t=%0.6f, err=%f\n", tdiff(&start, &end), res); - + printf("c++ primal mayalias combined t=%0.6f, err=%f\n", + tdiff(&start, &end), res); + + json primal; + primal["name"] = "C++ primal mayalias"; + primal["runtime"] = tdiff(&start, &end); + primal["result"].push_back(res); + test_suite["tools"].push_back(primal); + } + { + struct timeval start, end; + gettimeofday(&start, NULL); + auto res = primal(input); + gettimeofday(&end, NULL); + printf("c++ primal restrict combined t=%0.6f, err=%f\n", + tdiff(&start, &end), res); + json primal; - primal["name"] = "C++ primal"; + primal["name"] = "C++ primal restrict"; primal["runtime"] = tdiff(&start, &end); primal["result"].push_back(res); test_suite["tools"].push_back(primal); diff --git a/enzyme/benchmarks/ReverseMode/gmm/gmm.cpp b/enzyme/benchmarks/ReverseMode/gmm/gmm.cpp index 866059217b96..37fa90574157 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/gmm.cpp +++ b/enzyme/benchmarks/ReverseMode/gmm/gmm.cpp @@ -202,21 +202,13 @@ void Qtimesx( } } - - -void gmm_objective( - int d, - int k, - int n, - double const* __restrict alphas, - double const* __restrict means, - double const* __restrict icf, - double const* __restrict x, - Wishart wishart, - double* __restrict err -) -{ - #define int int64_t +void gmm_objective_restrict(int d, int k, int n, + double const *__restrict alphas, + double const *__restrict means, + double const *__restrict icf, + double const *__restrict x, Wishart wishart, + double *__restrict err) { +#define int int64_t int ix, ik; const double CONSTANT = -n * d * 0.5 * log(2 * PI); int icf_sz = d * (d + 1) / 2; @@ -265,23 +257,16 @@ extern int enzyme_dupnoneed; void __enzyme_autodiff(...) noexcept; // * tapenade -b -o gmm_tapenade -head "gmm_objective(err)/(alphas means icf)" gmm.c -void dgmm_objective(int d, int k, int n, const double *alphas, double * - alphasb, const double *means, double *meansb, const double *icf, - double *icfb, const double *x, Wishart wishart, double *err, double * - errb) { - __enzyme_autodiff( - gmm_objective, - enzyme_const, d, - enzyme_const, k, - enzyme_const, n, - enzyme_dup, alphas, alphasb, - enzyme_dup, means, meansb, - enzyme_dup, icf, icfb, - enzyme_const, x, - enzyme_const, wishart, - enzyme_dupnoneed, err, errb); +void dgmm_objective_restrict(int d, int k, int n, const double *alphas, + double *alphasb, const double *means, + double *meansb, const double *icf, double *icfb, + const double *x, Wishart wishart, double *err, + double *errb) { + __enzyme_autodiff(gmm_objective_restrict, enzyme_const, d, enzyme_const, k, + enzyme_const, n, enzyme_dup, alphas, alphasb, enzyme_dup, + means, meansb, enzyme_dup, icf, icfb, enzyme_const, x, + enzyme_const, wishart, enzyme_dupnoneed, err, errb); } - } @@ -1050,3 +1035,5 @@ void adept_dgmm_objective(int d, int k, int n, const double *alphas, double * delete[] ameans; delete[] aicf; } + +#include "gmm_mayalias.h" diff --git a/enzyme/benchmarks/ReverseMode/gmm/gmm_mayalias.h b/enzyme/benchmarks/ReverseMode/gmm/gmm_mayalias.h new file mode 100644 index 000000000000..91e207fbcceb --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/gmm/gmm_mayalias.h @@ -0,0 +1,64 @@ +void gmm_objective(int d, int k, int n, double const *alphas, + double const *means, double const *icf, double const *x, + Wishart wishart, double *err) { +#define int int64_t + int ix, ik; + const double CONSTANT = -n * d * 0.5 * log(2 * PI); + int icf_sz = d * (d + 1) / 2; + + double *Qdiags = (double *)malloc(d * k * sizeof(double)); + double *sum_qs = (double *)malloc(k * sizeof(double)); + double *xcentered = (double *)malloc(d * sizeof(double)); + double *Qxcentered = (double *)malloc(d * sizeof(double)); + double *main_term = (double *)malloc(k * sizeof(double)); + + preprocess_qs(d, k, icf, &sum_qs[0], &Qdiags[0]); + + double slse = 0.; + for (ix = 0; ix < n; ix++) { + for (ik = 0; ik < k; ik++) { + subtract(d, &x[ix * d], &means[ik * d], &xcentered[0]); + Qtimesx(d, &Qdiags[ik * d], &icf[ik * icf_sz + d], &xcentered[0], + &Qxcentered[0]); + // two caches for qxcentered at idx 0 and at arbitrary index + main_term[ik] = alphas[ik] + sum_qs[ik] - 0.5 * sqnorm(d, &Qxcentered[0]); + } + + // storing cmp for max of main_term + // 2 x (0 and arbitrary) storing sub to exp + // storing sum for use in log + slse = slse + log_sum_exp(k, &main_term[0]); + } + + // storing cmp of alphas + double lse_alphas = log_sum_exp(k, alphas); + + *err = CONSTANT + slse - n * lse_alphas + + log_wishart_prior(d, k, wishart, &sum_qs[0], &Qdiags[0], icf); + + free(Qdiags); + free(sum_qs); + free(xcentered); + free(Qxcentered); + free(main_term); +#undef int +} + +// * tapenade -b -o gmm_tapenade -head "gmm_objective(err)/(alphas means icf)" gmm.c +void dgmm_objective(int d, int k, int n, const double *alphas, double * + alphasb, const double *means, double *meansb, const double *icf, + double *icfb, const double *x, Wishart wishart, double *err, double * + errb) { + __enzyme_autodiff( + gmm_objective, + enzyme_const, d, + enzyme_const, k, + enzyme_const, n, + enzyme_dup, alphas, alphasb, + enzyme_dup, means, meansb, + enzyme_dup, icf, icfb, + enzyme_const, x, + enzyme_const, wishart, + enzyme_dupnoneed, err, errb); +} + From d6b35b34b68d952ef0766ce2c7ec9b2ce3e78f9a Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Fri, 19 Jul 2024 21:06:44 -0400 Subject: [PATCH 48/56] add lstm version without restrict --- enzyme/benchmarks/ReverseMode/adbench/lstm.h | 55 +++++- enzyme/benchmarks/ReverseMode/lstm/lstm.cpp | 86 +++------ .../ReverseMode/lstm/lstm_mayalias.h | 175 ++++++++++++++++++ 3 files changed, 251 insertions(+), 65 deletions(-) create mode 100644 enzyme/benchmarks/ReverseMode/lstm/lstm_mayalias.h diff --git a/enzyme/benchmarks/ReverseMode/adbench/lstm.h b/enzyme/benchmarks/ReverseMode/adbench/lstm.h index 7472bf37beb2..6318f5077edc 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/lstm.h +++ b/enzyme/benchmarks/ReverseMode/adbench/lstm.h @@ -60,6 +60,12 @@ void dlstm_objective(int l, int c, int b, double const *main_params, double *dextra_params, double *state, double const *sequence, double *loss, double *dloss); +void dlstm_objective_restrict(int l, int c, int b, double const *main_params, + double *dmain_params, double const *extra_params, + double *dextra_params, double *state, + double const *sequence, double *loss, + double *dloss); + void lstm_objective_b(int l, int c, int b, const double *main_params, double *main_paramsb, const double *extra_params, double *extra_paramsb, double *state, @@ -297,25 +303,58 @@ int main(const int argc, const char* argv[]) { int Jcols = 8 * input.l * input.b + 3 * input.b; struct LSTMOutput result = { 0, std::vector(Jcols) }; + { + struct timeval start, end; + gettimeofday(&start, NULL); + calculate_jacobian(input, result); + gettimeofday(&end, NULL); + printf("Enzyme restrict combined %0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "Enzyme restrict combined"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = result.gradient.size() - 5; i < result.gradient.size(); + i++) { + printf("%f ", result.gradient[i]); + enzyme["result"].push_back(result.gradient[i]); + } + test_suite["tools"].push_back(enzyme); + + printf("\n"); + } + } + + { + + struct LSTMInput input = {}; + + // Read instance + read_lstm_instance("data/" + path, &input.l, &input.c, &input.b, + input.main_params, input.extra_params, input.state, + input.sequence); + + std::vector state = std::vector(input.state.size()); + + int Jcols = 8 * input.l * input.b + 3 * input.b; + struct LSTMOutput result = {0, std::vector(Jcols)}; + { struct timeval start, end; gettimeofday(&start, NULL); calculate_jacobian(input, result); gettimeofday(&end, NULL); - printf("Enzyme combined %0.6f\n", tdiff(&start, &end)); + printf("Enzyme mayalias combined %0.6f\n", tdiff(&start, &end)); json enzyme; - enzyme["name"] = "Enzyme combined"; - enzyme["runtime"] = tdiff(&start, &end); - for (unsigned i = result.gradient.size() - 5; - i < result.gradient.size(); i++) { - printf("%f ", result.gradient[i]); - enzyme["result"].push_back(result.gradient[i]); + enzyme["name"] = "Enzyme mayalias combined"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = result.gradient.size() - 5; i < result.gradient.size(); + i++) { + printf("%f ", result.gradient[i]); + enzyme["result"].push_back(result.gradient[i]); } test_suite["tools"].push_back(enzyme); printf("\n"); } - } { diff --git a/enzyme/benchmarks/ReverseMode/lstm/lstm.cpp b/enzyme/benchmarks/ReverseMode/lstm/lstm.cpp index dbbc9929a7cc..e643efd738a3 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/lstm.cpp +++ b/enzyme/benchmarks/ReverseMode/lstm/lstm.cpp @@ -50,15 +50,10 @@ double logsumexp(double const* vect, int sz) // LSTM OBJECTIVE // The LSTM model -void lstm_model( - int hsize, - double const* __restrict weight, - double const* __restrict bias, - double* __restrict hidden, - double* __restrict cell, - double const* __restrict input -) -{ +void lstm_model_restrict(int hsize, double const *__restrict weight, + double const *__restrict bias, + double *__restrict hidden, double *__restrict cell, + double const *__restrict input) { // TODO NOTE THIS //__builtin_assume(hsize > 0); @@ -94,16 +89,9 @@ void lstm_model( } // Predict LSTM output given an input -void lstm_predict( - int l, - int b, - double const* __restrict w, - double const* __restrict w2, - double* __restrict s, - double const* __restrict x, - double* __restrict x2 -) -{ +void lstm_predict_restrict(int l, int b, double const *__restrict w, + double const *__restrict w2, double *__restrict s, + double const *__restrict x, double *__restrict x2) { int i; for (i = 0; i < b; i++) { @@ -113,7 +101,8 @@ void lstm_predict( double* xp = x2; for (i = 0; i <= 2 * l * b - 1; i += 2 * b) { - lstm_model(b, &(w[i * 4]), &(w[(i + b) * 4]), &(s[i]), &(s[i + b]), xp); + lstm_model_restrict(b, &(w[i * 4]), &(w[(i + b) * 4]), &(s[i]), + &(s[i + b]), xp); xp = &(s[i]); } @@ -124,17 +113,12 @@ void lstm_predict( } // LSTM objective (loss function) -void lstm_objective( - int l, - int c, - int b, - double const* __restrict main_params, - double const* __restrict extra_params, - double* __restrict state, - double const* __restrict sequence, - double* __restrict loss -) -{ +void lstm_objective_restrict(int l, int c, int b, + double const *__restrict main_params, + double const *__restrict extra_params, + double *__restrict state, + double const *__restrict sequence, + double *__restrict loss) { int i, t; double total = 0.0; int count = 0; @@ -147,7 +131,8 @@ void lstm_objective( __builtin_assume(b>0); for (t = 0; t <= (c - 1) * b - 1; t += b) { - lstm_predict(l, b, main_params, extra_params, state, input, ypred); + lstm_predict_restrict(l, b, main_params, extra_params, state, input, + ypred); lse = logsumexp(ypred, b); for (i = 0; i < b; i++) { @@ -177,32 +162,17 @@ void __enzyme_autodiff(...) noexcept; // * tapenade -b -o lstm_tapenade -head "lstm_objective(loss)/(main_params extra_params)" lstm.c -void dlstm_objective( - int l, - int c, - int b, - double const* main_params, - double* dmain_params, - double const* extra_params, - double* dextra_params, - double* state, - double const* sequence, - double* loss, - double* dloss -) -{ - __enzyme_autodiff(lstm_objective, - enzyme_const, l, - enzyme_const, c, - enzyme_const, b, - enzyme_dup, main_params, dmain_params, - enzyme_dup, extra_params, dextra_params, - enzyme_const, state, - enzyme_const, sequence, - enzyme_dupnoneed, loss, dloss - ); +void dlstm_objective_restrict(int l, int c, int b, double const *main_params, + double *dmain_params, double const *extra_params, + double *dextra_params, double *state, + double const *sequence, double *loss, + double *dloss) { + __enzyme_autodiff(lstm_objective_restrict, enzyme_const, l, enzyme_const, c, + enzyme_const, b, enzyme_dup, main_params, dmain_params, + enzyme_dup, extra_params, dextra_params, enzyme_const, + state, enzyme_const, sequence, enzyme_dupnoneed, loss, + dloss); } - } @@ -728,3 +698,5 @@ void adept_dlstm_objective(int l, int c, int b, const double *main_params, doubl } #endif + +#include "lstm_mayalias.h" diff --git a/enzyme/benchmarks/ReverseMode/lstm/lstm_mayalias.h b/enzyme/benchmarks/ReverseMode/lstm/lstm_mayalias.h new file mode 100644 index 000000000000..d2bbdb631224 --- /dev/null +++ b/enzyme/benchmarks/ReverseMode/lstm/lstm_mayalias.h @@ -0,0 +1,175 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +/* + * File "lstm_b_tapenade_generated.c" is generated by Tapenade 3.14 (r7259) from this file. + * To reproduce such a generation you can use Tapenade CLI + * (can be downloaded from http://www-sop.inria.fr/tropics/tapenade/downloading.html) + * + * After installing use the next command to generate a file: + * + * tapenade -b -o lstm_tapenade -head "lstm_objective(loss)/(main_params extra_params)" lstm.c + * + * This will produce a file "lstm_tapenade_b.c" which content will be the same as the content of the file "lstm_b_tapenade_generated.c", + * except one-line header. Moreover a log-file "lstm_tapenade_b.msg" will be produced. + * + * NOTE: the code in "lstm_b_tapenade_generated.c" is wrong and won't work. + * REPAIRED SOURCE IS STORED IN THE FILE "lstm_b.c". + * You can either use diff tool or read "lstm_b.c" header to figure out what changes was performed to fix the code. + * + * NOTE: you can also use Tapenade web server (http://tapenade.inria.fr:8080/tapenade/index.jsp) + * for generating but the result can be slightly different. + */ + +// #include "../adbench/lstm.h" + +extern "C" { +// #include "lstm.h" + +// UTILS +// Sigmoid on scalar +// double sigmoid(double x) +//{ +// return 1.0 / (1.0 + exp(-x)); +//} +// +//// log(sum(exp(x), 2)) +// double logsumexp(double const* vect, int sz) +//{ +// double sum = 0.0; +// int i; +// +// for (i = 0; i < sz; i++) +// { +// sum += exp(vect[i]); +// } +// +// sum += 2; +// return log(sum); +// } + +// LSTM OBJECTIVE +// The LSTM model +void lstm_model(int hsize, double const *weight, double const *bias, + double *hidden, double *cell, double const *input) { + // TODO NOTE THIS + //__builtin_assume(hsize > 0); + + double *gates = (double *)malloc(4 * hsize * sizeof(double)); + double *forget = &(gates[0]); + double *ingate = &(gates[hsize]); + double *outgate = &(gates[2 * hsize]); + double *change = &(gates[3 * hsize]); + + int i; + // caching input + // hidden (needed) + for (i = 0; i < hsize; i++) { + forget[i] = sigmoid(input[i] * weight[i] + bias[i]); + ingate[i] = sigmoid(hidden[i] * weight[hsize + i] + bias[hsize + i]); + outgate[i] = + sigmoid(input[i] * weight[2 * hsize + i] + bias[2 * hsize + i]); + change[i] = tanh(hidden[i] * weight[3 * hsize + i] + bias[3 * hsize + i]); + } + + // caching cell (needed) + for (i = 0; i < hsize; i++) { + cell[i] = cell[i] * forget[i] + ingate[i] * change[i]; + } + + for (i = 0; i < hsize; i++) { + hidden[i] = outgate[i] * tanh(cell[i]); + } + + free(gates); +} + +// Predict LSTM output given an input +void lstm_predict(int l, int b, double const *w, double const *w2, double *s, + double const *x, double *x2) { + int i; + for (i = 0; i < b; i++) { + x2[i] = x[i] * w2[i]; + } + + double *xp = x2; + for (i = 0; i <= 2 * l * b - 1; i += 2 * b) { + lstm_model(b, &(w[i * 4]), &(w[(i + b) * 4]), &(s[i]), &(s[i + b]), xp); + xp = &(s[i]); + } + + for (i = 0; i < b; i++) { + x2[i] = xp[i] * w2[b + i] + w2[2 * b + i]; + } +} + +// LSTM objective (loss function) +void lstm_objective(int l, int c, int b, double const *main_params, + double const *extra_params, double *state, + double const *sequence, double *loss) { + int i, t; + double total = 0.0; + int count = 0; + const double *input = &(sequence[0]); + double *ypred = (double *)malloc(b * sizeof(double)); + double *ynorm = (double *)malloc(b * sizeof(double)); + const double *ygold; + double lse; + + __builtin_assume(b > 0); + for (t = 0; t <= (c - 1) * b - 1; t += b) { + lstm_predict(l, b, main_params, extra_params, state, input, ypred); + lse = logsumexp(ypred, b); + for (i = 0; i < b; i++) { + ynorm[i] = ypred[i] - lse; + } + + ygold = &(sequence[t + b]); + for (i = 0; i < b; i++) { + total += ygold[i] * ynorm[i]; + } + + count += b; + input = ygold; + } + + *loss = -total / count; + + free(ypred); + free(ynorm); +} + +extern int enzyme_const; +extern int enzyme_dup; +extern int enzyme_dupnoneed; +void __enzyme_autodiff(...) noexcept; + +// * tapenade -b -o lstm_tapenade -head "lstm_objective(loss)/(main_params extra_params)" lstm.c + +void dlstm_objective( + int l, + int c, + int b, + double const* main_params, + double* dmain_params, + double const* extra_params, + double* dextra_params, + double* state, + double const* sequence, + double* loss, + double* dloss +) +{ + __enzyme_autodiff(lstm_objective, + enzyme_const, l, + enzyme_const, c, + enzyme_const, b, + enzyme_dup, main_params, dmain_params, + enzyme_dup, extra_params, dextra_params, + enzyme_const, state, + enzyme_const, sequence, + enzyme_dupnoneed, loss, dloss + ); +} + +} From 55a76b9ecaafb1aa7e66e3dc757376118af1b610 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Mon, 22 Jul 2024 17:16:43 -0400 Subject: [PATCH 49/56] Revert "smaller perf improvements" This reverts commit 629f87c4b0a184d52390e830c27965eaa4e25544. --- .../benchmarks/ReverseMode/lstm/src/safe.rs | 38 +++++++------------ 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs index 3cb5ca449747..ea9e71a67560 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs @@ -29,17 +29,14 @@ fn lstm_model( let mut gates = vec![0.0; 4 * hsize]; let gates = &mut gates[..4 * hsize]; let (a, b) = gates.split_at_mut(2 * hsize); - let (forget, ingate) = a.split_at_mut(hsize); - let (outgate, change) = b.split_at_mut(hsize); + let ((forget, ingate), (outgate, change)) = (a.split_at_mut(hsize), b.split_at_mut(hsize)); assert_eq!(weight.len(), 4 * hsize); assert_eq!(bias.len(), 4 * hsize); assert_eq!(hidden.len(), hsize); - assert_eq!(ingate.len(), hsize); - assert_eq!(change.len(), hsize); assert!(cell.len() >= hsize); assert!(input.len() >= hsize); - // Using unchecked indexing here was slightly slower for some reason + // caching input for i in 0..hsize { forget[i] = sigmoid(input[i] * weight[i] + bias[i]); ingate[i] = sigmoid(hidden[i] * weight[hsize + i] + bias[hsize + i]); @@ -70,42 +67,33 @@ fn lstm_predict( for i in 0..b { x2[i] = x[i] * w2[i]; } - - let (s1, s2) = s.split_at_mut(b); - lstm_model( - b, - &w[0..b * 4], - &w[b * 4..2 * b * 4], - s1, - s2, - x2.as_mut(), - ); - - assert_eq!(s.len(), 2 * b * l); - assert_eq!(w.len(), 4 * b * l); - for i in 1..l { - let i = i * 2 * b; - let (xp, s1, s2) = { + + let mut i = 0; + while i <= 2 * l * b - 1 { + // make borrow-checker happy with non-overlapping mutable references + let (xp, s1, s2) = if i == 0 { + let (s1, s2) = s.split_at_mut(b); + (x2.as_mut(), s1, s2) + } else { let tmp = &mut s[i - 2 * b..]; let (a, d) = tmp.split_at_mut(2 * b); let (d, c) = d.split_at_mut(b); + (a, d, c) }; - let (w1, w2) = w.split_at((i + b) * 4); lstm_model( b, - //&w1[i * 4..], - //&w2[0..(i + 2 * b) * 4], &w[i * 4..(i + b) * 4], &w[(i + b) * 4..(i + 2 * b) * 4], s1, s2, xp, ); + + i += 2 * b; } - let i = 2 * l * b; let xp = &s[i - 2 * b..]; for i in 0..b { From de87081b328c07bd09a94b5185d52e1c7d49eeef Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Mon, 22 Jul 2024 18:26:53 -0400 Subject: [PATCH 50/56] adding lstm primal cxx overhead benchmark --- enzyme/benchmarks/ReverseMode/adbench/lstm.h | 160 ++++++++++++++---- enzyme/benchmarks/ReverseMode/lstm/lstm.cpp | 22 +-- .../ReverseMode/lstm/lstm_mayalias.h | 41 ++--- 3 files changed, 153 insertions(+), 70 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/adbench/lstm.h b/enzyme/benchmarks/ReverseMode/adbench/lstm.h index 6318f5077edc..a24c39132215 100644 --- a/enzyme/benchmarks/ReverseMode/adbench/lstm.h +++ b/enzyme/benchmarks/ReverseMode/adbench/lstm.h @@ -49,16 +49,25 @@ void rust_safe_lstm_objective(int l, int c, int b, double const *main_params, double const *extra_params, double *state, double const *sequence, double *loss); +void cxx_restrict_lstm_objective(int l, int c, int b, double const *main_params, + double const *extra_params, double *state, + double const *sequence, double *loss); + +void cxx_mayalias_lstm_objective(int l, int c, int b, double const *main_params, + double const *extra_params, double *state, + double const *sequence, double *loss); + void rust_safe_dlstm_objective(int l, int c, int b, double const *main_params, double *dmain_params, double const *extra_params, double *dextra_params, double *state, double const *sequence, double *loss, double *dloss); -void dlstm_objective(int l, int c, int b, double const *main_params, - double *dmain_params, double const *extra_params, - double *dextra_params, double *state, - double const *sequence, double *loss, double *dloss); +void dlstm_objective_mayalias(int l, int c, int b, double const *main_params, + double *dmain_params, double const *extra_params, + double *dextra_params, double *state, + double const *sequence, double *loss, + double *dloss); void dlstm_objective_restrict(int l, int c, int b, double const *main_params, double *dmain_params, double const *extra_params, @@ -187,6 +196,28 @@ void calculate_jacobian(struct LSTMInput &input, struct LSTMOutput &result) } } +double calculate_mayalias_primal(struct LSTMInput &input) { + double loss = 0.0; + for (int i = 0; i < 100; i++) { + cxx_mayalias_lstm_objective( + input.l, input.c, input.b, input.main_params.data(), + input.extra_params.data(), input.state.data(), + input.sequence.data(), &loss); + } + return loss; +} + +double calculate_restrict_primal(struct LSTMInput &input) { + double loss = 0.0; + for (int i = 0; i < 100; i++) { + cxx_restrict_lstm_objective( + input.l, input.c, input.b, input.main_params.data(), + input.extra_params.data(), input.state.data(), + input.sequence.data(), &loss); + } + return loss; +} + double calculate_unsafe_primal(struct LSTMInput &input) { double loss = 0.0; for (int i = 0; i < 100; i++) { @@ -257,38 +288,39 @@ int main(const int argc, const char* argv[]) { } - { + //{ - struct LSTMInput input = {}; + // struct LSTMInput input = {}; - // Read instance - read_lstm_instance("data/" + path, &input.l, &input.c, &input.b, input.main_params, input.extra_params, input.state, - input.sequence); + //// Read instance + // read_lstm_instance("data/" + path, &input.l, &input.c, &input.b, + // input.main_params, input.extra_params, input.state, + // input.sequence); - std::vector state = std::vector(input.state.size()); + // std::vector state = std::vector(input.state.size()); - int Jcols = 8 * input.l * input.b + 3 * input.b; - struct LSTMOutput result = { 0, std::vector(Jcols) }; + // int Jcols = 8 * input.l * input.b + 3 * input.b; + // struct LSTMOutput result = { 0, std::vector(Jcols) }; - { - struct timeval start, end; - gettimeofday(&start, NULL); - calculate_jacobian(input, result); - gettimeofday(&end, NULL); - printf("Adept combined %0.6f\n", tdiff(&start, &end)); - json adept; - adept["name"] = "Adept combined"; - adept["runtime"] = tdiff(&start, &end); - for (unsigned i = result.gradient.size() - 5; - i < result.gradient.size(); i++) { - printf("%f ", result.gradient[i]); - adept["result"].push_back(result.gradient[i]); - } - test_suite["tools"].push_back(adept); - printf("\n"); - } + //{ + // struct timeval start, end; + // gettimeofday(&start, NULL); + // calculate_jacobian(input, result); + // gettimeofday(&end, NULL); + // printf("Adept combined %0.6f\n", tdiff(&start, &end)); + // json adept; + // adept["name"] = "Adept combined"; + // adept["runtime"] = tdiff(&start, &end); + // for (unsigned i = result.gradient.size() - 5; + // i < result.gradient.size(); i++) { + // printf("%f ", result.gradient[i]); + // adept["result"].push_back(result.gradient[i]); + // } + // test_suite["tools"].push_back(adept); + // printf("\n"); + //} - } + //} { @@ -340,7 +372,7 @@ int main(const int argc, const char* argv[]) { { struct timeval start, end; gettimeofday(&start, NULL); - calculate_jacobian(input, result); + calculate_jacobian(input, result); gettimeofday(&end, NULL); printf("Enzyme mayalias combined %0.6f\n", tdiff(&start, &end)); json enzyme; @@ -438,6 +470,72 @@ int main(const int argc, const char* argv[]) { int Jcols = 8 * input.l * input.b + 3 * input.b; struct LSTMOutput result = {0, std::vector(Jcols)}; + { + struct timeval start, end; + gettimeofday(&start, NULL); + calculate_mayalias_primal(input); + gettimeofday(&end, NULL); + printf("C++ mayalias primal %0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "C++ mayalias primal"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = result.gradient.size() - 5; i < result.gradient.size(); + i++) { + printf("%f ", result.gradient[i]); + enzyme["result"].push_back(result.gradient[i]); + } + test_suite["tools"].push_back(enzyme); + + printf("\n"); + } + } + { + + struct LSTMInput input = {}; + + // Read instance + read_lstm_instance("data/" + path, &input.l, &input.c, &input.b, + input.main_params, input.extra_params, input.state, + input.sequence); + + std::vector state = std::vector(input.state.size()); + + int Jcols = 8 * input.l * input.b + 3 * input.b; + struct LSTMOutput result = {0, std::vector(Jcols)}; + + { + struct timeval start, end; + gettimeofday(&start, NULL); + calculate_restrict_primal(input); + gettimeofday(&end, NULL); + printf("C++ restrict primal %0.6f\n", tdiff(&start, &end)); + json enzyme; + enzyme["name"] = "C++ restrict primal"; + enzyme["runtime"] = tdiff(&start, &end); + for (unsigned i = result.gradient.size() - 5; i < result.gradient.size(); + i++) { + printf("%f ", result.gradient[i]); + enzyme["result"].push_back(result.gradient[i]); + } + test_suite["tools"].push_back(enzyme); + + printf("\n"); + } + } + { + + struct LSTMInput input = {}; + + // Read instance + read_lstm_instance("data/" + path, &input.l, &input.c, &input.b, + input.main_params, input.extra_params, input.state, + input.sequence); + + std::vector state = std::vector(input.state.size()); + + int Jcols = 8 * input.l * input.b + 3 * input.b; + struct LSTMOutput result = {0, std::vector(Jcols)}; + { struct timeval start, end; gettimeofday(&start, NULL); diff --git a/enzyme/benchmarks/ReverseMode/lstm/lstm.cpp b/enzyme/benchmarks/ReverseMode/lstm/lstm.cpp index e643efd738a3..ade0b2237510 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/lstm.cpp +++ b/enzyme/benchmarks/ReverseMode/lstm/lstm.cpp @@ -113,12 +113,12 @@ void lstm_predict_restrict(int l, int b, double const *__restrict w, } // LSTM objective (loss function) -void lstm_objective_restrict(int l, int c, int b, - double const *__restrict main_params, - double const *__restrict extra_params, - double *__restrict state, - double const *__restrict sequence, - double *__restrict loss) { +void cxx_restrict_lstm_objective(int l, int c, int b, + double const *__restrict main_params, + double const *__restrict extra_params, + double *__restrict state, + double const *__restrict sequence, + double *__restrict loss) { int i, t; double total = 0.0; int count = 0; @@ -167,11 +167,11 @@ void dlstm_objective_restrict(int l, int c, int b, double const *main_params, double *dextra_params, double *state, double const *sequence, double *loss, double *dloss) { - __enzyme_autodiff(lstm_objective_restrict, enzyme_const, l, enzyme_const, c, - enzyme_const, b, enzyme_dup, main_params, dmain_params, - enzyme_dup, extra_params, dextra_params, enzyme_const, - state, enzyme_const, sequence, enzyme_dupnoneed, loss, - dloss); + __enzyme_autodiff(cxx_restrict_lstm_objective, enzyme_const, l, + enzyme_const, c, enzyme_const, b, enzyme_dup, main_params, + dmain_params, enzyme_dup, extra_params, dextra_params, + enzyme_const, state, enzyme_const, sequence, + enzyme_dupnoneed, loss, dloss); } } diff --git a/enzyme/benchmarks/ReverseMode/lstm/lstm_mayalias.h b/enzyme/benchmarks/ReverseMode/lstm/lstm_mayalias.h index d2bbdb631224..06401ff35a66 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/lstm_mayalias.h +++ b/enzyme/benchmarks/ReverseMode/lstm/lstm_mayalias.h @@ -104,9 +104,9 @@ void lstm_predict(int l, int b, double const *w, double const *w2, double *s, } // LSTM objective (loss function) -void lstm_objective(int l, int c, int b, double const *main_params, - double const *extra_params, double *state, - double const *sequence, double *loss) { +void cxx_mayalias_lstm_objective(int l, int c, int b, double const *main_params, + double const *extra_params, double *state, + double const *sequence, double *loss) { int i, t; double total = 0.0; int count = 0; @@ -146,30 +146,15 @@ void __enzyme_autodiff(...) noexcept; // * tapenade -b -o lstm_tapenade -head "lstm_objective(loss)/(main_params extra_params)" lstm.c -void dlstm_objective( - int l, - int c, - int b, - double const* main_params, - double* dmain_params, - double const* extra_params, - double* dextra_params, - double* state, - double const* sequence, - double* loss, - double* dloss -) -{ - __enzyme_autodiff(lstm_objective, - enzyme_const, l, - enzyme_const, c, - enzyme_const, b, - enzyme_dup, main_params, dmain_params, - enzyme_dup, extra_params, dextra_params, - enzyme_const, state, - enzyme_const, sequence, - enzyme_dupnoneed, loss, dloss - ); +void dlstm_objective_mayalias(int l, int c, int b, double const *main_params, + double *dmain_params, double const *extra_params, + double *dextra_params, double *state, + double const *sequence, double *loss, + double *dloss) { + __enzyme_autodiff(cxx_mayalias_lstm_objective, enzyme_const, l, enzyme_const, + c, enzyme_const, b, enzyme_dup, main_params, dmain_params, + enzyme_dup, extra_params, dextra_params, enzyme_const, + state, enzyme_const, sequence, enzyme_dupnoneed, loss, + dloss); } - } From 7d70dc555a44c024736fd3ed7dd17aabf2261cb8 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Wed, 18 Sep 2024 22:21:16 -0400 Subject: [PATCH 51/56] fix ode-real example, correct results, faster than c++, without dupnoneed --- .../ReverseMode/ode-real/Cargo.toml | 1 + .../ReverseMode/ode-real/Makefile.make | 36 ++-- .../benchmarks/ReverseMode/ode-real/ode.cpp | 199 ++++-------------- .../ReverseMode/ode-real/src/lib.rs | 158 +++++++------- 4 files changed, 139 insertions(+), 255 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/ode-real/Cargo.toml b/enzyme/benchmarks/ReverseMode/ode-real/Cargo.toml index 27a031a49570..880d7aca1567 100644 --- a/enzyme/benchmarks/ReverseMode/ode-real/Cargo.toml +++ b/enzyme/benchmarks/ReverseMode/ode-real/Cargo.toml @@ -13,6 +13,7 @@ crate-type = ["lib"] [profile.release] lto = "fat" opt-level = 3 +panic = 'abort' #debug = true #strip = "none" diff --git a/enzyme/benchmarks/ReverseMode/ode-real/Makefile.make b/enzyme/benchmarks/ReverseMode/ode-real/Makefile.make index 16033d158a3a..083ef1176feb 100644 --- a/enzyme/benchmarks/ReverseMode/ode-real/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/ode-real/Makefile.make @@ -1,4 +1,4 @@ -# RUN: cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%newLoadEnzyme %enzyme" make -B ode-raw.ll ode-opt.ll results.txt VERBOSE=1 -f %s +# RUN: cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%newLoadClangEnzyme" make -B ode-raw.ll ode-opt.ll results.txt VERBOSE=1 -f %s .PHONY: clean @@ -10,23 +10,31 @@ clean: $(dir)/benchmarks/ReverseMode/ode-real/target/release/libode.a: src/lib.rs Cargo.toml cargo +enzyme rustc --release --lib --crate-type=staticlib -%-unopt.ll: %.cpp - clang++ $(BENCH) $^ -O2 -I /u/drehwald/prog/boost_1_81_0 -DBOOST_DIR=/u/drehwald/prog/boost_1_81_0 -fno-use-cxa-atexit -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm - #clang++ $(BENCH) $^ -O1 -Xclang -disable-llvm-passes -fno-use-cxa-atexit -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm +# %-unopt.ll: %.cpp +# clang++ $(BENCH) $^ -O2 -I /u/drehwald/prog/boost_1_81_0 -DBOOST_DIR=/u/drehwald/prog/boost_1_81_0 -fno-use-cxa-atexit -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm +# #clang++ $(BENCH) $^ -O1 -Xclang -disable-llvm-passes -fno-use-cxa-atexit -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm +# +# %-raw.ll: %-unopt.ll +# @echo $(LOAD) +# opt $^ $(LOAD) -o $@ -S +# +# %-opt.ll: %-raw.ll +# opt $^ -o $@ -S +# #opt $^ -O2 -o $@ -S -%-raw.ll: %-unopt.ll - @echo $(LOAD) - opt $^ $(LOAD) -o $@ -S +#ode.o: ode-opt.ll $(dir)/benchmarks/ReverseMode/ode-real/target/release/libode.a +# clang++ -O2 $^ -o $@ $(BENCHLINK) -%-opt.ll: %-raw.ll - opt $^ -o $@ -S - #opt $^ -O2 -o $@ -S -ode.o: ode-opt.ll $(dir)/benchmarks/ReverseMode/ode-real/target/release/libode.a - clang++ -O2 $^ -o $@ $(BENCHLINK) +ode.o: ode.cpp $(dir)/benchmarks/ReverseMode/ode-real/target/release/libode.a + #/home/manuel/prog/llvm18/build/bin/clang++ $(LOAD) $(BENCH) ode.cpp -I /u/drehwald/prog/boost_1_81_0 -DBOOST_DIR=/u/drehwald/prog/boost_1_81_0 -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o ode.o -lpthread $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/ode-real/target/release/libode.a -L /usr/lib/gcc/x86_64-linux-gnu/11 + clang++ $(LOAD) $(BENCH) ode.cpp -I /u/drehwald/prog/boost_1_81_0 -DBOOST_DIR=/u/drehwald/prog/boost_1_81_0 -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o ode.o -lpthread $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/ode-real/target/release/libode.a -L /usr/lib/gcc/x86_64-linux-gnu/11 -#ode.o: ode-opt.ll $(dir)/benchmarks/ReverseMode/ode-real/target/release/libode.a -# clang++ $(LOAD) $(BENCH) ode.cpp -I /u/drehwald/prog/boost_1_81_0 -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o ode.o -lpthread $(BENCHLINK) -lm -lode -L $(dir)/benchmarks/ReverseMode/ode/target/release/ -L /usr/lib/gcc/x86_64-linux-gnu/11 +#fft.o: fft.cpp $(dir)/benchmarks/ReverseMode/fft/target/release/libfft.a +# clang++ $(LOAD) $(BENCH) fft.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o fft.o -lpthread $(BENCHLINK) -lm -lfft -L $(dir)/benchmarks/ReverseMode/fft/target/release/ -L /usr/lib/gcc/x86_64-linux-gnu/11 + +#gmm.o: gmm.cpp $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a +# clang++ $(LOAD) $(BENCH) gmm.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o gmm.o -lpthread $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 results.txt: ode.o diff --git a/enzyme/benchmarks/ReverseMode/ode-real/ode.cpp b/enzyme/benchmarks/ReverseMode/ode-real/ode.cpp index 7c7113df9641..17e677abc65d 100644 --- a/enzyme/benchmarks/ReverseMode/ode-real/ode.cpp +++ b/enzyme/benchmarks/ReverseMode/ode-real/ode.cpp @@ -109,14 +109,27 @@ void brusselator_2d_loop(double* __restrict du, double* __restrict dv, const dou typedef boost::array< double , 2 * N * N > state_type; -void lorenz( const state_type &x , state_type &dxdt , double t ) + +void lorenz( const state_type &x, state_type &dxdt, double t ) { // Extract the parameters double p[3] = { /*A*/ 3.4, /*B*/ 1, /*alpha*/10. }; brusselator_2d_loop(dxdt.c_array(), dxdt.c_array() + N * N, x.data(), x.data() + N * N, p, t); } -// init_brusselator(x.c_array(), x.c_array() + N*N) +extern "C" void rust_lorenz(const double* x, double* dxdt, double t); +extern "C" void rust_dbrusselator_2d_loop(const double* x, double* dx, double* adjoint, const double* p, double* dp, double t); + +double rustfoobar(const double *p, /*const*/ state_type x, const state_type adjoint, double t) { + double dp[3] = { 0. }; + + state_type dx = { 0. }; + + state_type dadjoint_inp = adjoint; + + rust_dbrusselator_2d_loop(dadjoint_inp.c_array(), x.c_array(), dx.c_array(), p, dp, t); + return dx[0]; +} double foobar(const double* p, const state_type x, const state_type adjoint, double t) { double dp[3] = { 0. }; @@ -128,10 +141,10 @@ double foobar(const double* p, const state_type x, const state_type adjoint, dou state_type dxdu; __enzyme_autodiff(brusselator_2d_loop, -// enzyme_dup, dxdu.c_array(), dadjoint_inp.c_array(), -// enzyme_dup, dxdu.c_array() + N * N, dadjoint_inp.c_array() + N * N, - enzyme_dupnoneed, nullptr, dadjoint_inp.data(), - enzyme_dupnoneed, nullptr, dadjoint_inp.data() + N * N, + enzyme_dup, dxdu.c_array(), dadjoint_inp.c_array(), + enzyme_dup, dxdu.c_array() + N * N, dadjoint_inp.c_array() + N * N, + // enzyme_dupnoneed, nullptr, dadjoint_inp.data(), + // enzyme_dupnoneed, nullptr, dadjoint_inp.data() + N * N, enzyme_dup, x.data(), dx.data(), enzyme_dup, x.data() + N * N, dx.data() + N * N, enzyme_dup, p, dp, @@ -545,171 +558,47 @@ int main(int argc, char** argv) { res = foobar(p, x, adjoint, t); gettimeofday(&end, NULL); - printf("Enzyme combined %0.6f res=%f\n", tdiff(&start, &end), res); - } - //printf("res=%f\n", foobar(1000)); -} - - -#if 0 - -typedef boost::array< double , 6 > state_type; - -void lorenz( const state_type &x , state_type &dxdt , double t ) -{ - // Extract the parameters - double k1 = x[3]; - double k2 = x[4]; - double k3 = x[5]; - - dxdt[0] = -k1 * x[0] + k3 * x[1] * x[2]; - dxdt[1] = k1 * x[0] - k2 * x[1] * x[1] - k3 * x[1] * x[2]; - dxdt[2] = k2 * x[1] * x[1]; - - // Don't change the parameters p - dxdt[3] = 0; - dxdt[4] = 0; - dxdt[5] = 0; -} - -double foobar(double* p, uint64_t iters) { - state_type x = { 1.0, 0, 0, p[0], p[1], p[2] }; // initial conditions - double t = 1e5; - typedef controlled_runge_kutta< runge_kutta_dopri5< state_type , typename state_type::value_type , state_type , double > > stepper_type; - //typedef euler< state_type , typename state_type::value_type , state_type , double > stepper_type; - integrate_const( stepper_type(), lorenz , x , 0.0 , t, t/iters ); - - return x[0]; -} - -typedef boost::array< adouble , 6 > astate_type; - -void alorenz( const astate_type &x , astate_type &dxdt , adouble t ) -{ - // Extract the parameters - adouble k1 = x[3]; - adouble k2 = x[4]; - adouble k3 = x[5]; - - dxdt[0] = -k1 * x[0] + k3 * x[1] * x[2]; - dxdt[1] = k1 * x[0] - k2 * x[1] * x[1] - k3 * x[1] * x[2]; - dxdt[2] = k2 * x[1] * x[1]; - - // Don't change the parameters p - dxdt[3] = 0; - dxdt[4] = 0; - dxdt[5] = 0; -} - -adouble afoobar(adouble* p, uint64_t iters) { - astate_type x = { 1.0, 0, 0, p[0], p[1], p[2] }; // initial conditions - double t = 1e5; - typedef controlled_runge_kutta< runge_kutta_dopri5< astate_type , typename astate_type::value_type , astate_type , adouble > > stepper_type; - //typedef euler< astate_type , typename astate_type::value_type , astate_type , adouble > stepper_type; - integrate_const( stepper_type(), alorenz , x , 0.0 , t, t/iters ); - - return x[0]; -} - -static -double afoobar_and_gradient(double* p_in, double* dp_out, uint64_t iters) { - adept::Stack stack; - adouble x[3] = { p_in[0], p_in[1], p_in[2] }; - stack.new_recording(); - adouble y = afoobar(x, iters); - y.set_gradient(1.0); - stack.compute_adjoint(); - for(int i=0; i<3; i++) - dp_out[i] = x[i].get_gradient(); - return y.value(); -} - -static void adept_sincos(uint64_t iters) { - { - struct timeval start, end; - gettimeofday(&start, NULL); - - double p[3] = { 0.04,3e7,1e4 }; - double res = foobar(p, iters); - - gettimeofday(&end, NULL); - printf("Adept real %0.6f res=%f\n", tdiff(&start, &end), res); - } - - { - struct timeval start, end; - gettimeofday(&start, NULL); - - adept::Stack stack; - adouble p[3] = { 0.04,3e7,1e4 }; - // stack.new_recording(); - adouble resa = afoobar(p, iters); - double res = resa.value(); - - gettimeofday(&end, NULL); - printf("Adept forward %0.6f res=%f\n", tdiff(&start, &end), res); - } - - { - struct timeval start, end; - gettimeofday(&start, NULL); - - double p[3] = { 0.04,3e7,1e4 }; - double dp[3] = { 0 }; - afoobar_and_gradient(p, dp, iters); - - gettimeofday(&end, NULL); - printf("Adept combined %0.6f res'=%f\n", tdiff(&start, &end), dp[0]); + printf("C++ Enzyme combined %0.6f res=%f\n", tdiff(&start, &end), res); } -} - -static void enzyme_sincos(double inp, uint64_t iters) { - + { - struct timeval start, end; - gettimeofday(&start, NULL); + struct timeval start, end; + gettimeofday(&start, NULL); - double p[3] = { 0.04,3e7,1e4 }; - double res = foobar(p, iters); + double res; + for(int i=0; i<10000; i++) + res = rustfoobar(p, x, adjoint, t); - gettimeofday(&end, NULL); - printf("Enzyme real %0.6f res=%f\n", tdiff(&start, &end), res); + gettimeofday(&end, NULL); + printf("Rust Enzyme combined %0.6f res=%f\n", tdiff(&start, &end), res); } { - struct timeval start, end; - gettimeofday(&start, NULL); + struct timeval start, end; + gettimeofday(&start, NULL); + state_type x2; - double p[3] = { 0.04,3e7,1e4 }; - double res = foobar(p, iters); + for(int i=0; i<10000; i++) { + lorenz(x, x2, t); + } - gettimeofday(&end, NULL); - printf("Enzyme forward %0.6f res=%f\n", tdiff(&start, &end), res); + gettimeofday(&end, NULL); + printf("C++ fwd %0.6f res=%f\n", tdiff(&start, &end), x2[0]); } { - struct timeval start, end; - gettimeofday(&start, NULL); + struct timeval start, end; + gettimeofday(&start, NULL); + state_type x2; - double p[3] = { 0.04,3e7,1e4 }; - double dp[3] = { 0 }; - __enzyme_autodiff(foobar, p, dp, iters); + for(int i=0; i<10000; i++) + rust_lorenz(x.c_array(), x2.c_array(), t); - gettimeofday(&end, NULL); - printf("Enzyme combined %0.6f res'=%f\n", tdiff(&start, &end), dp[0]); + gettimeofday(&end, NULL); + printf("Rust fwd %0.6f res=%f\n\n", tdiff(&start, &end), x2[0]); } -} -int main(int argc, char** argv) { - int max_iters = atoi(argv[1]) ; - double inp = 2.1; - //for(int iters=max_iters/20; iters<=max_iters; iters+=max_iters/20) { - auto iters = max_iters; - printf("iters=%d\n", iters); - adept_sincos(inp, iters); - enzyme_sincos(inp, iters); - //} + //printf("res=%f\n", foobar(1000)); } -#endif diff --git a/enzyme/benchmarks/ReverseMode/ode-real/src/lib.rs b/enzyme/benchmarks/ReverseMode/ode-real/src/lib.rs index 23995eaa5626..bd27f7930d9c 100644 --- a/enzyme/benchmarks/ReverseMode/ode-real/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/ode-real/src/lib.rs @@ -1,5 +1,4 @@ #![feature(autodiff)] -#![feature(slice_first_last_chunk)] #![feature(slice_as_chunks)] #![feature(iter_next_chunk)] #![allow(non_snake_case)] @@ -26,12 +25,6 @@ fn get(x: &[f64], i: usize, j: usize) -> f64 { x[N * i + j] } -//#define RANGE(min, max, i, N) ((max-min)/(N-1)*i + min) -//#define GETnb(x, i, j) (x)[N*i+j] -//#define GET(x, i, j) GETnb(x, i, j) -// #define GET(x, i, j) ({ assert(i >=0); assert( j>=0); assert(j f64 { let eq1 = (x - 0.3) * (x - 0.3) + (y - 0.6) * (y - 0.6) <= 0.1 * 0.1; let eq2 = t >= 1.1; @@ -43,26 +36,21 @@ fn brusselator_f(x: f64, y: f64, t: f64) -> f64 { } fn init_brusselator(u: &mut [f64], v: &mut [f64]) { + assert!(u.len() == N * N); + assert!(v.len() == N * N); for i in 0..N { for j in 0..N { let x = range(xmin, xmax, i, N); let y = range(ymin, ymax, j, N); - u[N * i + j] = 22.0 * y * (1.0 - y) * (y * (1.0 - y)).sqrt(); - v[N * i + j] = 27.0 * x * (1.0 - x) * (x * (1.0 - x)).sqrt(); + u[N * i + j] = 22.0 * (y * (1.0 - y)) * (y * (1.0 - y)).sqrt(); + v[N * i + j] = 27.0 * (x * (1.0 - x)) * (x * (1.0 - x)).sqrt(); } } } -// __enzyme_autodiff(brusselator_2d_loop, -// enzyme_dupnoneed, nullptr, dadjoint_inp.data(), -// enzyme_dupnoneed, nullptr, dadjoint_inp.data() + N * N, -// enzyme_dup, x.data(), dx.data(), -// enzyme_dup, x.data() + N * N, dx.data() + N * N, -// enzyme_dup, p, dp, -// enzyme_const, t); - +#[no_mangle] #[autodiff(dbrusselator_2d_loop, Reverse, Duplicated, Duplicated, Duplicated, Duplicated, Duplicated, Const)] -fn brusselator_2d_loop(d_u: &mut [f64], d_v: &mut [f64], u: &[f64], v: &[f64], p: &[f64;3], t: f64) { +fn brusselator_2d_loop(d_u: &mut [f64;N*N], d_v: &mut [f64;N*N], u: &[f64;N*N], v: &[f64;N*N], p: &[f64;3], t: f64) { let A = p[0]; let B = p[1]; let alpha = p[2]; @@ -85,96 +73,94 @@ fn brusselator_2d_loop(d_u: &mut [f64], d_v: &mut [f64], u: &[f64], v: &[f64], p } } -type state_type = [f64; 2 * N * N]; +type StateType = [f64; 2 * N * N]; + +#[no_mangle] +pub extern "C" fn rust_lorenz(x: *const StateType, dxdt: *mut StateType, t: f64) { + let x: &StateType = unsafe { &*x }; + let dxdt: &mut StateType = unsafe { &mut *dxdt }; + lorenz(x, dxdt, t); +} -fn lorenz(x: &state_type, dxdt: &mut state_type, t: f64) { +fn lorenz(x: &StateType, dxdt: &mut StateType, t: f64) { let p = [3.4, 1., 10.]; let (tmp1, tmp2) = dxdt.split_at_mut(N * N); let mut dxdt1: [f64; N * N] = tmp1.try_into().unwrap(); let mut dxdt2: [f64; N * N] = tmp2.try_into().unwrap(); - brusselator_2d_loop(&mut dxdt1, &mut dxdt2, &x[..], &x[N * N..], &p, t); + let (tmp1, tmp2) = x.split_at(N * N); + let u: [f64; N * N] = tmp1.try_into().unwrap(); + let v: [f64; N * N] = tmp2.try_into().unwrap(); + brusselator_2d_loop(&mut dxdt1, &mut dxdt2, &u, &v, &p, t); } #[no_mangle] -pub extern "C" fn rust_dbrusselator_2d_loop(p: *const f64, dp: *mut f64, x: *const state_type, dx: *mut state_type, adjoint: *mut state_type, t: f64) -> f64 { - let x = unsafe { *x }; - let mut adjoint = unsafe { *adjoint }; - let p: [f64;3] = unsafe { *p.cast::<[f64;3]>().as_ref().unwrap() }; - let mut dp: [f64;3] = unsafe { dp.cast::<[f64;3]>().as_mut().unwrap() }; - - let (mut dx1, mut dx2) = dx.split_at_mut(N * N); - //let mut dp = [0.; 3]; - //let mut dx1 = [0.; N * N]; - //let mut dx2 = [0.; N * N]; - let (mut dadj1, mut dadj2) = adjoint.split_at_mut(N * N); +pub extern "C" fn rust_dbrusselator_2d_loop(adjoint: *mut StateType, x: *const StateType, dx: *mut StateType, p: *const [f64;3], dp: *mut [f64;3], t: f64) { + let x: &StateType = unsafe { &*x }; + let dx: &mut StateType = unsafe { &mut *dx }; + let adjoint: &mut StateType = unsafe { &mut *adjoint }; + + let p: &[f64;3] = unsafe { &*p }; + let dp: &mut [f64;3] = unsafe { &mut *dp }; + + //assert!(p[0] == 3.4); + //assert!(p[1] == 1.); + //assert!(p[2] == 10.); + //assert!(t == 2.1); + + //let mut x1 = [0.; 2 * N * N]; + //let mut dx1 = [0.; 2 *N * N]; + //let (tmp1, tmp2) = x1.split_at_mut(N * N); + //let mut x1: [f64; N * N] = tmp1.try_into().unwrap(); + //let mut x2: [f64; N * N] = tmp2.try_into().unwrap(); + //init_brusselator(&mut x1, &mut x2); + //for i in 0..N*N { + // let tmp = (x1[i] - x[i]).abs(); + // if (tmp / x[i] > 1e-5) { + // dbg!(tmp); + // dbg!(tmp / x[i]); + // dbg!(i); + // dbg!(x1[i]); + // dbg!(x[i]); + // println!("x1[{}] = {} != x[{}] = {}", i, x1[i], i, x[i]); + // panic!(); + // } + //} + + // Alternative ways to split the inputs + //let [ mut dx1, mut dx2]: [[f64; N*N]; 2] = unsafe { *std::mem::transmute::<*mut StateType, &mut [[f64; N*N]; 2]>(dx) }; + //let [dx1, dx2]: &mut [[f64; N*N];2] = unsafe { dx.cast::<[[f64; N*N]; 2]>().as_mut().unwrap() }; // https://discord.com/channels/273534239310479360/273541522815713281/1236945105601040446 + let ([dx1, dx2], []): (&mut [[f64; N*N]], &mut [f64]) = dx.as_chunks_mut() else { unreachable!() }; + let ([dadj1, dadj2], []): (&mut [[f64; N*N]], &mut [f64])= adjoint.as_chunks_mut() else { unreachable!() }; let ([x1, x2], []): (&[[f64; N*N]], &[f64])= x.as_chunks() else { unreachable!() }; - let mut null1 = [0.; 2 * N * N]; - let mut null2 = [0.; 2 * N * N]; - dbrusselator_2d_loop(&mut null1, &mut dadj1, - &mut null2, &mut dadj2, - x1, &mut dx1, - x2, &mut dx2, - &p, &mut dp, t); - dx1[0] - //brusselator_2d_loop_b(nullptr, dadjoint_inp.data(), - // nullptr, dadjoint_inp.data() + N * N, - // x.data(), dx.data(), - // x.data() + N * N, dx.data() + N * N, - // p, dp, - // t); + let mut null1 = [0.; 1 * N * N]; + let mut null2 = [0.; 1 * N * N]; + dbrusselator_2d_loop(&mut null1, dadj1, + &mut null2, dadj2, + x1, dx1, + x2, dx2, + p, dp, t); + return; } -fn foobar(p: &[f64;3], x: state_type, mut adjoint: state_type, t: f64) -> f64 { +fn foobar(p: &[f64;3], x: StateType, mut adjoint: StateType, t: f64) -> f64 { let mut dp = [0.; 3]; let mut dx1 = [0.; N * N]; let mut dx2 = [0.; N * N]; - let (mut dadj1, mut dadj2) = adjoint.split_at_mut(N * N); - let mut null1 = [0.; 2 * N * N]; - let mut null2 = [0.; 2 * N * N]; + // https://discord.com/channels/273534239310479360/273541522815713281/1236945105601040446 + let ([dadj1, dadj2], []): (&mut [[f64; N*N]], &mut [f64])= adjoint.as_chunks_mut() else { unreachable!() }; + //let (mut dadj1, mut dadj2) = adjoint.split_at_mut(N * N); + let mut null1 = [0.; 1 * N * N]; + let mut null2 = [0.; 1 * N * N]; // https://discord.com/channels/273534239310479360/273541522815713281/1236945105601040446 let ([x1, x2], []): (&[[f64; N*N]], &[f64])= x.as_chunks() else { unreachable!() }; - dbrusselator_2d_loop(&mut null1, &mut dadj1, - &mut null2, &mut dadj2, + dbrusselator_2d_loop(&mut null1, dadj1, + &mut null2, dadj2, x1, &mut dx1, x2, &mut dx2, &p, &mut dp, t); dx1[0] } - -//double foobar(const double* p, const state_type x, const state_type adjoint, double t) { -// double dp[3] = { 0. }; -// -// state_type dx = { 0. }; -// -// state_type dadjoint_inp = adjoint; -// -// state_type dxdu; -// -// __enzyme_autodiff(brusselator_2d_loop, -// enzyme_dupnoneed, nullptr, dadjoint_inp.data(), -// enzyme_dupnoneed, nullptr, dadjoint_inp.data() + N * N, -// enzyme_dup, x.data(), dx.data(), -// enzyme_dup, x.data() + N * N, dx.data() + N * N, -// enzyme_dup, p, dp, -// enzyme_const, t); -// -// return dx[0]; -//} - -fn main() { - let p = [3.4, 1., 10.]; - let mut x = [0.; 2 * N * N]; - let mut adjoint = [0.; 2 * N * N]; - init_brusselator(&mut x, &mut adjoint); - let t = 2.1; - let mut res = 0.; - let time = std::time::Instant::now(); - for _ in 0..10000 { - res = foobar(&p, x, adjoint, t); - } - println!("Enzyme combined {} res={}", time.elapsed().as_secs_f64(), res); -} From fdb0d0977dc2c32b0467e87bfdd6e3d9a825d874 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Thu, 19 Sep 2024 00:43:19 -0400 Subject: [PATCH 52/56] remove boost leftovers --- .../benchmarks/ReverseMode/ode-real/ode.cpp | 66 +++++++++---------- .../ReverseMode/ode-real/src/lib.rs | 35 ++-------- 2 files changed, 35 insertions(+), 66 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/ode-real/ode.cpp b/enzyme/benchmarks/ReverseMode/ode-real/ode.cpp index 17e677abc65d..c0c5064b833d 100644 --- a/enzyme/benchmarks/ReverseMode/ode-real/ode.cpp +++ b/enzyme/benchmarks/ReverseMode/ode-real/ode.cpp @@ -24,20 +24,8 @@ float tdiff(struct timeval *start, struct timeval *end) { return (end->tv_sec-start->tv_sec) + 1e-6*(end->tv_usec-start->tv_usec); } -#define BOOST_MATH_NO_LONG_DOUBLE_MATH_FUNCTIONS -#define BOOST_NO_EXCEPTIONS #include -#include - -#include - -#include -void boost::throw_exception(std::exception const & e){ - //do nothing -} - using namespace std; -using namespace boost::numeric::odeint; #define N 32 #define xmin 0. @@ -107,27 +95,29 @@ void brusselator_2d_loop(double* __restrict du, double* __restrict dv, const dou } } -typedef boost::array< double , 2 * N * N > state_type; - +typedef double state_type[2*N*N]; void lorenz( const state_type &x, state_type &dxdt, double t ) { // Extract the parameters double p[3] = { /*A*/ 3.4, /*B*/ 1, /*alpha*/10. }; - brusselator_2d_loop(dxdt.c_array(), dxdt.c_array() + N * N, x.data(), x.data() + N * N, p, t); + brusselator_2d_loop(dxdt, dxdt + N * N, x, x + N * N, p, t); } extern "C" void rust_lorenz(const double* x, double* dxdt, double t); -extern "C" void rust_dbrusselator_2d_loop(const double* x, double* dx, double* adjoint, const double* p, double* dp, double t); +extern "C" void rust_dbrusselator_2d_loop(double* adjoint, const double* x, double* dx, const double* p, double* dp, double t); -double rustfoobar(const double *p, /*const*/ state_type x, const state_type adjoint, double t) { +double rustfoobar(const double *p, const state_type x, const state_type adjoint, double t) { double dp[3] = { 0. }; state_type dx = { 0. }; - state_type dadjoint_inp = adjoint; + state_type dadjoint_inp;// = adjoint + for (int i = 0; i < N * N; i++) { + dadjoint_inp[i] = adjoint[i]; + } - rust_dbrusselator_2d_loop(dadjoint_inp.c_array(), x.c_array(), dx.c_array(), p, dp, t); + rust_dbrusselator_2d_loop(dadjoint_inp, x, dx, p, dp, t); return dx[0]; } @@ -136,17 +126,20 @@ double foobar(const double* p, const state_type x, const state_type adjoint, dou state_type dx = { 0. }; - state_type dadjoint_inp = adjoint; + state_type dadjoint_inp;// = adjoint + for (int i = 0; i < N * N; i++) { + dadjoint_inp[i] = adjoint[i]; + } state_type dxdu; __enzyme_autodiff(brusselator_2d_loop, - enzyme_dup, dxdu.c_array(), dadjoint_inp.c_array(), - enzyme_dup, dxdu.c_array() + N * N, dadjoint_inp.c_array() + N * N, - // enzyme_dupnoneed, nullptr, dadjoint_inp.data(), - // enzyme_dupnoneed, nullptr, dadjoint_inp.data() + N * N, - enzyme_dup, x.data(), dx.data(), - enzyme_dup, x.data() + N * N, dx.data() + N * N, + enzyme_dup, dxdu, dadjoint_inp, + enzyme_dup, dxdu + N * N, dadjoint_inp + N * N, + // enzyme_dupnoneed, nullptr, dadjoint_inp, + // enzyme_dupnoneed, nullptr, dadjoint_inp + N * N, + enzyme_dup, x, dx, + enzyme_dup, x + N * N, dx + N * N, enzyme_dup, p, dp, enzyme_const, t); @@ -499,14 +492,17 @@ double tfoobar(const double* p, const state_type x, const state_type adjoint, do state_type dx = { 0. }; - state_type dadjoint_inp = adjoint; + state_type dadjoint_inp;// = adjoint + for (int i = 0; i < N * N; i++) { + dadjoint_inp[i] = adjoint[i]; + } state_type dxdu; - brusselator_2d_loop_b(nullptr, dadjoint_inp.data(), - nullptr, dadjoint_inp.data() + N * N, - x.data(), dx.data(), - x.data() + N * N, dx.data() + N * N, + brusselator_2d_loop_b(nullptr, dadjoint_inp, + nullptr, dadjoint_inp + N * N, + x, dx, + x + N * N, dx + N * N, p, dp, t); @@ -518,10 +514,10 @@ int main(int argc, char** argv) { const double p[3] = { /*A*/ 3.4, /*B*/ 1, /*alpha*/10. }; state_type x; - init_brusselator(x.data(), x.data() + N * N); + init_brusselator(x, x + N * N); state_type adjoint; - init_brusselator(adjoint.data(), adjoint.data() + N * N); + init_brusselator(adjoint, adjoint + N * N); double t = 2.1; @@ -592,13 +588,11 @@ int main(int argc, char** argv) { state_type x2; for(int i=0; i<10000; i++) - rust_lorenz(x.c_array(), x2.c_array(), t); + rust_lorenz(x, x2, t); gettimeofday(&end, NULL); printf("Rust fwd %0.6f res=%f\n\n", tdiff(&start, &end), x2[0]); } - - //printf("res=%f\n", foobar(1000)); } diff --git a/enzyme/benchmarks/ReverseMode/ode-real/src/lib.rs b/enzyme/benchmarks/ReverseMode/ode-real/src/lib.rs index bd27f7930d9c..2347ca8e0f80 100644 --- a/enzyme/benchmarks/ReverseMode/ode-real/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/ode-real/src/lib.rs @@ -18,12 +18,6 @@ const ymax: f64 = 1.; fn range(min: f64, max: f64, i: usize, N_var: usize) -> f64 { (max - min) / (N_var as f64 - 1.) * i as f64 + min } -#[inline(always)] -fn get(x: &[f64], i: usize, j: usize) -> f64 { - assert!(i > 0); - assert!(j < N); - x[N * i + j] -} fn brusselator_f(x: f64, y: f64, t: f64) -> f64 { let eq1 = (x - 0.3) * (x - 0.3) + (y - 0.6) * (y - 0.6) <= 0.1 * 0.1; @@ -35,6 +29,7 @@ fn brusselator_f(x: f64, y: f64, t: f64) -> f64 { } } +#[expect(unused)] fn init_brusselator(u: &mut [f64], v: &mut [f64]) { assert!(u.len() == N * N); assert!(v.len() == N * N); @@ -102,10 +97,10 @@ pub extern "C" fn rust_dbrusselator_2d_loop(adjoint: *mut StateType, x: *const S let p: &[f64;3] = unsafe { &*p }; let dp: &mut [f64;3] = unsafe { &mut *dp }; - //assert!(p[0] == 3.4); - //assert!(p[1] == 1.); - //assert!(p[2] == 10.); - //assert!(t == 2.1); + assert!(p[0] == 3.4); + assert!(p[1] == 1.); + assert!(p[2] == 10.); + assert!(t == 2.1); //let mut x1 = [0.; 2 * N * N]; //let mut dx1 = [0.; 2 *N * N]; @@ -144,23 +139,3 @@ pub extern "C" fn rust_dbrusselator_2d_loop(adjoint: *mut StateType, x: *const S p, dp, t); return; } - - -fn foobar(p: &[f64;3], x: StateType, mut adjoint: StateType, t: f64) -> f64 { - let mut dp = [0.; 3]; - let mut dx1 = [0.; N * N]; - let mut dx2 = [0.; N * N]; - // https://discord.com/channels/273534239310479360/273541522815713281/1236945105601040446 - let ([dadj1, dadj2], []): (&mut [[f64; N*N]], &mut [f64])= adjoint.as_chunks_mut() else { unreachable!() }; - //let (mut dadj1, mut dadj2) = adjoint.split_at_mut(N * N); - let mut null1 = [0.; 1 * N * N]; - let mut null2 = [0.; 1 * N * N]; - // https://discord.com/channels/273534239310479360/273541522815713281/1236945105601040446 - let ([x1, x2], []): (&[[f64; N*N]], &[f64])= x.as_chunks() else { unreachable!() }; - dbrusselator_2d_loop(&mut null1, dadj1, - &mut null2, dadj2, - x1, &mut dx1, - x2, &mut dx2, - &p, &mut dp, t); - dx1[0] -} From 747e4f9b2459c5ad646ff342b386986daf136e40 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Tue, 1 Oct 2024 17:21:52 -0400 Subject: [PATCH 53/56] fix makefiles to use new rustflags --- enzyme/benchmarks/ReverseMode/ba/Makefile.make | 2 +- enzyme/benchmarks/ReverseMode/gmm/Makefile.make | 4 ++-- enzyme/benchmarks/ReverseMode/lstm/Makefile.make | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/ba/Makefile.make b/enzyme/benchmarks/ReverseMode/ba/Makefile.make index 8a13a0e524fb..1e4ed61859a7 100644 --- a/enzyme/benchmarks/ReverseMode/ba/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/ba/Makefile.make @@ -8,7 +8,7 @@ clean: rm -f *.ll *.o results.txt results.json $(dir)/benchmarks/ReverseMode/ba/target/release/libbars.a: src/lib.rs Cargo.toml - ENZYME_LOOSE_TYPES=1 cargo +enzyme rustc --release --lib --crate-type=staticlib --features=libm + RUSTFLAGS="-Z autodiff=LooseTypes" cargo +enzyme rustc --release --lib --crate-type=staticlib --features=libm ba.o: ba.cpp $(dir)/benchmarks/ReverseMode/ba/target/release/libbars.a clang++ $(LOAD) $(BENCH) ba.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o ba.o -lpthread $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/ba/target/release/libbars.a -L /usr/lib/gcc/x86_64-linux-gnu/11 diff --git a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make index e3c15f4dcc11..7eff2a950c86 100644 --- a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make @@ -1,4 +1,4 @@ -# RUN: if [ %llvmver -ge 12 ] || [ %llvmver -le 9 ]; then cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%newLoadClangEnzyme" make -B gmm.o results.json -f %s; fi +# RUN: cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%newLoadClangEnzyme" make -B gmm.o results.json VERBOSE=1 -f %s .PHONY: clean @@ -8,7 +8,7 @@ clean: rm -f *.ll *.o results.txt results.json $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a: src/lib.rs Cargo.toml - ENZYME_LOOSE_TYPES=1 cargo +enzyme rustc --release --lib --crate-type=staticlib --features=libm + RUSTFLAGS="-Z autodiff=LooseTypes" cargo +enzyme rustc --release --lib --crate-type=staticlib --features=libm gmm.o: gmm.cpp $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a clang++ $(LOAD) $(BENCH) gmm.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o gmm.o -lpthread $(BENCHLINK) -lm $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a -L /usr/lib/gcc/x86_64-linux-gnu/11 diff --git a/enzyme/benchmarks/ReverseMode/lstm/Makefile.make b/enzyme/benchmarks/ReverseMode/lstm/Makefile.make index 23ba9a51ceff..0d21100d7567 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/lstm/Makefile.make @@ -8,7 +8,7 @@ clean: rm -f *.ll *.o results.json $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a: src/lib.rs Cargo.toml - ENZYME_LOOSE_TYPES=1 cargo +enzyme rustc --release --lib --crate-type=staticlib + RUSTFLAGS="-Z autodiff=LooseTypes" cargo +enzyme rustc --release --lib --crate-type=staticlib %-unopt.ll: %.cpp clang++ $(BENCH) $^ -pthread -O2 -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm From 05d3e8776487c3d6fa0948d789de361d3384da8f Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Tue, 1 Oct 2024 22:18:43 -0400 Subject: [PATCH 54/56] add tanh support for llvm19+ --- enzyme/Enzyme/FunctionUtils.cpp | 5 ++++ enzyme/Enzyme/GradientUtils.cpp | 5 ++++ enzyme/Enzyme/InstructionDerivatives.td | 12 ++++----- enzyme/Enzyme/TypeAnalysis/TypeAnalysis.cpp | 11 ++++++++ enzyme/Enzyme/Utils.h | 5 ++++ enzyme/test/Enzyme/ReverseMode/tanh19.ll | 29 +++++++++++++++++++++ 6 files changed, 61 insertions(+), 6 deletions(-) create mode 100644 enzyme/test/Enzyme/ReverseMode/tanh19.ll diff --git a/enzyme/Enzyme/FunctionUtils.cpp b/enzyme/Enzyme/FunctionUtils.cpp index 384180656e10..033ccc56c226 100644 --- a/enzyme/Enzyme/FunctionUtils.cpp +++ b/enzyme/Enzyme/FunctionUtils.cpp @@ -2799,6 +2799,11 @@ bool guaranteedDataDependent(Value *z) { case Intrinsic::sqrt: case Intrinsic::sin: case Intrinsic::cos: +#if LLVM_VERSION_MAJOR >= 19 + case Intrinsic::sinh: + case Intrinsic::cosh: + case Intrinsic::tanh: +#endif return guaranteedDataDependent(II->getArgOperand(0)); default: break; diff --git a/enzyme/Enzyme/GradientUtils.cpp b/enzyme/Enzyme/GradientUtils.cpp index ac1a79491f3c..0e0cf5fe5e52 100644 --- a/enzyme/Enzyme/GradientUtils.cpp +++ b/enzyme/Enzyme/GradientUtils.cpp @@ -4177,6 +4177,11 @@ bool GradientUtils::shouldRecompute(const Value *val, case Intrinsic::sin: case Intrinsic::cos: case Intrinsic::exp: +#if LLVM_VERSION_MAJOR >= 19 + case Intrinsic::tanh: + case Intrinsic::cosh: + case Intrinsic::sinh: +#endif case Intrinsic::log: case Intrinsic::nvvm_ldu_global_i: case Intrinsic::nvvm_ldu_global_p: diff --git a/enzyme/Enzyme/InstructionDerivatives.td b/enzyme/Enzyme/InstructionDerivatives.td index 9f765f88826f..fe1a97a67fd8 100644 --- a/enzyme/Enzyme/InstructionDerivatives.td +++ b/enzyme/Enzyme/InstructionDerivatives.td @@ -335,12 +335,6 @@ def : CallPattern<(Op $x, $y), [ReadNone, NoUnwind] >; -def : CallPattern<(Op $x), - ["tanh"], - [(FDiv (DiffeRet), (FMul(Call<(SameTypesFunc<"cosh">), [ReadNone,NoUnwind]> $x):$c, $c))], - (ForwardFromSummedReverse), - [ReadNone, NoUnwind] - >; def : CallPattern<(Op $x), ["tanhf"], [(FDiv (DiffeRet), (FMul(Call<(SameTypesFunc<"coshf">), [ReadNone,NoUnwind]> $x):$c, $c))], @@ -872,6 +866,12 @@ def : CallPattern<(Op (Op $x, $y):$z), [ReadNone, NoUnwind] >; +def : IntrPattern<(Op $x), + [["tanh"]], + [(FDiv (DiffeRet), (FMul(Intrinsic<"cosh"> $x):$c, $c))], + (ForwardFromSummedReverse) + >; + def : IntrPattern<(Op $x), [["sin"]], [(FMul (DiffeRet), (Intrinsic<"cos"> $x))] , diff --git a/enzyme/Enzyme/TypeAnalysis/TypeAnalysis.cpp b/enzyme/Enzyme/TypeAnalysis/TypeAnalysis.cpp index aed7767652f0..c59f712b0121 100644 --- a/enzyme/Enzyme/TypeAnalysis/TypeAnalysis.cpp +++ b/enzyme/Enzyme/TypeAnalysis/TypeAnalysis.cpp @@ -118,9 +118,15 @@ const llvm::StringMap LIBM_FUNCTIONS = { {"atan", Intrinsic::not_intrinsic}, {"atan2", Intrinsic::not_intrinsic}, {"__nv_atan2", Intrinsic::not_intrinsic}, +#if LLVM_VERSION_MAJOR >= 19 + {"cosh", Intrinsic::cosh}, + {"sinh", Intrinsic::sinh}, + {"tanh", Intrinsic::tanh}, +#else {"cosh", Intrinsic::not_intrinsic}, {"sinh", Intrinsic::not_intrinsic}, {"tanh", Intrinsic::not_intrinsic}, +#endif {"acosh", Intrinsic::not_intrinsic}, {"asinh", Intrinsic::not_intrinsic}, {"atanh", Intrinsic::not_intrinsic}, @@ -3849,6 +3855,11 @@ void TypeAnalyzer::visitIntrinsicInst(llvm::IntrinsicInst &I) { case Intrinsic::exp2: case Intrinsic::sin: case Intrinsic::cos: +#if LLVM_VERSION_MAJOR >= 19 + case Intrinsic::sinh: + case Intrinsic::cosh: + case Intrinsic::tanh: +#endif case Intrinsic::floor: case Intrinsic::ceil: case Intrinsic::trunc: diff --git a/enzyme/Enzyme/Utils.h b/enzyme/Enzyme/Utils.h index 9b66730d14d5..a8ce244caadc 100644 --- a/enzyme/Enzyme/Utils.h +++ b/enzyme/Enzyme/Utils.h @@ -1693,6 +1693,11 @@ static inline bool isNoEscapingAllocation(const llvm::Function *F) { case Intrinsic::exp: case Intrinsic::cos: case Intrinsic::sin: +#if LLVM_VERSION_MAJOR >= 19 + case Intrinsic::tanh: + case Intrinsic::cosh: + case Intrinsic::sinh: +#endif case Intrinsic::copysign: case Intrinsic::fabs: return true; diff --git a/enzyme/test/Enzyme/ReverseMode/tanh19.ll b/enzyme/test/Enzyme/ReverseMode/tanh19.ll new file mode 100644 index 000000000000..2d22ab6b6328 --- /dev/null +++ b/enzyme/test/Enzyme/ReverseMode/tanh19.ll @@ -0,0 +1,29 @@ +; RUN: if [ %llvmver -ge 19 ]; then %opt < %s %newLoadEnzyme -enzyme-preopt=false -passes="enzyme,function(mem2reg,instsimplify,%simplifycfg)" -S | FileCheck %s; fi + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare double @llvm.tanh.f64(double) #14 + +; Function Attrs: nounwind readnone uwtable +define double @tester(double %x) { +entry: + %0 = call double @llvm.tanh.f64(double %x) + ret double %0 +} + +define double @test_derivative(double %x) { +entry: + %0 = tail call double (double (double)*, ...) @__enzyme_autodiff(double (double)* nonnull @tester, double %x) + ret double %0 +} + +; Function Attrs: nounwind +declare double @__enzyme_autodiff(double (double)*, ...) + +; CHECK: define internal { double } @diffetester(double %x, double %differeturn) +; CHECK-NEXT: entry: +; CHECK-NEXT: %0 = call fast double @llvm.cosh.f64(double %x) +; CHECK-NEXT: %1 = fmul fast double %0, %0 +; CHECK-NEXT: %2 = fdiv fast double %differeturn, %1 +; CHECK-NEXT: %3 = insertvalue { double } undef, double %2, 0 +; CHECK-NEXT: ret { double } %3 +; CHECK-NEXT: } From 1fe64c0add8df4d9074e7150c803243a00662432 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Wed, 2 Oct 2024 02:21:12 -0400 Subject: [PATCH 55/56] fix safe fft performance --- enzyme/benchmarks/ReverseMode/fft/src/lib.rs | 1 + enzyme/benchmarks/ReverseMode/fft/src/safe.rs | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/fft/src/lib.rs b/enzyme/benchmarks/ReverseMode/fft/src/lib.rs index 47b0aa1e97fd..84b16d077ac7 100644 --- a/enzyme/benchmarks/ReverseMode/fft/src/lib.rs +++ b/enzyme/benchmarks/ReverseMode/fft/src/lib.rs @@ -1,3 +1,4 @@ +#![feature(slice_swap_unchecked)] #![feature(autodiff)] pub mod safe; diff --git a/enzyme/benchmarks/ReverseMode/fft/src/safe.rs b/enzyme/benchmarks/ReverseMode/fft/src/safe.rs index 7332dcb91356..d44633a3813a 100644 --- a/enzyme/benchmarks/ReverseMode/fft/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/fft/src/safe.rs @@ -9,8 +9,8 @@ fn bitreversal_perm(data: &mut [T]) { while i < 2*len { if j > i { //dbg!(&i, &j); - data.swap(j-1, i-1); - data.swap(j, i); + unsafe {data.swap_unchecked(j-1, i-1);} + unsafe {data.swap_unchecked(j, i);} } let mut m = len; From 7f26f432d7467362250ac7bf68040de912b9ea6b Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Wed, 2 Oct 2024 02:55:05 -0400 Subject: [PATCH 56/56] fix (mostly) safe lstm perf --- enzyme/benchmarks/ReverseMode/lstm/src/safe.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs index ea9e71a67560..ad6481790a19 100644 --- a/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs +++ b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs @@ -31,11 +31,11 @@ fn lstm_model( let (a, b) = gates.split_at_mut(2 * hsize); let ((forget, ingate), (outgate, change)) = (a.split_at_mut(hsize), b.split_at_mut(hsize)); - assert_eq!(weight.len(), 4 * hsize); - assert_eq!(bias.len(), 4 * hsize); - assert_eq!(hidden.len(), hsize); - assert!(cell.len() >= hsize); - assert!(input.len() >= hsize); + debug_assert_eq!(weight.len(), 4 * hsize); + debug_assert_eq!(bias.len(), 4 * hsize); + debug_assert_eq!(hidden.len(), hsize); + debug_assert!(cell.len() >= hsize); + debug_assert!(input.len() >= hsize); // caching input for i in 0..hsize { forget[i] = sigmoid(input[i] * weight[i] + bias[i]); @@ -130,7 +130,7 @@ pub(crate) fn lstm_objective( let mut ypred = vec![0.0; b]; let mut ynorm = vec![0.0; b]; - assert!(b > 0); + debug_assert!(b > 0); let limit = (c - 1) * b; for j in 0..(c - 1) {