From b09d79d5c797f05188d9a9ac70e396c8e475ddd8 Mon Sep 17 00:00:00 2001 From: jirigav Date: Tue, 25 Jun 2024 15:47:49 +0200 Subject: [PATCH 1/7] move results handling to a new file --- src/common.rs | 1 + src/main.rs | 87 +++----------------------------------------------- src/results.rs | 82 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 88 insertions(+), 82 deletions(-) create mode 100644 src/results.rs diff --git a/src/common.rs b/src/common.rs index 4954479..b9c131f 100644 --- a/src/common.rs +++ b/src/common.rs @@ -53,6 +53,7 @@ pub(crate) enum SubCommand { #[arg(short, long)] dis_path: String, }, + Autotest {} } pub(crate) fn bits_block_eval(bits: &[usize], block: &[u8]) -> usize { let mut result = 0; diff --git a/src/main.rs b/src/main.rs index 2a7c535..1f60796 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,93 +1,15 @@ mod bottomup; mod common; +mod results; use crate::bottomup::bottomup; -use crate::common::{p_value, z_score, Args}; +use crate::common::Args; use bottomup::Histogram; use clap::Parser; use common::{prepare_data, SubCommand}; -use serde_json::json; -use std::fs::{self, File}; -use std::io::Write; +use std::fs; use std::time::Instant; - -fn print_results(p_value: f64, z_score: f64, alpha: f64, hist: &Histogram, bins: Vec) { - println!("----------------------------------------------------------------------"); - println!("RESULTS:\n"); - - println!("Histogram(the discovered Boolean function returns 1 for values before the separator and 0 for values after the separator.):\n"); - let m = bins.iter().max().unwrap(); - let unit = (m / 50).max(1); - for (i, ind) in hist.sorted_indices.iter().enumerate() { - for x in &hist.bits { - print!("x{} ", x); - } - let mut j = *ind; - print!("| ["); - for _ in 0..hist.bits.len() { - print!("{}", j % 2); - j /= 2; - } - print!("] | "); - for _ in 0..bins[*ind] / unit { - print!("∎"); - } - println!(); - if i == (hist.best_division - 1) { - for _ in 0..80 { - print!("—"); - } - println!(); - } - } - println!(); - println!("Z-score: {z_score}"); - println!("P-value: {p_value:.0e}"); - if p_value >= alpha { - println!( - "As the p-value >= alpha {alpha:.0e}, the randomness hypothesis cannot be rejected." - ); - println!("= CoolTest could not find statistically significant non-randomness."); - } else { - println!("As the p-value < alpha {alpha:.0e}, the randomness hypothesis is REJECTED."); - println!("= Data is not random."); - } -} - -fn results(hist: Histogram, testing_data: &[Vec], args: Args) { - let (count, bins) = hist.evaluate(testing_data); - let prob = 2.0_f64.powf(-(hist.bits.len() as f64)); - let z = z_score( - testing_data.len(), - count, - prob * (hist.best_division as f64), - ); - let p_val = p_value( - count, - testing_data.len(), - prob * (hist.best_division as f64), - ); - print_results(p_val, z, args.alpha, &hist, bins); - - if let Some(path) = args.json.clone() { - let mut file = - File::create(&path).unwrap_or_else(|_| panic!("File {} couldn't be created", path)); - - let output = json!({ - "args": args, - "dis": hist, - "result": if p_val < args.alpha {"random"} else {"non-random"}, - "p-value": p_val - }); - - file.write_all( - serde_json::to_string_pretty(&output) - .expect("Failed to produce json!") - .as_bytes(), - ) - .unwrap(); - } -} +use results::results; fn run_bottomup(args: Args) { let (training_data, testing_data) = prepare_data(&args.data_source, args.block, true); @@ -120,6 +42,7 @@ fn main() { let (testing_data, _) = prepare_data(&args.data_source, args.block, false); results(hist, &testing_data, args) } + Some(SubCommand::Autotest {}) => todo!(), None => run_bottomup(args), } } diff --git a/src/results.rs b/src/results.rs new file mode 100644 index 0000000..dd903f5 --- /dev/null +++ b/src/results.rs @@ -0,0 +1,82 @@ +use std::io::Write; +use serde_json::json; +use std::fs::File; +use crate::{bottomup::Histogram, common::{p_value, z_score, Args}}; + +pub(crate) fn results(hist: Histogram, testing_data: &[Vec], args: Args) { + let (count, bins) = hist.evaluate(testing_data); + let prob = 2.0_f64.powf(-(hist.bits.len() as f64)); + let z = z_score( + testing_data.len(), + count, + prob * (hist.best_division as f64), + ); + let p_val = p_value( + count, + testing_data.len(), + prob * (hist.best_division as f64), + ); + print_results(p_val, z, args.alpha, &hist, bins); + + if let Some(path) = args.json.clone() { + let mut file = + File::create(&path).unwrap_or_else(|_| panic!("File {} couldn't be created", path)); + + let output = json!({ + "args": args, + "dis": hist, + "result": if p_val < args.alpha {"random"} else {"non-random"}, + "p-value": p_val + }); + + file.write_all( + serde_json::to_string_pretty(&output) + .expect("Failed to produce json!") + .as_bytes(), + ) + .unwrap(); + } +} + +fn print_results(p_value: f64, z_score: f64, alpha: f64, hist: &Histogram, bins: Vec) { + println!("----------------------------------------------------------------------"); + println!("RESULTS:\n"); + + println!("Histogram(the discovered Boolean function returns 1 for values before the separator and 0 for values after the separator.):\n"); + let m = bins.iter().max().unwrap(); + let unit = (m / 50).max(1); + for (i, ind) in hist.sorted_indices.iter().enumerate() { + for x in &hist.bits { + print!("x{} ", x); + } + let mut j = *ind; + print!("| ["); + for _ in 0..hist.bits.len() { + print!("{}", j % 2); + j /= 2; + } + print!("] | "); + for _ in 0..bins[*ind] / unit { + print!("∎"); + } + println!(); + if i == (hist.best_division - 1) { + for _ in 0..80 { + print!("—"); + } + println!(); + } + } + println!(); + println!("Z-score: {z_score}"); + println!("P-value: {p_value:.0e}"); + if p_value >= alpha { + println!( + "As the p-value >= alpha {alpha:.0e}, the randomness hypothesis cannot be rejected." + ); + println!("= CoolTest could not find statistically significant non-randomness."); + } else { + println!("As the p-value < alpha {alpha:.0e}, the randomness hypothesis is REJECTED."); + println!("= Data is not random."); + } +} From 2483ad7ee5252ed35a4c1c3a58a7e5eadaa02b35 Mon Sep 17 00:00:00 2001 From: jirigav Date: Wed, 26 Jun 2024 10:23:23 +0200 Subject: [PATCH 2/7] add first version of autotest function --- src/autotest.rs | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ src/common.rs | 4 ++-- src/main.rs | 6 ++++-- src/results.rs | 7 +++++-- 4 files changed, 59 insertions(+), 6 deletions(-) create mode 100644 src/autotest.rs diff --git a/src/autotest.rs b/src/autotest.rs new file mode 100644 index 0000000..d69d217 --- /dev/null +++ b/src/autotest.rs @@ -0,0 +1,48 @@ +use crate::bottomup::bottomup; +use crate::common::{prepare_data, Args}; +use crate::results::results; +use std::time::Instant; + +// TODO +fn choose_k(_block_size: usize, _data_size: usize) -> usize { + 3 +} + +pub(crate) fn autotest(args: Args) { + let (training_data, testing_data) = prepare_data(&args.data_source, args.block, true); + let mut testing_data = testing_data.unwrap(); + let start = Instant::now(); + let data_size = training_data.len(); + + let mut k = choose_k(args.block, data_size); + + let mut hist = bottomup( + &training_data, + args.block, + k, + args.top, + args.max_bits, + args.threads, + ); + let testing_data2; + if args.block <= 256 { + let (training_data, testing_data_opt2) = prepare_data(&args.data_source, 2*args.block, true); + testing_data2 = testing_data_opt2.unwrap(); + k = choose_k(2 * args.block, data_size); + let hist2 = bottomup( + &training_data, + args.block * 2, + k, + args.top, + args.max_bits, + args.threads, + ); + if hist2.z_score.abs() > hist.z_score.abs() { + hist = hist2; + testing_data = testing_data2; + } + } + println!("training finished in {:?}", start.elapsed()); + + results(hist, &testing_data, args) +} diff --git a/src/common.rs b/src/common.rs index b9c131f..0743878 100644 --- a/src/common.rs +++ b/src/common.rs @@ -14,7 +14,7 @@ pub(crate) struct Args { pub(crate) data_source: String, /// Length of block of data. - #[arg(short, long, default_value_t = 128)] + #[arg(short, long, default_value_t = 128)] // Changing the default value changes autotest pub(crate) block: usize, /// Number of bits in histograms in brute-force search. @@ -53,7 +53,7 @@ pub(crate) enum SubCommand { #[arg(short, long)] dis_path: String, }, - Autotest {} + Autotest {}, } pub(crate) fn bits_block_eval(bits: &[usize], block: &[u8]) -> usize { let mut result = 0; diff --git a/src/main.rs b/src/main.rs index 1f60796..af5cedb 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,15 +1,17 @@ +mod autotest; mod bottomup; mod common; mod results; use crate::bottomup::bottomup; use crate::common::Args; +use autotest::autotest; use bottomup::Histogram; use clap::Parser; use common::{prepare_data, SubCommand}; +use results::results; use std::fs; use std::time::Instant; -use results::results; fn run_bottomup(args: Args) { let (training_data, testing_data) = prepare_data(&args.data_source, args.block, true); @@ -42,7 +44,7 @@ fn main() { let (testing_data, _) = prepare_data(&args.data_source, args.block, false); results(hist, &testing_data, args) } - Some(SubCommand::Autotest {}) => todo!(), + Some(SubCommand::Autotest {}) => autotest(args), None => run_bottomup(args), } } diff --git a/src/results.rs b/src/results.rs index dd903f5..5c9a9ea 100644 --- a/src/results.rs +++ b/src/results.rs @@ -1,7 +1,10 @@ -use std::io::Write; +use crate::{ + bottomup::Histogram, + common::{p_value, z_score, Args}, +}; use serde_json::json; use std::fs::File; -use crate::{bottomup::Histogram, common::{p_value, z_score, Args}}; +use std::io::Write; pub(crate) fn results(hist: Histogram, testing_data: &[Vec], args: Args) { let (count, bins) = hist.evaluate(testing_data); From ea51762c873f36e4e6e30a609d1119021fdf778c Mon Sep 17 00:00:00 2001 From: jirigav Date: Wed, 26 Jun 2024 14:50:55 +0200 Subject: [PATCH 3/7] Add block size info to distinguisher --- src/autotest.rs | 3 ++- src/bottomup.rs | 18 +++++++++++------- src/main.rs | 6 ++++-- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/autotest.rs b/src/autotest.rs index d69d217..c800892 100644 --- a/src/autotest.rs +++ b/src/autotest.rs @@ -26,7 +26,8 @@ pub(crate) fn autotest(args: Args) { ); let testing_data2; if args.block <= 256 { - let (training_data, testing_data_opt2) = prepare_data(&args.data_source, 2*args.block, true); + let (training_data, testing_data_opt2) = + prepare_data(&args.data_source, 2 * args.block, true); testing_data2 = testing_data_opt2.unwrap(); k = choose_k(2 * args.block, data_size); let hist2 = bottomup( diff --git a/src/bottomup.rs b/src/bottomup.rs index e36b3ca..f137e97 100644 --- a/src/bottomup.rs +++ b/src/bottomup.rs @@ -9,7 +9,9 @@ pub(crate) struct Histogram { pub(crate) bits: Vec, pub(crate) sorted_indices: Vec, pub(crate) best_division: usize, + #[serde(skip_serializing, default)] pub(crate) z_score: f64, + pub(crate) block_size: usize, } impl Histogram { @@ -42,10 +44,11 @@ impl Histogram { sorted_indices: indices, best_division: best_i, z_score: max_z, + block_size: data[0].len(), } } - pub(crate) fn from_bins(bits: Vec, bins: &[usize]) -> Histogram { + pub(crate) fn from_bins(bits: Vec, bins: &[usize], block_size: usize) -> Histogram { let mut indices = (0..2_usize.pow(bits.len() as u32)).collect_vec(); indices.sort_by(|a, b| bins[*b].cmp(&bins[*a])); @@ -69,6 +72,7 @@ impl Histogram { sorted_indices: indices, best_division: best_i, z_score: max_z, + block_size, } } @@ -179,11 +183,11 @@ fn brute_force(data: &Data, block_size: usize, k: usize, top: usize) -> Vec 1 { - let mut best_hists = vec![Histogram::from_bins(vec![0], &[1, 1]); top]; + let mut best_hists = vec![Histogram::from_bins(vec![0], &[1, 1], block_size); top]; let mut bins = vec![0; 2_usize.pow(k as u32)]; for bits in (0..block_size).combinations(k) { compute_bins(&bits, data, k, &hists, &mut bins, block_size); - let hist = Histogram::from_bins(bits, &bins); + let hist = Histogram::from_bins(bits, &bins, block_size); best_hists.push(hist); best_hists.sort_by(|a, b| b.z_score.abs().partial_cmp(&a.z_score.abs()).unwrap()); best_hists.pop(); @@ -194,7 +198,7 @@ fn brute_force(data: &Data, block_size: usize, k: usize, top: usize) -> Vec = hists .into_iter() .enumerate() - .map(|(i, bins)| Histogram::from_bins(bits[i].clone(), &bins)) + .map(|(i, bins)| Histogram::from_bins(bits[i].clone(), &bins, block_size)) .collect(); best.sort_by(|a, b| b.z_score.partial_cmp(&a.z_score).unwrap()); @@ -203,7 +207,7 @@ fn brute_force(data: &Data, block_size: usize, k: usize, top: usize) -> Vec]) -> Histogram { - let mut best_hist = Histogram::from_bins(vec![0], &[1, 1]); + let mut best_hist = Histogram::from_bins(vec![0], &[1, 1], data[0].len()); for comb in hists.iter().combinations(n) { let mut bits = comb.iter().flat_map(|x| x.bits.clone()).collect_vec(); bits.sort(); @@ -352,14 +356,14 @@ fn brute_force_threads( .map(|i| { let combs = (0..block_size).combinations(k).skip(i); - let mut best_hists = vec![Histogram::from_bins(vec![0], &[1, 1]); top]; + let mut best_hists = vec![Histogram::from_bins(vec![0], &[1, 1], block_size); top]; for bits in combs.step_by(threads) { let mut bins = vec![0; 2_usize.pow(k as u32)]; for (i, bin) in bins.iter_mut().enumerate() { *bin = multi_eval_neg(&bits, data, &neg_data, i); } - let new_hist = Histogram::from_bins(bits, &bins); + let new_hist = Histogram::from_bins(bits, &bins, block_size); best_hists.push(new_hist); best_hists.sort_by(|a, b| b.z_score.abs().partial_cmp(&a.z_score.abs()).unwrap()); best_hists.pop(); diff --git a/src/main.rs b/src/main.rs index af5cedb..7b2a601 100644 --- a/src/main.rs +++ b/src/main.rs @@ -32,7 +32,7 @@ fn run_bottomup(args: Args) { } fn main() { - let args = Args::parse(); + let mut args = Args::parse(); println!("\n{args:?}\n"); match args.subcommand.clone() { @@ -41,7 +41,9 @@ fn main() { .unwrap_or_else(|_| panic!("Failed to read contents of {}", &dis_path)); let hist: Histogram = serde_json::from_str(&contents).expect("Invalid distinguisher json!"); - let (testing_data, _) = prepare_data(&args.data_source, args.block, false); + args.block = hist.block_size; + args.k = hist.bits.len(); + let (testing_data, _) = prepare_data(&args.data_source, hist.block_size, false); results(hist, &testing_data, args) } Some(SubCommand::Autotest {}) => autotest(args), From dd8178238ffbdeb8e4b97b89ca17f8472f60a4fe Mon Sep 17 00:00:00 2001 From: jirigav Date: Wed, 26 Jun 2024 14:58:26 +0200 Subject: [PATCH 4/7] adjust alpha when multiple tests are run --- src/autotest.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/autotest.rs b/src/autotest.rs index c800892..9bb4fe1 100644 --- a/src/autotest.rs +++ b/src/autotest.rs @@ -8,14 +8,16 @@ fn choose_k(_block_size: usize, _data_size: usize) -> usize { 3 } -pub(crate) fn autotest(args: Args) { +pub(crate) fn autotest(mut args: Args) { let (training_data, testing_data) = prepare_data(&args.data_source, args.block, true); let mut testing_data = testing_data.unwrap(); + let mut tested_cases = 0; let start = Instant::now(); let data_size = training_data.len(); let mut k = choose_k(args.block, data_size); + tested_cases += 1; let mut hist = bottomup( &training_data, args.block, @@ -26,6 +28,7 @@ pub(crate) fn autotest(args: Args) { ); let testing_data2; if args.block <= 256 { + tested_cases += 1; let (training_data, testing_data_opt2) = prepare_data(&args.data_source, 2 * args.block, true); testing_data2 = testing_data_opt2.unwrap(); @@ -45,5 +48,11 @@ pub(crate) fn autotest(args: Args) { } println!("training finished in {:?}", start.elapsed()); + if tested_cases > 1 { + let new_alpha = args.alpha/(tested_cases as f64); + println!("Adjusting significance level based on the number of tests from {} to {}", args.alpha, new_alpha); + args.alpha = new_alpha; + } + results(hist, &testing_data, args) } From bb7007b3554299ea4e1f4f748aa5f074301fef8b Mon Sep 17 00:00:00 2001 From: jirigav Date: Wed, 26 Jun 2024 15:06:36 +0200 Subject: [PATCH 5/7] Add warning about too large block size --- src/autotest.rs | 7 +++++-- src/main.rs | 4 ++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/autotest.rs b/src/autotest.rs index 9bb4fe1..d0295ee 100644 --- a/src/autotest.rs +++ b/src/autotest.rs @@ -49,8 +49,11 @@ pub(crate) fn autotest(mut args: Args) { println!("training finished in {:?}", start.elapsed()); if tested_cases > 1 { - let new_alpha = args.alpha/(tested_cases as f64); - println!("Adjusting significance level based on the number of tests from {} to {}", args.alpha, new_alpha); + let new_alpha = args.alpha / (tested_cases as f64); + println!( + "Adjusting significance level based on the number of tests from {} to {}", + args.alpha, new_alpha + ); args.alpha = new_alpha; } diff --git a/src/main.rs b/src/main.rs index 7b2a601..5d625ed 100644 --- a/src/main.rs +++ b/src/main.rs @@ -35,6 +35,10 @@ fn main() { let mut args = Args::parse(); println!("\n{args:?}\n"); + if args.block > 600 { + println!("With block size {}, the computation can take long time, consider using smaller block size.", args.block); + } + match args.subcommand.clone() { Some(SubCommand::Evaluate { dis_path }) => { let contents = fs::read_to_string(&dis_path) From 523a95124bb199c676328d3731f5e9cd1079b78c Mon Sep 17 00:00:00 2001 From: jirigav Date: Wed, 26 Jun 2024 15:22:57 +0200 Subject: [PATCH 6/7] add simple rule for choosing `k` in autotest --- Cargo.lock | 2 +- Cargo.toml | 2 +- src/autotest.rs | 15 ++++++++++++--- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5410956..a3a44db 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -133,7 +133,7 @@ checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" [[package]] name = "cooltest" -version = "0.1.1" +version = "0.1.2" dependencies = [ "clap", "itertools", diff --git a/Cargo.toml b/Cargo.toml index 199837c..ac04a04 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cooltest" -version = "0.1.1" +version = "0.1.2" edition = "2021" diff --git a/src/autotest.rs b/src/autotest.rs index d0295ee..c91790c 100644 --- a/src/autotest.rs +++ b/src/autotest.rs @@ -3,9 +3,18 @@ use crate::common::{prepare_data, Args}; use crate::results::results; use std::time::Instant; -// TODO -fn choose_k(_block_size: usize, _data_size: usize) -> usize { - 3 +const GB: usize = 1000000000; +const MB: usize = 1000000; + + +fn choose_k(block_size: usize, data_size: usize) -> usize { + if data_size <= 10*MB && block_size < 128{ + 4 + } else if data_size < 2*GB && block_size < 256{ + 3 + } else { + 2 + } } pub(crate) fn autotest(mut args: Args) { From 9294e72bb8cfad8b4855347d1ed140801e1ba9ec Mon Sep 17 00:00:00 2001 From: jirigav Date: Wed, 26 Jun 2024 15:25:22 +0200 Subject: [PATCH 7/7] fmt --- src/autotest.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/autotest.rs b/src/autotest.rs index c91790c..e548fd4 100644 --- a/src/autotest.rs +++ b/src/autotest.rs @@ -6,11 +6,10 @@ use std::time::Instant; const GB: usize = 1000000000; const MB: usize = 1000000; - fn choose_k(block_size: usize, data_size: usize) -> usize { - if data_size <= 10*MB && block_size < 128{ + if data_size <= 10 * MB && block_size < 128 { 4 - } else if data_size < 2*GB && block_size < 256{ + } else if data_size < 2 * GB && block_size < 256 { 3 } else { 2