diff --git a/Cargo.lock b/Cargo.lock index 5410956..a3a44db 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -133,7 +133,7 @@ checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" [[package]] name = "cooltest" -version = "0.1.1" +version = "0.1.2" dependencies = [ "clap", "itertools", diff --git a/Cargo.toml b/Cargo.toml index 199837c..ac04a04 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cooltest" -version = "0.1.1" +version = "0.1.2" edition = "2021" diff --git a/src/autotest.rs b/src/autotest.rs new file mode 100644 index 0000000..e548fd4 --- /dev/null +++ b/src/autotest.rs @@ -0,0 +1,69 @@ +use crate::bottomup::bottomup; +use crate::common::{prepare_data, Args}; +use crate::results::results; +use std::time::Instant; + +const GB: usize = 1000000000; +const MB: usize = 1000000; + +fn choose_k(block_size: usize, data_size: usize) -> usize { + if data_size <= 10 * MB && block_size < 128 { + 4 + } else if data_size < 2 * GB && block_size < 256 { + 3 + } else { + 2 + } +} + +pub(crate) fn autotest(mut args: Args) { + let (training_data, testing_data) = prepare_data(&args.data_source, args.block, true); + let mut testing_data = testing_data.unwrap(); + let mut tested_cases = 0; + let start = Instant::now(); + let data_size = training_data.len(); + + let mut k = choose_k(args.block, data_size); + + tested_cases += 1; + let mut hist = bottomup( + &training_data, + args.block, + k, + args.top, + args.max_bits, + args.threads, + ); + let testing_data2; + if args.block <= 256 { + tested_cases += 1; + let (training_data, testing_data_opt2) = + prepare_data(&args.data_source, 2 * args.block, true); + testing_data2 = testing_data_opt2.unwrap(); + k = choose_k(2 * args.block, data_size); + let hist2 = bottomup( + &training_data, + args.block * 2, + k, + args.top, + args.max_bits, + args.threads, + ); + if hist2.z_score.abs() > hist.z_score.abs() { + hist = hist2; + testing_data = testing_data2; + } + } + println!("training finished in {:?}", start.elapsed()); + + if tested_cases > 1 { + let new_alpha = args.alpha / (tested_cases as f64); + println!( + "Adjusting significance level based on the number of tests from {} to {}", + args.alpha, new_alpha + ); + args.alpha = new_alpha; + } + + results(hist, &testing_data, args) +} diff --git a/src/bottomup.rs b/src/bottomup.rs index e36b3ca..f137e97 100644 --- a/src/bottomup.rs +++ b/src/bottomup.rs @@ -9,7 +9,9 @@ pub(crate) struct Histogram { pub(crate) bits: Vec, pub(crate) sorted_indices: Vec, pub(crate) best_division: usize, + #[serde(skip_serializing, default)] pub(crate) z_score: f64, + pub(crate) block_size: usize, } impl Histogram { @@ -42,10 +44,11 @@ impl Histogram { sorted_indices: indices, best_division: best_i, z_score: max_z, + block_size: data[0].len(), } } - pub(crate) fn from_bins(bits: Vec, bins: &[usize]) -> Histogram { + pub(crate) fn from_bins(bits: Vec, bins: &[usize], block_size: usize) -> Histogram { let mut indices = (0..2_usize.pow(bits.len() as u32)).collect_vec(); indices.sort_by(|a, b| bins[*b].cmp(&bins[*a])); @@ -69,6 +72,7 @@ impl Histogram { sorted_indices: indices, best_division: best_i, z_score: max_z, + block_size, } } @@ -179,11 +183,11 @@ fn brute_force(data: &Data, block_size: usize, k: usize, top: usize) -> Vec 1 { - let mut best_hists = vec![Histogram::from_bins(vec![0], &[1, 1]); top]; + let mut best_hists = vec![Histogram::from_bins(vec![0], &[1, 1], block_size); top]; let mut bins = vec![0; 2_usize.pow(k as u32)]; for bits in (0..block_size).combinations(k) { compute_bins(&bits, data, k, &hists, &mut bins, block_size); - let hist = Histogram::from_bins(bits, &bins); + let hist = Histogram::from_bins(bits, &bins, block_size); best_hists.push(hist); best_hists.sort_by(|a, b| b.z_score.abs().partial_cmp(&a.z_score.abs()).unwrap()); best_hists.pop(); @@ -194,7 +198,7 @@ fn brute_force(data: &Data, block_size: usize, k: usize, top: usize) -> Vec = hists .into_iter() .enumerate() - .map(|(i, bins)| Histogram::from_bins(bits[i].clone(), &bins)) + .map(|(i, bins)| Histogram::from_bins(bits[i].clone(), &bins, block_size)) .collect(); best.sort_by(|a, b| b.z_score.partial_cmp(&a.z_score).unwrap()); @@ -203,7 +207,7 @@ fn brute_force(data: &Data, block_size: usize, k: usize, top: usize) -> Vec]) -> Histogram { - let mut best_hist = Histogram::from_bins(vec![0], &[1, 1]); + let mut best_hist = Histogram::from_bins(vec![0], &[1, 1], data[0].len()); for comb in hists.iter().combinations(n) { let mut bits = comb.iter().flat_map(|x| x.bits.clone()).collect_vec(); bits.sort(); @@ -352,14 +356,14 @@ fn brute_force_threads( .map(|i| { let combs = (0..block_size).combinations(k).skip(i); - let mut best_hists = vec![Histogram::from_bins(vec![0], &[1, 1]); top]; + let mut best_hists = vec![Histogram::from_bins(vec![0], &[1, 1], block_size); top]; for bits in combs.step_by(threads) { let mut bins = vec![0; 2_usize.pow(k as u32)]; for (i, bin) in bins.iter_mut().enumerate() { *bin = multi_eval_neg(&bits, data, &neg_data, i); } - let new_hist = Histogram::from_bins(bits, &bins); + let new_hist = Histogram::from_bins(bits, &bins, block_size); best_hists.push(new_hist); best_hists.sort_by(|a, b| b.z_score.abs().partial_cmp(&a.z_score.abs()).unwrap()); best_hists.pop(); diff --git a/src/common.rs b/src/common.rs index 4954479..0743878 100644 --- a/src/common.rs +++ b/src/common.rs @@ -14,7 +14,7 @@ pub(crate) struct Args { pub(crate) data_source: String, /// Length of block of data. - #[arg(short, long, default_value_t = 128)] + #[arg(short, long, default_value_t = 128)] // Changing the default value changes autotest pub(crate) block: usize, /// Number of bits in histograms in brute-force search. @@ -53,6 +53,7 @@ pub(crate) enum SubCommand { #[arg(short, long)] dis_path: String, }, + Autotest {}, } pub(crate) fn bits_block_eval(bits: &[usize], block: &[u8]) -> usize { let mut result = 0; diff --git a/src/main.rs b/src/main.rs index 2a7c535..5d625ed 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,94 +1,18 @@ +mod autotest; mod bottomup; mod common; +mod results; use crate::bottomup::bottomup; -use crate::common::{p_value, z_score, Args}; +use crate::common::Args; +use autotest::autotest; use bottomup::Histogram; use clap::Parser; use common::{prepare_data, SubCommand}; -use serde_json::json; -use std::fs::{self, File}; -use std::io::Write; +use results::results; +use std::fs; use std::time::Instant; -fn print_results(p_value: f64, z_score: f64, alpha: f64, hist: &Histogram, bins: Vec) { - println!("----------------------------------------------------------------------"); - println!("RESULTS:\n"); - - println!("Histogram(the discovered Boolean function returns 1 for values before the separator and 0 for values after the separator.):\n"); - let m = bins.iter().max().unwrap(); - let unit = (m / 50).max(1); - for (i, ind) in hist.sorted_indices.iter().enumerate() { - for x in &hist.bits { - print!("x{} ", x); - } - let mut j = *ind; - print!("| ["); - for _ in 0..hist.bits.len() { - print!("{}", j % 2); - j /= 2; - } - print!("] | "); - for _ in 0..bins[*ind] / unit { - print!("∎"); - } - println!(); - if i == (hist.best_division - 1) { - for _ in 0..80 { - print!("—"); - } - println!(); - } - } - println!(); - println!("Z-score: {z_score}"); - println!("P-value: {p_value:.0e}"); - if p_value >= alpha { - println!( - "As the p-value >= alpha {alpha:.0e}, the randomness hypothesis cannot be rejected." - ); - println!("= CoolTest could not find statistically significant non-randomness."); - } else { - println!("As the p-value < alpha {alpha:.0e}, the randomness hypothesis is REJECTED."); - println!("= Data is not random."); - } -} - -fn results(hist: Histogram, testing_data: &[Vec], args: Args) { - let (count, bins) = hist.evaluate(testing_data); - let prob = 2.0_f64.powf(-(hist.bits.len() as f64)); - let z = z_score( - testing_data.len(), - count, - prob * (hist.best_division as f64), - ); - let p_val = p_value( - count, - testing_data.len(), - prob * (hist.best_division as f64), - ); - print_results(p_val, z, args.alpha, &hist, bins); - - if let Some(path) = args.json.clone() { - let mut file = - File::create(&path).unwrap_or_else(|_| panic!("File {} couldn't be created", path)); - - let output = json!({ - "args": args, - "dis": hist, - "result": if p_val < args.alpha {"random"} else {"non-random"}, - "p-value": p_val - }); - - file.write_all( - serde_json::to_string_pretty(&output) - .expect("Failed to produce json!") - .as_bytes(), - ) - .unwrap(); - } -} - fn run_bottomup(args: Args) { let (training_data, testing_data) = prepare_data(&args.data_source, args.block, true); let testing_data = testing_data.unwrap(); @@ -108,18 +32,25 @@ fn run_bottomup(args: Args) { } fn main() { - let args = Args::parse(); + let mut args = Args::parse(); println!("\n{args:?}\n"); + if args.block > 600 { + println!("With block size {}, the computation can take long time, consider using smaller block size.", args.block); + } + match args.subcommand.clone() { Some(SubCommand::Evaluate { dis_path }) => { let contents = fs::read_to_string(&dis_path) .unwrap_or_else(|_| panic!("Failed to read contents of {}", &dis_path)); let hist: Histogram = serde_json::from_str(&contents).expect("Invalid distinguisher json!"); - let (testing_data, _) = prepare_data(&args.data_source, args.block, false); + args.block = hist.block_size; + args.k = hist.bits.len(); + let (testing_data, _) = prepare_data(&args.data_source, hist.block_size, false); results(hist, &testing_data, args) } + Some(SubCommand::Autotest {}) => autotest(args), None => run_bottomup(args), } } diff --git a/src/results.rs b/src/results.rs new file mode 100644 index 0000000..5c9a9ea --- /dev/null +++ b/src/results.rs @@ -0,0 +1,85 @@ +use crate::{ + bottomup::Histogram, + common::{p_value, z_score, Args}, +}; +use serde_json::json; +use std::fs::File; +use std::io::Write; + +pub(crate) fn results(hist: Histogram, testing_data: &[Vec], args: Args) { + let (count, bins) = hist.evaluate(testing_data); + let prob = 2.0_f64.powf(-(hist.bits.len() as f64)); + let z = z_score( + testing_data.len(), + count, + prob * (hist.best_division as f64), + ); + let p_val = p_value( + count, + testing_data.len(), + prob * (hist.best_division as f64), + ); + print_results(p_val, z, args.alpha, &hist, bins); + + if let Some(path) = args.json.clone() { + let mut file = + File::create(&path).unwrap_or_else(|_| panic!("File {} couldn't be created", path)); + + let output = json!({ + "args": args, + "dis": hist, + "result": if p_val < args.alpha {"random"} else {"non-random"}, + "p-value": p_val + }); + + file.write_all( + serde_json::to_string_pretty(&output) + .expect("Failed to produce json!") + .as_bytes(), + ) + .unwrap(); + } +} + +fn print_results(p_value: f64, z_score: f64, alpha: f64, hist: &Histogram, bins: Vec) { + println!("----------------------------------------------------------------------"); + println!("RESULTS:\n"); + + println!("Histogram(the discovered Boolean function returns 1 for values before the separator and 0 for values after the separator.):\n"); + let m = bins.iter().max().unwrap(); + let unit = (m / 50).max(1); + for (i, ind) in hist.sorted_indices.iter().enumerate() { + for x in &hist.bits { + print!("x{} ", x); + } + let mut j = *ind; + print!("| ["); + for _ in 0..hist.bits.len() { + print!("{}", j % 2); + j /= 2; + } + print!("] | "); + for _ in 0..bins[*ind] / unit { + print!("∎"); + } + println!(); + if i == (hist.best_division - 1) { + for _ in 0..80 { + print!("—"); + } + println!(); + } + } + println!(); + println!("Z-score: {z_score}"); + println!("P-value: {p_value:.0e}"); + if p_value >= alpha { + println!( + "As the p-value >= alpha {alpha:.0e}, the randomness hypothesis cannot be rejected." + ); + println!("= CoolTest could not find statistically significant non-randomness."); + } else { + println!("As the p-value < alpha {alpha:.0e}, the randomness hypothesis is REJECTED."); + println!("= Data is not random."); + } +}