triarius · triarius · Jan 9, 2024 · Jan 9, 2024 · Jan 9, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -13,4 +13,5 @@ rand = "0.8.5"
 regex = "1.10.2"
 
 [dev-dependencies]
+rayon = "1.8.0"
 statrs = "0.16.0"
diff --git a/src/passphrase.rs b/src/passphrase.rs
@@ -28,44 +28,72 @@ pub fn new(
 
 mod test {
  #[test]
+ // Uses [Pearson's chi-squared test](https://en.wikipedia.org/wiki/Chi-squared_test#Pearson's_chi-squared_test)
+ // to test that the passphrases are uniformly distributed.
  fn chi_squared() {
  use crate::{passphrase, words};
+ use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator};
  use statrs::distribution::{ChiSquared, ContinuousCDF};
  use std::collections::HashMap;
 
- let n = 4;
- let n_fact = 24;
- // this test file has n = 4 words, which can have 24 permutations
+ // This test file has W = 4 words, which can have 24 permutations
+ const W: usize = 4;
+ const W_FACTORIAL: usize = 24;
+ const N: usize = 12_000_000; // number of samples
+
  let words = words::list(Some("src/fixtures/test")).unwrap();
 
- let trials = 1_200_000;
- let mut rng = rand::thread_rng();
+ let histogram = Vec::from_iter(0..N)
+ .par_iter()
+ .fold_chunks(
+ N / std::thread::available_parallelism().unwrap(),
+ || HashMap::new(),
+ |mut acc, _| {
+ let mut rng = rand::thread_rng();
+ let mut words = words.clone();
+ let s = passphrase::new(&mut rng, &mut words, W, " ");
+ *acc.entry(s).or_insert(0) += 1 as usize;
+ acc
+ },
+ )
+ .collect::<Vec<HashMap<String, usize>>>()
+ .iter()
+ .fold(HashMap::new(), |mut acc, h| {
+ h.iter().for_each(|(k, v)| {
+ *acc.entry(k.to_owned()).or_insert(0) += v;
+ });
+ acc
+ });
 
- let mut histogram: HashMap<String, u32> = HashMap::new();
- (1..trials).for_each(|_| {
- let mut words = words.clone();
- let s = passphrase::new(&mut rng, &mut words, n, " ");
- *histogram.entry(s).or_insert(0) += 1;
- });
+ assert_eq!(histogram.values().sum::<usize>(), N, "missing samples");
 
- assert_eq!(histogram.len(), n_fact);
+ // There should be at most W! different passphrases. If, by chance, some of them are not
+ // generated, then the chi-squared test is highly unlikely to conclude that they are
+ // uniformly distributed.
+ assert_eq!(W_FACTORIAL, histogram.len(), "missing a permutation");
 
- let expected_frequency = trials as f64 / n_fact as f64;
+ let expected_frequency = N as f64 / W_FACTORIAL as f64;
  let chi_squared_stat: f64 = histogram
- .iter()
- .map(|(_, v)| (*v as f64 - expected_frequency).powi(2) / expected_frequency)
+ .values()
+ .map(|v| (*v as f64 - expected_frequency).powi(2) / expected_frequency)
  .sum();
 
- // degrees of freedom = (number of rows - 1) * (number of columns - 1)
- let df = ((2 - 1) * (24 - 1)) as f64;
- let dist = ChiSquared::new(df).unwrap();
- let p = 1.0 - dist.cdf(chi_squared_stat);
+ // Since the number in any permutation is determined by the number in all the others,
+ // degrees of freedom = number of permutations - 1
+ const DF: f64 = (W_FACTORIAL - 1) as f64;
+ let dist = ChiSquared::new(DF).unwrap();
 
- eprintln!("χ^2: {}", chi_squared_stat);
- eprintln!("p: {}", p);
+ // The p-value is the area under the chi-squared pdf to the right of the chi_squared_stat
+ let p = 1.0 - dist.cdf(chi_squared_stat);
 
- // the p-value should be greater than 0.05 so that we can't reject the null hypothesis
- // if we can reject the null hypothesis, then the passphrase generator is not uniform
- assert_eq!(p > 0.05, true);
+ // The p-value should be greater than 0.05 so that we can't reject the null hypothesis that
+ // the values are from a uniform distribution.
+ // If we can reject the null hypothesis, then the passphrase generator may not be uniform.
+ assert!(
+ p > 0.05,
+ "passphrase may not be uniformly random. (p = {} <= 0.05, χ^2 = {}).",
+ p,
+ chi_squared_stat,
+ );
  }
 }