diff --git a/Cargo.toml b/Cargo.toml index 36563d3..ef8d848 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,7 @@ categories = ["science","mathematics","algorithms"] include = [ "src/lib.rs", "src/algos.rs", - "src/implementations.rs", + "src/error.rs", "Cargo.toml", "README.md", "LICENSE" diff --git a/README.md b/README.md index 9c7c3f3..689a7bc 100644 --- a/README.md +++ b/README.md @@ -25,22 +25,21 @@ Short primitive types are best dealt with by radix search. We have implemented i pub fn medianu8( s:&[u8] ) -> Result { ... } ``` -More complex types require general comparison search. Median can be found naively by sorting the list of data and then picking its midpoint. The best comparison sort algorithm(s) have complexity `O(n*log(n))`. However, faster median algorithms, with complexity `O(n)` are possible. They are based on the observation that data need to be sorted, only partitioned and counted off. Therefore, the sort method can not compete. It has been deleted as of version 2.0.0. +More complex data types require general comparison search. Median can be found naively by sorting the list of data and then picking its midpoint. The best comparison sort algorithms have complexity `O(n*log(n))`. However, faster median algorithms, with complexity `O(n)` are possible. They are based on the observation that data need to be sorted, only partitioned and counted off. Therefore, the naive sort method can not compete and has been deleted as of version 2.0.0. -Currently considered to be the 'state of the art' comparison algorithm is Floyd-Rivest (1975): Median of Medians. This divides the data into groups of five items, finds a median of each group by sort and then recursively finds medians of five of these medians, and so on, until only one is left. This is then used as a pivot for the partitioning of the original data. Such pivot is guaranteed to produce 'pretty good' partitioning, though not necessarily perfect, so iteration is still necessary. +Currently considered to be the 'state of the art' comparison algorithm is Floyd-Rivest (1975): Median of Medians. This divides the data into groups of five items, finds a median of each group by sort and then recursively finds medians of five of these medians, and so on, until only one is left. This is then used as a pivot for the partitioning of the original data. Such pivot will produce reasonably good partitioning, though not necessarily perfect. Therefore, iteration is still necessary. -To find the best pivot is not the main overall objective. Rather, it is to eliminate (count off) eccentric data items as fast as possible. Therefore, the expense of choosing the pivot must be considered. It is possible to allow less optimal pivots, as we do here and yet, on average, to find the median faster. +However, finding the best pivot is not the main objective. Rather, it is to eliminate (count off) eccentric data items as fast as possible. Therefore, the expense of choosing the pivot must be carefully considered. It is possible to use less optimal pivot, and yet to find the median faster (on average). Let our average ratio of items remaining after one partitioning be `rs` and the Floyd-Rivest's be `rf`. Typically, `1/2 <= rf <= rs < 1`, i.e. `rf` is more optimal, being nearer to the perfect partitioning ratio of `1/2`. However, suppose that we can perform two partitions in the time it takes Floyd-Rivest to do one (because of their expensive pivot selection process). Then it is enough for better performance that `rs^2 < rf`, which is perfectly possible and seems to be born out in practice. For example, `rf=0.65` (nearly optimal), `rs=0.8` (deeply suboptimal), yet `rs^2 < rf`. -On large datasets it pays to select the pivot more carefully. Here we depart from Floyd-Rivest and their recursive sorting of ad-hoc groups of five items. Instead, we recursively find the actual median of a subsample of data (currently 1/20). +Nonetheless, especially on large datasets, one should devote certain limited fraction of the overall computational effort to pivot selection. ### Summary of he main features of our median algorithm * Linear complexity. * Fast (in-place) iterative partitioning into three subranges (lesser,equal,greater), minimising data movements and memory management. * Simple pivot selection on small datasets. We define the `middling` value of a sample of four as one of the middle pair of ordered items. This is found in only three comparisons. A `middling` pivot is enough to guarantee convergence of iterative search for the median. Really poor pivots occur only rarely. -* Recursive deployment on subsamples of large datasets for more accurate pivots. ## Trait Medianf64 @@ -114,7 +113,7 @@ pub trait Median<'a, T> { ## Release Notes -**Version 3.0.9** - Improved performance on large data sets by recursive pivot estimation. +**Version 3.0.9** - Improved pivot estimation for large data sets. **Version 3.0.8** - Added `implementation.rs` module and reorganized the source. diff --git a/src/algos.rs b/src/algos.rs index d9ea1ce..640323c 100644 --- a/src/algos.rs +++ b/src/algos.rs @@ -1,38 +1,7 @@ -use crate::{merror, Me}; use core::cmp::{Ordering, Ordering::*}; use std::ops::Range; - -/// Partitions `s: &mut [&T]` within range `rng`, using comparator `c`. -/// Returns the boundaries of the rearranged partitions, (eqstart,gtstart), where -/// `rng.start..eqstart` (may be empty) contains references to items lesser than the pivot, -/// `eqstart..gtstart`are items equal to the pivot, -/// `gtstart..rng.end` (may be empty) contains references to items greater than the pivot. -pub fn partit<'a,T>( - s: &mut [&'a T], - pivot: &'a T, - rng: &Range, - c: &mut impl FnMut(&T, &T) -> Ordering, -) -> (usize, usize) { - let mut eqstart = rng.start; - let mut gtstart = eqstart; - for t in rng.start..rng.end { - match c(s[t], pivot) { - Less => { - s[eqstart] = s[t]; - eqstart += 1; - s[t] = s[gtstart]; - gtstart += 1; - } - Equal => { - s[t] = s[gtstart]; - gtstart += 1; - } - Greater => (), - } - } - for e in eqstart..gtstart { s[e] = pivot; } - (eqstart, gtstart) -} +use indxvec::Mutops; +use crate::{Me,merror}; /// Scan a slice of f64s for NANs pub fn nans(v: &[f64]) -> bool { @@ -45,28 +14,28 @@ pub fn nans(v: &[f64]) -> bool { } /// kth item from rng (ascending or descending, depending on `c`) -pub fn best1_k(s: &[T], k: usize, rng: Range, c: F) -> &T -where - F: Fn(&T, &T) -> Ordering, -{ - let n = rng.len(); - assert!((k > 0) & (k <= n)); - let mut k_sorted: Vec<&T> = s.iter().skip(rng.start).take(k).collect(); - k_sorted.sort_unstable_by(|&a, &b| c(a, b)); - let mut k_max = k_sorted[k - 1]; - for si in s.iter() { - if c(si, k_max) == Less { - let insert_pos = match k_sorted.binary_search_by(|j| c(j, si)) { - Ok(ins) => ins + 1, - Err(ins) => ins, - }; - k_sorted.insert(insert_pos, si); - k_sorted.pop(); - k_max = k_sorted[k - 1]; - }; - } - k_max -} +pub fn best1_k(s: &[T], k: usize, rng: Range, c: F) -> &T + where + F: Fn(&T, &T) -> Ordering, + { + let n = rng.len(); + assert!((k > 0) & (k <= n)); + let mut k_sorted: Vec<&T> = s.iter().skip(rng.start).take(k).collect(); + k_sorted.sort_unstable_by(|&a, &b| c(a, b)); + let mut k_max = k_sorted[k - 1]; + for si in s.iter() { + if c(si, k_max) == Less { + let insert_pos = match k_sorted.binary_search_by(|j| c(j, si)) { + Ok(ins) => ins + 1, + Err(ins) => ins, + }; + k_sorted.insert(insert_pos, si); + k_sorted.pop(); + k_max = k_sorted[k - 1]; + }; + } + k_max + } /// Index of the middling value of four refs. Makes only three comparisons fn middling( @@ -85,7 +54,7 @@ fn middling( } } -/// Minimum value within a range in a slice. +/// Minimum value within a range in a slice /// Finds maximum, when arguments of c are swapped in the function call: `|a,b| c(b,a)` pub fn min<'a, T>(s: &[&'a T], rng: Range, c: &mut impl FnMut(&T, &T) -> Ordering) -> &'a T { let mut min = s[rng.start]; @@ -121,8 +90,8 @@ pub fn min2<'a, T>( (min1, min2) } -/// Measures errors from centre (for testing purposes). -/// Requires quantising to f64 for accuracy. +/// measure errors from centre (for testing) +/// requires quantising to f64 for accuracy pub fn qbalance(s: &[T], centre: &f64, q: impl Fn(&T) -> f64) -> i64 { let mut bal = 0_i64; let mut eq = 0_i64; @@ -179,29 +148,24 @@ fn evenmedianu8(s: &[u8]) -> f64 { continue; }; cummulator += hist; - if firstres { - if need < cummulator { - res = i as f64; - break; - }; // cummulator exceeds need, found both items - if need == cummulator { - // found first item (last in this bucket) - res = i as f64; + if firstres { + if need < cummulator { res = i as f64; break; }; // cummulator exceeds need, found both items + if need == cummulator { // found first item (last in this bucket) + res = i as f64; firstres = false; continue; // search for the second item }; - } else { - // the second item is in the first following non-zero bucket + } else { // the second item is in the first following non-zero bucket res += i as f64; res /= 2.0; break; }; // found the second - } + }; res } /// Median of primitive type u8 by fast radix search -pub fn medianu8(s: &[u8]) -> Result { +pub fn medianu8(s:&[u8]) -> Result { let n = s.len(); match n { 0 => return merror("size", "median: zero length data")?, @@ -217,26 +181,22 @@ pub fn medianu8(s: &[u8]) -> Result { } /// Median of odd sized generic data with Odering comparisons by custom closure -pub(super) fn oddmed_by<'a, T>( - s: &mut [&'a T], - mut rng: Range, - c: &mut impl FnMut(&T, &T) -> Ordering, -) -> &'a T { - let need = rng.len() / 2; // median target position in fully partitioned set +pub(super) fn oddmedian_by<'a, T>(s: &mut [&'a T], c: &mut impl FnMut(&T, &T) -> Ordering) -> &'a T { + let mut rng = 0..s.len(); + let need = s.len() / 2; // median target position in fully partitioned set loop { - let n = rng.len(); - let pivotref = if n > 99 { - oddmed_by(s, rng.start..rng.start + n / 20, c) - } else { - s[middling( - rng.start, - rng.start + 1, - rng.end - 2, - rng.end - 1, - &mut |a, b| c(s[a], s[b]), - )] + let pivotsub = middling( + rng.start, + rng.start + 1, + rng.end - 2, + rng.end - 1, + &mut |a, b| c(s[a], s[b]), + ); + if pivotsub != rng.start { + s.swap(rng.start, pivotsub); }; - let (eqsub, gtsub) = partit(s, pivotref, &rng, c); + let pivotref = s[rng.start]; + let (eqsub, gtsub) = <&mut [T]>::part(s, &rng, c); // well inside lt partition, iterate on it if need + 2 < eqsub { rng.end = eqsub; @@ -268,43 +228,39 @@ pub(super) fn oddmed_by<'a, T>( rng.start = gtsub; } } + /// Median of even sized generic data with Odering comparisons by custom closure -pub(super) fn evenmed_by<'a, T>( +pub(super) fn evenmedian_by<'a, T>( s: &mut [&'a T], - mut rng: Range, c: &mut impl FnMut(&T, &T) -> Ordering, ) -> (&'a T, &'a T) { + let mut rng = 0..s.len(); let need = s.len() / 2 - 1; // median target position in fully partitioned set - loop { - let n = rng.len(); - let pivotref = if n > 99 { - oddmed_by(s, rng.start..rng.start + n / 20, c) - } else { - s[middling( - rng.start, - rng.start + 1, - rng.end - 2, - rng.end - 1, - &mut |a, b| c(s[a], s[b]), - )] - }; - let (eqsub, gtsub) = partit(s, pivotref, &rng, c); + loop { + let pivotsub = middling( + rng.start, + rng.start + 1, + rng.end - 2, + rng.end - 1, + &mut |a, b| c(s[a], s[b]), + ); + if pivotsub != rng.start { + s.swap(rng.start, pivotsub); + }; + let pivotref = s[rng.start]; + let (eqsub, gtsub) = <&mut [T]>::part(s, &rng, c); // well inside lt partition, iterate on it narrowing the range if need + 2 < eqsub { rng.end = eqsub; continue; }; - // penultimate place in lt partition: + // penultimate place in lt partition, solution: if need + 2 == eqsub { // swapping comparison arguments to get two maxima let (m1, m2) = min2(s, rng.start..eqsub, &mut |a, b| c(b, a)); return (m2, m1); }; - // first place in gt partition, the solution are its two minima - if need == gtsub { - return min2(s, gtsub..rng.end, c); - }; - // last place in the lt partition: + // last place in the lt partition, solution: if need + 1 == eqsub { // swapped comparison arguments to get maximum return (min(s, rng.start..eqsub, &mut |a, b| c(b, a)), pivotref); @@ -317,6 +273,10 @@ pub(super) fn evenmed_by<'a, T>( if need + 1 == gtsub { return (pivotref, min(s, gtsub..rng.end, c)); }; + // first place in gt partition, the solution are its two minima + if need == gtsub { + return min2(s, gtsub..rng.end, c); + }; // inside gt partition, iterate on it, narrowing the range rng.start = gtsub; } diff --git a/src/implementations.rs b/src/implementations.rs index 96c57a1..07743d4 100644 --- a/src/implementations.rs +++ b/src/implementations.rs @@ -56,12 +56,12 @@ impl Medianf64 for &[f64] { Ok(x) } }) - .collect::, Me>>()?; + .collect::, Me>>()?; if (n & 1) == 1 { - let oddm = oddmed_by(&mut s, 0..n, &mut ::total_cmp); + let oddm = oddmedian_by(&mut s, &mut ::total_cmp); Ok(*oddm) } else { - let (&med1, &med2) = evenmed_by(&mut s, 0..n, &mut ::total_cmp); + let (&med1, &med2) = evenmedian_by(&mut s, &mut ::total_cmp); Ok((med1+med2) / 2.0) } } @@ -76,15 +76,15 @@ impl Medianf64 for &[f64] { 2 => return (self[0] + self[1]) / 2.0, _ => (), }; - let mut s = self.ref_vec(0..n); + let mut s = self.ref_vec(0..self.len()); if (n & 1) == 1 { - let oddm = oddmed_by(&mut s, 0..n,&mut ::total_cmp); + let oddm = oddmedian_by(&mut s, &mut ::total_cmp); *oddm } else { - let (&med1, &med2) = evenmed_by(&mut s, 0..n,&mut ::total_cmp); + let (&med1, &med2) = evenmedian_by(&mut s, &mut ::total_cmp); (med1 + med2) / 2.0 } - } + } /// Iterative weighted median with accuracy eps fn medf_weighted(self, ws: Self, eps: f64) -> Result { if self.len() != ws.len() { @@ -175,9 +175,9 @@ impl<'a, T> Median<'a, T> for &'a [T] { }; let mut s = self.ref_vec(0..self.len()); if (n & 1) == 1 { - Ok(q(oddmed_by(&mut s, 0..n,c))) + Ok(q(oddmedian_by(&mut s, c))) } else { - let (med1, med2) = evenmed_by(&mut s, 0..n, c); + let (med1, med2) = evenmedian_by(&mut s, c); Ok((q(med1) + q(med2)) / 2.0) } } @@ -193,9 +193,9 @@ impl<'a, T> Median<'a, T> for &'a [T] { }; let mut s = self.ref_vec(0..self.len()); if (n & 1) == 1 { - Ok(Medians::Odd(oddmed_by(&mut s, 0..n, c))) + Ok(Medians::Odd(oddmedian_by(&mut s, c))) } else { - Ok(Medians::Even(evenmed_by(&mut s, 0..n, c))) + Ok(Medians::Even(evenmedian_by(&mut s, c))) } } @@ -204,7 +204,17 @@ impl<'a, T> Median<'a, T> for &'a [T] { Ok(self.iter().map(|s| q(s) - centre).collect()) } /// We define median based correlation as cosine of an angle between two - /// zero median vectors (analogously to Pearson's zero mean vectors) + /// zero median vectors (analogously to Pearson's zero mean vectors) + /// # Example + /// ``` + /// use medians::{Medianf64,Median}; + /// use core::convert::identity; + /// use core::cmp::Ordering::*; + /// let v1 = vec![1_f64,2.,3.,4.,5.,6.,7.,8.,9.,10.,11.,12.,13.,14.]; + /// let v2 = vec![14_f64,1.,13.,2.,12.,3.,11.,4.,10.,5.,9.,6.,8.,7.]; + /// assert_eq!(v1.medf_correlation(&v2).unwrap(),-0.1076923076923077); + /// assert_eq!(v1.med_correlation(&v2,&mut |a,b| a.total_cmp(b),|&a| identity(a)).unwrap(),-0.1076923076923077); + /// ``` fn med_correlation( self, v: Self, diff --git a/tests/tests.rs b/tests/tests.rs index d6da7b1..5955d07 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -12,15 +12,35 @@ use times::{benchf64, benchu64, benchu8, mutbenchf64}; #[test] fn parting() -> Result<(), Me> { let data = [ - 5.,8.,7.,6.,5.,4.,3.,2.,-f64::NAN,1.,0.,1.,-2.,3.,4.,-5.,f64::NAN,f64::NAN,6.,7.,7. + 5., + 8., + 7., + 6., + 5., + 4., + 3., + 2., + -f64::NAN, + 1., + 0., + 1., + -2., + 3., + 4., + -5., + f64::NAN, + f64::NAN, + 6., + 7., + 7., ]; // println!("To u64s: {}",to_u64s(&data).gr()); // println!("To f64s: {}",to_f64s(&to_u64s(&data)).gr()); // println!("Scrubbed: {}", scrub_nans(&to_f64s(&to_u64s(&data))).gr()); let len = data.len(); - println!("Pivot {}: {}", 7.0.yl(), data.gr()); + println!("Pivot {}: {}", data[0].yl(), data.gr()); let mut refdata = data.ref_vec(0..len); - let (eqsub, gtsub) = partit(&mut refdata, &7.0, &(0..len), &mut ::total_cmp); + let (eqsub, gtsub) = <&mut [f64]>::part(&mut refdata, &(0..len), &mut ::total_cmp); println!( "Result: {}\nCommas show the subranges:\n\ {GR}[{}, {}, {}]{UN}\n{} items equal to pivot {}", @@ -154,13 +174,18 @@ fn errors() -> Result<(), Me> { Ok(()) } -const NAMES: [&str; 2] = ["median_by","medf_unchecked"]; +const NAMES: [&str; 3] = ["median_by", "best_k", "medf_unchecked"]; -const CLOSURESF64: [fn(&[f64]); 2] = [ +const CLOSURESF64: [fn(&[f64]); 3] = [ |v: &[_]| { v.median_by(&mut ::total_cmp) .expect("even median closure failed"); }, + |v: &[_]| { + let mut sorted: Vec<&f64> = v.iter().collect(); + sorted.sort_unstable_by(|&a, &b| a.total_cmp(b)); + // sorted[sorted.len()/2]; + }, |v: &[_]| { v.medf_unchecked(); }, @@ -183,5 +208,5 @@ const CLOSURESF64: [fn(&[f64]); 2] = [ fn comparison() { // set_seeds(0); // intialise random numbers generator // Rnum encapsulates the type of random data to be generated - benchf64(901..920, 1, 10, &NAMES, &CLOSURESF64); + benchf64(3..100, 1, 10, &NAMES, &CLOSURESF64); }