From ebfe8ba86e3c89156249475f6a45b2ca18d47704 Mon Sep 17 00:00:00 2001 From: L <457124+liborty@users.noreply.github.com> Date: Sun, 29 Oct 2023 22:14:05 +1000 Subject: [PATCH] v 3.0.2 --- Cargo.toml | 2 +- README.md | 30 +++-- src/algos.rs | 126 ++++++++++++------ src/lib.rs | 67 +++++++--- src/oldalgos.rs | 336 ------------------------------------------------ tests/tests.rs | 71 +++++----- 6 files changed, 197 insertions(+), 435 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 071e914..437b353 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "medians" -version = "3.0.1" +version = "3.0.2" authors = ["Libor Spacek"] edition = "2021" description = "Median, Statistical Measures, Mathematics, Statistics" diff --git a/README.md b/README.md index 9f0df29..9cb97b0 100644 --- a/README.md +++ b/README.md @@ -5,12 +5,12 @@ Fast algorithm for finding medians of one dimensional data, implemented in 100% safe Rust. ```rust -use medians::{Medians,Medianf64,Median}; +use medians::{medianu8,Medians,Medianf64,Median}; ``` ## Introduction -Finding medians is a common task in statistics and data analysis. At least it ought to be, because median is a more stable measure of central tendency than mean. Similarly, `mad` (median of absolute differences) is a more stable measure of data spread than standard deviation, dominated by squared outliers. Median and mad are not used nearly enough mostly for practical historical reasons: they are more difficult to compute. The fast algorithms presented here provide a practical remedy for this situation. +Finding medians is a common task in statistics and data analysis. At least it ought to be, because median is a more stable measure of central tendency than mean. Similarly, `mad` (median of absolute differences) is a more stable measure of data spread than is standard deviation, which is dominated by squared outliers. Median and mad are not used nearly enough mostly for practical reasons: they are more difficult to compute. The fast algorithms presented here provide remedy for this situation. We argued in [`rstats`](https://github.com/liborty/rstats), that using the Geometric Median is the most stable way to characterise multidimensional data. The one dimensional case is addressed here. @@ -18,19 +18,26 @@ See [`tests.rs`](https://github.com/liborty/medians/blob/main/tests/tests.rs) fo ## Algorithms Analysis -Median can be found naively by sorting the list of data and then picking its midpoint. The best comparison sort algorithm(s) have complexity `O(nlog(n))`. However, faster median algorithms, with complexity `O(n)` are possible. They are based on the observation that not all data need to be sorted, only partitioned and counted off. Therefore, the sort method can not compete, as is demonstrated by the tests. It has been deleted as of version 2.0.0. +Simple primitive types are best dealt with by extra fast radix search. We have implemented it for `u8`, other primitive types to follow soon: -Currently considered to be the 'state of the art' algorithm is Floyd-Rivest (1975) Median of Medians. This divides the data into groups of five items, finds a median of each group and then recursively finds medians of five of these medians, and so on, until only one is left. This is then used as a pivot for the partitioning of the original data. Such pivot is guaranteed to produce 'pretty good' partitioning, though not necessarily perfect, so iteration is necessary. +```rust +/// Median of primitive type u8 by fast radix search +pub fn medianu8( s:&[u8] ) -> Result { ... } +``` + +More complex types require general comparison search. Median can be found naively by sorting the list of data and then picking its midpoint. The best comparison sort algorithm(s) have complexity `O(nlog(n))`. However, faster median algorithms, with complexity `O(n)` are possible. They are based on the observation that not all data need to be sorted, only partitioned and counted off. Therefore, the sort method can not compete, as is demonstrated by the tests. It has been deleted as of version 2.0.0. -However, to find the best pivot is not the main overall objective. Rather, it is to eliminate (count off) eccentric data items as fast as possible. Therefore, the expense of choosing the pivot must be considered. It is possible to allow less optimal pivots, as we do here and yet, on average, to find the median faster. +Currently considered to be the 'state of the art' comparison algorithm is Floyd-Rivest (1975) Median of Medians. This divides the data into groups of five items, finds a median of each group by sort and then recursively finds medians of five of these medians, and so on, until only one is left. This is then used as a pivot for the partitioning of the original data. Such pivot is guaranteed to produce 'pretty good' partitioning, though not necessarily perfect, so iteration is still necessary. -Let our average ratio of items remaining after one partitioning be rs and the Floyd-Rivest be rf. Where `1/2 <= rf <= rs < 1` and rf is more optimal, being nearer to the perfect ratio `1/2`. However, suppose that we can perform two partitions in the time it takes Floyd-Rivest to do one (because of their expensive pivot selection process). Then it is enough for better performance that `rs^2 < rf`, which is entirely possible and seems to be confirmed in practice. For example, `rf=0.65` (nearly optimal), `rs=0.8` (deeply suboptimal), yet `rs^2 < rf`. +To find the best pivot is not the main overall objective. Rather, it is to eliminate (count off) eccentric data items as fast as possible. Therefore, the expense of choosing the pivot must be considered. It is possible to allow less optimal pivots, as we do here and yet, on average, to find the median faster. + +Let our average ratio of items remaining after one partitioning be `rs` and the Floyd-Rivest's be `rf`. Typically, `1/2 <= rf <= rs < 1`, i.e. `rf` is more optimal, being nearer to the perfect partitioning ratio `1/2`. However, suppose that we can perform two partitions in the time it takes Floyd-Rivest to do one (because of their expensive pivot selection process). Then it is enough for better performance that `rs^2 < rf`, which is entirely possible and seems to be confirmed in practice. For example, `rf=0.65` (nearly optimal), `rs=0.8` (deeply suboptimal), yet `rs^2 < rf`. The main features of our median algorithm are: * Linear complexity. -* Fast (in place) iterative partitioning into three subranges (lesser,equal,greater), minimising data movements and memory management. -* Simple pivot selection strategy. We define the `middling` value of a sample of four as one of the middle pair of sorted items. Whereas full sort of four items takes at least five comparisons, we only need three. A `middling` pivot is enough to guarantee convergence of iterative schemes, such as the search for the median. Also, poor pivots are unlikely to be picked repeatedly. +* Fast (in-place) iterative partitioning into three subranges (lesser,equal,greater), minimising data movements and memory management. +* Simple pivot selection strategy. We define the `middling` value of a sample of four as one of the middle pair of items in order. Whereas full (merge) sort of four items takes five comparisons, we only need three. A `middling` pivot is enough to guarantee convergence of iterative schemes, such as the search for the median. Also, poor pivots are unlikely to be picked repeatedly. ## Trait Medianf64 @@ -60,7 +67,8 @@ Most of its methods take a quantify closure `q`, which converts its generic argu Weaker partial ordinal comparison is used instead of numerical comparison. The search algorithm remains the same. The only additional cost is the extra layer of referencing to prevent the copying of data. -**`median_by()`** +**`median_by()`** + For all end-types quantifiable to f64, we simply averaged the two midpoints of even length data to obtain a single median (of type `f64`). When the data items are unquantifiable to `f64`, this is no longer possible. Then `median_by` should be used. It returns both middle values within `Medians` enum type, the lesser one first: ```rust @@ -101,9 +109,11 @@ pub trait Median<'a, T> { ## Release Notes +**Version 3.0.2** - Added function `medianu8` that finds median byte by superfast radix search. More primitive types to follow. + **Version 3.0.1** - Renamed `correlation` to `med_correlation` to avoid name clashes elsewhere. -**Version 3.0.0** - Numerous improvements to speed and generality and renaming. +**Version 3.0.0** - Numerous improvements to speed and generality and renaming. **Version 2.3.1** - Further speed optimisation of `partf64`. diff --git a/src/algos.rs b/src/algos.rs index 624dc90..76669be 100644 --- a/src/algos.rs +++ b/src/algos.rs @@ -83,19 +83,14 @@ pub fn isort_refs(s: &mut [&T], rng: Range, c: impl Fn(&T, &T) -> Orde /// Index of the middling value of four refs. Makes only three comparisons fn middling( - idx0:usize,idx1:usize,idx2:usize,idx3:usize, - c: &mut impl FnMut(usize,usize) -> Ordering, + idx0: usize, + idx1: usize, + idx2: usize, + idx3: usize, + c: &mut impl FnMut(usize, usize) -> Ordering, ) -> usize { - let max1 = if c(idx0,idx1) == Less { - idx1 - } else { - idx0 - }; - let max2 = if c(idx2,idx3) == Less { - idx3 - } else { - idx2 - }; + let max1 = if c(idx0, idx1) == Less { idx1 } else { idx0 }; + let max2 = if c(idx2, idx3) == Less { idx3 } else { idx2 }; if c(max1, max2) == Less { max1 } else { @@ -115,9 +110,9 @@ pub fn min<'a, T>(s: &[&'a T], rng: Range, c: &mut impl FnMut(&T, &T) -> min } -/// Two minimum values within rng in slice s -/// Finds maxima, when invoked with arguments of c swapped: `|a,b| c(b,a)` -/// min1 is the overall minimum/maximum. +/// Two minimum values within rng in slice s. +/// Finds maxima, when invoked with arguments of c swapped: `|a,b| c(b,a)`. +/// The first returned item refers to the primary minimum/maximum. pub fn min2<'a, T>( s: &[&'a T], rng: Range, @@ -153,11 +148,11 @@ pub fn qbalance(s: &[T], centre: &f64, q: impl Fn(&T) -> f64) -> i64 { let mut eq = 0_i64; for si in s { match &q(si).total_cmp(centre) { - Less => bal -= 1, - Equal => eq += 1, + Less => bal -= 1, Greater => bal += 1, + _ => eq += 1, }; - } + }; if bal == 0 { return 0; }; @@ -167,22 +162,21 @@ pub fn qbalance(s: &[T], centre: &f64, q: impl Fn(&T) -> f64) -> i64 { 1 } -/// Partitions `s: &mut [&T]` within range `rng` by pivot, which is the first item: s[rng.start] +/// Partitions `s: &mut [&T]` within range `rng` by pivot, which is the first item: `s[rng.start]`. /// The three resulting partitions are divided by eqstart,gtstart, where: -/// `rng.start..eqstart` (may be empty) contains refs to items lesser than the pivot, -/// `gtstart-eqstart` is the number (>= 1) of items equal to the pivot, -/// values of s within this subrange are undefined, -/// `gtstart..rng.end` (may be empty) contains refs to items greater than the pivot +/// `rng.start..eqstart` (may be empty) contains refs to items lesser than the pivot, +/// `gtstart-eqstart` is the number (>= 1) of items equal to the pivot, values within this subrange are undefined, +/// `gtstart..rng.end` (may be empty) contains refs to items greater than the pivot. pub fn part( s: &mut [&T], - rng: &Range, + rng: &Range, c: &mut impl FnMut(&T, &T) -> Ordering, -) -> (usize, usize) { - // place pivot in the first location +) -> (usize, usize) { + // get pivot from the first location let pivot = s[rng.start]; let mut eqstart = rng.start; - let mut gtstart = eqstart+1; - for t in rng.start+1..rng.end { + let mut gtstart = eqstart + 1; + for t in rng.start + 1..rng.end { match c(s[t], pivot) { Less => { s[eqstart] = s[t]; @@ -200,14 +194,63 @@ pub fn part( (eqstart, gtstart) } +/// Odd median of `&[u8]` +pub fn oddmedianu8(s: &[u8]) -> f64 { + let need = s.len() / 2; // median target position + let mut histogram = [0_usize; 256]; + let mut cummulator = 0_usize; + let mut resbucket = 0_u8; + for &u in s.iter() { + histogram[u as usize] += 1; + }; + for &hist in histogram.iter() { + cummulator += hist; + if need < cummulator { break; }; + resbucket += 1; + }; + resbucket as f64 +} + +/// Even median of `&[u8]` +pub fn evenmedianu8(s: &[u8]) -> f64 { + let mut need = s.len() / 2; // first median target position + let mut histogram = [0_usize; 256]; + let mut cummulator = 0_usize; + let mut resbucket = 0u8; + let mut res = f64::MIN; + for &u in s.iter() { + histogram[u as usize] += 1; + } + for &hist in histogram.iter() { + cummulator += hist; + if need < cummulator { break; }; // cummulator exceeds need, found at least two items + if need == cummulator { // the last (possibly only) item in this bucket + if res == f64::MIN { // is the first median + res = resbucket as f64; // save it + need += 1; // next look for the second one (in the following buckets) + } else { break; }; // this item is already the second median + }; + resbucket += 1; + continue; + }; + if res == f64::MIN { resbucket as f64 } else { (res + resbucket as f64)/ 2.0 } +} + /// Median of odd sized generic data with Odering comparisons by custom closure pub fn oddmedian_by<'a, T>(s: &mut [&'a T], c: &mut impl FnMut(&T, &T) -> Ordering) -> &'a T { let mut rng = 0..s.len(); let need = s.len() / 2; // median target position in fully partitioned set loop { - let pivotsub = middling(rng.start,rng.start + 1,rng.end - 2,rng.end - 1, - &mut |a,b| c(s[a],s[b])); - if pivotsub != rng.start { s.swap(rng.start,pivotsub); }; + let pivotsub = middling( + rng.start, + rng.start + 1, + rng.end - 2, + rng.end - 1, + &mut |a, b| c(s[a], s[b]), + ); + if pivotsub != rng.start { + s.swap(rng.start, pivotsub); + }; let pivotref = s[rng.start]; let (eqsub, gtsub) = part(s, &rng, c); // well inside lt partition, iterate on it @@ -235,7 +278,7 @@ pub fn oddmedian_by<'a, T>(s: &mut [&'a T], c: &mut impl FnMut(&T, &T) -> Orderi }; // second place in gt partition, the solution is the next minimum if need == gtsub + 1 { - return min2(s, gtsub..rng.end, c).1; + return min2(s, gtsub..rng.end, c).1; }; // well inside gt partition, iterate on it rng.start = gtsub; @@ -250,11 +293,18 @@ pub fn evenmedian_by<'a, T>( let mut rng = 0..s.len(); let need = s.len() / 2 - 1; // median target position in fully partitioned set loop { - let pivotsub = middling(rng.start,rng.start + 1,rng.end - 2,rng.end - 1, - &mut |a,b| c(s[a],s[b])); - if pivotsub != rng.start { s.swap(rng.start,pivotsub); }; + let pivotsub = middling( + rng.start, + rng.start + 1, + rng.end - 2, + rng.end - 1, + &mut |a, b| c(s[a], s[b]), + ); + if pivotsub != rng.start { + s.swap(rng.start, pivotsub); + }; let pivotref = s[rng.start]; - let (eqsub,gtsub) = part(s, &rng, c); + let (eqsub, gtsub) = part(s, &rng, c); // well inside lt partition, iterate on it narrowing the range if need + 2 < eqsub { rng.end = eqsub; @@ -263,7 +313,7 @@ pub fn evenmedian_by<'a, T>( // penultimate place in lt partition, solution: if need + 2 == eqsub { // swapping comparison arguments to get two maxima - let (m1,m2) = min2(s, rng.start..eqsub, &mut |a, b| c(b, a)); + let (m1, m2) = min2(s, rng.start..eqsub, &mut |a, b| c(b, a)); return (m2, m1); }; // last place in the lt partition, solution: @@ -284,7 +334,7 @@ pub fn evenmedian_by<'a, T>( return min2(s, gtsub..rng.end, c); }; // inside gt partition, iterate on it, narrowing the range - rng.start = gtsub; + rng.start = gtsub; } } diff --git a/src/lib.rs b/src/lib.rs index b76f071..70dbd1b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -14,7 +14,7 @@ pub mod error; use crate::algos::*; use crate::error::{merror, MedError}; -use core::{fmt::Display,cmp::Ordering}; +use core::{cmp::Ordering, fmt::Display}; use indxvec::printing::{GR, UN, YL}; /// Shorthand type for medians errors with message payload specialized to String @@ -27,15 +27,38 @@ pub enum Medians<'a, T> { /// Even sized data results in a pair of (centered) medians Even((&'a T, &'a T)), } -impl std::fmt::Display for Medians<'_,T> where T:Display { +impl std::fmt::Display for Medians<'_, T> +where + T: Display, +{ fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { match self { - Medians::Odd(m) => { write!( f,"{YL}odd median: {GR}{}{UN}",*m ) }, - Medians::Even((m1, m2)) => { write!(f,"{YL}even medians: {GR}{} {}{UN}",*m1,*m2) } + Medians::Odd(m) => { + write!(f, "{YL}odd median: {GR}{}{UN}", *m) + } + Medians::Even((m1, m2)) => { + write!(f, "{YL}even medians: {GR}{} {}{UN}", *m1, *m2) + } } } } +/// Median of primitive type u8 by fast radix search +pub fn medianu8(s:&[u8]) -> Result { + let n = s.len(); + match n { + 0 => return Err(merror("size", "median: zero length data")), + 1 => return Ok(s[0] as f64), + 2 => return Ok((s[0] as f64 + s[1] as f64) / 2.0), + _ => (), + }; + if (n & 1) == 1 { + Ok(oddmedianu8(s)) + } else { + Ok(evenmedianu8(s)) + } +} + /// Fast 1D medians of floating point data, plus related methods pub trait Medianf64 { /// Median of f64s, checked for NaNs @@ -95,11 +118,11 @@ impl Medianf64 for &[f64] { }) .collect::, Me>>()?; if (n & 1) == 1 { - let oddm = oddmedian_by(&mut s, &mut::total_cmp); + let oddm = oddmedian_by(&mut s, &mut ::total_cmp); Ok(*oddm) } else { - let (&med1, &med2) = evenmedian_by(&mut s, &mut::total_cmp); - Ok((med1+med2)/2.0) + let (&med1, &med2) = evenmedian_by(&mut s, &mut ::total_cmp); + Ok((med1 + med2) / 2.0) } } /// Use this when your data does not contain any NaNs. @@ -115,17 +138,17 @@ impl Medianf64 for &[f64] { }; let mut s = ref_vec(self, 0..self.len()); if (n & 1) == 1 { - let oddm = oddmedian_by(&mut s, &mut::total_cmp); + let oddm = oddmedian_by(&mut s, &mut ::total_cmp); *oddm } else { - let (&med1, &med2) = evenmedian_by(&mut s, &mut::total_cmp); - (med1+med2)/2.0 + let (&med1, &med2) = evenmedian_by(&mut s, &mut ::total_cmp); + (med1 + med2) / 2.0 } } /// Zero mean/median data produced by subtracting the centre, /// typically the mean or the median. fn medf_zeroed(self, centre: f64) -> Vec { - self.iter().map(|&s| s-centre).collect() + self.iter().map(|&s| s - centre).collect() } /// Median correlation = cosine of an angle between two zero median vectors, /// (where the two data samples are interpreted as n-dimensional vectors). @@ -146,8 +169,11 @@ impl Medianf64 for &[f64] { }) .sum(); let res = sxy / (sx2 * sy2).sqrt(); - if res.is_nan() { Err(merror("Nan", "medf_correlation: Nan result!")) } - else { Ok(res) } + if res.is_nan() { + Err(merror("Nan", "medf_correlation: Nan result!")) + } else { + Ok(res) + } } /// Data dispersion estimator MAD (Median of Absolute Differences). /// MAD is more stable than standard deviation and more general than quartiles. @@ -160,9 +186,7 @@ impl Medianf64 for &[f64] { } } -/// Medians of &mut [T]. Data will get rearranged. -/// This is generally faster than copying it unnecessarily. -/// When this is not acceptable, make sure to make your own copy first! +/// Medians of &[T] impl<'a, T> Median<'a, T> for &'a [T] { /// Median of `&[T]` by comparison `c`, quantified to a single f64 by `q`. /// When T is a primitive type directly convertible to f64, pass in `as f64` for `q`. @@ -177,7 +201,7 @@ impl<'a, T> Median<'a, T> for &'a [T] { ) -> Result { let n = self.len(); match n { - 0 => return Err(merror("size", "median_ord: zero length data")), + 0 => return Err(merror("size", "qmedian_by: zero length data")), 1 => return Ok(q(&self[0])), 2 => return Ok((q(&self[0]) + q(&self[1])) / 2.0), _ => (), @@ -190,6 +214,7 @@ impl<'a, T> Median<'a, T> for &'a [T] { Ok((q(med1) + q(med2)) / 2.0) } } + /// Median(s) of unquantifiable type by general comparison closure fn median_by(self, c: &mut impl FnMut(&T, &T) -> Ordering) -> Result, Me> { let n = self.len(); @@ -206,6 +231,7 @@ impl<'a, T> Median<'a, T> for &'a [T] { Ok(Medians::Even(evenmedian_by(&mut s, c))) } } + /// Zero mean/median data produced by subtracting the centre fn zeroed(self, centre: f64, q: impl Fn(&T) -> f64) -> Result, Me> { Ok(self.iter().map(|s| q(s) - centre).collect()) @@ -244,8 +270,11 @@ impl<'a, T> Median<'a, T> for &'a [T] { }) .sum(); let res = sxy / (sx2 * sy2).sqrt(); - if res.is_nan() { Err(merror("Nan", "correlation: Nan result!")) } - else { Ok(res) } + if res.is_nan() { + Err(merror("Nan", "correlation: Nan result!")) + } else { + Ok(res) + } } /// Data dispersion estimator MAD (Median of Absolute Differences). /// MAD is more stable than standard deviation and more general than quartiles. diff --git a/src/oldalgos.rs b/src/oldalgos.rs index cc98606..c856594 100644 --- a/src/oldalgos.rs +++ b/src/oldalgos.rs @@ -136,132 +136,6 @@ pub fn minmax( } */ -/// Partitions by pivot, in a single pass, mutable set `s` within `rng`. -/// s[rng.end-1] is used as the ref of the pivot. Any prior pivot -/// selection scheme must place it there. -/// The three partitions are divided by returned `(gtstart,eqstart)`, where: -/// `rng.start..gtstart` (may be empty) contains refs to items less than the pivot, -/// `gtstart..eqstart` (may be empty) contains refs to items greater than the pivot, -/// `eqstart..rng.end` contains zero or more undefined refs, ending with the pivot. -/// The total number of items equal to the pivot is: `rng.end-eqstart` > 0. -pub fn part( - s: &mut [&T], - rng: &Range, - c: &mut impl FnMut(&T, &T) -> Ordering, -) -> (usize, usize) { - let pivot = s[rng.end - 1]; - let mut ltlast = rng.start; - let mut gtstart = rng.end - 2; - let mut eqstart = rng.end - 1; - loop { - match c(pivot, s[gtstart]) { - Less => { - if ltlast == gtstart { - return (gtstart, eqstart); - }; - gtstart -= 1; - continue; - } - Equal => { - eqstart -= 1; - s[gtstart] = s[eqstart]; // shifts gt range one down - if ltlast == gtstart { - return (ltlast, eqstart); - }; - gtstart -= 1; - continue; - } - Greater => (), // *s[gtstart] < *pivot - }; - 'lt: loop { - match c(s[ltlast], pivot) { - Less => { - if gtstart == ltlast { - return (gtstart + 1, eqstart); - }; - ltlast += 1; - continue 'lt; - } - Equal => { - s[ltlast] = s[gtstart]; // making ltlast lt - eqstart -= 1; - s[gtstart] = s[eqstart]; // shifts gt range one down - } - Greater => { - s.swap(ltlast, gtstart); - } - }; - ltlast += 1; - gtstart -= 1; - if gtstart < ltlast { - return (gtstart + 1, eqstart); - }; - break 'lt; - } - } -} - -/// Partitions in-place mutable set `s` within range `rng`, -/// using the given (ref of a) pivot. -/// The two partitions are divided by gtstart, where: -/// for all items x in `rng.start..gtstart`: `x < pivot`, -/// for all items y in `gtstart..gtend`: `y > pivot`, -/// `rng.end-gtend` is the number of items equal to pivot. -/// The actual values in this last subrange are undefined. -pub fn part(s: &mut [T], rng: &Range, pivot: &T, mut c: impl FnMut(&T, &T) -> Ordering) -> usize { - let mut gtstart = rng.end; - for ltend in rng.start..gtstart { - if c(pivot,&s[ltend]) == Less { - gtstart -= 1; - swap(&s[ltend],&s[gtstart]); - }; - }; - gtstart -} - -/// Partitions in a single pass mutable set `s` within range `rng`, -/// into three non empty subranges. Uses middle pair of a sample of four data values -/// as `min_pivot` and `max_pivot`, moved to the last two positions. -/// The partitions are divided by `(gtstart,midstart)`, where: -/// for all items x in `rng.start..gtstart`: `x < min_pivot`, -/// for all items y in `gtstart..midstart`: `y > max_pivot`, -/// for all items z in midstart..rng.end `min_pivot <= z <= max_pivot`. -pub fn part(s: &mut [T], rng: &Range, mut c: impl FnMut(&T, &T) -> Ordering) - -> (usize, usize) { - let mut midstart = rng.end-2; - let mut ltend = rng.start; - let mut gtstart = midstart-1; - // sort in place four samples of data - insert_sort(s, &[ ltend, midstart, midstart+1, gtstart ], c); - let mut minpivot = &s[midstart]; - let mut maxpivot = &s[midstart+1]; - loop { - gtstart -= 1; - if gtstart < ltend { return (ltend,midstart); }; - if c(maxpivot,&s[gtstart]) == Less { continue; }; - if c(&s[gtstart],minpivot) != Less { - midstart -= 1; - s.swap(gtstart,midstart); - continue; - }; - // s[gtstart] < minpivot - loop { - ltend += 1; - if gtstart < ltend { return (ltend,midstart); }; - if c(&s[ltend],minpivot) == Less { continue; }; - if c(maxpivot,&s[ltend]) == Less { - s.swap(ltend,gtstart); - } else { - let savedmidval = s[ltend]; - s[ltend] = s[gtstart]; - midstart -= 1; - s[gtstart] = s[midstart]; - s[midstart] = savedmidval; - }; - break; - }; - }; -} /// Finds the item at sort index k using the heap method /// To find the median, use k = self.len()/2 @@ -285,213 +159,3 @@ fn strict_even(&self, k:usize) -> Result<(&T, &T),Me> return Err(merror("other","strict_even: failed to peek smallest_k heap")); }; Ok((m2,m1)) } - -/// Fast partial pivoting. -/// Reorders mutable set within the given range so that all items less than or equal to pivot come first, -/// followed by items greater than or equal to pivot. -pub fn spart(s: &mut [T], mut ltsub: usize, mut gtsub: usize, pivot: &T) -> usize -where - T: PartialOrd, -{ - gtsub -= 1; - loop { - while s[ltsub] <= *pivot { - ltsub += 1; - if ltsub > gtsub { - return ltsub; - }; - } - while s[gtsub] >= *pivot { - gtsub -= 1; - if gtsub <= ltsub { - return ltsub; - }; - } - s.swap(ltsub, gtsub); - ltsub += 1; - gtsub -= 1; - if gtsub <= ltsub { - return ltsub; - }; - } -} - -/// Pivoting: reorders mutable set s within ltsub..gtsub so that all items equal to pivot come first. -/// Can be used after `part` on geset for total partition: `[ltset,eqset,gtset]` -pub fn eqpart(s: &mut [T], mut ltsub: usize, mut gtsub: usize, pivot: &T) -> usize -where - T: PartialOrd, -{ - gtsub -= 1; - assert!(ltsub < gtsub); - loop { - while s[ltsub] == *pivot { - ltsub += 1; - if ltsub > gtsub { - return ltsub; - }; - } - while s[gtsub] != *pivot { - gtsub -= 1; - if gtsub == ltsub { - return gtsub; - }; - } - s.swap(ltsub, gtsub); - ltsub += 1; - gtsub -= 1; - if ltsub >= gtsub { - return ltsub; - }; - } -} - -fn fmin(s: &[f64], rng: Range) -> f64 { - let mut min = s[rng.start]; - for &si in s.iter().take(rng.end).skip(rng.start + 1) { - if si < min { - min = si; - }; - } - min -} - -fn fmin2(s: &[f64], rng: Range) -> f64 { - let mut min1 = s[rng.start]; - let mut min2 = min1; - for &si in s.iter().take(rng.end).skip(rng.start + 1) { - if si < min1 { - min2 = min1; - min1 = si; - } else if si < min2 { - min2 = si; - } - } - (min1 + min2) / 2.0 -} - -fn fmax2(s: &[f64], rng: Range) -> f64 { - let mut max1 = s[rng.start]; - let mut max2 = max1; - for &si in s.iter().take(rng.end).skip(rng.start + 1) { - if si > max1 { - max2 = max1; - max1 = si; - } else if si > max2 { - max2 = si; - } - } - (max1 + max2) / 2.0 -} - -fn fmax(s: &[f64], rng: Range) -> f64 { - let mut max = s[rng.start]; - for &si in s.iter().take(rng.end).skip(rng.start + 1) { - if si > max { - max = si; - }; - } - max -} - -/// Median of an odd sized set is the central value. -/// Need is the subscript of the item required. -/// For median, it should be the midpoint. -pub fn med_odd(set: &mut [f64]) -> f64 { - let mut firsttime = true; - let mut max = 0_f64; - let mut min = 0_f64; - let n = set.len(); - let mut rngstart = 0_usize; - let mut rngend = n; - let need = n / 2; - let mut pivot = set.iter().sum::() / (n as f64); // initially the mean - println!(); - loop { - let (gtsub, eq) = fpart(set, rngstart, rngend, pivot); - print!("{gtsub} "); - if gtsub == rngend { - return set[rngend - 1]; - }; - if gtsub == rngstart { - return set[rngstart]; - }; - if need < gtsub { - rngend = gtsub; - if need + 1 == gtsub { - return fmax(set, rngstart..rngend); - }; - max = pivot; - if firsttime { - min = fmin(set, rngstart..rngend); - firsttime = false; - }; - } else { - rngstart = gtsub; - if need < gtsub + eq { - return pivot; - }; // in equal set - min = pivot; - if firsttime { - max = fmax(set, rngstart..rngend); - firsttime = false; - }; - }; - pivot = min + (max - min) * ((need - rngstart) as f64) / ((rngend - rngstart) as f64); - } -} - -/// Median of an even sized set is half of the sum of the two central values. -pub fn med_even(set: &mut [f64]) -> f64 { - let mut firsttime = true; - let mut max = 0_f64; - let mut min = 0_f64; - let n = set.len(); - let mut rngstart = 0_usize; - let mut rngend = n; - let need = n / 2 - 1; - let mut pivot = set.iter().sum::() / (n as f64); // initially the mean - loop { - // print!("{pivot} "); - let (gtsub, eq) = fpart(set, rngstart, rngend, pivot); - if need < gtsub { - //rngend = gtsub; - if rngend == gtsub { - return pivot; - } else { - rngend = gtsub; - } - if need + 2 == gtsub { - return fmax2(set, rngstart..rngend); - }; - if need + 1 == gtsub { - return (fmax(set, rngstart..rngend) + fmin(set, rngstart..rngend)) / 2.; - }; - max = pivot; - if firsttime { - min = fmin(set, rngstart..rngend); - firsttime = false; - }; - } else { - if need + 1 < gtsub + eq { - return pivot; - }; // in equal set - //rngstart = gtsub; - if rngstart == gtsub { - return pivot; - } else { - rngstart = gtsub; - }; - if need == gtsub + eq { - return fmin2(set, rngstart..rngend); - } - min = pivot; - if firsttime { - max = fmax(set, rngstart..rngend); - firsttime = false; - }; - }; - pivot = min + (max - min) * ((need - rngstart) as f64) / ((rngend - rngstart) as f64); - // pivot = (max+min)/2.; - } -} diff --git a/tests/tests.rs b/tests/tests.rs index bf5d8a0..4f01028 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -4,7 +4,7 @@ use indxvec::{here, printing::*, Indices, Mutops, Printing, Vecops}; use medians::algos::{scrub_nans, to_u64s, to_f64s, qbalance, part, ref_vec, min, min2, isort_refs}; // use medians::algosf64::partord; -use medians::{Me, Median, Medianf64}; +use medians::{Me, medianu8, Median, Medianf64}; use ran::{generators::*, *}; // use std::io::{stdout,Write}; use std::convert::From; @@ -70,7 +70,7 @@ fn text() { #[test] fn medf64() { let v = [ - 10., 18., 17., 16., 15., 14., 1., 2., 3., 4., 5., 6., 7., 8., 17., 10., 11., 12., 13., 14., 15., 16., 18., 9. + 9., 10., 18., 17., 16., 15., 14., 1., 2., 3., 4., 5., 6., 7., 8., 17., 10., 11., 12., 13., 14., 15., 16., 18., 9. ]; println!("Data: {}",v.gr()); let len = v.len(); @@ -83,7 +83,7 @@ fn medf64() { &mut ::total_cmp, ); println!( - "Result: {}\nCommas show the subranges:\n\ + "Result: {}\nCommas separate the subranges:\n\ {GR}[{}, {}, {}]{UN}\n{} items equal to the pivot {}", (eqsub,gtsub).yl(), vr[0..eqsub].to_plainstr(), @@ -110,59 +110,68 @@ fn correlation() -> Result<(), Me> { fn errors() -> Result<(), Me> { let n = 10_usize; // number of vectors to test for each magnitude // set_seeds(33333); - let rv = Rnum::newf64(); + let rv = Rnum::newu8(); for d in [10, 50, 100, 1000, 10000, 100000] { let mut error = 0_i64; trait Eq: PartialEq {} impl Eq for f64 {} for _ in 0..n { - let v = rv.ranv(d).expect("Random vec genertion failed").getvf64()?; // random vector - let med = v - .as_slice() - .medf_unchecked(); - error += qbalance(&v, &med, |&f| f); + let v = rv.ranv(d).expect("Random vec genertion failed").getvu8()?; // random vector + let med = medianu8(&v)?; + // v + //.as_slice() + //.medf_unchecked(); + error += qbalance(&v, &med, |&f| f as f64); } println!("Even length {GR}{d}{UN}, repeats: {GR}{n}{UN}, errors: {GR}{error}{UN}"); error = 0_i64; - for _ in 0..n { - let v = rv.ranv(d + 1).expect("Random vec genertion failed").getvf64()?; // random vector - let med = v - .as_slice() - .medf_unchecked(); - error += qbalance(&v, &med, |&f| f); + let v = rv.ranv(d + 1).expect("Random vec genertion failed").getvu8()?; // random vector + let med = medianu8(&v)?; + // v + // .as_slice() + // .medf_unchecked(); + error += qbalance(&v, &med, |&f| f as f64); } println!( - "Odd length {GR}{}{UN}, repeats: {GR}{}{UN}, errors: {GR}{}{UN}", - d + 1, - n, - error + "Odd length {GR}{}{UN}, repeats: {GR}{n}{UN}, errors: {GR}{error}{UN}", + d + 1 ); } Ok(()) } const NAMES: [&str; 4] = [ - "medf_unchecked", - "medf_checked", + // "medf_unchecked", + "qmedian", "median_by", - "mutisort" + "mutisort", + "medianu8" ]; -const CLOSURESF64: [fn(&[f64]); 4] = [ - |v: &[_]| { - v.medf_unchecked(); - }, +const CLOSURESU8: [fn(&[u8]); 4] = [ +// |v: &[_]| { +// v.medf_unchecked(); +// }, +// |v: &[_]| { +// v.medf_checked().expect("median closure failed"); + // }, + |v: &[_]| { - v.medf_checked().expect("median closure failed"); - }, + v.qmedian_by(&mut ::cmp,|&x| x as f64) + .expect("even median closure failed"); + }, |v: &[_]| { - v.median_by(&mut ::total_cmp) + v.median_by(&mut ::cmp) .expect("even median closure failed"); }, |v: &[_]| { let mut vm = v.to_owned(); - vm.mutisort( 0..v.len(), |a:&f64,b:&f64| a.total_cmp(b)); + vm.mutisort( 0..v.len(), |a:&u8,b| a.cmp(b));// |a:&f64,b:&f64| a.total_cmp(b)); + }, + |v: &[_]| { + medianu8(v) + .expect("medianu8 closure failed"); }, ]; @@ -170,5 +179,5 @@ const CLOSURESF64: [fn(&[f64]); 4] = [ fn comparison() { // set_seeds(0); // intialise random numbers generator // Rnum encapsulates the type of random data to be generated - benchf64(Rnum::newf64(), 2..5000, 100, 10, &NAMES, &CLOSURESF64); + benchu8(Rnum::newu8(), 2..5000, 100, 10, &NAMES, &CLOSURESU8); }