From bd1ec47d712e1bcac445d7d3e895c388a8e687e3 Mon Sep 17 00:00:00 2001 From: L <457124+liborty@users.noreply.github.com> Date: Fri, 14 Jun 2024 11:18:57 +1000 Subject: [PATCH] 3.0.12 --- Cargo.toml | 4 ++-- README.md | 30 +++++++++++------------ src/algos.rs | 32 ++++--------------------- src/lib.rs | 4 ++-- tests/tests.rs | 64 +++++++++++++++++++++++++++++++++++++++++--------- 5 files changed, 77 insertions(+), 57 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 3ebe204..f5dbc94 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "medians" -version = "3.1.0" +version = "3.0.12" authors = ["Libor Spacek"] edition = "2021" description = "Median, Statistical Measures, Mathematics, Statistics" @@ -26,7 +26,7 @@ missing_docs = "warn" maintenance = { status = "actively-developed" } [lib] [dependencies] -indxvec = "^1.8.9" +indxvec = "^1.9.4" [dev-dependencies] ran = "2" times = "1" diff --git a/README.md b/README.md index ca152d5..fcf430c 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ ## **by Libor Spacek** -Fast algorithm for finding medians of one dimensional data, implemented in 100% safe Rust. +Fast new algorithms for finding medians, implemented in 100% safe Rust. ```rust use medians::{*,algos::*}; @@ -12,9 +12,9 @@ use medians::{*,algos::*}; ## Introduction -Finding medians is a common task in statistics and data analysis. At least it ought to be, because median is a more stable measure of central tendency than mean. Similarly, `mad` (median of absolute differences) is a more stable measure of data spread than standard deviation, which is dominated by squared outliers. Median and `mad` are not used nearly enough mostly for practical historical reasons: they are more difficult to compute. The fast algorithms presented here provide remedy for this situation. +Finding medians is a common task in statistics and data analysis. At least it ought to be, because median is a more stable measure of central tendency than mean. Similarly, `mad` (median of absolute differences) is a more stable measure of data spread than standard deviation, which is dominated by squared outliers. Median and `mad` are not used nearly enough mostly for practical historical reasons: they are more difficult to compute. The fast algorithms presented here provide a remedy for this situation. -We argued in [`rstats`](https://github.com/liborty/rstats), that using the Geometric Median is the most stable way to characterise multidimensional data. The one dimensional case is addressed here. +We argued in [`rstats`](https://github.com/liborty/rstats), that using the Geometric Median is the most stable way to characterise multidimensional data. The one dimensional case is addressed in this crate. See [`tests.rs`](https://github.com/liborty/medians/blob/main/tests/tests.rs) for examples of usage. Their automatically generated output can also be found by clicking the 'test' icon at the top of this document and then examining the latest log. @@ -27,26 +27,26 @@ Best methods/functions to be deployed, depending on the end type of data (i.e. t - `f64` -> methods of trait Medianf64 - `T` custom quantifiable to u64 -> method `uqmedian` of trait `Median` - `T` custom comparable by `c` -> method `qmedian_by` of trait `Median` -- `T` custom comparable but not quantifiable -> method `median_by` of trait `Median`. +- `T` custom comparable but not quantifiable -> general method `median_by` of trait `Median`. ## Algorithms Analysis -Short primitive types are best dealt with by radix search. We have implemented it for `u8`: +Short primitive types are best dealt with by radix search. We have implemented it for `u8` and for `u64`: ```rust -/// Median of primitive type u8 by fast radix search -pub fn medianu8(s: &[u8]) -> Result, Me> +/// Medians of u8 end type by fast radix search +pub fn medianu8(s: &[u8]) -> Result, Me>; +/// Medians of u64 end type by fast recursive radix search +pub fn medu64(s: &mut [u64]) -> Result<(u64, u64), Me>; ``` -More complex data types require general comparison search, see `median_by`. Median can be found naively by sorting the list of data and then picking its midpoint. The best comparison sort algorithms have complexity `O(n*log(n))`. However, faster median algorithms with complexity `O(n)` are possible. They are based on the observation that data need to be all sorted, only partitioned and counted off. Therefore, the naive sort method can not compete and has been deleted as of version 2.0.0. +More complex data types require general comparison search, see `median_by`. Median can be found naively by sorting the list of data and then picking its midpoint. The best comparison sort algorithms have complexity `O(n*log(n))`. However, faster median algorithms with complexity `O(n)` are possible. They are based on the observation that data need to be all fully sorted, only partitioned and counted off. Therefore, the naive sort method can not compete and has been deleted as of version 2.0.0. -Currently considered to be the 'state of the art' comparison algorithm is Floyd-Rivest (1975): Median of Medians. This divides the data into groups of five items, finds median of each group by sort, then finds medians of five of these medians, and so on, until only one remains. This is then used as the pivot for partitioning of the original data. Such pivot will produce good partitioning, though not perfect. Counting off and iterating is still necessary. +Floyd-Rivest (1975): Median of Medians is currently considered to be 'the state of the art' comparison algorithm. It divides the data into groups of five items, finds median of each group by sort, then finds medians of five of these medians, and so on, until only one remains. This is then used as the pivot for partitioning of the original data. Such pivot will produce good partitioning, though not perfect halving. Counting off and iterating is therefore still necessary. -However, finding the best pivot estimate is not the main objective. The real objective is to eliminate (count off) eccentric data items as fast as possible. Therefore, the expense of estimating the pivot is highly relevant. It is possible to use less optimal pivots, yet to find the medians faster on average. In any case, efficient partitioning is a must. +Finding the best possible pivot estimate is not the main objective. The real objective is to eliminate (count off) eccentric data items as fast as possible, overall. Therefore, the time spent estimating the pivot has to be taken into account. It is possible to settle for less optimal pivots, yet to find the medians faster on average. In any case, efficient partitioning is a must. -Let our average ratio of items remaining after one partitioning be `rs` and the Floyd-Rivest's be `rf`. Typically, `1/2 <= rf <= rs < 1`, i.e. `rf` is more optimal, being nearer to the perfect partitioning ratio of `1/2`. However, suppose that we can perform two partitions in the time it takes Floyd-Rivest to do one (because of their expensive pivot selection process). Then it is enough for better performance that `rs^2 < rf`, which is perfectly possible and seems to be born out in practice. For example, `rf=0.65` (nearly optimal), `rs=0.8` (deeply suboptimal), yet `rs^2 < rf`. - -Nonetheless, on large datasets, we do devote some of the overall computational effort to pivot selection. +Let our average ratio of items remaining after one partitioning be `rs` and the Floyd-Rivest's be `rf`. Typically, `1/2 <= rf <= rs < 1`, i.e. `rf` is more optimal, being nearer to the perfect halving (ratio of `1/2`). Suppose that we can perform two partitions in the time it takes Floyd-Rivest to do one (because of their slow pivot selection). Then it is enough for better performance that `rs^2 < rf`, which is perfectly possible and seems to be born out in practice. For example, `rf=0.65` (nearly optimal), `rs=0.8` (deeply suboptimal), yet `rs^2 < rf`. Nonetheless, some computational effort devoted to the pivot selection, proportional to the data length, is worth it. We introduce another new algorithm, implemented as function `medianu64`: @@ -140,9 +140,9 @@ pub trait Median<'a, T> { ## Release Notes -**Version 3.1.0** - Adding faster `medu64`, even variant is still work in progress. +**Version 3.0.12** - Adding faster `medu64`, even variant is still work in progress. Fixed a bug. -**Version 3.0.11** - Added method `uqmedian` to trait `Median` for types quantifiable to `u64` by some closure `q`. Fixed a recent bug in `oddmedian_by`, whereby the pivot reference was not timely saved. Sorry about that. This was affecting results where the pivot coincident with the median was guessed early on. +**Version 3.0.11** - Added method `uqmedian` to trait `Median` for types quantifiable to `u64` by some closure `q`. Fixed a recent bug in `oddmedian_by`, whereby the pivot reference was not timely saved. **Version 3.0.10** - Added `medianu64`. It is faster on u64 data than the general purpose `median_by`. It is using a new algorithm that partitions by bits, thus avoiding the complexities of pivot estimation. diff --git a/src/algos.rs b/src/algos.rs index 520f0c1..074c1c9 100644 --- a/src/algos.rs +++ b/src/algos.rs @@ -17,28 +17,6 @@ pub fn tobebytes(v: &[u64]) -> Vec { } */ -/// Partitions `s: &mut [u64]` within range `rng`, using bitmask. -/// Returns the boundary of the rearranged partitions gtstart, where -/// `rng.start..gtstart` (may be empty) contains items with zero bit(s) corresponding to bitmask, -/// `gtstart..rng.end` (may be empty) contains items with one (or more) set bit(s). -pub fn part_binary(s: &mut [u64], rng: &Range, bitmask: u64) -> usize { - let mut gtstart = rng.start; - for < in s.iter().take(rng.end).skip(rng.start) { - if (lt & bitmask) == 0 { - gtstart += 1; - } else { - break; - }; - } - for i in gtstart + 1..rng.end { - if (s[i] & bitmask) == 0 { - s.swap(gtstart, i); - gtstart += 1; - }; - } - gtstart -} - /// index of middle valued ref of three, using at most three comparisons { pub fn midof3( s: &[T], @@ -275,7 +253,7 @@ pub fn oddmedianu64(s: &mut [u64]) -> &u64 { let need = s.len() / 2; // median target position in fully partitioned let mut bitval = FIRST_BIT; // set the most significant bit loop { - let gtsub = part_binary(s, &rng, bitval); + let gtsub = s.part_binary(&rng, bitval); if bitval == 1 { // termination of bit iterations: same values left if need < gtsub { @@ -318,13 +296,13 @@ pub fn evenmedianu64(s: &mut [u64]) -> (&u64, &u64) { let need = s.len() / 2 - 1; // first median target position let mut bitval = FIRST_BIT; // set the most significant bit loop { - let gtsub = part_binary(s, &rng, bitval); + let gtsub = s.part_binary(&rng, bitval); if bitval == 1 { // termination of bit iterations: same values left - if need < gtsub - 1 { + if need + 1 < gtsub { return (&s[gtsub - 2], &s[gtsub - 1]); }; - if need == gtsub - 1 { + if need + 1 == gtsub { return (&s[gtsub - 1], &s[gtsub]); }; return (&s[gtsub], &s[gtsub + 1]); @@ -411,7 +389,7 @@ pub(super) fn oddmedu64(bytes: &[[u8; 8]], byteno: usize, need: usize) -> u64 { } /* -/// Median of even sized u64 data (recursive) +/// Median of even sized u64 data (recursive) - work in progress pub(super) fn evenmedu64(bytes: &[[u8;8]], byteno:usize, need:usize) -> (u64, u64) { let n = bytes.len(); assert_ne!(n,0,"evenmedu64 failed to find median"); diff --git a/src/lib.rs b/src/lib.rs index edc93ae..0991cc7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -48,7 +48,7 @@ pub enum Medians<'a, T> { Even((&'a T, &'a T)), } -/// Fast medians of u8 end type by fast radix search +/// Medians of u8 end type by fast radix search pub fn medianu8(s: &[u8]) -> Result<(u8, u8), Me> { let n = s.len(); if n == 0 { @@ -91,7 +91,7 @@ pub fn medianu64(s: &mut [u64]) -> Result, Me> { } } -/// Fast medians of u64 end type by radix search +/// Medians of u64 end type by fast recursive radix search pub fn medu64(s: &mut [u64]) -> Result<(u64, u64), Me> { if (s.len() & 1) == 1 { let byteslice = s.iter().map(|x| x.to_be_bytes()).collect::>(); diff --git a/tests/tests.rs b/tests/tests.rs index ea5eb8c..2bb587a 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -14,12 +14,48 @@ fn partbin() -> Result<(), Me> { let mut data = [257_u64,9,8,7,6,5,4,3,2,1]; println!("Data: {}",data.gr()); let n = data.len(); - let gtsub = part_binary(&mut data, &(0..n), 3); + let gtsub = data.part_binary( &(0..n), 3); println!("Partitioned by bit 3: {},{}",data[..gtsub].gr(),data[gtsub..].gr()); println!("Median: {}",evenmedianu64(&mut data).gr()); Ok(()) } +#[test] +fn ftest() -> Result<(), Me> { +let a = vec![ + 100.0, + 163.6170150950381, + 224.6127142531872, + 239.91368100304916, + 345.1674002412572, + 402.88833594261706, + 423.6406741377381, + 472.6292699764225, + 487.23306678749594, + 490.94434592125606, + 511.16658896980687, + 516.3472076946555, + 523.052566308903, + 563.6784311991111, + 586.7283185517608, + 633.5580942760708, + 678.4956618813414, + 708.2452516626092, + 741.9710552209048, + 763.476192474483, + 768.6249939324011, + 777.1952444919513, + 785.2192860329102, + 785.3178558989187, + 858.0319001781837, + 927.4228569429413, + 952.453888947949, + 1067.6089037099757, +]; +eprintln!("Median: {} ", a.medf_unchecked()); +Ok(()) +} + #[test] fn parting() -> Result<(), Me> { let data = [ @@ -107,10 +143,13 @@ fn medf64() -> Result<(), Me> { let median = v.medf_checked()?; let mad = v.madf(median); println!("Median±mad: {GR}{}±{}{UN}", median, mad); + println!("Mean: {GR}{}{UN}", v.iter().sum::()/(len as f64)); println!( "Weighted median: {GR}{}{UN} ", - v.medf_weighted(&weights, 0.0001)? + v.medf_weighted(&weights, 0.00001)? ); + let prodsum:f64 = v.iter().zip(weights.iter()).map(|(x,w)| x*w ).sum(); + println!("Weighted mean: {GR}{}{UN}", prodsum/weights.iter().sum::()); Ok(()) } @@ -157,9 +196,12 @@ fn errors() -> Result<(), Me> { Ok(()) } -const NAMES: [&str; 4] = ["median_by","medf_checked","uqmedian","medu64"]; +#[test] +fn comparison() { +println!("Comparison tests running, please wait...."); +const NAMES: [&str; 5] = ["median_by","medf_unchecked","uqmedian","medianu64","medu64"]; -const CLOSURESU64: [fn(&mut [u64]); 4] = [ +const CLOSURESU64: [fn(&mut [u64]); 5] = [ |v: &mut [_]| { v.median_by(&mut ::cmp) .expect("median_by closure failed"); @@ -167,8 +209,8 @@ const CLOSURESU64: [fn(&mut [u64]); 4] = [ |v: &mut [_]| { let vf:Vec = v.iter().map(|&x| x as f64).collect(); - vf.medf_checked() - .expect("medf_checked found NaN"); + vf.medf_unchecked(); + // .expect("medf_checked found NaN"); }, |v: &mut [_]| { // already in u64, so using identity quantifier @@ -176,6 +218,11 @@ const CLOSURESU64: [fn(&mut [u64]); 4] = [ .expect("uqmedian error"); }, + |v: &mut [_]| { + medianu64(v) + .expect("uqmedian error"); + }, + |v: &mut [_]| { medu64(v) .expect("uqmedian error"); @@ -201,10 +248,5 @@ const CLOSURESU64: [fn(&mut [u64]); 4] = [ } */ ]; - -#[test] -fn comparison() { - // set_seeds(0); // intialise random numbers generator - // Rnum encapsulates the type of random data to be generated mutbenchu64(100000..100010, 1, 10, &NAMES, &CLOSURESU64); }