From abd758653c02a18e5b6d5bdbd8cbdcf7a0ff0433 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Fri, 13 Oct 2023 14:37:08 +0200 Subject: [PATCH] perf: rechunk before grouping on multiple keys (#11711) --- crates/polars-plan/src/dsl/string.rs | 2 +- .../polars-plan/src/logical_plan/optimizer/delay_rechunk.rs | 5 +++-- crates/polars-sql/src/functions.rs | 6 +++--- py-polars/src/expr/string.rs | 2 +- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/crates/polars-plan/src/dsl/string.rs b/crates/polars-plan/src/dsl/string.rs index 1fdebe23f676..7e74806d7f31 100644 --- a/crates/polars-plan/src/dsl/string.rs +++ b/crates/polars-plan/src/dsl/string.rs @@ -395,7 +395,7 @@ impl StringNameSpace { } /// Slice the string values. - pub fn str_slice(self, start: i64, length: Option) -> Expr { + pub fn slice(self, start: i64, length: Option) -> Expr { self.0 .map_private(FunctionExpr::StringExpr(StringFunction::Slice( start, length, diff --git a/crates/polars-plan/src/logical_plan/optimizer/delay_rechunk.rs b/crates/polars-plan/src/logical_plan/optimizer/delay_rechunk.rs index 14f004f32916..6c3705ca93ee 100644 --- a/crates/polars-plan/src/logical_plan/optimizer/delay_rechunk.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/delay_rechunk.rs @@ -25,8 +25,9 @@ impl OptimizationRule for DelayRechunk { match lp_arena.get(node) { // An aggregation can be partitioned, its wasteful to rechunk before that partition. #[allow(unused_mut)] - ALogicalPlan::Aggregate { input, .. } => { - if !self.processed.insert(node.0) { + ALogicalPlan::Aggregate { input, keys, .. } => { + // Multiple keys on multiple chunks is much slower, so rechunk. + if !self.processed.insert(node.0) || keys.len() > 1 { return None; }; diff --git a/crates/polars-sql/src/functions.rs b/crates/polars-sql/src/functions.rs index c605271b9f4e..70f9cabe7669 100644 --- a/crates/polars-sql/src/functions.rs +++ b/crates/polars-sql/src/functions.rs @@ -683,7 +683,7 @@ impl SqlFunctionVisitor<'_> { EndsWith => self.visit_binary(|e, s| e.str().ends_with(s)), InitCap => self.visit_unary(|e| e.str().to_titlecase()), Left => self.try_visit_binary(|e, length| { - Ok(e.str().str_slice(0, match length { + Ok(e.str().slice(0, match length { Expr::Literal(LiteralValue::Int64(n)) => Some(n as u64), _ => { polars_bail!(InvalidOperation: "Invalid 'length' for Left: {}", function.args[1]); @@ -729,7 +729,7 @@ impl SqlFunctionVisitor<'_> { StartsWith => self.visit_binary(|e, s| e.str().starts_with(s)), Substring => match function.args.len() { 2 => self.try_visit_binary(|e, start| { - Ok(e.str().str_slice(match start { + Ok(e.str().slice(match start { Expr::Literal(LiteralValue::Int64(n)) => n, _ => { polars_bail!(InvalidOperation: "Invalid 'start' for Substring: {}", function.args[1]); @@ -737,7 +737,7 @@ impl SqlFunctionVisitor<'_> { }, None)) }), 3 => self.try_visit_ternary(|e, start, length| { - Ok(e.str().str_slice( + Ok(e.str().slice( match start { Expr::Literal(LiteralValue::Int64(n)) => n, _ => { diff --git a/py-polars/src/expr/string.rs b/py-polars/src/expr/string.rs index 85ed5a996a53..41c39099b8ee 100644 --- a/py-polars/src/expr/string.rs +++ b/py-polars/src/expr/string.rs @@ -91,7 +91,7 @@ impl PyExpr { } fn str_slice(&self, start: i64, length: Option) -> Self { - self.inner.clone().str().str_slice(start, length).into() + self.inner.clone().str().slice(start, length).into() } fn str_explode(&self) -> Self {