Skip to content

Commit

Permalink
Doc gen: Migrate aggregate functions doc to attribute based. (#13646)
Browse files Browse the repository at this point in the history
* Doc gen: Migrate aggregate functions doc to attribute based.
  • Loading branch information
comphead authored Dec 5, 2024
1 parent 2ac8af8 commit b312ac1
Show file tree
Hide file tree
Showing 28 changed files with 689 additions and 725 deletions.
150 changes: 81 additions & 69 deletions datafusion-cli/Cargo.lock

Large diffs are not rendered by default.

26 changes: 26 additions & 0 deletions datafusion/core/src/bin/print_functions_docs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,29 @@ fn print_window_docs() -> Result<String> {
print_docs(providers, window_doc_sections::doc_sections())
}

// Temporary method useful to semi automate
// the migration of UDF documentation generation from code based
// to attribute based
// To be removed
fn save_doc_code_text(documentation: &Documentation, name: &str) {
let attr_text = documentation.to_doc_attribute();

let file_path = format!("{}.txt", name);
if std::path::Path::new(&file_path).exists() {
std::fs::remove_file(&file_path).unwrap();
}

// Open the file in append mode, create it if it doesn't exist
let mut file = std::fs::OpenOptions::new()
.append(true) // Open in append mode
.create(true) // Create the file if it doesn't exist
.open(file_path)
.unwrap();

use std::io::Write;
file.write_all(attr_text.as_bytes()).unwrap();
}

fn print_docs(
providers: Vec<Box<dyn DocProvider>>,
doc_sections: Vec<DocSection>,
Expand Down Expand Up @@ -158,6 +181,9 @@ fn print_docs(
unreachable!()
};

// Temporary for doc gen migration, see `save_doc_code_text` comments
save_doc_code_text(documentation, &name);

// first, the name, description and syntax example
let _ = write!(
docs,
Expand Down
84 changes: 84 additions & 0 deletions datafusion/doc/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,90 @@ impl Documentation {
) -> DocumentationBuilder {
DocumentationBuilder::new(doc_section, description, syntax_example)
}

/// Output the `Documentation` struct in form of custom Rust documentation attributes
/// It is useful to semi automate during tmigration of UDF documentation
/// generation from code based to attribute based and can be safely removed after
pub fn to_doc_attribute(&self) -> String {
let mut result = String::new();

result.push_str("#[user_doc(");
// Doc Section
result.push_str(
format!(
"\n doc_section({}label = \"{}\"{}),",
if !self.doc_section.include {
"include = \"false\", "
} else {
""
},
self.doc_section.label,
self.doc_section
.description
.map(|s| format!(", description = \"{}\"", s))
.unwrap_or_default(),
)
.as_ref(),
);

// Description
result.push_str(format!("\n description=\"{}\",", self.description).as_ref());
// Syntax Example
result.push_str(
format!("\n syntax_example=\"{}\",", self.syntax_example).as_ref(),
);
// SQL Example
result.push_str(
&self
.sql_example
.clone()
.map(|s| format!("\n sql_example = r#\"{}\"#,", s))
.unwrap_or_default(),
);

let st_arg_token = " expression to operate on. Can be a constant, column, or function, and any combination of operators.";
// Standard Arguments
if let Some(args) = self.arguments.clone() {
args.iter().for_each(|(name, value)| {
if value.contains(st_arg_token) {
if name.starts_with("The ") {
result.push_str(format!("\n standard_argument(\n name = \"{}\"),", name).as_ref());
} else {
result.push_str(format!("\n standard_argument(\n name = \"{}\",\n prefix = \"{}\"\n ),", name, value.replace(st_arg_token, "")).as_ref());
}
}
});
}

// Arguments
if let Some(args) = self.arguments.clone() {
args.iter().for_each(|(name, value)| {
if !value.contains(st_arg_token) {
result.push_str(format!("\n argument(\n name = \"{}\",\n description = \"{}\"\n ),", name, value).as_ref());
}
});
}

if let Some(alt_syntax) = self.alternative_syntax.clone() {
alt_syntax.iter().for_each(|syntax| {
result.push_str(
format!("\n alternative_syntax = \"{}\",", syntax).as_ref(),
);
});
}

// Related UDFs
if let Some(related_udf) = self.related_udfs.clone() {
related_udf.iter().for_each(|udf| {
result
.push_str(format!("\n related_udf(name = \"{}\"),", udf).as_ref());
});
}

result.push_str("\n)]");

result
}
}

#[derive(Debug, Clone, PartialEq)]
Expand Down
2 changes: 2 additions & 0 deletions datafusion/functions-aggregate/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,11 @@ ahash = { workspace = true }
arrow = { workspace = true }
arrow-schema = { workspace = true }
datafusion-common = { workspace = true }
datafusion-doc = { workspace = true }
datafusion-execution = { workspace = true }
datafusion-expr = { workspace = true }
datafusion-functions-aggregate-common = { workspace = true }
datafusion-macros = { workspace = true }
datafusion-physical-expr = { workspace = true }
datafusion-physical-expr-common = { workspace = true }
half = { workspace = true }
Expand Down
39 changes: 18 additions & 21 deletions datafusion/functions-aggregate/src/approx_distinct.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,19 @@ use datafusion_common::ScalarValue;
use datafusion_common::{
downcast_value, internal_err, not_impl_err, DataFusionError, Result,
};
use datafusion_expr::aggregate_doc_sections::DOC_SECTION_APPROXIMATE;
use datafusion_doc::DocSection;
use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
use datafusion_expr::utils::format_state_name;
use datafusion_expr::{
Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility,
};
use datafusion_macros::user_doc;
use std::any::Any;
use std::fmt::{Debug, Formatter};
use std::hash::Hash;
use std::marker::PhantomData;
use std::sync::OnceLock;

make_udaf_expr_and_func!(
ApproxDistinct,
approx_distinct,
Expand Down Expand Up @@ -243,6 +245,20 @@ impl Default for ApproxDistinct {
}
}

#[user_doc(
doc_section(label = "Approximate Functions"),
description = "Returns the approximate number of distinct input values calculated using the HyperLogLog algorithm.",
syntax_example = "approx_distinct(expression)",
sql_example = r#"```sql
> SELECT approx_distinct(column_name) FROM table_name;
+-----------------------------------+
| approx_distinct(column_name) |
+-----------------------------------+
| 42 |
+-----------------------------------+
```"#,
standard_argument(name = "expression",)
)]
pub struct ApproxDistinct {
signature: Signature,
}
Expand Down Expand Up @@ -309,25 +325,6 @@ impl AggregateUDFImpl for ApproxDistinct {
}

fn documentation(&self) -> Option<&Documentation> {
Some(get_approx_distinct_doc())
self.doc()
}
}

static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();

fn get_approx_distinct_doc() -> &'static Documentation {
DOCUMENTATION.get_or_init(|| {
Documentation::builder(DOC_SECTION_APPROXIMATE, "Returns the approximate number of distinct input values calculated using the HyperLogLog algorithm.", "approx_distinct(expression)")
.with_sql_example(r#"```sql
> SELECT approx_distinct(column_name) FROM table_name;
+-----------------------------------+
| approx_distinct(column_name) |
+-----------------------------------+
| 42 |
+-----------------------------------+
```"#,
)
.with_standard_argument("expression", None)
.build()
})
}
42 changes: 17 additions & 25 deletions datafusion/functions-aggregate/src/approx_median.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,14 @@ use arrow::{datatypes::DataType, datatypes::Field};
use arrow_schema::DataType::{Float64, UInt64};

use datafusion_common::{not_impl_err, plan_err, Result};
use datafusion_expr::aggregate_doc_sections::DOC_SECTION_APPROXIMATE;
use datafusion_doc::DocSection;
use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
use datafusion_expr::type_coercion::aggregates::NUMERICS;
use datafusion_expr::utils::format_state_name;
use datafusion_expr::{
Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility,
};
use datafusion_macros::user_doc;

use crate::approx_percentile_cont::ApproxPercentileAccumulator;

Expand All @@ -44,6 +45,20 @@ make_udaf_expr_and_func!(
);

/// APPROX_MEDIAN aggregate expression
#[user_doc(
doc_section(label = "Approximate Functions"),
description = "Returns the approximate median (50th percentile) of input values. It is an alias of `approx_percentile_cont(x, 0.5)`.",
syntax_example = "approx_median(expression)",
sql_example = r#"```sql
> SELECT approx_median(column_name) FROM table_name;
+-----------------------------------+
| approx_median(column_name) |
+-----------------------------------+
| 23.5 |
+-----------------------------------+
```"#,
standard_argument(name = "expression",)
)]
pub struct ApproxMedian {
signature: Signature,
}
Expand Down Expand Up @@ -122,29 +137,6 @@ impl AggregateUDFImpl for ApproxMedian {
}

fn documentation(&self) -> Option<&Documentation> {
Some(get_approx_median_doc())
self.doc()
}
}

static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();

fn get_approx_median_doc() -> &'static Documentation {
DOCUMENTATION.get_or_init(|| {
Documentation::builder(
DOC_SECTION_APPROXIMATE,
"Returns the approximate median (50th percentile) of input values. It is an alias of `approx_percentile_cont(x, 0.5)`.",

"approx_median(expression)")
.with_sql_example(r#"```sql
> SELECT approx_median(column_name) FROM table_name;
+-----------------------------------+
| approx_median(column_name) |
+-----------------------------------+
| 23.5 |
+-----------------------------------+
```"#,
)
.with_standard_argument("expression", None)
.build()
})
}
50 changes: 25 additions & 25 deletions datafusion/functions-aggregate/src/approx_percentile_cont.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ use datafusion_common::{
downcast_value, internal_err, not_impl_datafusion_err, not_impl_err, plan_err,
DataFusionError, Result, ScalarValue,
};
use datafusion_expr::aggregate_doc_sections::DOC_SECTION_APPROXIMATE;
use datafusion_doc::DocSection;
use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
use datafusion_expr::type_coercion::aggregates::{INTEGERS, NUMERICS};
use datafusion_expr::utils::format_state_name;
Expand All @@ -46,6 +46,7 @@ use datafusion_expr::{
use datafusion_functions_aggregate_common::tdigest::{
TDigest, TryIntoF64, DEFAULT_MAX_SIZE,
};
use datafusion_macros::user_doc;
use datafusion_physical_expr_common::physical_expr::PhysicalExpr;

create_func!(ApproxPercentileCont, approx_percentile_cont_udaf);
Expand All @@ -64,6 +65,28 @@ pub fn approx_percentile_cont(
approx_percentile_cont_udaf().call(args)
}

#[user_doc(
doc_section(label = "Approximate Functions"),
description = "Returns the approximate percentile of input values using the t-digest algorithm.",
syntax_example = "approx_percentile_cont(expression, percentile, centroids)",
sql_example = r#"```sql
> SELECT approx_percentile_cont(column_name, 0.75, 100) FROM table_name;
+-------------------------------------------------+
| approx_percentile_cont(column_name, 0.75, 100) |
+-------------------------------------------------+
| 65.0 |
+-------------------------------------------------+
```"#,
standard_argument(name = "expression",),
argument(
name = "percentile",
description = "Percentile to compute. Must be a float value between 0 and 1 (inclusive)."
),
argument(
name = "centroids",
description = "Number of centroids to use in the t-digest algorithm. _Default is 100_. A higher number results in more accurate approximation but requires more memory."
)
)]
pub struct ApproxPercentileCont {
signature: Signature,
}
Expand Down Expand Up @@ -272,33 +295,10 @@ impl AggregateUDFImpl for ApproxPercentileCont {
}

fn documentation(&self) -> Option<&Documentation> {
Some(get_approx_percentile_cont_doc())
self.doc()
}
}

static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();

fn get_approx_percentile_cont_doc() -> &'static Documentation {
DOCUMENTATION.get_or_init(|| {
Documentation::builder(
DOC_SECTION_APPROXIMATE,
"Returns the approximate percentile of input values using the t-digest algorithm.",
"approx_percentile_cont(expression, percentile, centroids)")
.with_sql_example(r#"```sql
> SELECT approx_percentile_cont(column_name, 0.75, 100) FROM table_name;
+-------------------------------------------------+
| approx_percentile_cont(column_name, 0.75, 100) |
+-------------------------------------------------+
| 65.0 |
+-------------------------------------------------+
```"#)
.with_standard_argument("expression", None)
.with_argument("percentile", "Percentile to compute. Must be a float value between 0 and 1 (inclusive).")
.with_argument("centroids", "Number of centroids to use in the t-digest algorithm. _Default is 100_. A higher number results in more accurate approximation but requires more memory.")
.build()
})
}

#[derive(Debug)]
pub struct ApproxPercentileAccumulator {
digest: TDigest,
Expand Down
Loading

0 comments on commit b312ac1

Please sign in to comment.