Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Doc gen: Migrate aggregate functions doc to attribute based. #13646

Merged
merged 5 commits into from
Dec 5, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 81 additions & 69 deletions datafusion-cli/Cargo.lock

Large diffs are not rendered by default.

17 changes: 17 additions & 0 deletions datafusion/core/src/bin/print_functions_docs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,23 @@ fn print_docs(
unreachable!()
};

// let attr_text = documentation.to_doc_attribute();
//
// let file_path = format!("{}.txt", name);
// if std::path::Path::new(&file_path).exists() {
// std::fs::remove_file(&file_path).unwrap();
Copy link
Contributor Author

@comphead comphead Dec 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm thinking of leaving it for now as this part generates text files with attribute text so its easier to do a migration for remaning parts(windows/builtin udfs), semi automatic way instead of manual

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would just add a small comment noting this is useful for migration purposes.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree a comment explaining what this is used for would be most helpful

// }
//
// // Open the file in append mode, create it if it doesn't exist
// let mut file = std::fs::OpenOptions::new()
// .append(true) // Open in append mode
// .create(true) // Create the file if it doesn't exist
// .open(file_path)
// .unwrap();
//
// use std::io::Write;
// file.write_all(attr_text.as_bytes()).unwrap();

// first, the name, description and syntax example
let _ = write!(
docs,
Expand Down
82 changes: 82 additions & 0 deletions datafusion/doc/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,88 @@ impl Documentation {
) -> DocumentationBuilder {
DocumentationBuilder::new(doc_section, description, syntax_example)
}

/// Output the `Documentation` struct in form of custom Rust documentation attributes
pub fn to_doc_attribute(&self) -> String {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe it would also help helpful here to note this is to help migration to doc comment attributes

let mut result = String::new();

result.push_str("#[user_doc(");
// Doc Section
result.push_str(
format!(
"\n doc_section({}label = \"{}\"{}),",
if !self.doc_section.include {
"include = \"false\", "
} else {
""
},
self.doc_section.label,
self.doc_section
.description
.map(|s| format!(", description = \"{}\"", s))
.unwrap_or_default(),
)
.as_ref(),
);

// Description
result.push_str(format!("\n description=\"{}\",", self.description).as_ref());
// Syntax Example
result.push_str(
format!("\n syntax_example=\"{}\",", self.syntax_example).as_ref(),
);
// SQL Example
result.push_str(
&self
.sql_example
.clone()
.map(|s| format!("\n sql_example = r#\"{}\"#,", s))
.unwrap_or_default(),
);

let st_arg_token = " expression to operate on. Can be a constant, column, or function, and any combination of operators.";
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps this could be extracted to a constant and that used instead of having text copies of the string in 2 places.

// Standard Arguments
if let Some(args) = self.arguments.clone() {
args.iter().for_each(|(name, value)| {
if value.contains(st_arg_token) {
if name.starts_with("The ") {
result.push_str(format!("\n standard_argument(\n name = \"{}\"),", name).as_ref());
} else {
result.push_str(format!("\n standard_argument(\n name = \"{}\",\n prefix = \"{}\"\n ),", name, value.replace(st_arg_token, "")).as_ref());
}
}
});
}

// Arguments
if let Some(args) = self.arguments.clone() {
args.iter().for_each(|(name, value)| {
if !value.contains(st_arg_token) {
result.push_str(format!("\n argument(\n name = \"{}\",\n description = \"{}\"\n ),", name, value).as_ref());
}
});
}

if let Some(alt_syntax) = self.alternative_syntax.clone() {
alt_syntax.iter().for_each(|syntax| {
result.push_str(
format!("\n alternative_syntax = \"{}\",", syntax).as_ref(),
);
});
}

// Related UDFs
if let Some(related_udf) = self.related_udfs.clone() {
related_udf.iter().for_each(|udf| {
result
.push_str(format!("\n related_udf(name = \"{}\"),", udf).as_ref());
});
}

result.push_str("\n)]");

result
}
}

#[derive(Debug, Clone, PartialEq)]
Expand Down
2 changes: 2 additions & 0 deletions datafusion/functions-aggregate/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,11 @@ ahash = { workspace = true }
arrow = { workspace = true }
arrow-schema = { workspace = true }
datafusion-common = { workspace = true }
datafusion-doc = { workspace = true }
datafusion-execution = { workspace = true }
datafusion-expr = { workspace = true }
datafusion-functions-aggregate-common = { workspace = true }
datafusion-macros = { workspace = true }
datafusion-physical-expr = { workspace = true }
datafusion-physical-expr-common = { workspace = true }
half = { workspace = true }
Expand Down
39 changes: 18 additions & 21 deletions datafusion/functions-aggregate/src/approx_distinct.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,19 @@ use datafusion_common::ScalarValue;
use datafusion_common::{
downcast_value, internal_err, not_impl_err, DataFusionError, Result,
};
use datafusion_expr::aggregate_doc_sections::DOC_SECTION_APPROXIMATE;
use datafusion_doc::DocSection;
use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
use datafusion_expr::utils::format_state_name;
use datafusion_expr::{
Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility,
};
use datafusion_macros::user_doc;
use std::any::Any;
use std::fmt::{Debug, Formatter};
use std::hash::Hash;
use std::marker::PhantomData;
use std::sync::OnceLock;

make_udaf_expr_and_func!(
ApproxDistinct,
approx_distinct,
Expand Down Expand Up @@ -243,6 +245,20 @@ impl Default for ApproxDistinct {
}
}

#[user_doc(
doc_section(label = "Approximate Functions"),
description = "Returns the approximate number of distinct input values calculated using the HyperLogLog algorithm.",
syntax_example = "approx_distinct(expression)",
sql_example = r#"```sql
> SELECT approx_distinct(column_name) FROM table_name;
+-----------------------------------+
| approx_distinct(column_name) |
+-----------------------------------+
| 42 |
+-----------------------------------+
```"#,
standard_argument(name = "expression",)
)]
pub struct ApproxDistinct {
signature: Signature,
}
Expand Down Expand Up @@ -309,25 +325,6 @@ impl AggregateUDFImpl for ApproxDistinct {
}

fn documentation(&self) -> Option<&Documentation> {
Some(get_approx_distinct_doc())
self.doc()
}
}

static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();

fn get_approx_distinct_doc() -> &'static Documentation {
DOCUMENTATION.get_or_init(|| {
Documentation::builder(DOC_SECTION_APPROXIMATE, "Returns the approximate number of distinct input values calculated using the HyperLogLog algorithm.", "approx_distinct(expression)")
.with_sql_example(r#"```sql
> SELECT approx_distinct(column_name) FROM table_name;
+-----------------------------------+
| approx_distinct(column_name) |
+-----------------------------------+
| 42 |
+-----------------------------------+
```"#,
)
.with_standard_argument("expression", None)
.build()
})
}
42 changes: 17 additions & 25 deletions datafusion/functions-aggregate/src/approx_median.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,14 @@ use arrow::{datatypes::DataType, datatypes::Field};
use arrow_schema::DataType::{Float64, UInt64};

use datafusion_common::{not_impl_err, plan_err, Result};
use datafusion_expr::aggregate_doc_sections::DOC_SECTION_APPROXIMATE;
use datafusion_doc::DocSection;
use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
use datafusion_expr::type_coercion::aggregates::NUMERICS;
use datafusion_expr::utils::format_state_name;
use datafusion_expr::{
Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility,
};
use datafusion_macros::user_doc;

use crate::approx_percentile_cont::ApproxPercentileAccumulator;

Expand All @@ -44,6 +45,20 @@ make_udaf_expr_and_func!(
);

/// APPROX_MEDIAN aggregate expression
#[user_doc(
doc_section(label = "Approximate Functions"),
description = "Returns the approximate median (50th percentile) of input values. It is an alias of `approx_percentile_cont(x, 0.5)`.",
syntax_example = "approx_median(expression)",
sql_example = r#"```sql
> SELECT approx_median(column_name) FROM table_name;
+-----------------------------------+
| approx_median(column_name) |
+-----------------------------------+
| 23.5 |
+-----------------------------------+
```"#,
standard_argument(name = "expression",)
)]
pub struct ApproxMedian {
signature: Signature,
}
Expand Down Expand Up @@ -122,29 +137,6 @@ impl AggregateUDFImpl for ApproxMedian {
}

fn documentation(&self) -> Option<&Documentation> {
Some(get_approx_median_doc())
self.doc()
}
}

static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();

fn get_approx_median_doc() -> &'static Documentation {
DOCUMENTATION.get_or_init(|| {
Documentation::builder(
DOC_SECTION_APPROXIMATE,
"Returns the approximate median (50th percentile) of input values. It is an alias of `approx_percentile_cont(x, 0.5)`.",

"approx_median(expression)")
.with_sql_example(r#"```sql
> SELECT approx_median(column_name) FROM table_name;
+-----------------------------------+
| approx_median(column_name) |
+-----------------------------------+
| 23.5 |
+-----------------------------------+
```"#,
)
.with_standard_argument("expression", None)
.build()
})
}
50 changes: 25 additions & 25 deletions datafusion/functions-aggregate/src/approx_percentile_cont.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ use datafusion_common::{
downcast_value, internal_err, not_impl_datafusion_err, not_impl_err, plan_err,
DataFusionError, Result, ScalarValue,
};
use datafusion_expr::aggregate_doc_sections::DOC_SECTION_APPROXIMATE;
use datafusion_doc::DocSection;
use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
use datafusion_expr::type_coercion::aggregates::{INTEGERS, NUMERICS};
use datafusion_expr::utils::format_state_name;
Expand All @@ -46,6 +46,7 @@ use datafusion_expr::{
use datafusion_functions_aggregate_common::tdigest::{
TDigest, TryIntoF64, DEFAULT_MAX_SIZE,
};
use datafusion_macros::user_doc;
use datafusion_physical_expr_common::physical_expr::PhysicalExpr;

create_func!(ApproxPercentileCont, approx_percentile_cont_udaf);
Expand All @@ -64,6 +65,28 @@ pub fn approx_percentile_cont(
approx_percentile_cont_udaf().call(args)
}

#[user_doc(
doc_section(label = "Approximate Functions"),
description = "Returns the approximate percentile of input values using the t-digest algorithm.",
syntax_example = "approx_percentile_cont(expression, percentile, centroids)",
sql_example = r#"```sql
> SELECT approx_percentile_cont(column_name, 0.75, 100) FROM table_name;
+-------------------------------------------------+
| approx_percentile_cont(column_name, 0.75, 100) |
+-------------------------------------------------+
| 65.0 |
+-------------------------------------------------+
```"#,
standard_argument(name = "expression",),
argument(
name = "percentile",
description = "Percentile to compute. Must be a float value between 0 and 1 (inclusive)."
),
argument(
name = "centroids",
description = "Number of centroids to use in the t-digest algorithm. _Default is 100_. A higher number results in more accurate approximation but requires more memory."
)
)]
pub struct ApproxPercentileCont {
signature: Signature,
}
Expand Down Expand Up @@ -272,33 +295,10 @@ impl AggregateUDFImpl for ApproxPercentileCont {
}

fn documentation(&self) -> Option<&Documentation> {
Some(get_approx_percentile_cont_doc())
self.doc()
}
}

static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();

fn get_approx_percentile_cont_doc() -> &'static Documentation {
DOCUMENTATION.get_or_init(|| {
Documentation::builder(
DOC_SECTION_APPROXIMATE,
"Returns the approximate percentile of input values using the t-digest algorithm.",
"approx_percentile_cont(expression, percentile, centroids)")
.with_sql_example(r#"```sql
> SELECT approx_percentile_cont(column_name, 0.75, 100) FROM table_name;
+-------------------------------------------------+
| approx_percentile_cont(column_name, 0.75, 100) |
+-------------------------------------------------+
| 65.0 |
+-------------------------------------------------+
```"#)
.with_standard_argument("expression", None)
.with_argument("percentile", "Percentile to compute. Must be a float value between 0 and 1 (inclusive).")
.with_argument("centroids", "Number of centroids to use in the t-digest algorithm. _Default is 100_. A higher number results in more accurate approximation but requires more memory.")
.build()
})
}

#[derive(Debug)]
pub struct ApproxPercentileAccumulator {
digest: TDigest,
Expand Down
Loading
Loading