From 50032f298b2815e41fa7ca1cea41562fd0e804ee Mon Sep 17 00:00:00 2001 From: psteinroe Date: Fri, 22 Sep 2023 12:26:44 +0300 Subject: [PATCH] feat: implement the non-proc macro version of resolve_tokens --- crates/parser/src/get_children_codegen.rs | 30 +++++ crates/parser/src/get_location_codegen.rs | 3 + crates/parser/src/lib.rs | 3 + crates/parser/src/resolve_tokens.rs | 141 ++++++++++++++++++++++ crates/parser/src/statement_parser.rs | 9 +- 5 files changed, 185 insertions(+), 1 deletion(-) create mode 100644 crates/parser/src/get_children_codegen.rs create mode 100644 crates/parser/src/get_location_codegen.rs create mode 100644 crates/parser/src/resolve_tokens.rs diff --git a/crates/parser/src/get_children_codegen.rs b/crates/parser/src/get_children_codegen.rs new file mode 100644 index 00000000..13b895f2 --- /dev/null +++ b/crates/parser/src/get_children_codegen.rs @@ -0,0 +1,30 @@ +use codegen::get_children; + +get_children!(); + +#[cfg(test)] +mod tests { + use crate::get_children_codegen::get_children; + + #[test] + fn test_get_children() { + let input = "with c as (insert into contact (id) values ('id')) select * from c;"; + + let pg_query_root = match pg_query::parse(input) { + Ok(parsed) => Some( + parsed + .protobuf + .nodes() + .iter() + .find(|n| n.1 == 1) + .unwrap() + .0 + .to_enum(), + ), + Err(_) => None, + }; + + let children = get_children(&pg_query_root.unwrap(), input.to_string(), 1); + assert_eq!(children.len(), 13); + } +} diff --git a/crates/parser/src/get_location_codegen.rs b/crates/parser/src/get_location_codegen.rs new file mode 100644 index 00000000..fcc6685d --- /dev/null +++ b/crates/parser/src/get_location_codegen.rs @@ -0,0 +1,3 @@ +use codegen::get_location; + +get_location!(); diff --git a/crates/parser/src/lib.rs b/crates/parser/src/lib.rs index 90b2f9a2..8be85d21 100644 --- a/crates/parser/src/lib.rs +++ b/crates/parser/src/lib.rs @@ -16,7 +16,10 @@ //! To see how these drawbacks are mitigated, see the `statement.rs` and the `source_file.rs` module. mod ast_node; +mod get_children_codegen; +mod get_location_codegen; mod parser; +mod resolve_tokens; mod sibling_token; mod source_parser; mod statement_parser; diff --git a/crates/parser/src/resolve_tokens.rs b/crates/parser/src/resolve_tokens.rs new file mode 100644 index 00000000..d2dde198 --- /dev/null +++ b/crates/parser/src/resolve_tokens.rs @@ -0,0 +1,141 @@ +use crate::get_children_codegen::ChildrenNode; +use crate::get_location_codegen::get_location; +use cstree::text::{TextRange, TextSize}; +use pg_query::{protobuf::ScanToken, NodeEnum}; + +// all tokens of a node beneath it +// get estimation for each node location from tokens +// and also node range +// +// how to handle tokens that cannot be put beneath node based on the ast? +// pass token -> if not beneath current node, apply immediately + +#[derive(Debug, Clone)] +pub struct NestedNode { + pub node: NodeEnum, + pub depth: i32, + pub path: String, + pub tokens: Vec, + pub range: TextRange, +} + +/// Turns a `Vec` into a `Vec` by adding `tokens` and `range` to each node. +/// +/// For each node, we walk all properties and search for tokens that match the property value. The +/// token that is closest to the node or a parent is used. +/// +/// The node range is the minimum start and maximum end of all tokens. +pub fn resolve_tokens( + children: &Vec, + tokens: &Vec, + text: &str, +) -> Vec { + children + .iter() + .map(|c| { + let nearest_parent_location = get_nearest_parent_location(&c, children); + let furthest_child_location = get_furthest_child_location(&c, children); + + let mut child_tokens = Vec::new(); + + let mut find_token = |property: String| { + child_tokens.push( + tokens + .iter() + .filter_map(|t| { + if get_token_text( + usize::try_from(t.start).unwrap(), + usize::try_from(t.end).unwrap(), + text, + ) != property + { + return None; + } + + if furthest_child_location.is_some() + && furthest_child_location.unwrap() < t.start as i32 + { + return None; + } + + let distance = t.start - nearest_parent_location; + if distance > 0 { + Some((distance, t)) + } else { + None + } + }) + .min_by_key(|(d, _)| d.to_owned()) + .map(|(_, t)| t) + .unwrap(), + ); + }; + + match &c.node { + NodeEnum::RangeVar(n) => { + find_token(n.relname.to_owned()); + } + _ => {} + }; + + NestedNode { + node: c.node.to_owned(), + depth: c.depth, + path: c.path.to_owned(), + tokens: child_tokens.iter().map(|t| t.token).collect(), + range: TextRange::new( + TextSize::from( + child_tokens.iter().min_by_key(|t| t.start).unwrap().start as u32, + ), + TextSize::from(child_tokens.iter().max_by_key(|t| t.end).unwrap().end as u32), + ), + } + }) + .collect() +} + +fn get_token_text(start: usize, end: usize, text: &str) -> String { + text.chars() + .skip(start) + .take(end - start) + .collect::() +} + +fn get_furthest_child_location(c: &ChildrenNode, children: &Vec) -> Option { + children + .iter() + .filter_map(|n| { + if !n.path.starts_with(c.path.as_str()) { + return None; + } + get_location(&n.node) + }) + .max() +} + +fn get_nearest_parent_location(n: &ChildrenNode, children: &Vec) -> i32 { + // if location is set, return it + let location = get_location(&n.node); + if location.is_some() { + return location.unwrap(); + } + + // go up in the tree and check if location exists on any parent + let mut path_elements = n.path.split(".").collect::>(); + path_elements.pop(); + while path_elements.len() > 0 { + let parent_path = path_elements.join("."); + let node = children.iter().find(|c| c.path == parent_path); + if node.is_some() { + let location = get_location(&node.unwrap().node); + if location.is_some() { + return location.unwrap(); + } + } + + path_elements.pop(); + } + + // fallback to 0 + return 0; +} diff --git a/crates/parser/src/statement_parser.rs b/crates/parser/src/statement_parser.rs index d0f6a25c..68644702 100644 --- a/crates/parser/src/statement_parser.rs +++ b/crates/parser/src/statement_parser.rs @@ -1,7 +1,7 @@ use cstree::text::{TextRange, TextSize}; use logos::{Logos, Span}; -use crate::{parser::Parser, syntax_kind_codegen::SyntaxKind}; +use crate::{get_children_codegen::get_children, parser::Parser, syntax_kind_codegen::SyntaxKind}; /// A super simple lexer for sql statements. /// @@ -83,6 +83,13 @@ impl Parser { } }; + let mut pg_query_nodes = match &pg_query_root { + Some(root) => get_children(root, text.to_string(), 1) + .into_iter() + .peekable(), + None => Vec::new().into_iter().peekable(), + }; + let mut lexer = StatementToken::lexer(&text); // parse root node if no syntax errors