From 2b7729d47691a6e5d3ab3c3edc89c284449cb918 Mon Sep 17 00:00:00 2001 From: psteinroe Date: Sun, 1 Oct 2023 16:31:48 +0200 Subject: [PATCH] feat: refactor and improve all over the place --- .../src/{get_children.rs => get_nodes.rs} | 12 +- crates/codegen/src/lib.rs | 8 +- ...ildren_codegen.rs => get_nodes_codegen.rs} | 12 +- crates/parser/src/lib.rs | 4 +- crates/parser/src/parser.rs | 72 +--- crates/parser/src/resolve_tokens.rs | 185 +++++---- crates/parser/src/sibling_token.rs | 31 -- crates/parser/src/source_parser.rs | 17 +- crates/parser/src/statement_parser.rs | 380 ++++++++++++++---- 9 files changed, 429 insertions(+), 292 deletions(-) rename crates/codegen/src/{get_children.rs => get_nodes.rs} (92%) rename crates/parser/src/{get_children_codegen.rs => get_nodes_codegen.rs} (67%) delete mode 100644 crates/parser/src/sibling_token.rs diff --git a/crates/codegen/src/get_children.rs b/crates/codegen/src/get_nodes.rs similarity index 92% rename from crates/codegen/src/get_children.rs rename to crates/codegen/src/get_nodes.rs index e92c5f6e..26fd5a53 100644 --- a/crates/codegen/src/get_children.rs +++ b/crates/codegen/src/get_nodes.rs @@ -2,7 +2,7 @@ use pg_query_proto_parser::{FieldType, Node, ProtoParser}; use proc_macro2::{Ident, TokenStream}; use quote::{format_ident, quote}; -pub fn get_children_mod(_item: proc_macro2::TokenStream) -> proc_macro2::TokenStream { +pub fn get_nodes_mod(_item: proc_macro2::TokenStream) -> proc_macro2::TokenStream { let parser = ProtoParser::new("./libpg_query/protobuf/pg_query.proto"); let proto_file = parser.parse(); @@ -16,7 +16,7 @@ pub fn get_children_mod(_item: proc_macro2::TokenStream) -> proc_macro2::TokenSt use std::collections::VecDeque; #[derive(Debug, Clone)] - pub struct ChildrenNode { + pub struct Node { pub node: NodeEnum, pub depth: i32, pub path: String, @@ -24,8 +24,10 @@ pub fn get_children_mod(_item: proc_macro2::TokenStream) -> proc_macro2::TokenSt /// Returns all children of the node, recursively /// location is resolved manually - pub fn get_children(node: &NodeEnum, text: String, current_depth: i32) -> Vec { - let mut nodes: Vec = vec![]; + pub fn get_nodes(node: &NodeEnum, text: String, current_depth: i32) -> Vec { + let mut nodes: Vec = vec![ + Node { node: node.to_owned(), depth: current_depth, path: "0".to_string() } + ]; // Node, depth, path let mut stack: VecDeque<(NodeEnum, i32, String)> = VecDeque::from(vec![(node.to_owned(), current_depth, "0".to_string())]); @@ -37,7 +39,7 @@ pub fn get_children_mod(_item: proc_macro2::TokenStream) -> proc_macro2::TokenSt let path = path.clone() + "." + child_ctr.to_string().as_str(); child_ctr = child_ctr + 1; stack.push_back((c.to_owned(), current_depth, path.clone())); - nodes.push(ChildrenNode { + nodes.push(Node { node: c, depth: current_depth, path: path.clone(), diff --git a/crates/codegen/src/lib.rs b/crates/codegen/src/lib.rs index fba42ea7..bc63d4f6 100644 --- a/crates/codegen/src/lib.rs +++ b/crates/codegen/src/lib.rs @@ -1,14 +1,14 @@ -mod get_children; mod get_location; +mod get_nodes; mod syntax_kind; -use get_children::get_children_mod; use get_location::get_location_mod; +use get_nodes::get_nodes_mod; use syntax_kind::syntax_kind_mod; #[proc_macro] -pub fn get_children(item: proc_macro::TokenStream) -> proc_macro::TokenStream { - get_children_mod(item.into()).into() +pub fn get_nodes(item: proc_macro::TokenStream) -> proc_macro::TokenStream { + get_nodes_mod(item.into()).into() } #[proc_macro] diff --git a/crates/parser/src/get_children_codegen.rs b/crates/parser/src/get_nodes_codegen.rs similarity index 67% rename from crates/parser/src/get_children_codegen.rs rename to crates/parser/src/get_nodes_codegen.rs index 13b895f2..3305baab 100644 --- a/crates/parser/src/get_children_codegen.rs +++ b/crates/parser/src/get_nodes_codegen.rs @@ -1,13 +1,13 @@ -use codegen::get_children; +use codegen::get_nodes; -get_children!(); +get_nodes!(); #[cfg(test)] mod tests { - use crate::get_children_codegen::get_children; + use crate::get_nodes_codegen::get_nodes; #[test] - fn test_get_children() { + fn test_get_nodes() { let input = "with c as (insert into contact (id) values ('id')) select * from c;"; let pg_query_root = match pg_query::parse(input) { @@ -24,7 +24,7 @@ mod tests { Err(_) => None, }; - let children = get_children(&pg_query_root.unwrap(), input.to_string(), 1); - assert_eq!(children.len(), 13); + let nodes = get_nodes(&pg_query_root.unwrap(), input.to_string(), 1); + assert_eq!(nodes.len(), 14); } } diff --git a/crates/parser/src/lib.rs b/crates/parser/src/lib.rs index 8be85d21..5344ff27 100644 --- a/crates/parser/src/lib.rs +++ b/crates/parser/src/lib.rs @@ -16,17 +16,15 @@ //! To see how these drawbacks are mitigated, see the `statement.rs` and the `source_file.rs` module. mod ast_node; -mod get_children_codegen; mod get_location_codegen; +mod get_nodes_codegen; mod parser; mod resolve_tokens; -mod sibling_token; mod source_parser; mod statement_parser; mod syntax_error; mod syntax_kind_codegen; mod syntax_node; -pub use crate::parser::{Parse, Parser}; pub use crate::syntax_kind_codegen::SyntaxKind; pub use crate::syntax_node::{SyntaxElement, SyntaxNode, SyntaxToken}; diff --git a/crates/parser/src/parser.rs b/crates/parser/src/parser.rs index 80219f8d..2f311ba5 100644 --- a/crates/parser/src/parser.rs +++ b/crates/parser/src/parser.rs @@ -1,6 +1,5 @@ use cstree::syntax::ResolvedNode; use cstree::{build::GreenNodeBuilder, text::TextRange}; -use log::debug; use pg_query::NodeEnum; use crate::ast_node::RawStmt; @@ -8,7 +7,7 @@ use crate::syntax_error::SyntaxError; use crate::syntax_kind_codegen::SyntaxKind; use crate::syntax_node::SyntaxNode; -/// Main parser that controls the cst building process, and collects errors and statements +/// Main parser that exposes the `cstree` api, and collects errors and statements #[derive(Debug)] pub struct Parser { /// The cst builder @@ -17,16 +16,9 @@ pub struct Parser { errors: Vec, /// The pg_query statements representing the abtract syntax tree stmts: Vec, - /// The current checkpoint depth, if any - checkpoint: Option, - /// Whether the parser is currently parsing a flat node - is_parsing_flat_node: bool, - /// Keeps track of currently open nodes - /// Latest opened is last - open_nodes: Vec<(SyntaxKind, i32)>, } -/// Result of parsing +/// Result of Building #[derive(Debug)] pub struct Parse { /// The concrete syntax tree @@ -43,72 +35,16 @@ impl Parser { inner: GreenNodeBuilder::new(), errors: Vec::new(), stmts: Vec::new(), - checkpoint: None, - is_parsing_flat_node: false, - open_nodes: Vec::new(), } } - /// close all nodes until the specified depth is reached - pub fn close_until_depth(&mut self, depth: i32) { - debug!("close until depth {}", depth); - if self.open_nodes.is_empty() || self.get_current_depth() < depth { - return; - } - loop { - if self.open_nodes.is_empty() || self.get_current_depth() < depth { - break; - } - self.finish_node(); - } - } - - fn get_current_depth(&self) -> i32 { - self.open_nodes[self.open_nodes.len() - 1].1 - } - - /// set a checkpoint at current depth - /// - /// if `is_parsing_flat_node` is true, all tokens parsed until this checkpoint is closed will be applied immediately - pub fn set_checkpoint(&mut self) { - assert!( - self.checkpoint.is_none(), - "Must close previouos checkpoint before setting new one" - ); - self.checkpoint = Some(self.get_current_depth()); - } - - /// close all nodes until checkpoint depth is reached - pub fn close_checkpoint(&mut self) { - if self.checkpoint.is_some() { - self.close_until_depth(self.checkpoint.unwrap()); - } - self.checkpoint = None; - self.is_parsing_flat_node = false; - } - - /// start a new node of `SyntaxKind` at `depth` - /// handles closing previous nodes if necessary - pub fn start_node_at(&mut self, kind: SyntaxKind, depth: i32) { - debug!("starting node at depth {} {:?}", depth, kind); - // close until target depth - self.close_until_depth(depth); - - self.open_nodes.push((kind, depth)); - debug!("start node {:?}", kind); + /// start a new node of `SyntaxKind` + pub fn start_node(&mut self, kind: SyntaxKind) { self.inner.start_node(kind); } /// finish current node pub fn finish_node(&mut self) { - debug!("finish_node"); - - let n = self.open_nodes.pop(); - if n.is_none() { - panic!("No node to finish"); - } - - debug!("finish node {:?}", n.unwrap().0); self.inner.finish_node(); } diff --git a/crates/parser/src/resolve_tokens.rs b/crates/parser/src/resolve_tokens.rs index 93a8a877..9c01b471 100644 --- a/crates/parser/src/resolve_tokens.rs +++ b/crates/parser/src/resolve_tokens.rs @@ -1,88 +1,96 @@ -use crate::get_children_codegen::ChildrenNode; +use std::{ + cmp::{max, min}, + convert::identity, +}; + use crate::get_location_codegen::get_location; +use crate::get_nodes_codegen::Node; use cstree::text::{TextRange, TextSize}; use pg_query::{protobuf::ScanToken, NodeEnum}; #[derive(Debug, Clone)] -pub struct NestedNode { - pub id: usize, - pub inner: ChildrenNode, - // .start property of `ScanToken` - pub tokens: Vec, - pub range: TextRange, +pub struct RangedNode { + pub inner: Node, + pub estimated_range: TextRange, } -/// Turns a `Vec` into a `Vec` by adding `tokens` and `range` to each node. -/// -/// For each node, we walk all properties and search for tokens that match the property value. The -/// token that is closest to the node or a parent is used. -/// -/// The node range is the minimum start and maximum end of all tokens. -pub fn resolve_tokens( - children: &Vec, - tokens: &Vec, - text: &str, -) -> Vec { - children - .iter() - .enumerate() - .map(|(idx, c)| { - let nearest_parent_location = get_nearest_parent_location(&c, children); - let furthest_child_location = get_furthest_child_location(&c, children); - - let mut child_tokens = Vec::new(); - - let mut find_token = |property: String| { - child_tokens.push( - tokens - .iter() - .filter_map(|t| { - if get_token_text( - usize::try_from(t.start).unwrap(), - usize::try_from(t.end).unwrap(), - text, - ) != property - { - return None; - } - - if furthest_child_location.is_some() - && furthest_child_location.unwrap() < t.start as i32 - { - return None; - } - - let distance = t.start - nearest_parent_location; - if distance > 0 { - Some((distance, t)) - } else { - None - } - }) - .min_by_key(|(d, _)| d.to_owned()) - .map(|(_, t)| t) - .unwrap(), - ); - }; - - match &c.node { - NodeEnum::RangeVar(n) => { - find_token(n.relname.to_owned()); - } - _ => {} - }; - - NestedNode { - id: idx, - inner: c.to_owned(), - tokens: child_tokens.iter().map(|t| t.start).collect(), - range: TextRange::new( - TextSize::from( - child_tokens.iter().min_by_key(|t| t.start).unwrap().start as u32, - ), - TextSize::from(child_tokens.iter().max_by_key(|t| t.end).unwrap().end as u32), - ), +/// Turns a `Vec` into a `Vec` by estimating their range. +pub fn resolve_tokens(nodes: &Vec, tokens: &Vec, text: &str) -> Vec { + let mut ranged_nodes: Vec = Vec::new(); + + // we get an estimated range by searching for tokens that match the node property values + // and, if available, the `location` of the node itself + nodes.iter().for_each(|n| { + let nearest_parent_location = get_nearest_parent_location(&n, nodes); + let furthest_child_location = get_furthest_child_location(&n, nodes); + + let mut child_tokens = Vec::new(); + + let mut find_token = |property: String| { + child_tokens.push( + tokens + .iter() + .filter_map(|t| { + if get_token_text( + usize::try_from(t.start).unwrap(), + usize::try_from(t.end).unwrap(), + text, + ) != property + { + return None; + } + + if furthest_child_location.is_some() + && furthest_child_location.unwrap() < t.start as i32 + { + return None; + } + + let distance = t.start - nearest_parent_location; + if distance > 0 { + Some((distance, t)) + } else { + None + } + }) + .min_by_key(|(d, _)| d.to_owned()) + .map(|(_, t)| t) + .unwrap(), + ); + }; + + match &n.node { + NodeEnum::RangeVar(n) => { + find_token(n.relname.to_owned()); } + _ => {} + }; + + let from_locations: Vec = [ + get_location(&n.node), + Some(nearest_parent_location), + Some(child_tokens.iter().min_by_key(|t| t.start).unwrap().start), + ] + .into_iter() + .filter_map(|x| x) + .collect(); + + ranged_nodes.push(RangedNode { + inner: n.to_owned(), + estimated_range: TextRange::new( + TextSize::from(from_locations.iter().min().unwrap_or(&0).to_owned() as u32), + TextSize::from(child_tokens.iter().max_by_key(|t| t.end).unwrap().end as u32), + ), + }); + }); + + // FIXME: this additional loop is not required if we order the nodes by path first + ranged_nodes + .iter() + .map(|n| RangedNode { + inner: n.inner.to_owned(), + // the range of a node must be larger than the range of all children nodes + estimated_range: get_largest_child_range(&n, &ranged_nodes), }) .collect() } @@ -94,7 +102,26 @@ fn get_token_text(start: usize, end: usize, text: &str) -> String { .collect::() } -fn get_furthest_child_location(c: &ChildrenNode, children: &Vec) -> Option { +fn get_largest_child_range(node: &RangedNode, nodes: &Vec) -> TextRange { + let mut start: TextSize = node.estimated_range.start().to_owned(); + let mut end: TextSize = node.estimated_range.end().to_owned(); + + nodes.iter().for_each(|n| { + if !n.inner.path.starts_with(node.inner.path.as_str()) { + return; + } + if start < n.estimated_range.start() { + start = n.estimated_range.start(); + } + if end > n.estimated_range.end() { + end = n.estimated_range.end(); + } + }); + + TextRange::new(start, end) +} + +fn get_furthest_child_location(c: &Node, children: &Vec) -> Option { children .iter() .filter_map(|n| { @@ -106,7 +133,7 @@ fn get_furthest_child_location(c: &ChildrenNode, children: &Vec) - .max() } -fn get_nearest_parent_location(n: &ChildrenNode, children: &Vec) -> i32 { +fn get_nearest_parent_location(n: &Node, children: &Vec) -> i32 { // if location is set, return it let location = get_location(&n.node); if location.is_some() { diff --git a/crates/parser/src/sibling_token.rs b/crates/parser/src/sibling_token.rs deleted file mode 100644 index 6a42dd0d..00000000 --- a/crates/parser/src/sibling_token.rs +++ /dev/null @@ -1,31 +0,0 @@ -use crate::syntax_kind_codegen::SyntaxKind; - -impl SyntaxKind { - pub fn is_opening_sibling(&self) -> bool { - match self { - SyntaxKind::Ascii40 => true, - SyntaxKind::Ascii91 => true, - SyntaxKind::Case => true, - _ => false, - } - } - pub fn is_closing_sibling(&self) -> bool { - match self { - SyntaxKind::Ascii41 => true, - SyntaxKind::Ascii93 => true, - SyntaxKind::EndP => true, - _ => false, - } - } - pub fn sibling(&self) -> Option { - match self { - SyntaxKind::Case => Some(SyntaxKind::EndP), - SyntaxKind::EndP => Some(SyntaxKind::Case), - SyntaxKind::Ascii40 => Some(SyntaxKind::Ascii41), - SyntaxKind::Ascii41 => Some(SyntaxKind::Ascii40), - SyntaxKind::Ascii91 => Some(SyntaxKind::Ascii93), - SyntaxKind::Ascii93 => Some(SyntaxKind::Ascii91), - _ => None, - } - } -} diff --git a/crates/parser/src/source_parser.rs b/crates/parser/src/source_parser.rs index b7a727ec..341d6eb9 100644 --- a/crates/parser/src/source_parser.rs +++ b/crates/parser/src/source_parser.rs @@ -75,14 +75,15 @@ fn tokens(input: &str) -> Vec { } impl Parser { - /// Parse a source - pub fn parse_source_at(&mut self, text: &str, at_offset: Option) { + fn parse_source_at(&mut self, text: &str, at_offset: Option) { let offset = at_offset.unwrap_or(0); let tokens = tokens(&text); let mut tokens_iter = tokens.iter(); - self.start_node_at(SyntaxKind::SourceFile, 0); + // open root `SourceFile` node + self.start_node(SyntaxKind::SourceFile); + while let Some(token) = tokens_iter.next() { match token.kind { SourceFileToken::Comment => { @@ -92,13 +93,15 @@ impl Parser { self.token(SyntaxKind::Newline, token.text.as_str()); } SourceFileToken::Statement => { - self.parse_statement( - token.text.as_str(), - Some(offset + u32::from(token.span.start())), - ); + // self.parse_statement( + // token.text.as_str(), + // Some(offset + u32::from(token.span.start())), + // ); } }; } + + // close root `SourceFile` node self.finish_node(); } } diff --git a/crates/parser/src/statement_parser.rs b/crates/parser/src/statement_parser.rs index fde95be8..ee461391 100644 --- a/crates/parser/src/statement_parser.rs +++ b/crates/parser/src/statement_parser.rs @@ -1,8 +1,10 @@ +use std::collections::VecDeque; + use cstree::text::{TextRange, TextSize}; -use logos::{Logos, Span}; +use logos::Logos; use crate::{ - get_children_codegen::get_children, parser::Parser, resolve_tokens::resolve_tokens, + get_nodes_codegen::get_nodes, parser::Parser, resolve_tokens::resolve_tokens, syntax_kind_codegen::SyntaxKind, }; @@ -25,7 +27,6 @@ pub enum StatementToken { impl StatementToken { /// Creates a `SyntaxKind` from a `StatementToken`. - /// can be generated. pub fn syntax_kind(&self) -> SyntaxKind { match self { StatementToken::Whitespace => SyntaxKind::Whitespace, @@ -36,30 +37,48 @@ impl StatementToken { } } +struct TokenBuffer { + tokens: VecDeque<(SyntaxKind, String)>, +} + +impl TokenBuffer { + fn new() -> Self { + Self { + tokens: VecDeque::new(), + } + } + + fn push(&mut self, kind: SyntaxKind, text: String) { + self.tokens.push_back((kind, text)); + } + + fn drain(&mut self, until: Option) -> Vec<(SyntaxKind, String)> { + if self.tokens.is_empty() { + return Vec::new(); + } + let range = match until { + Some(u) => 0..u as usize, + None => 0..self.tokens.len(), + }; + self.tokens.drain(range).collect::>() + } +} + impl Parser { - /// The main entry point for parsing a statement `text`. `at_offset` is the offset of the statement in the source file. - /// - /// On a high level, the algorithm works as follows: - /// 1. Parse the statement with pg_query.rs. If the statement contains syntax errors, the parser will report the error and continue to work without information - /// about the nodes. The result will be a flat list of tokens under the generic `Stmt` node. - /// If successful, the first node in the ordered list will be the main node of the statement, - /// and serves as a root node. - /// 2. Scan the statements for tokens with pg_query.rs. This will never fail, even if the statement contains syntax errors. - /// 3. Parse the statement with the `StatementToken` lexer. The lexer only contains the tokens - /// that are not parsed by pg_query.rs, such as whitespace. - /// 4. Define a pointer that starts at 0 and move it along the statement. - /// - first, check if the current pointer is within a pg_query token. If so, consume the - /// token. - /// - if not, consume the next token from the `StatementToken` lexer. - /// 5. Close all open nodes for that statement. - pub fn parse_statement(&mut self, text: &str, at_offset: Option) { + pub fn parse_statement_at(&mut self, text: &str, at_offset: Option) { + // 1. Collect as much information as possible from pg_query.rs and `StatementToken` lexer + + // offset of the statement in the source file. let offset = at_offset.unwrap_or(0); + + // range of the statement in the source file. let range = TextRange::new( TextSize::from(offset), TextSize::from(offset + text.len() as u32), ); - let mut pg_query_tokens = match pg_query::scan(text) { + // tokens from pg_query.rs + let pg_query_tokens = match pg_query::scan(text) { Ok(scanned) => scanned.tokens, Err(e) => { self.error(e.to_string(), range); @@ -67,8 +86,7 @@ impl Parser { } }; - // Get root node with depth 1 - // Since we are parsing only a single statement there can be only a single node at depth 1 + // root node of the statement, if no syntax errors let pg_query_root = match pg_query::parse(text) { Ok(parsed) => Some( parsed @@ -86,9 +104,11 @@ impl Parser { } }; + // ranged nodes from pg_query.rs, including the root node + // the nodes are ordered by starting range, starting with the root node let mut pg_query_nodes = match &pg_query_root { Some(root) => resolve_tokens( - &get_children(root, text.to_string(), 1), + &get_nodes(root, text.to_string(), 1), &pg_query_tokens, &text, ) @@ -99,109 +119,291 @@ impl Parser { let mut pg_query_tokens = pg_query_tokens.iter().peekable(); - let mut lexer = StatementToken::lexer(&text); + let mut statement_token_lexer = StatementToken::lexer(&text); + + // 2. Setup data structures required for the parsing algorithm + // A buffer for tokens that are not applied immediately to the cst + let mut token_buffer = TokenBuffer::new(); + // Keeps track of currently open nodes. Latest opened is last. + let mut open_nodes: Vec<(SyntaxKind, TextRange, i32)> = Vec::new(); - // parse root node if no syntax errors - if pg_query_root.is_some() { - let root_node = pg_query_root.unwrap(); - self.stmt(root_node.to_owned(), range); - self.start_node_at(SyntaxKind::new_from_pg_query_node(&root_node), 1); + // 3. Parse the statement + + // Handle root node + if pg_query_nodes.len() > 0 { + // if there are no syntax errors, use the pg_query node as the root node + let root_node = pg_query_nodes + .find(|n| n.inner.path == "0".to_string()) + .unwrap(); + // can only be at depth 1 + assert_eq!( + root_node.inner.depth, 1, + "Root node must be at depth 1, but is at depth {}", + root_node.inner.depth + ); + self.stmt(root_node.inner.node.to_owned(), range); + self.start_node(SyntaxKind::new_from_pg_query_node(&root_node.inner.node)); + open_nodes.push(( + SyntaxKind::new_from_pg_query_node(&root_node.inner.node), + range, + 1, + )); } else { // fallback to generic node as root - self.start_node_at(SyntaxKind::Stmt, 1); + self.start_node(SyntaxKind::Stmt); + open_nodes.push((SyntaxKind::Stmt, range, 1)); } - self.set_checkpoint(); // start at 0, and increment by the length of the token let mut pointer: i32 = 0; - #[derive(Debug)] - struct Token { - syntax_kind: SyntaxKind, - span: Span, - } - + // main loop that walks through the statement token by token while pointer < text.len() as i32 { // Check if the pointer is within a pg_query token let next_pg_query_token = pg_query_tokens.peek(); - let token = if next_pg_query_token.is_some() + + let token_length = if next_pg_query_token.is_some() && next_pg_query_token.unwrap().start <= pointer && pointer <= next_pg_query_token.unwrap().end { let token = pg_query_tokens.next().unwrap(); - Token { - syntax_kind: SyntaxKind::new_from_pg_query_token(&token), - span: Span { - start: token.start as usize, - end: token.end as usize, - }, + + let token_text = text + .chars() + .skip(token.start as usize) + .take((token.end as usize) - (token.start as usize)) + .collect::(); + + // a node can only start and end with a pg_query token, so we can handle them here + + // before applying the token, close any node that ends before the token starts + while open_nodes.last().is_some() + && open_nodes.last().unwrap().1.end() <= TextSize::from(token.start as u32) + { + self.finish_node(); + open_nodes.pop(); } + + // drain token buffer + for (kind, text) in token_buffer.drain(None) { + self.token(kind, text.as_str()); + } + + // apply the token + self.token(SyntaxKind::new_from_pg_query_token(token), text); + + // consume all nodes that start at or before the token ends + while pg_query_nodes.peek().is_some() + && pg_query_nodes.peek().unwrap().estimated_range.start() + <= TextSize::from(token.end as u32) + { + let node = pg_query_nodes.next().unwrap(); + self.start_node(SyntaxKind::new_from_pg_query_node(&node.inner.node)); + open_nodes.push(( + SyntaxKind::new_from_pg_query_node(&node.inner.node), + node.estimated_range, + node.inner.depth, + )); + } + + token_text.len() as i32 } else { // fallback to statement token // move statement token lexer to before pointer - while (lexer.span().end as i32) < pointer { - lexer.next(); + while (statement_token_lexer.span().end as i32) < pointer { + statement_token_lexer.next(); } - let token = lexer.next(); - if token.is_none() || (lexer.span().start as i32) != pointer { + let token = statement_token_lexer.next(); + if token.is_none() || (statement_token_lexer.span().start as i32) != pointer { // if the token is not at the pointer, we have a syntax error panic!( "Expected token for '{}' at offset {}", - lexer.slice(), - lexer.span().start + statement_token_lexer.slice(), + statement_token_lexer.span().start ); } - Token { - syntax_kind: token.unwrap().unwrap().syntax_kind(), - span: lexer.span(), - } + let token_text = statement_token_lexer.slice().to_string(); + token_buffer.push(token.unwrap().unwrap().syntax_kind(), token_text.clone()); + token_text.len() as i32 }; - self.token( - token.syntax_kind, - text.chars() - .skip(token.span.start) - .take(token.span.end - token.span.start) - .collect::() - .as_str(), - ); - - pointer = pointer + (token.span.end - token.span.start) as i32; + pointer = pointer + token_length; } - // close up nodes - self.close_checkpoint(); + while open_nodes.last().is_some() { + self.finish_node(); + open_nodes.pop(); + } } } +// impl Parser { +// /// The main entry point for parsing a statement `text`. `at_offset` is the offset of the statement in the source file. +// /// +// /// On a high level, the algorithm works as follows: +// /// 1. Parse the statement with pg_query.rs. If the statement contains syntax errors, the parser will report the error and continue to work without information +// /// about the nodes. The result will be a flat list of tokens under the generic `Stmt` node. +// /// If successful, the first node in the ordered list will be the main node of the statement, +// /// and serves as a root node. +// /// 2. Scan the statements for tokens with pg_query.rs. This will never fail, even if the statement contains syntax errors. +// /// 3. Parse the statement with the `StatementToken` lexer. The lexer only contains the tokens +// /// that are not parsed by pg_query.rs, such as whitespace. +// /// 4. Define a pointer that starts at 0 and move it along the statement. +// /// - first, check if the current pointer is within a pg_query token. If so, consume the +// /// token. +// /// - if not, consume the next token from the `StatementToken` lexer. +// /// 5. Close all open nodes for that statement. +// pub fn parse_statement(&mut self, text: &str, at_offset: Option) { +// let offset = at_offset.unwrap_or(0); +// let range = TextRange::new( +// TextSize::from(offset), +// TextSize::from(offset + text.len() as u32), +// ); +// +// let mut pg_query_tokens = match pg_query::scan(text) { +// Ok(scanned) => scanned.tokens, +// Err(e) => { +// self.error(e.to_string(), range); +// Vec::new() +// } +// }; +// +// // Get root node with depth 1 +// // Since we are parsing only a single statement there can be only a single node at depth 1 +// let pg_query_root = match pg_query::parse(text) { +// Ok(parsed) => Some( +// parsed +// .protobuf +// .nodes() +// .iter() +// .find(|n| n.1 == 1) +// .unwrap() +// .0 +// .to_enum(), +// ), +// Err(e) => { +// self.error(e.to_string(), range); +// None +// } +// }; +// +// let mut pg_query_nodes = match &pg_query_root { +// Some(root) => resolve_tokens( +// &get_nodes(root, text.to_string(), 1), +// &pg_query_tokens, +// &text, +// ) +// .into_iter() +// .peekable(), +// None => Vec::new().into_iter().peekable(), +// }; +// +// let mut pg_query_tokens = pg_query_tokens.iter().peekable(); +// +// let mut lexer = StatementToken::lexer(&text); +// +// // parse root node if no syntax errors +// if pg_query_root.is_some() { +// let root_node = pg_query_root.unwrap(); +// self.stmt(root_node.to_owned(), range); +// self.start_node_at(SyntaxKind::new_from_pg_query_node(&root_node), 1); +// } else { +// // fallback to generic node as root +// self.start_node_at(SyntaxKind::Stmt, 1); +// } +// self.set_checkpoint(); +// +// // start at 0, and increment by the length of the token +// let mut pointer: i32 = 0; +// +// #[derive(Debug)] +// struct Token { +// syntax_kind: SyntaxKind, +// span: Span, +// } +// +// while pointer < text.len() as i32 { +// // Check if the pointer is within a pg_query token +// let next_pg_query_token = pg_query_tokens.peek(); +// let token = if next_pg_query_token.is_some() +// && next_pg_query_token.unwrap().start <= pointer +// && pointer <= next_pg_query_token.unwrap().end +// { +// let token = pg_query_tokens.next().unwrap(); +// Token { +// syntax_kind: SyntaxKind::new_from_pg_query_token(&token), +// span: Span { +// start: token.start as usize, +// end: token.end as usize, +// }, +// } +// } else { +// // fallback to statement token +// +// // move statement token lexer to before pointer +// while (lexer.span().end as i32) < pointer { +// lexer.next(); +// } +// let token = lexer.next(); +// if token.is_none() || (lexer.span().start as i32) != pointer { +// // if the token is not at the pointer, we have a syntax error +// panic!( +// "Expected token for '{}' at offset {}", +// lexer.slice(), +// lexer.span().start +// ); +// } +// Token { +// syntax_kind: token.unwrap().unwrap().syntax_kind(), +// span: lexer.span(), +// } +// }; +// +// self.token( +// token.syntax_kind, +// text.chars() +// .skip(token.span.start) +// .take(token.span.end - token.span.start) +// .collect::() +// .as_str(), +// ); +// +// pointer = pointer + (token.span.end - token.span.start) as i32; +// } +// +// // close up nodes +// self.close_checkpoint(); +// } +// } + #[cfg(test)] mod tests { use std::assert_eq; use super::*; - #[test] - fn test_invalid_statement() { - let input = "select select;"; - - let mut parser = Parser::new(); - parser.parse_statement(input, None); - let parsed = parser.finish(); - - assert_eq!(parsed.cst.text(), input); - } - - #[test] - fn test_create_sql_function() { - let input = "CREATE FUNCTION dup(in int, out f1 int, out f2 text) - AS $$ SELECT $1, CAST($1 AS text) || ' is text' $$ - LANGUAGE SQL;"; - - let mut parser = Parser::new(); - parser.parse_statement(input, None); - let parsed = parser.finish(); - - assert_eq!(parsed.cst.text(), input); - } + // #[test] + // fn test_invalid_statement() { + // let input = "select select;"; + // + // let mut parser = Parser::new(); + // parser.parse_statement(input, None); + // let parsed = parser.finish(); + // + // assert_eq!(parsed.cst.text(), input); + // } + // + // #[test] + // fn test_create_sql_function() { + // let input = "CREATE FUNCTION dup(in int, out f1 int, out f2 text) + // AS $$ SELECT $1, CAST($1 AS text) || ' is text' $$ + // LANGUAGE SQL;"; + // + // let mut parser = Parser::new(); + // parser.parse_statement(input, None); + // let parsed = parser.finish(); + // + // assert_eq!(parsed.cst.text(), input); + // } }