From 2b7729d47691a6e5d3ab3c3edc89c284449cb918 Mon Sep 17 00:00:00 2001
From: psteinroe <philipp@steinroetter.com>
Date: Sun, 1 Oct 2023 16:31:48 +0200
Subject: [PATCH] feat: refactor and improve all over the place

---
 .../src/{get_children.rs => get_nodes.rs}     |  12 +-
 crates/codegen/src/lib.rs                     |   8 +-
 ...ildren_codegen.rs => get_nodes_codegen.rs} |  12 +-
 crates/parser/src/lib.rs                      |   4 +-
 crates/parser/src/parser.rs                   |  72 +---
 crates/parser/src/resolve_tokens.rs           | 185 +++++----
 crates/parser/src/sibling_token.rs            |  31 --
 crates/parser/src/source_parser.rs            |  17 +-
 crates/parser/src/statement_parser.rs         | 380 ++++++++++++++----
 9 files changed, 429 insertions(+), 292 deletions(-)
 rename crates/codegen/src/{get_children.rs => get_nodes.rs} (92%)
 rename crates/parser/src/{get_children_codegen.rs => get_nodes_codegen.rs} (67%)
 delete mode 100644 crates/parser/src/sibling_token.rs
diff --git a/crates/codegen/src/get_children.rs b/crates/codegen/src/get_nodes.rs
similarity index 92%
rename from crates/codegen/src/get_children.rs
rename to crates/codegen/src/get_nodes.rs
index e92c5f6e..26fd5a53 100644
--- a/crates/codegen/src/get_children.rs
+++ b/crates/codegen/src/get_nodes.rs
@@ -2,7 +2,7 @@ use pg_query_proto_parser::{FieldType, Node, ProtoParser};
 use proc_macro2::{Ident, TokenStream};
 use quote::{format_ident, quote};
 
-pub fn get_children_mod(_item: proc_macro2::TokenStream) -> proc_macro2::TokenStream {
+pub fn get_nodes_mod(_item: proc_macro2::TokenStream) -> proc_macro2::TokenStream {
     let parser = ProtoParser::new("./libpg_query/protobuf/pg_query.proto");
     let proto_file = parser.parse();
 
@@ -16,7 +16,7 @@ pub fn get_children_mod(_item: proc_macro2::TokenStream) -> proc_macro2::TokenSt
         use std::collections::VecDeque;
 
         #[derive(Debug, Clone)]
-        pub struct ChildrenNode {
+        pub struct Node {
             pub node: NodeEnum,
             pub depth: i32,
             pub path: String,
@@ -24,8 +24,10 @@ pub fn get_children_mod(_item: proc_macro2::TokenStream) -> proc_macro2::TokenSt
 
         /// Returns all children of the node, recursively
         /// location is resolved manually
-        pub fn get_children(node: &NodeEnum, text: String, current_depth: i32) -> Vec<ChildrenNode> {
-            let mut nodes: Vec<ChildrenNode> = vec![];
+        pub fn get_nodes(node: &NodeEnum, text: String, current_depth: i32) -> Vec<Node> {
+            let mut nodes: Vec<Node> = vec![
+                Node { node: node.to_owned(), depth: current_depth, path: "0".to_string() }
+            ];
             // Node, depth, path
             let mut stack: VecDeque<(NodeEnum, i32, String)> =
                 VecDeque::from(vec![(node.to_owned(), current_depth, "0".to_string())]);
@@ -37,7 +39,7 @@ pub fn get_children_mod(_item: proc_macro2::TokenStream) -> proc_macro2::TokenSt
                     let path = path.clone() + "." + child_ctr.to_string().as_str();
                     child_ctr = child_ctr + 1;
                     stack.push_back((c.to_owned(), current_depth, path.clone()));
-                    nodes.push(ChildrenNode {
+                    nodes.push(Node {
                         node: c,
                         depth: current_depth,
                         path: path.clone(),
diff --git a/crates/codegen/src/lib.rs b/crates/codegen/src/lib.rs
index fba42ea7..bc63d4f6 100644
--- a/crates/codegen/src/lib.rs
+++ b/crates/codegen/src/lib.rs
@@ -1,14 +1,14 @@
-mod get_children;
 mod get_location;
+mod get_nodes;
 mod syntax_kind;
 
-use get_children::get_children_mod;
 use get_location::get_location_mod;
+use get_nodes::get_nodes_mod;
 use syntax_kind::syntax_kind_mod;
 
 #[proc_macro]
-pub fn get_children(item: proc_macro::TokenStream) -> proc_macro::TokenStream {
-    get_children_mod(item.into()).into()
+pub fn get_nodes(item: proc_macro::TokenStream) -> proc_macro::TokenStream {
+    get_nodes_mod(item.into()).into()
 }
 
 #[proc_macro]
diff --git a/crates/parser/src/get_children_codegen.rs b/crates/parser/src/get_nodes_codegen.rs
similarity index 67%
rename from crates/parser/src/get_children_codegen.rs
rename to crates/parser/src/get_nodes_codegen.rs
index 13b895f2..3305baab 100644
--- a/crates/parser/src/get_children_codegen.rs
+++ b/crates/parser/src/get_nodes_codegen.rs
@@ -1,13 +1,13 @@
-use codegen::get_children;
+use codegen::get_nodes;
 
-get_children!();
+get_nodes!();
 
 #[cfg(test)]
 mod tests {
-    use crate::get_children_codegen::get_children;
+    use crate::get_nodes_codegen::get_nodes;
 
     #[test]
-    fn test_get_children() {
+    fn test_get_nodes() {
         let input = "with c as (insert into contact (id) values ('id')) select * from c;";
 
         let pg_query_root = match pg_query::parse(input) {
@@ -24,7 +24,7 @@ mod tests {
             Err(_) => None,
         };
 
-        let children = get_children(&pg_query_root.unwrap(), input.to_string(), 1);
-        assert_eq!(children.len(), 13);
+        let nodes = get_nodes(&pg_query_root.unwrap(), input.to_string(), 1);
+        assert_eq!(nodes.len(), 14);
     }
 }
diff --git a/crates/parser/src/lib.rs b/crates/parser/src/lib.rs
index 8be85d21..5344ff27 100644
--- a/crates/parser/src/lib.rs
+++ b/crates/parser/src/lib.rs
@@ -16,17 +16,15 @@
 //! To see how these drawbacks are mitigated, see the `statement.rs` and the `source_file.rs` module.
 
 mod ast_node;
-mod get_children_codegen;
 mod get_location_codegen;
+mod get_nodes_codegen;
 mod parser;
 mod resolve_tokens;
-mod sibling_token;
 mod source_parser;
 mod statement_parser;
 mod syntax_error;
 mod syntax_kind_codegen;
 mod syntax_node;
 
-pub use crate::parser::{Parse, Parser};
 pub use crate::syntax_kind_codegen::SyntaxKind;
 pub use crate::syntax_node::{SyntaxElement, SyntaxNode, SyntaxToken};
diff --git a/crates/parser/src/parser.rs b/crates/parser/src/parser.rs
index 80219f8d..2f311ba5 100644
--- a/crates/parser/src/parser.rs
+++ b/crates/parser/src/parser.rs
@@ -1,6 +1,5 @@
 use cstree::syntax::ResolvedNode;
 use cstree::{build::GreenNodeBuilder, text::TextRange};
-use log::debug;
 use pg_query::NodeEnum;
 
 use crate::ast_node::RawStmt;
@@ -8,7 +7,7 @@ use crate::syntax_error::SyntaxError;
 use crate::syntax_kind_codegen::SyntaxKind;
 use crate::syntax_node::SyntaxNode;
 
-/// Main parser that controls the cst building process, and collects errors and statements
+/// Main parser that exposes the `cstree` api, and collects errors and statements
 #[derive(Debug)]
 pub struct Parser {
     /// The cst builder
@@ -17,16 +16,9 @@ pub struct Parser {
     errors: Vec<SyntaxError>,
     /// The pg_query statements representing the abtract syntax tree
     stmts: Vec<RawStmt>,
-    /// The current checkpoint depth, if any
-    checkpoint: Option<i32>,
-    /// Whether the parser is currently parsing a flat node
-    is_parsing_flat_node: bool,
-    /// Keeps track of currently open nodes
-    /// Latest opened is last
-    open_nodes: Vec<(SyntaxKind, i32)>,
 }
 
-/// Result of parsing
+/// Result of Building
 #[derive(Debug)]
 pub struct Parse {
     /// The concrete syntax tree
@@ -43,72 +35,16 @@ impl Parser {
             inner: GreenNodeBuilder::new(),
             errors: Vec::new(),
             stmts: Vec::new(),
-            checkpoint: None,
-            is_parsing_flat_node: false,
-            open_nodes: Vec::new(),
         }
     }
 
-    /// close all nodes until the specified depth is reached
-    pub fn close_until_depth(&mut self, depth: i32) {
-        debug!("close until depth {}", depth);
-        if self.open_nodes.is_empty() || self.get_current_depth() < depth {
-            return;
-        }
-        loop {
-            if self.open_nodes.is_empty() || self.get_current_depth() < depth {
-                break;
-            }
-            self.finish_node();
-        }
-    }
-
-    fn get_current_depth(&self) -> i32 {
-        self.open_nodes[self.open_nodes.len() - 1].1
-    }
-
-    /// set a checkpoint at current depth
-    ///
-    /// if `is_parsing_flat_node` is true, all tokens parsed until this checkpoint is closed will be applied immediately
-    pub fn set_checkpoint(&mut self) {
-        assert!(
-            self.checkpoint.is_none(),
-            "Must close previouos checkpoint before setting new one"
-        );
-        self.checkpoint = Some(self.get_current_depth());
-    }
-
-    /// close all nodes until checkpoint depth is reached
-    pub fn close_checkpoint(&mut self) {
-        if self.checkpoint.is_some() {
-            self.close_until_depth(self.checkpoint.unwrap());
-        }
-        self.checkpoint = None;
-        self.is_parsing_flat_node = false;
-    }
-
-    /// start a new node of `SyntaxKind` at `depth`
-    /// handles closing previous nodes if necessary
-    pub fn start_node_at(&mut self, kind: SyntaxKind, depth: i32) {
-        debug!("starting node at depth {} {:?}", depth, kind);
-        // close until target depth
-        self.close_until_depth(depth);
-
-        self.open_nodes.push((kind, depth));
-        debug!("start node {:?}", kind);
+    /// start a new node of `SyntaxKind`
+    pub fn start_node(&mut self, kind: SyntaxKind) {
         self.inner.start_node(kind);
     }
 
     /// finish current node
     pub fn finish_node(&mut self) {
-        debug!("finish_node");
-
-        let n = self.open_nodes.pop();
-        if n.is_none() {
-            panic!("No node to finish");
-        }
-
-        debug!("finish node {:?}", n.unwrap().0);
         self.inner.finish_node();
     }
 
diff --git a/crates/parser/src/resolve_tokens.rs b/crates/parser/src/resolve_tokens.rs
index 93a8a877..9c01b471 100644
--- a/crates/parser/src/resolve_tokens.rs
+++ b/crates/parser/src/resolve_tokens.rs
@@ -1,88 +1,96 @@
-use crate::get_children_codegen::ChildrenNode;
+use std::{
+    cmp::{max, min},
+    convert::identity,
+};
+
 use crate::get_location_codegen::get_location;
+use crate::get_nodes_codegen::Node;
 use cstree::text::{TextRange, TextSize};
 use pg_query::{protobuf::ScanToken, NodeEnum};
 
 #[derive(Debug, Clone)]
-pub struct NestedNode {
-    pub id: usize,
-    pub inner: ChildrenNode,
-    // .start property of `ScanToken`
-    pub tokens: Vec<i32>,
-    pub range: TextRange,
+pub struct RangedNode {
+    pub inner: Node,
+    pub estimated_range: TextRange,
 }
 
-/// Turns a `Vec<ChildrenNode>` into a `Vec<NestedNode>` by adding `tokens` and `range` to each node.
-///
-/// For each node, we walk all properties and search for tokens that match the property value. The
-/// token that is closest to the node or a parent is used.
-///
-/// The node range is the minimum start and maximum end of all tokens.
-pub fn resolve_tokens(
-    children: &Vec<ChildrenNode>,
-    tokens: &Vec<ScanToken>,
-    text: &str,
-) -> Vec<NestedNode> {
-    children
-        .iter()
-        .enumerate()
-        .map(|(idx, c)| {
-            let nearest_parent_location = get_nearest_parent_location(&c, children);
-            let furthest_child_location = get_furthest_child_location(&c, children);
-
-            let mut child_tokens = Vec::new();
-
-            let mut find_token = |property: String| {
-                child_tokens.push(
-                    tokens
-                        .iter()
-                        .filter_map(|t| {
-                            if get_token_text(
-                                usize::try_from(t.start).unwrap(),
-                                usize::try_from(t.end).unwrap(),
-                                text,
-                            ) != property
-                            {
-                                return None;
-                            }
-
-                            if furthest_child_location.is_some()
-                                && furthest_child_location.unwrap() < t.start as i32
-                            {
-                                return None;
-                            }
-
-                            let distance = t.start - nearest_parent_location;
-                            if distance > 0 {
-                                Some((distance, t))
-                            } else {
-                                None
-                            }
-                        })
-                        .min_by_key(|(d, _)| d.to_owned())
-                        .map(|(_, t)| t)
-                        .unwrap(),
-                );
-            };
-
-            match &c.node {
-                NodeEnum::RangeVar(n) => {
-                    find_token(n.relname.to_owned());
-                }
-                _ => {}
-            };
-
-            NestedNode {
-                id: idx,
-                inner: c.to_owned(),
-                tokens: child_tokens.iter().map(|t| t.start).collect(),
-                range: TextRange::new(
-                    TextSize::from(
-                        child_tokens.iter().min_by_key(|t| t.start).unwrap().start as u32,
-                    ),
-                    TextSize::from(child_tokens.iter().max_by_key(|t| t.end).unwrap().end as u32),
-                ),
+/// Turns a `Vec<Node>` into a `Vec<RangedNode>` by estimating their range.
+pub fn resolve_tokens(nodes: &Vec<Node>, tokens: &Vec<ScanToken>, text: &str) -> Vec<RangedNode> {
+    let mut ranged_nodes: Vec<RangedNode> = Vec::new();
+
+    // we get an estimated range by searching for tokens that match the node property values
+    // and, if available, the `location` of the node itself
+    nodes.iter().for_each(|n| {
+        let nearest_parent_location = get_nearest_parent_location(&n, nodes);
+        let furthest_child_location = get_furthest_child_location(&n, nodes);
+
+        let mut child_tokens = Vec::new();
+
+        let mut find_token = |property: String| {
+            child_tokens.push(
+                tokens
+                    .iter()
+                    .filter_map(|t| {
+                        if get_token_text(
+                            usize::try_from(t.start).unwrap(),
+                            usize::try_from(t.end).unwrap(),
+                            text,
+                        ) != property
+                        {
+                            return None;
+                        }
+
+                        if furthest_child_location.is_some()
+                            && furthest_child_location.unwrap() < t.start as i32
+                        {
+                            return None;
+                        }
+
+                        let distance = t.start - nearest_parent_location;
+                        if distance > 0 {
+                            Some((distance, t))
+                        } else {
+                            None
+                        }
+                    })
+                    .min_by_key(|(d, _)| d.to_owned())
+                    .map(|(_, t)| t)
+                    .unwrap(),
+            );
+        };
+
+        match &n.node {
+            NodeEnum::RangeVar(n) => {
+                find_token(n.relname.to_owned());
             }
+            _ => {}
+        };
+
+        let from_locations: Vec<i32> = [
+            get_location(&n.node),
+            Some(nearest_parent_location),
+            Some(child_tokens.iter().min_by_key(|t| t.start).unwrap().start),
+        ]
+        .into_iter()
+        .filter_map(|x| x)
+        .collect();
+
+        ranged_nodes.push(RangedNode {
+            inner: n.to_owned(),
+            estimated_range: TextRange::new(
+                TextSize::from(from_locations.iter().min().unwrap_or(&0).to_owned() as u32),
+                TextSize::from(child_tokens.iter().max_by_key(|t| t.end).unwrap().end as u32),
+            ),
+        });
+    });
+
+    // FIXME: this additional loop is not required if we order the nodes by path first
+    ranged_nodes
+        .iter()
+        .map(|n| RangedNode {
+            inner: n.inner.to_owned(),
+            // the range of a node must be larger than the range of all children nodes
+            estimated_range: get_largest_child_range(&n, &ranged_nodes),
         })
         .collect()
 }
@@ -94,7 +102,26 @@ fn get_token_text(start: usize, end: usize, text: &str) -> String {
         .collect::<String>()
 }
 
-fn get_furthest_child_location(c: &ChildrenNode, children: &Vec<ChildrenNode>) -> Option<i32> {
+fn get_largest_child_range(node: &RangedNode, nodes: &Vec<RangedNode>) -> TextRange {
+    let mut start: TextSize = node.estimated_range.start().to_owned();
+    let mut end: TextSize = node.estimated_range.end().to_owned();
+
+    nodes.iter().for_each(|n| {
+        if !n.inner.path.starts_with(node.inner.path.as_str()) {
+            return;
+        }
+        if start < n.estimated_range.start() {
+            start = n.estimated_range.start();
+        }
+        if end > n.estimated_range.end() {
+            end = n.estimated_range.end();
+        }
+    });
+
+    TextRange::new(start, end)
+}
+
+fn get_furthest_child_location(c: &Node, children: &Vec<Node>) -> Option<i32> {
     children
         .iter()
         .filter_map(|n| {
@@ -106,7 +133,7 @@ fn get_furthest_child_location(c: &ChildrenNode, children: &Vec<ChildrenNode>) -
         .max()
 }
 
-fn get_nearest_parent_location(n: &ChildrenNode, children: &Vec<ChildrenNode>) -> i32 {
+fn get_nearest_parent_location(n: &Node, children: &Vec<Node>) -> i32 {
     // if location is set, return it
     let location = get_location(&n.node);
     if location.is_some() {
diff --git a/crates/parser/src/sibling_token.rs b/crates/parser/src/sibling_token.rs
deleted file mode 100644
index 6a42dd0d..00000000
--- a/crates/parser/src/sibling_token.rs
+++ /dev/null
@@ -1,31 +0,0 @@
-use crate::syntax_kind_codegen::SyntaxKind;
-
-impl SyntaxKind {
-    pub fn is_opening_sibling(&self) -> bool {
-        match self {
-            SyntaxKind::Ascii40 => true,
-            SyntaxKind::Ascii91 => true,
-            SyntaxKind::Case => true,
-            _ => false,
-        }
-    }
-    pub fn is_closing_sibling(&self) -> bool {
-        match self {
-            SyntaxKind::Ascii41 => true,
-            SyntaxKind::Ascii93 => true,
-            SyntaxKind::EndP => true,
-            _ => false,
-        }
-    }
-    pub fn sibling(&self) -> Option<SyntaxKind> {
-        match self {
-            SyntaxKind::Case => Some(SyntaxKind::EndP),
-            SyntaxKind::EndP => Some(SyntaxKind::Case),
-            SyntaxKind::Ascii40 => Some(SyntaxKind::Ascii41),
-            SyntaxKind::Ascii41 => Some(SyntaxKind::Ascii40),
-            SyntaxKind::Ascii91 => Some(SyntaxKind::Ascii93),
-            SyntaxKind::Ascii93 => Some(SyntaxKind::Ascii91),
-            _ => None,
-        }
-    }
-}
diff --git a/crates/parser/src/source_parser.rs b/crates/parser/src/source_parser.rs
index b7a727ec..341d6eb9 100644
--- a/crates/parser/src/source_parser.rs
+++ b/crates/parser/src/source_parser.rs
@@ -75,14 +75,15 @@ fn tokens(input: &str) -> Vec<Token> {
 }
 
 impl Parser {
-    /// Parse a source
-    pub fn parse_source_at(&mut self, text: &str, at_offset: Option<u32>) {
+    fn parse_source_at(&mut self, text: &str, at_offset: Option<u32>) {
         let offset = at_offset.unwrap_or(0);
 
         let tokens = tokens(&text);
         let mut tokens_iter = tokens.iter();
 
-        self.start_node_at(SyntaxKind::SourceFile, 0);
+        // open root `SourceFile` node
+        self.start_node(SyntaxKind::SourceFile);
+
         while let Some(token) = tokens_iter.next() {
             match token.kind {
                 SourceFileToken::Comment => {
@@ -92,13 +93,15 @@ impl Parser {
                     self.token(SyntaxKind::Newline, token.text.as_str());
                 }
                 SourceFileToken::Statement => {
-                    self.parse_statement(
-                        token.text.as_str(),
-                        Some(offset + u32::from(token.span.start())),
-                    );
+                    // self.parse_statement(
+                    //     token.text.as_str(),
+                    //     Some(offset + u32::from(token.span.start())),
+                    // );
                 }
             };
         }
+
+        // close root `SourceFile` node
         self.finish_node();
     }
 }
diff --git a/crates/parser/src/statement_parser.rs b/crates/parser/src/statement_parser.rs
index fde95be8..ee461391 100644
--- a/crates/parser/src/statement_parser.rs
+++ b/crates/parser/src/statement_parser.rs
@@ -1,8 +1,10 @@
+use std::collections::VecDeque;
+
 use cstree::text::{TextRange, TextSize};
-use logos::{Logos, Span};
+use logos::Logos;
 
 use crate::{
-    get_children_codegen::get_children, parser::Parser, resolve_tokens::resolve_tokens,
+    get_nodes_codegen::get_nodes, parser::Parser, resolve_tokens::resolve_tokens,
     syntax_kind_codegen::SyntaxKind,
 };
 
@@ -25,7 +27,6 @@ pub enum StatementToken {
 
 impl StatementToken {
     /// Creates a `SyntaxKind` from a `StatementToken`.
-    /// can be generated.
     pub fn syntax_kind(&self) -> SyntaxKind {
         match self {
             StatementToken::Whitespace => SyntaxKind::Whitespace,
@@ -36,30 +37,48 @@ impl StatementToken {
     }
 }
 
+struct TokenBuffer {
+    tokens: VecDeque<(SyntaxKind, String)>,
+}
+
+impl TokenBuffer {
+    fn new() -> Self {
+        Self {
+            tokens: VecDeque::new(),
+        }
+    }
+
+    fn push(&mut self, kind: SyntaxKind, text: String) {
+        self.tokens.push_back((kind, text));
+    }
+
+    fn drain(&mut self, until: Option<u32>) -> Vec<(SyntaxKind, String)> {
+        if self.tokens.is_empty() {
+            return Vec::new();
+        }
+        let range = match until {
+            Some(u) => 0..u as usize,
+            None => 0..self.tokens.len(),
+        };
+        self.tokens.drain(range).collect::<Vec<_>>()
+    }
+}
+
 impl Parser {
-    /// The main entry point for parsing a statement `text`. `at_offset` is the offset of the statement in the source file.
-    ///
-    /// On a high level, the algorithm works as follows:
-    /// 1. Parse the statement with pg_query.rs. If the statement contains syntax errors, the parser will report the error and continue to work without information
-    ///   about the nodes. The result will be a flat list of tokens under the generic `Stmt` node.
-    ///   If successful, the first node in the ordered list will be the main node of the statement,
-    ///   and serves as a root node.
-    /// 2. Scan the statements for tokens with pg_query.rs. This will never fail, even if the statement contains syntax errors.
-    /// 3. Parse the statement with the `StatementToken` lexer. The lexer only contains the tokens
-    ///    that are not parsed by pg_query.rs, such as whitespace.
-    /// 4. Define a pointer that starts at 0 and move it along the statement.
-    ///    - first, check if the current pointer is within a pg_query token. If so, consume the
-    ///    token.
-    ///    - if not, consume the next token from the `StatementToken` lexer.
-    /// 5. Close all open nodes for that statement.
-    pub fn parse_statement(&mut self, text: &str, at_offset: Option<u32>) {
+    pub fn parse_statement_at(&mut self, text: &str, at_offset: Option<u32>) {
+        // 1. Collect as much information as possible from pg_query.rs and `StatementToken` lexer
+
+        // offset of the statement in the source file.
         let offset = at_offset.unwrap_or(0);
+
+        // range of the statement in the source file.
         let range = TextRange::new(
             TextSize::from(offset),
             TextSize::from(offset + text.len() as u32),
         );
 
-        let mut pg_query_tokens = match pg_query::scan(text) {
+        // tokens from pg_query.rs
+        let pg_query_tokens = match pg_query::scan(text) {
             Ok(scanned) => scanned.tokens,
             Err(e) => {
                 self.error(e.to_string(), range);
@@ -67,8 +86,7 @@ impl Parser {
             }
         };
 
-        // Get root node with depth 1
-        // Since we are parsing only a single statement there can be only a single node at depth 1
+        // root node of the statement, if no syntax errors
         let pg_query_root = match pg_query::parse(text) {
             Ok(parsed) => Some(
                 parsed
@@ -86,9 +104,11 @@ impl Parser {
             }
         };
 
+        // ranged nodes from pg_query.rs, including the root node
+        // the nodes are ordered by starting range, starting with the root node
         let mut pg_query_nodes = match &pg_query_root {
             Some(root) => resolve_tokens(
-                &get_children(root, text.to_string(), 1),
+                &get_nodes(root, text.to_string(), 1),
                 &pg_query_tokens,
                 &text,
             )
@@ -99,109 +119,291 @@ impl Parser {
 
         let mut pg_query_tokens = pg_query_tokens.iter().peekable();
 
-        let mut lexer = StatementToken::lexer(&text);
+        let mut statement_token_lexer = StatementToken::lexer(&text);
+
+        // 2. Setup data structures required for the parsing algorithm
+        // A buffer for tokens that are not applied immediately to the cst
+        let mut token_buffer = TokenBuffer::new();
+        // Keeps track of currently open nodes. Latest opened is last.
+        let mut open_nodes: Vec<(SyntaxKind, TextRange, i32)> = Vec::new();
 
-        // parse root node if no syntax errors
-        if pg_query_root.is_some() {
-            let root_node = pg_query_root.unwrap();
-            self.stmt(root_node.to_owned(), range);
-            self.start_node_at(SyntaxKind::new_from_pg_query_node(&root_node), 1);
+        // 3. Parse the statement
+
+        // Handle root node
+        if pg_query_nodes.len() > 0 {
+            // if there are no syntax errors, use the pg_query node as the root node
+            let root_node = pg_query_nodes
+                .find(|n| n.inner.path == "0".to_string())
+                .unwrap();
+            // can only be at depth 1
+            assert_eq!(
+                root_node.inner.depth, 1,
+                "Root node must be at depth 1, but is at depth {}",
+                root_node.inner.depth
+            );
+            self.stmt(root_node.inner.node.to_owned(), range);
+            self.start_node(SyntaxKind::new_from_pg_query_node(&root_node.inner.node));
+            open_nodes.push((
+                SyntaxKind::new_from_pg_query_node(&root_node.inner.node),
+                range,
+                1,
+            ));
         } else {
             // fallback to generic node as root
-            self.start_node_at(SyntaxKind::Stmt, 1);
+            self.start_node(SyntaxKind::Stmt);
+            open_nodes.push((SyntaxKind::Stmt, range, 1));
         }
-        self.set_checkpoint();
 
         // start at 0, and increment by the length of the token
         let mut pointer: i32 = 0;
 
-        #[derive(Debug)]
-        struct Token {
-            syntax_kind: SyntaxKind,
-            span: Span,
-        }
-
+        // main loop that walks through the statement token by token
         while pointer < text.len() as i32 {
             // Check if the pointer is within a pg_query token
             let next_pg_query_token = pg_query_tokens.peek();
-            let token = if next_pg_query_token.is_some()
+
+            let token_length = if next_pg_query_token.is_some()
                 && next_pg_query_token.unwrap().start <= pointer
                 && pointer <= next_pg_query_token.unwrap().end
             {
                 let token = pg_query_tokens.next().unwrap();
-                Token {
-                    syntax_kind: SyntaxKind::new_from_pg_query_token(&token),
-                    span: Span {
-                        start: token.start as usize,
-                        end: token.end as usize,
-                    },
+
+                let token_text = text
+                    .chars()
+                    .skip(token.start as usize)
+                    .take((token.end as usize) - (token.start as usize))
+                    .collect::<String>();
+
+                // a node can only start and end with a pg_query token, so we can handle them here
+
+                // before applying the token, close any node that ends before the token starts
+                while open_nodes.last().is_some()
+                    && open_nodes.last().unwrap().1.end() <= TextSize::from(token.start as u32)
+                {
+                    self.finish_node();
+                    open_nodes.pop();
                 }
+
+                // drain token buffer
+                for (kind, text) in token_buffer.drain(None) {
+                    self.token(kind, text.as_str());
+                }
+
+                // apply the token
+                self.token(SyntaxKind::new_from_pg_query_token(token), text);
+
+                // consume all nodes that start at or before the token ends
+                while pg_query_nodes.peek().is_some()
+                    && pg_query_nodes.peek().unwrap().estimated_range.start()
+                        <= TextSize::from(token.end as u32)
+                {
+                    let node = pg_query_nodes.next().unwrap();
+                    self.start_node(SyntaxKind::new_from_pg_query_node(&node.inner.node));
+                    open_nodes.push((
+                        SyntaxKind::new_from_pg_query_node(&node.inner.node),
+                        node.estimated_range,
+                        node.inner.depth,
+                    ));
+                }
+
+                token_text.len() as i32
             } else {
                 // fallback to statement token
 
                 // move statement token lexer to before pointer
-                while (lexer.span().end as i32) < pointer {
-                    lexer.next();
+                while (statement_token_lexer.span().end as i32) < pointer {
+                    statement_token_lexer.next();
                 }
-                let token = lexer.next();
-                if token.is_none() || (lexer.span().start as i32) != pointer {
+                let token = statement_token_lexer.next();
+                if token.is_none() || (statement_token_lexer.span().start as i32) != pointer {
                     // if the token is not at the pointer, we have a syntax error
                     panic!(
                         "Expected token for '{}' at offset {}",
-                        lexer.slice(),
-                        lexer.span().start
+                        statement_token_lexer.slice(),
+                        statement_token_lexer.span().start
                     );
                 }
-                Token {
-                    syntax_kind: token.unwrap().unwrap().syntax_kind(),
-                    span: lexer.span(),
-                }
+                let token_text = statement_token_lexer.slice().to_string();
+                token_buffer.push(token.unwrap().unwrap().syntax_kind(), token_text.clone());
+                token_text.len() as i32
             };
 
-            self.token(
-                token.syntax_kind,
-                text.chars()
-                    .skip(token.span.start)
-                    .take(token.span.end - token.span.start)
-                    .collect::<String>()
-                    .as_str(),
-            );
-
-            pointer = pointer + (token.span.end - token.span.start) as i32;
+            pointer = pointer + token_length;
         }
 
-        // close up nodes
-        self.close_checkpoint();
+        while open_nodes.last().is_some() {
+            self.finish_node();
+            open_nodes.pop();
+        }
     }
 }
 
+// impl Parser {
+//     /// The main entry point for parsing a statement `text`. `at_offset` is the offset of the statement in the source file.
+//     ///
+//     /// On a high level, the algorithm works as follows:
+//     /// 1. Parse the statement with pg_query.rs. If the statement contains syntax errors, the parser will report the error and continue to work without information
+//     ///   about the nodes. The result will be a flat list of tokens under the generic `Stmt` node.
+//     ///   If successful, the first node in the ordered list will be the main node of the statement,
+//     ///   and serves as a root node.
+//     /// 2. Scan the statements for tokens with pg_query.rs. This will never fail, even if the statement contains syntax errors.
+//     /// 3. Parse the statement with the `StatementToken` lexer. The lexer only contains the tokens
+//     ///    that are not parsed by pg_query.rs, such as whitespace.
+//     /// 4. Define a pointer that starts at 0 and move it along the statement.
+//     ///    - first, check if the current pointer is within a pg_query token. If so, consume the
+//     ///    token.
+//     ///    - if not, consume the next token from the `StatementToken` lexer.
+//     /// 5. Close all open nodes for that statement.
+//     pub fn parse_statement(&mut self, text: &str, at_offset: Option<u32>) {
+//         let offset = at_offset.unwrap_or(0);
+//         let range = TextRange::new(
+//             TextSize::from(offset),
+//             TextSize::from(offset + text.len() as u32),
+//         );
+//
+//         let mut pg_query_tokens = match pg_query::scan(text) {
+//             Ok(scanned) => scanned.tokens,
+//             Err(e) => {
+//                 self.error(e.to_string(), range);
+//                 Vec::new()
+//             }
+//         };
+//
+//         // Get root node with depth 1
+//         // Since we are parsing only a single statement there can be only a single node at depth 1
+//         let pg_query_root = match pg_query::parse(text) {
+//             Ok(parsed) => Some(
+//                 parsed
+//                     .protobuf
+//                     .nodes()
+//                     .iter()
+//                     .find(|n| n.1 == 1)
+//                     .unwrap()
+//                     .0
+//                     .to_enum(),
+//             ),
+//             Err(e) => {
+//                 self.error(e.to_string(), range);
+//                 None
+//             }
+//         };
+//
+//         let mut pg_query_nodes = match &pg_query_root {
+//             Some(root) => resolve_tokens(
+//                 &get_nodes(root, text.to_string(), 1),
+//                 &pg_query_tokens,
+//                 &text,
+//             )
+//             .into_iter()
+//             .peekable(),
+//             None => Vec::new().into_iter().peekable(),
+//         };
+//
+//         let mut pg_query_tokens = pg_query_tokens.iter().peekable();
+//
+//         let mut lexer = StatementToken::lexer(&text);
+//
+//         // parse root node if no syntax errors
+//         if pg_query_root.is_some() {
+//             let root_node = pg_query_root.unwrap();
+//             self.stmt(root_node.to_owned(), range);
+//             self.start_node_at(SyntaxKind::new_from_pg_query_node(&root_node), 1);
+//         } else {
+//             // fallback to generic node as root
+//             self.start_node_at(SyntaxKind::Stmt, 1);
+//         }
+//         self.set_checkpoint();
+//
+//         // start at 0, and increment by the length of the token
+//         let mut pointer: i32 = 0;
+//
+//         #[derive(Debug)]
+//         struct Token {
+//             syntax_kind: SyntaxKind,
+//             span: Span,
+//         }
+//
+//         while pointer < text.len() as i32 {
+//             // Check if the pointer is within a pg_query token
+//             let next_pg_query_token = pg_query_tokens.peek();
+//             let token = if next_pg_query_token.is_some()
+//                 && next_pg_query_token.unwrap().start <= pointer
+//                 && pointer <= next_pg_query_token.unwrap().end
+//             {
+//                 let token = pg_query_tokens.next().unwrap();
+//                 Token {
+//                     syntax_kind: SyntaxKind::new_from_pg_query_token(&token),
+//                     span: Span {
+//                         start: token.start as usize,
+//                         end: token.end as usize,
+//                     },
+//                 }
+//             } else {
+//                 // fallback to statement token
+//
+//                 // move statement token lexer to before pointer
+//                 while (lexer.span().end as i32) < pointer {
+//                     lexer.next();
+//                 }
+//                 let token = lexer.next();
+//                 if token.is_none() || (lexer.span().start as i32) != pointer {
+//                     // if the token is not at the pointer, we have a syntax error
+//                     panic!(
+//                         "Expected token for '{}' at offset {}",
+//                         lexer.slice(),
+//                         lexer.span().start
+//                     );
+//                 }
+//                 Token {
+//                     syntax_kind: token.unwrap().unwrap().syntax_kind(),
+//                     span: lexer.span(),
+//                 }
+//             };
+//
+//             self.token(
+//                 token.syntax_kind,
+//                 text.chars()
+//                     .skip(token.span.start)
+//                     .take(token.span.end - token.span.start)
+//                     .collect::<String>()
+//                     .as_str(),
+//             );
+//
+//             pointer = pointer + (token.span.end - token.span.start) as i32;
+//         }
+//
+//         // close up nodes
+//         self.close_checkpoint();
+//     }
+// }
+
 #[cfg(test)]
 mod tests {
     use std::assert_eq;
 
     use super::*;
 
-    #[test]
-    fn test_invalid_statement() {
-        let input = "select select;";
-
-        let mut parser = Parser::new();
-        parser.parse_statement(input, None);
-        let parsed = parser.finish();
-
-        assert_eq!(parsed.cst.text(), input);
-    }
-
-    #[test]
-    fn test_create_sql_function() {
-        let input = "CREATE FUNCTION dup(in int, out f1 int, out f2 text)
-    AS $$ SELECT $1, CAST($1 AS text) || ' is text' $$
-    LANGUAGE SQL;";
-
-        let mut parser = Parser::new();
-        parser.parse_statement(input, None);
-        let parsed = parser.finish();
-
-        assert_eq!(parsed.cst.text(), input);
-    }
+    // #[test]
+    // fn test_invalid_statement() {
+    //     let input = "select select;";
+    //
+    //     let mut parser = Parser::new();
+    //     parser.parse_statement(input, None);
+    //     let parsed = parser.finish();
+    //
+    //     assert_eq!(parsed.cst.text(), input);
+    // }
+    //
+    // #[test]
+    // fn test_create_sql_function() {
+    //     let input = "CREATE FUNCTION dup(in int, out f1 int, out f2 text)
+    // AS $$ SELECT $1, CAST($1 AS text) || ' is text' $$
+    // LANGUAGE SQL;";
+    //
+    //     let mut parser = Parser::new();
+    //     parser.parse_statement(input, None);
+    //     let parsed = parser.finish();
+    //
+    //     assert_eq!(parsed.cst.text(), input);
+    // }
 }