diff --git a/crates/parser/src/lib.rs b/crates/parser/src/lib.rs index 493b49cb..2a24b634 100644 --- a/crates/parser/src/lib.rs +++ b/crates/parser/src/lib.rs @@ -3,17 +3,17 @@ //! This crate provides a parser for the Postgres SQL dialect. //! It is based in the pg_query.rs crate, which is a wrapper around the PostgreSQL query parser. //! The main `Parser` struct parses a source file and individual statements. -//! The `Parse` struct contains the resulting concrete syntax tree, syntax errors, and the abtract syntax tree, which is a list of pg_query statements and their positions. +//! The `Parse` result struct contains the resulting concrete syntax tree, syntax errors, and the abtract syntax tree, which is a list of pg_query statements and their positions. //! //! The idea is to offload the heavy lifting to the same parser that the PostgreSQL server uses, -//! and just fill in the gaps to be able to build both cst and ast from a a source file that +//! and just fill in the gaps to be able to build both cst and ast from a source file that //! potentially contains erroneous statements. //! //! The main drawbacks of the PostgreSQL query parser mitigated by this parser are: //! - it only parsed a full source text, and if there is any syntax error in a file, it will not parse anything and return an error. -//! - it does not parse whitespaces and newlines, so it is not possible to build a concrete syntax tree build a concrete syntax tree. +//! - it does not parse whitespaces and newlines, and it only returns ast nodes. The concrete syntax tree has to be reverse-engineered. //! -//! To see how these drawbacks are mitigated, see the `statement.rs` and the `source_file.rs` module. +//! To see how these drawbacks are mitigated, see the `statement_parser.rs` and the `source_parser.rs` module. mod ast_node; mod estimate_node_range; diff --git a/crates/parser/src/statement_parser.rs b/crates/parser/src/statement_parser.rs index 46e7126d..25d35511 100644 --- a/crates/parser/src/statement_parser.rs +++ b/crates/parser/src/statement_parser.rs @@ -9,10 +9,7 @@ use crate::{ syntax_kind_codegen::SyntaxKind, }; -/// A super simple lexer for sql statements. -/// -/// One weakness of pg_query.rs is that it does not parse whitespace or newlines. We use a very -/// simple lexer to fill the gaps. +/// Super simple lexer that only catches the tokens that libpg_query ignores. #[derive(Logos, Debug, PartialEq)] pub enum StatementToken { // comments and whitespaces @@ -27,7 +24,7 @@ pub enum StatementToken { } impl StatementToken { - /// Creates a `SyntaxKind` from a `StatementToken`. + /// Create a `SyntaxKind` from a `StatementToken`. pub fn syntax_kind(&self) -> SyntaxKind { match self { StatementToken::Whitespace => SyntaxKind::Whitespace, @@ -39,6 +36,13 @@ impl StatementToken { } impl Parser { + /// Parse a single statement passed in `text`. If `at_offset` is `Some`, the statement is assumed to be at that offset in the source file. + /// + /// On a high level, the parser works as follows: + /// - 1. Collect all information from pg_query.rs and `StatementToken` lexer + /// - 2. Derive as much information as possible from the collected information + /// - 3. Collect AST node and errors, if any + /// - 3. Walk the statement token by token, and reverse-engineer the concrete syntax tree pub fn parse_statement_at(&mut self, text: &str, at_offset: Option) { // 1. Collect as much information as possible from pg_query.rs and `StatementToken` lexer @@ -98,6 +102,7 @@ impl Parser { let mut statement_token_lexer = StatementToken::lexer(&text); // 2. Setup data structures required for the parsing algorithm + // A buffer for tokens that are not applied immediately to the cst let mut token_buffer: VecDeque<(SyntaxKind, String)> = VecDeque::new(); // Keeps track of currently open nodes. Latest opened is last. diff --git a/crates/parser/src/syntax_error.rs b/crates/parser/src/syntax_error.rs index d8dee689..df6d670f 100644 --- a/crates/parser/src/syntax_error.rs +++ b/crates/parser/src/syntax_error.rs @@ -2,7 +2,7 @@ use std::fmt; use cstree::text::{TextRange, TextSize}; -/// Represents the result of unsuccessful tokenization, parsing +/// Represents the result of unsuccessful tokenization, parsing, /// or tree validation. #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct SyntaxError(String, TextRange);