Skip to content

Commit

Permalink
refactor: parser now uses a pointer into the token vector instead of …
Browse files Browse the repository at this point in the history
…popping and cloning
  • Loading branch information
psteinroe committed Oct 18, 2024
1 parent 58c0374 commit 3849cf7
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 33 deletions.
99 changes: 67 additions & 32 deletions crates/pg_statement_splitter/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,17 @@ use crate::syntax_error::SyntaxError;
/// It is modelled after a Pratt Parser. For a gentle introduction to Pratt Parsing, see https://matklad.github.io/2020/04/13/simple-but-powerful-pratt-parsing.html
pub struct Parser {
/// The ranges of the statements
ranges: Vec<TextRange>,
ranges: Vec<(usize, usize)>,
/// The syntax errors accumulated during parsing
errors: Vec<SyntaxError>,
/// The start of the current statement, if any
current_stmt_start: Option<TextSize>,
current_stmt_start: Option<usize>,
/// The tokens to parse
pub tokens: Vec<Token>,

eof_token: Token,

last_token_end: Option<TextSize>,
next_pos: usize,
}

/// Result of Building
Expand All @@ -46,66 +46,96 @@ impl Parser {
return !WHITESPACE_TOKENS.contains(&t.kind)
|| (t.kind == SyntaxKind::Newline && t.text.chars().count() > 1);
})
.rev()
.cloned()
.collect::<Vec<_>>();

let eof_token = Token::eof(usize::from(
tokens
.last()
.map(|t| t.span.start())
.unwrap_or(TextSize::from(0)),
));

// next_pos should be the initialised with the first valid token already
let mut next_pos = 0;
loop {
let token = tokens.get(next_pos).unwrap_or(&eof_token);

if is_irrelevant_token(token) {
next_pos += 1;
} else {
break;
}
}

Self {
ranges: Vec::new(),
eof_token: Token::eof(usize::from(
tokens
.first()
.map(|t| t.span.start())
.unwrap_or(TextSize::from(0)),
)),
eof_token,
errors: Vec::new(),
current_stmt_start: None,
tokens,
last_token_end: None,
next_pos,
}
}

pub fn finish(self) -> Parse {
Parse {
ranges: self.ranges,
ranges: self
.ranges
.iter()
.map(|(start, end)| {
println!("{} {}", start, end);
let from = self.tokens.get(*start);
let to = self.tokens.get(*end).unwrap_or(&self.eof_token);

TextRange::new(from.unwrap().span.start(), to.span.end())
})
.collect(),
errors: self.errors,
}
}

/// Start statement
pub fn start_stmt(&mut self) -> Token {
pub fn start_stmt(&mut self) {
assert!(self.current_stmt_start.is_none());

let token = self.peek();

self.current_stmt_start = Some(token.span.start());

token
self.current_stmt_start = Some(self.next_pos);
}

/// Close statement
pub fn close_stmt(&mut self) {
self.ranges.push(TextRange::new(
assert!(self.next_pos > 0);

self.ranges.push((
self.current_stmt_start.expect("Expected active statement"),
self.last_token_end.expect("Expected last token end"),
self.next_pos - 1,
));

self.current_stmt_start = None;
}

fn advance(&mut self) -> Token {
let token = self.tokens.pop().unwrap_or(self.eof_token.clone());

self.last_token_end = Some(token.span.end());

token
fn advance(&mut self) -> &Token {
let mut first_relevant_token = None;
loop {
let token = self.tokens.get(self.next_pos).unwrap_or(&self.eof_token);

// we need to continue with next_pos until the next relevant token after we already
// found the first one
if !is_irrelevant_token(token) {
if let Some(t) = first_relevant_token {
return t;
}
first_relevant_token = Some(token);
}

self.next_pos += 1;
}
}

fn peek(&mut self) -> Token {
self.tokens
.last()
.cloned()
.unwrap_or(self.eof_token.clone())
fn peek(&self) -> &Token {
match self.tokens.get(self.next_pos) {
Some(token) => token,
None => &self.eof_token,
}
}

/// checks if the current token is of `kind` and advances if true
Expand All @@ -132,3 +162,8 @@ impl Parser {
todo!();
}
}

fn is_irrelevant_token(t: &Token) -> bool {
return WHITESPACE_TOKENS.contains(&t.kind)
&& (t.kind != SyntaxKind::Newline || t.text.chars().count() == 1);
}
9 changes: 8 additions & 1 deletion crates/pg_statement_splitter/src/parser/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,14 @@ pub(crate) fn unknown(p: &mut Parser) {
loop {
match p.peek() {
Token {
kind: SyntaxKind::Newline | SyntaxKind::Ascii59 | SyntaxKind::Eof,
kind: SyntaxKind::Ascii59,
..
} => {
p.advance();
break;
}
Token {
kind: SyntaxKind::Newline | SyntaxKind::Eof,
..
} => {
break;
Expand Down

0 comments on commit 3849cf7

Please sign in to comment.