Generate ast when parsing regex

exellentcoin26 · Jul 6, 2023 · 8f6d7e3 · 8f6d7e3
1 parent 63d7dd2
commit 8f6d7e3
Show file tree

Hide file tree

Showing 2 changed files with 128 additions and 85 deletions.
diff --git a/src/regex/ast.rs b/src/regex/ast.rs
@@ -12,36 +12,44 @@
 //! character_group_item ::= CHARACTER_CLASS | character_range | CHARACTER;
 //! ```
 
-use std::collections::HashSet;
-
 use super::tokenizer;
 
 /// Regular expression ast.
-#[derive(Debug)]
-pub enum Expr {
+#[derive(Debug, Clone)]
+pub enum ExprKind {
+ /// Concatenation of regular expressions.
+ Concat(Vec<ExprKind>),
+ /// An empty regex expresion.
+ Empty,
  /// An alternative expression (e.g., `<expression> | <expression>`).
- Alt(Box<Expr>, Box<Expr>),
- /// A literal (e.g., `a`, `[^ca]`, `[a-z]`).
- Lit(LiteralKind),
- /// A quantified expresion (e.g., `(ab[ac]){3,}`, `[0-1]*`).
- Quant(Box<Expr>, Option<tokenizer::QuantifierKind>),
+ Alt(Box<ExprKind>, Box<ExprKind>),
+ /// A literal (e.g., `a`, `[^ca]`, `[a-z]`, `[0-1]*`).
+ Lit(LiteralKind, Option<tokenizer::QuantifierKind>),
+ /// A grouped expression (e.g., `([a-z] | foo)`, `(ab[ac]){3,}`).
+ Group(Box<ExprKind>, Option<tokenizer::QuantifierKind>),
 }
 
 /// Literal kind that appears in an expression (e.g., `b`, `[^ab]`).
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub enum LiteralKind {
  /// A single token (unicode character constructs can consist multiple characters).
  Match(char),
+ /// A shorthand for character groups (e.g., `\w`, `\D`, `.`).
+ Class(tokenizer::ClassKind),
  /// A group of characters (e.g. `[^a-cl47i]`).
  Group {
  negated: bool,
- kind: HashSet<GroupedLiteralKind>,
+ literals: Vec<GroupedLiteralKind>,
  },
 }
 
 /// literal that appears in a group.
-#[derive(Debug)]
+#[derive(Debug, Clone, Copy)]
 pub enum GroupedLiteralKind {
+ /// A single token (unicode character constructs can consist multiple characters).
  Match(char),
+ /// A shorthand for character groups (e.g., `\w`, `\D`, `.`).
+ Class(tokenizer::ClassKind),
+ /// A character range (e.g., `0-1`, `a-z`).
  Range(char, char),
 }