feat: add query, getSemanticTokens api

Myriad-Dreamin · Nov 3, 2023 · 3af05ab · 3af05ab
1 parent ac13b9e
commit 3af05ab
Show file tree

Hide file tree

Showing 12 changed files with 593 additions and 7 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/compiler/Cargo.toml b/compiler/Cargo.toml
@@ -23,6 +23,7 @@ sha2.workspace = true
 flate2.workspace = true
 ecow.workspace = true
 instant.workspace = true
+strum = { version = "0.25.0", features = ["derive"] }
 
 serde.workspace = true
 serde_json.workspace = true

diff --git a/compiler/src/parser/mod.rs b/compiler/src/parser/mod.rs
@@ -1,7 +1,16 @@
+mod modifier_set;
+mod semantic_tokens;
+// mod token_encode;
+mod typst_tokens;
+
 use typst::{diag::FileResult, syntax::Source};
 
 use typst_ts_core::TypstFileId;
 
+pub use semantic_tokens::{
+    get_semantic_tokens_full, get_semantic_tokens_legend, SemanticToken, SemanticTokensLegend,
+};
+
 pub fn reparse(source_id: TypstFileId, prev: Option<Source>, next: String) -> FileResult<Source> {
     use dissimilar::Chunk;
     match prev {

diff --git a/compiler/src/parser/modifier_set.rs b/compiler/src/parser/modifier_set.rs
@@ -0,0 +1,33 @@
+use std::ops;
+
+use super::typst_tokens::Modifier;
+
+#[derive(Default, Clone, Copy)]
+pub struct ModifierSet(u32);
+
+impl ModifierSet {
+    pub fn empty() -> Self {
+        Self::default()
+    }
+
+    pub fn new(modifiers: &[Modifier]) -> Self {
+        let bits = modifiers
+            .iter()
+            .copied()
+            .map(Modifier::bitmask)
+            .fold(0, |bits, mask| bits | mask);
+        Self(bits)
+    }
+
+    pub fn bitset(self) -> u32 {
+        self.0
+    }
+}
+
+impl ops::BitOr for ModifierSet {
+    type Output = Self;
+
+    fn bitor(self, rhs: Self) -> Self::Output {
+        Self(self.0 | rhs.0)
+    }
+}
diff --git a/compiler/src/parser/semantic_tokens.rs b/compiler/src/parser/semantic_tokens.rs
@@ -0,0 +1,226 @@
+//! From <https://github.com/nvarner/typst-lsp/blob/cc7bad9bd9764bfea783f2fab415cb3061fd8bff/src/server/semantic_tokens/mod.rs>
+
+use strum::IntoEnumIterator;
+use typst::syntax::{ast, LinkedNode, Source, SyntaxKind};
+
+use super::modifier_set::ModifierSet;
+use super::typst_tokens::{Modifier, TokenType};
+
+#[derive(serde::Deserialize, serde::Serialize)]
+pub struct SemanticTokensLegend {
+    #[serde(rename = "tokenTypes")]
+    pub token_types: Vec<String>,
+    #[serde(rename = "tokenModifiers")]
+    pub token_modifiers: Vec<String>,
+}
+
+pub fn get_semantic_tokens_legend() -> SemanticTokensLegend {
+    SemanticTokensLegend {
+        token_types: TokenType::iter()
+            .map(|e| {
+                let e: &'static str = e.into();
+
+                e.to_owned()
+            })
+            .collect(),
+        token_modifiers: Modifier::iter()
+            .map(|e| {
+                let e: &'static str = e.into();
+
+                e.to_owned()
+            })
+            .collect(),
+    }
+}
+
+pub fn get_semantic_tokens_full(source: &Source) -> Vec<SemanticToken> {
+    let root = LinkedNode::new(source.root());
+    let mut full = tokenize_tree(source, &root, ModifierSet::empty());
+
+    let mut init = (0, 0);
+    for token in full.iter_mut() {
+        let next = (token.delta_line, token.delta_start_character);
+        token.delta_line -= init.0;
+        if token.delta_line == 0 {
+            token.delta_start_character -= init.1;
+        }
+        init = next;
+    }
+
+    full
+}
+
+fn tokenize_single_node(
+    ctx: &Source,
+    node: &LinkedNode,
+    modifiers: ModifierSet,
+) -> Option<SemanticToken> {
+    let is_leaf = node.children().next().is_none();
+
+    token_from_node(node)
+        .or_else(|| is_leaf.then_some(TokenType::Text))
+        .map(|token_type| SemanticToken::new(ctx, token_type, modifiers, node))
+}
+
+/// Tokenize a node and its children
+fn tokenize_tree(
+    ctx: &Source,
+    root: &LinkedNode<'_>,
+    parent_modifiers: ModifierSet,
+) -> Vec<SemanticToken> {
+    let modifiers = parent_modifiers | modifiers_from_node(root);
+
+    let token = tokenize_single_node(ctx, root, modifiers).into_iter();
+    let children = root
+        .children()
+        .flat_map(move |child| tokenize_tree(ctx, &child, modifiers));
+    token.chain(children).collect()
+}
+
+pub struct SemanticToken {
+    pub delta_line: u32,
+    pub delta_start_character: u32,
+    pub length: u32,
+    pub token_type: u32,
+    pub token_modifiers: u32,
+}
+
+impl SemanticToken {
+    pub fn new(
+        ctx: &Source,
+        token_type: TokenType,
+        modifiers: ModifierSet,
+        node: &LinkedNode,
+    ) -> Self {
+        let source = node.get().clone().into_text();
+
+        let position = offset_to_position(node.offset(), ctx);
+
+        Self {
+            token_type: token_type as u32,
+            token_modifiers: modifiers.bitset(),
+            delta_line: position.0,
+            delta_start_character: position.1,
+            length: source.chars().map(char::len_utf16).sum::<usize>() as u32,
+        }
+    }
+}
+
+/// Determines the [`Modifier`]s to be applied to a node and all its children.
+///
+/// Note that this does not recurse up, so calling it on a child node may not
+/// return a modifier that should be applied to it due to a parent.
+fn modifiers_from_node(node: &LinkedNode) -> ModifierSet {
+    match node.kind() {
+        SyntaxKind::Emph => ModifierSet::new(&[Modifier::Emph]),
+        SyntaxKind::Strong => ModifierSet::new(&[Modifier::Strong]),
+        SyntaxKind::Math | SyntaxKind::Equation => ModifierSet::new(&[Modifier::Math]),
+        _ => ModifierSet::empty(),
+    }
+}
+
+/// Determines the best [`TokenType`] for an entire node and its children, if
+/// any. If there is no single `TokenType`, or none better than `Text`, returns
+/// `None`.
+///
+/// In tokenization, returning `Some` stops recursion, while returning `None`
+/// continues and attempts to tokenize each of `node`'s children. If there are
+/// no children, `Text` is taken as the default.
+fn token_from_node(node: &LinkedNode) -> Option<TokenType> {
+    use SyntaxKind::*;
+
+    match node.kind() {
+        Star if node.parent_kind() == Some(Strong) => Some(TokenType::Punctuation),
+        Star if node.parent_kind() == Some(ModuleImport) => Some(TokenType::Operator),
+
+        Underscore if node.parent_kind() == Some(Emph) => Some(TokenType::Punctuation),
+        Underscore if node.parent_kind() == Some(MathAttach) => Some(TokenType::Operator),
+
+        MathIdent | Ident => Some(token_from_ident(node)),
+        Hash => token_from_hashtag(node),
+
+        LeftBrace | RightBrace | LeftBracket | RightBracket | LeftParen | RightParen | Comma
+        | Semicolon | Colon => Some(TokenType::Punctuation),
+        Linebreak | Escape | Shorthand => Some(TokenType::Escape),
+        Link => Some(TokenType::Link),
+        Raw => Some(TokenType::Raw),
+        Label => Some(TokenType::Label),
+        RefMarker => Some(TokenType::Ref),
+        Heading | HeadingMarker => Some(TokenType::Heading),
+        ListMarker | EnumMarker | TermMarker => Some(TokenType::ListMarker),
+        MathAlignPoint | Plus | Minus | Slash | Hat | Dot | Eq | EqEq | ExclEq | Lt | LtEq | Gt
+        | GtEq | PlusEq | HyphEq | StarEq | SlashEq | Dots | Arrow | Not | And | Or => {
+            Some(TokenType::Operator)
+        }
+        Dollar => Some(TokenType::Delimiter),
+        None | Auto | Let | Show | If | Else | For | In | While | Break | Continue | Return
+        | Import | Include | As | Set => Some(TokenType::Keyword),
+        Bool => Some(TokenType::Bool),
+        Int | Float | Numeric => Some(TokenType::Number),
+        Str => Some(TokenType::String),
+        LineComment | BlockComment => Some(TokenType::Comment),
+        Error => Some(TokenType::Error),
+
+        // Disambiguate from `SyntaxKind::None`
+        _ => Option::None,
+    }
+}
+
+// TODO: differentiate also using tokens in scope, not just context
+fn is_function_ident(ident: &LinkedNode) -> bool {
+    let Some(next) = ident.next_leaf() else {
+        return false;
+    };
+    let function_call = matches!(next.kind(), SyntaxKind::LeftParen)
+        && matches!(
+            next.parent_kind(),
+            Some(SyntaxKind::Args | SyntaxKind::Params)
+        );
+    let function_content = matches!(next.kind(), SyntaxKind::LeftBracket)
+        && matches!(next.parent_kind(), Some(SyntaxKind::ContentBlock));
+    function_call || function_content
+}
+
+fn token_from_ident(ident: &LinkedNode) -> TokenType {
+    if is_function_ident(ident) {
+        TokenType::Function
+    } else {
+        TokenType::Interpolated
+    }
+}
+
+fn get_expr_following_hashtag<'a>(hashtag: &LinkedNode<'a>) -> Option<LinkedNode<'a>> {
+    hashtag
+        .next_sibling()
+        .filter(|next| next.cast::<ast::Expr>().map_or(false, |expr| expr.hash()))
+        .and_then(|node| node.leftmost_leaf())
+}
+
+fn token_from_hashtag(hashtag: &LinkedNode) -> Option<TokenType> {
+    get_expr_following_hashtag(hashtag)
+        .as_ref()
+        .and_then(token_from_node)
+}
+
+fn offset_to_position(typst_offset: usize, typst_source: &Source) -> (u32, u32) {
+    let line_index = typst_source.byte_to_line(typst_offset).unwrap();
+
+    let lsp_line = line_index as u32;
+
+    // See the implementation of `lsp_to_typst::position_to_offset` for discussion
+    // relevant to this function.
+
+    // TODO: Typst's `Source` could easily provide an implementation of the method
+    // we   need here. Submit a PR to `typst` to add it, then update
+    // this if/when merged.
+
+    let utf16_offset = typst_source.byte_to_utf16(typst_offset).unwrap();
+
+    let byte_line_offset = typst_source.line_to_byte(line_index).unwrap();
+    let utf16_line_offset = typst_source.byte_to_utf16(byte_line_offset).unwrap();
+
+    let utf16_column_offset = utf16_offset - utf16_line_offset;
+    let lsp_column = utf16_column_offset;
+
+    (lsp_line, lsp_column as u32)
+}