diff --git a/Cargo.lock b/Cargo.lock index 92cf7fc..6495bff 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -600,6 +600,7 @@ dependencies = [ "lalrpop-util", "lasso", "lazy_static", + "line-index", "log", "regex", "salsa", @@ -791,6 +792,16 @@ dependencies = [ "redox_syscall 0.4.1", ] +[[package]] +name = "line-index" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67d61795376ae2683928c218fda7d7d7db136fd38c06b7552904667f0d55580a" +dependencies = [ + "nohash-hasher", + "text-size", +] + [[package]] name = "linked-hash-map" version = "0.5.6" @@ -873,6 +884,12 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" +[[package]] +name = "nohash-hasher" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bf50223579dc7cdcfb3bfcacf7069ff68243f8c363f62ffa99cf000a6b9c451" + [[package]] name = "num-traits" version = "0.2.17" @@ -1500,6 +1517,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "text-size" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f18aa187839b2bdb1ad2fa35ead8c4c2976b64e4363c386d45ac0f7ee85c9233" + [[package]] name = "thiserror" version = "1.0.51" diff --git a/compiler/Cargo.toml b/compiler/Cargo.toml index 5136980..d8bf9ed 100644 --- a/compiler/Cargo.toml +++ b/compiler/Cargo.toml @@ -30,6 +30,7 @@ tower-lsp = "0.20" tokio = { version = "1.35", features = ["full"] } simple_logger = { version = "4.3", features = ["stderr"] } time = "0.3" +line-index = "0.1" [dev-dependencies] criterion = "0.5" diff --git a/compiler/src/parser/humanizer.rs b/compiler/src/parser/humanizer.rs index fd50f70..3d2f844 100644 --- a/compiler/src/parser/humanizer.rs +++ b/compiler/src/parser/humanizer.rs @@ -1,27 +1,21 @@ use crate::location::SourceLocation; +use line_index::{LineIndex, TextSize, WideEncoding}; #[derive(Debug, Eq, PartialEq)] -pub struct Humanizer { - line_starts: Vec, -} +pub struct Humanizer(LineIndex); impl Humanizer { pub fn new(input: &str) -> Self { - let mut line_starts = Vec::new(); - let mut index = 0; - line_starts.push(index); - for line in input.lines() { - // FIXME(MH): This assumes a newline character is just one byte, - // which is not true on Windows. - index += line.len() + 1; - line_starts.push(index); - } - Self { line_starts } + Self(LineIndex::new(input)) } pub fn run(&self, loc: usize) -> SourceLocation { - let line = self.line_starts.binary_search(&loc).unwrap_or_else(|x| x - 1); - SourceLocation { line: line as u32, column: (loc - self.line_starts[line]) as u32 } + u32::try_from(loc) + .ok() + .and_then(|loc| self.0.try_line_col(TextSize::new(loc))) + .and_then(|utf8_pos| self.0.to_wide(WideEncoding::Utf32, utf8_pos)) + .map(|utf16_pos| SourceLocation { line: utf16_pos.line, column: utf16_pos.col }) + .unwrap_or(SourceLocation { line: 0, column: 0 }) } } @@ -29,42 +23,31 @@ impl Humanizer { mod tests { use super::*; - #[test] - fn test_line_starts() { - let cases = vec![ - ("", vec![0]), - ("a", vec![0, 2]), - ("a\n", vec![0, 2]), - ("aa", vec![0, 3]), - ("a\nb", vec![0, 2, 4]), - ("a\nb\n", vec![0, 2, 4]), - ("ab\ncd\n", vec![0, 3, 6]), - ("\na", vec![0, 1, 3]), - ]; - for (input, expected_line_starts) in cases { - let humanizer = Humanizer::new(input); - let expected_line_starts: Vec<_> = expected_line_starts.into_iter().collect(); - assert_eq!(humanizer.line_starts, expected_line_starts); - } - } - #[test] fn test_translation() { - let humanizer = Humanizer::new("ab\nc\nde\n\nf"); + let humanizer = Humanizer::new("ab\nc\nde\n\nf\r\ng\näß"); let cases = vec![ - (0, 0, 0), - (1, 0, 1), - (2, 0, 2), - (3, 1, 0), - (4, 1, 1), - (5, 2, 0), - (6, 2, 1), - (7, 2, 2), - (8, 3, 0), - (9, 4, 0), - (10, 4, 1), - (11, 5, 0), - (100, 5, 89), + (0, 0, 0), // ^|a + (1, 0, 1), // a|b + (2, 0, 2), // b|\n + (3, 1, 0), // \n|c + (4, 1, 1), // c|\n + (5, 2, 0), // \n|d + (6, 2, 1), // d|e + (7, 2, 2), // e|\n + (8, 3, 0), // \n|\n + (9, 4, 0), // \n|f + (10, 4, 1), // f|\r + (11, 4, 2), // \r|\n + (12, 5, 0), // \n|g + (13, 5, 1), // g|\n + (14, 6, 0), // \n|ä + (15, 0, 0), // in ä + (16, 6, 1), // ä|ß + (17, 0, 0), // in ß + (18, 6, 2), // ß|$ + (19, 0, 0), // $| + (100, 0, 0), ]; for (loc, line, column) in cases { assert_eq!(humanizer.run(loc), SourceLocation { line, column }); diff --git a/compiler/src/tests/parser/mod.rs b/compiler/src/tests/parser/mod.rs index ff811cd..331e29b 100644 --- a/compiler/src/tests/parser/mod.rs +++ b/compiler/src/tests/parser/mod.rs @@ -96,13 +96,14 @@ fn location_eol_lf() { "###); } -// TODO(MH): This should have the same output as `location_eol_lf`. #[test] fn location_eol_crlf() { insta::assert_snapshot!(parse_error("\r\nx"), @r###" --- -------------------------------------------------- - 2:2-3:1: Unrecognized token `x` found at 2:2:3:1 + 2 | x + ~ + Unrecognized token `x` found at 2:1:2:2 Expected one of "fn" or "type" "###); } @@ -119,15 +120,14 @@ fn location_comment_ascii() { "###); } -// TODO(MH): This should have the same output as `location_comment_ascii`. #[test] fn location_comment_unlauts() { insta::assert_snapshot!(parse_error("/* äëïöü */ x"), @r###" --- -------------------------------------------------- 1 | /* äëïöü */ x - ~ - Unrecognized token `x` found at 1:18:1:19 + ~ + Unrecognized token `x` found at 1:13:1:14 Expected one of "fn" or "type" "###); }