Skip to content

Commit

Permalink
Properly handle unicode and grapheme clusters (#26)
Browse files Browse the repository at this point in the history
  • Loading branch information
terror authored Oct 1, 2024
1 parent 9d92b39 commit bc59418
Show file tree
Hide file tree
Showing 10 changed files with 524 additions and 291 deletions.
593 changes: 343 additions & 250 deletions Cargo.lock

Large diffs are not rendered by default.

21 changes: 11 additions & 10 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,18 @@ keywords = ["command-line", "productivity", "utility", "markdown", "bash"]
resolver = "2"

[dependencies]
clap = { version = "4.0.9", features = ["derive"] }
console = "0.15.2"
pulldown-cmark = { version = "0.9.2", default-features = false, features = ["simd"] }
ropey = "1.5.0"
similar = "2.2.0"
snafu = { version = "0.7.1", default-features = false, features = ["std"] }
termimad = "0.20.3"
walkdir = "2.3.2"
clap = { version = "4.5.19", features = ["derive"] }
console = "0.15.8"
pulldown-cmark = { version = "0.9.6", default-features = false, features = ["simd"] }
ropey = "1.6.1"
similar = "2.6.0"
snafu = { version = "0.7.5", default-features = false, features = ["std"] }
termimad = "0.20.6"
unicode-segmentation = "1.12.0"
walkdir = "2.5.0"

[dev-dependencies]
executable-path = "1.0.0"
pretty_assertions = "1.3.0"
pretty_assertions = "1.4.1"
tempdir = "0.3.7"
unindent = "0.1.10"
unindent = "0.1.11"
3 changes: 1 addition & 2 deletions src/common.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
//! Contains commonly used stuff from external crates
// std
pub(crate) use std::{
fs,
io::{self, Write},
Expand All @@ -9,12 +8,12 @@ pub(crate) use std::{
process, str,
};

// dependencies
pub(crate) use {
console::Style,
pulldown_cmark::{CodeBlockKind, Event, Parser as MarkdownParser, Tag},
ropey::Rope,
similar::{ChangeTag, TextDiff},
snafu::Snafu,
termimad::print_inline,
unicode_segmentation::UnicodeSegmentation,
};
36 changes: 31 additions & 5 deletions src/diff.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,20 @@ pub struct Diff {
}

impl Diff {
/// Adjusts the diff's range by the given offset.
///
/// This method modifies the start and end points of the diff's range
/// based on the provided offset. It handles both positive and negative
/// offsets, using saturating arithmetic to prevent underflow or overflow.
pub(crate) fn offset(&mut self, offset: isize) {
if offset < 0 {
self.range.start = self.range.start.saturating_sub(offset.unsigned_abs());
self.range.end = self.range.end.saturating_sub(offset.unsigned_abs());
if offset >= 0 {
let offset = offset as usize;
self.range.start = self.range.start.saturating_add(offset);
self.range.end = self.range.end.saturating_add(offset);
} else {
self.range.start += offset as usize;
self.range.end += offset as usize;
let abs_offset = offset.unsigned_abs();
self.range.start = self.range.start.saturating_sub(abs_offset);
self.range.end = self.range.end.saturating_sub(abs_offset);
}
}

Expand Down Expand Up @@ -73,4 +80,23 @@ mod tests {
diff.offset(-10);
assert_eq!(diff.range, 0..0);
}

#[test]
fn offset_positive_large() {
let mut diff = diff();

diff.offset(isize::MAX);

assert_eq!(
diff.range,
(1 + isize::MAX as usize)..(4 + isize::MAX as usize)
);
}

#[test]
fn offset_negative_large() {
let mut diff = diff();
diff.offset(isize::MIN);
assert_eq!(diff.range, 0..0);
}
}
6 changes: 3 additions & 3 deletions src/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,12 +94,12 @@ impl File {
/// If [`interactive`](File::interactive) is set to `true`, the user will be
/// asked if they want to apply the change for each diff.
pub fn present(&mut self) -> Result {
let mut offset = 0;
let mut offset: isize = 0;

let diffs = self.diffs().collect::<Result<Vec<Diff>>>()?;

for mut diff in diffs {
let prev = self.content.len_chars();
let prev = self.content.len_bytes();

diff.offset(offset);

Expand All @@ -111,7 +111,7 @@ impl File {
}

self.content.apply(diff.clone());
offset += self.content.len_chars() as isize - prev as isize;
offset += self.content.len_bytes() as isize - prev as isize;
}

Ok(())
Expand Down
20 changes: 20 additions & 0 deletions src/grapheme.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
use crate::common::*;

pub(crate) fn byte_index_to_grapheme_index(
s: &str,
byte_index: usize,
) -> usize {
s.grapheme_indices(true)
.take_while(|(i, _)| *i < byte_index)
.count()
}

pub(crate) fn grapheme_index_to_byte_index(
s: &str,
grapheme_index: usize,
) -> usize {
s.grapheme_indices(true)
.nth(grapheme_index)
.map(|(i, _)| i)
.unwrap_or(s.len())
}
11 changes: 9 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ mod common;
mod diff;
mod error;
mod file;
mod grapheme;
mod lexer;
mod parser;
mod position;
Expand All @@ -34,8 +35,14 @@ pub use crate::{diff::Diff, error::Error, file::File};

// Public only to crate
pub(crate) use crate::{
codeblock::Codeblock, command::Command, lexer::Lexer, parser::Parser,
position::Position, prompt::prompt, rope_ext::RopeExt,
codeblock::Codeblock,
command::Command,
grapheme::{byte_index_to_grapheme_index, grapheme_index_to_byte_index},
lexer::Lexer,
parser::Parser,
position::Position,
prompt::prompt,
rope_ext::RopeExt,
};

/// Present's internal result type
Expand Down
85 changes: 70 additions & 15 deletions src/parser.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
use crate::{common::*, Codeblock, Command, Position, Result};
use crate::{
byte_index_to_grapheme_index, common::*, grapheme_index_to_byte_index,
Codeblock, Command, Position, Result,
};

#[derive(Debug, Clone)]
pub(crate) struct Parser<'a> {
Expand Down Expand Up @@ -39,33 +42,63 @@ impl<'a> Parser<'a> {
let start_start = range.start;
let mut start_end = start_start;

while let Some(ch) = self.src.chars().nth(start_end) {
match ch {
'`' => start_end += 1,
let src_graphemes: Vec<&str> = self.src.graphemes(true).collect();

while let Some(grapheme) =
src_graphemes.get(byte_index_to_grapheme_index(self.src, start_end))
{
match *grapheme {
"`" => {
start_end = grapheme_index_to_byte_index(
self.src,
byte_index_to_grapheme_index(self.src, start_end) + 1,
)
}
_ => break,
}
}

while let Some(ch) = self.src.chars().nth(start_end) {
match ch {
'`' | '\n' => break,
_ => start_end += 1,
while let Some(grapheme) =
src_graphemes.get(byte_index_to_grapheme_index(self.src, start_end))
{
match *grapheme {
"`" | "\n" => break,
_ => {
start_end = grapheme_index_to_byte_index(
self.src,
byte_index_to_grapheme_index(self.src, start_end) + 1,
)
}
}
}

let end_end = range.end - 1;
let mut end_start = end_end;

while let Some(ch) = self.src.chars().nth(end_start) {
match ch {
'`' => break,
_ => end_start -= 1,
while let Some(grapheme) =
src_graphemes.get(byte_index_to_grapheme_index(self.src, end_start))
{
match *grapheme {
"`" => break,
_ => {
end_start = grapheme_index_to_byte_index(
self.src,
byte_index_to_grapheme_index(self.src, end_start) - 1,
)
}
}
}

while let Some(ch) = self.src.chars().nth(end_start) {
match ch {
'`' => end_start -= 1,
while let Some(grapheme) =
src_graphemes.get(byte_index_to_grapheme_index(self.src, end_start))
{
match *grapheme {
"`" => {
end_start = grapheme_index_to_byte_index(
self.src,
byte_index_to_grapheme_index(self.src, end_start) - 1,
)
}
_ => break,
}
}
Expand Down Expand Up @@ -137,4 +170,26 @@ mod tests {
}
);
}

#[test]
fn parse_codeblock_with_unicode() {
let parser = Parser::new("```present echo 🚀\n```");

let codeblock = parser.parse_codeblock(0..23).unwrap().unwrap();

assert_eq!(
codeblock.command,
Command::from(vec!["present".into(), "echo".into(), "🚀".into()])
.unwrap()
.unwrap()
);

assert_eq!(
codeblock.position,
Position {
start: 0..20,
end: 20..22
}
);
}
}
12 changes: 8 additions & 4 deletions src/rope_ext.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,18 @@ pub(crate) trait RopeExt {

impl RopeExt for Rope {
fn apply(&mut self, diff: Diff) {
self.remove(diff.range.clone());
self.insert(diff.range.start, &diff.content);
let start = self.byte_to_char(diff.range.start);
let end = self.byte_to_char(diff.range.end);
self.remove(start..end);
self.insert(start, &diff.content);
}

fn simulate(&self, diff: Diff) -> Rope {
let mut clone = self.clone();
clone.remove(diff.range.clone());
clone.insert(diff.range.start, &diff.content);
let start = clone.byte_to_char(diff.range.start);
let end = clone.byte_to_char(diff.range.end);
clone.remove(start..end);
clone.insert(start, &diff.content);
clone
}
}
28 changes: 28 additions & 0 deletions tests/integration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -584,3 +584,31 @@ fn interactive_reject() -> Result {

Ok(())
}

#[test]
fn grapheme_handling() -> Result {
Test::new()?
.markdown(
r#"
Hello, 世界! 👋
```present echo "🚀 Grapheme test: é, 世界, 👨‍👩‍👧‍👦"
```
Grapheme cluster: 👨‍👩‍👧‍👦
"#,
)
.expected_status(0)
.expected_stdout(
r#"
Hello, 世界! 👋
```present echo "🚀 Grapheme test: é, 世界, 👨‍👩‍👧‍👦"
🚀 Grapheme test: é, 世界, 👨‍👩‍👧‍👦
```
Grapheme cluster: 👨‍👩‍👧‍👦
"#,
)
.run()
}

0 comments on commit bc59418

Please sign in to comment.