Fix CJK word navigation in the TUI composer (#16829)

Addresses #16584

Problem: TUI word-wise cursor movement treated entire CJK runs as a
single word, so Option/Alt+Left and Right skipped too far when editing
East Asian text.

Solution: Use Unicode word-boundary segments within each non-whitespace
run so CJK text advances one segment at a time while preserving
separator and delete-word behavior, and add regression coverage for CJK
and mixed-script navigation.

Testing: Manually tested solution by pasting text that includes CJK
characters into the composer and confirmed that keyboard navigation
worked correctly (after confirming it didn't prior to the change).
This commit is contained in:
Eric Traut
2026-04-06 08:37:42 -07:00
committed by GitHub
parent 624c69e840
commit 82b061afb2

View File

@@ -35,6 +35,32 @@ fn is_word_separator(ch: char) -> bool {
WORD_SEPARATORS.contains(ch)
}
fn split_word_pieces(run: &str) -> Vec<(usize, &str)> {
let mut pieces = Vec::new();
for (segment_start, segment) in run.split_word_bound_indices() {
let mut piece_start = 0;
let mut chars = segment.char_indices();
let Some((_, first_char)) = chars.next() else {
continue;
};
let mut in_separator = is_word_separator(first_char);
for (idx, ch) in chars {
let is_separator = is_word_separator(ch);
if is_separator == in_separator {
continue;
}
pieces.push((segment_start + piece_start, &segment[piece_start..idx]));
piece_start = idx;
in_separator = is_separator;
}
pieces.push((segment_start + piece_start, &segment[piece_start..]));
}
pieces
}
#[derive(Debug, Clone)]
struct TextElement {
id: u64,
@@ -1216,36 +1242,55 @@ impl TextArea {
else {
return 0;
};
let is_separator = is_word_separator(ch);
let mut start = first_non_ws_idx;
for (idx, ch) in prefix[..first_non_ws_idx].char_indices().rev() {
if ch.is_whitespace() || is_word_separator(ch) != is_separator {
start = idx + ch.len_utf8();
break;
let run_start = prefix[..first_non_ws_idx]
.char_indices()
.rev()
.find(|&(_, ch)| ch.is_whitespace())
.map_or(0, |(idx, ch)| idx + ch.len_utf8());
let run_end = first_non_ws_idx + ch.len_utf8();
let pieces = split_word_pieces(&prefix[run_start..run_end]);
let mut pieces = pieces.into_iter().rev().peekable();
let Some((piece_start, piece)) = pieces.next() else {
return run_start;
};
let mut start = run_start + piece_start;
if piece.chars().all(is_word_separator) {
while let Some((idx, piece)) = pieces.peek() {
if !piece.chars().all(is_word_separator) {
break;
}
start = run_start + *idx;
pieces.next();
}
start = idx;
}
self.adjust_pos_out_of_elements(start, /*prefer_start*/ true)
}
pub(crate) fn end_of_next_word(&self) -> usize {
let Some(first_non_ws) = self.text[self.cursor_pos..].find(|c: char| !c.is_whitespace())
else {
let suffix = &self.text[self.cursor_pos..];
let Some(first_non_ws) = suffix.find(|ch: char| !ch.is_whitespace()) else {
return self.text.len();
};
let word_start = self.cursor_pos + first_non_ws;
let mut iter = self.text[word_start..].char_indices();
let Some((_, first_ch)) = iter.next() else {
return word_start;
let run = &suffix[first_non_ws..];
let run = &run[..run.find(char::is_whitespace).unwrap_or(run.len())];
let mut pieces = split_word_pieces(run).into_iter().peekable();
let Some((start, piece)) = pieces.next() else {
return self.cursor_pos + first_non_ws;
};
let is_separator = is_word_separator(first_ch);
let mut end = self.text.len();
for (idx, ch) in iter {
if ch.is_whitespace() || is_word_separator(ch) != is_separator {
end = word_start + idx;
break;
let word_start = self.cursor_pos + first_non_ws + start;
let mut end = word_start + piece.len();
if piece.chars().all(is_word_separator) {
while let Some((idx, piece)) = pieces.peek() {
if !piece.chars().all(is_word_separator) {
break;
}
end = self.cursor_pos + first_non_ws + *idx + piece.len();
pieces.next();
}
}
self.adjust_pos_out_of_elements(end, /*prefer_start*/ false)
}
@@ -2040,6 +2085,80 @@ mod tests {
assert_eq!(t.end_of_next_word(), t.text().len());
}
#[test]
fn word_navigation_cjk_each_char_is_boundary() {
let text = "你好世界";
let mut t = ta_with(text);
t.set_cursor(/*pos*/ text.len());
assert_eq!(t.beginning_of_previous_word(), 9);
t.set_cursor(/*pos*/ 9);
assert_eq!(t.beginning_of_previous_word(), 6);
t.set_cursor(/*pos*/ 6);
assert_eq!(t.beginning_of_previous_word(), 3);
t.set_cursor(/*pos*/ 3);
assert_eq!(t.beginning_of_previous_word(), 0);
}
#[test]
fn word_navigation_cjk_forward() {
let text = "你好世界";
let mut t = ta_with(text);
t.set_cursor(/*pos*/ 0);
assert_eq!(t.end_of_next_word(), 3);
t.set_cursor(/*pos*/ 3);
assert_eq!(t.end_of_next_word(), 6);
t.set_cursor(/*pos*/ 6);
assert_eq!(t.end_of_next_word(), 9);
t.set_cursor(/*pos*/ 9);
assert_eq!(t.end_of_next_word(), 12);
}
#[test]
fn word_navigation_mixed_ascii_cjk() {
let text = "hello你好";
let mut t = ta_with(text);
t.set_cursor(/*pos*/ 0);
assert_eq!(t.end_of_next_word(), 5);
t.set_cursor(/*pos*/ 5);
assert_eq!(t.end_of_next_word(), 8);
t.set_cursor(/*pos*/ text.len());
assert_eq!(t.beginning_of_previous_word(), 8);
t.set_cursor(/*pos*/ 8);
assert_eq!(t.beginning_of_previous_word(), 5);
t.set_cursor(/*pos*/ 5);
assert_eq!(t.beginning_of_previous_word(), 0);
}
#[test]
fn word_navigation_preserves_separator_breaks_within_unicode_segments() {
let mut t = ta_with("can't 32.3 foo.bar");
t.set_cursor(/*pos*/ 5);
assert_eq!(t.beginning_of_previous_word(), 4);
t.set_cursor(/*pos*/ 4);
assert_eq!(t.beginning_of_previous_word(), 3);
t.set_cursor(/*pos*/ 10);
assert_eq!(t.beginning_of_previous_word(), 9);
t.set_cursor(/*pos*/ 18);
assert_eq!(t.beginning_of_previous_word(), 15);
}
#[test]
fn wrapping_and_cursor_positions() {
let mut t = ta_with("hello world here");