mirror of
https://github.com/openai/codex.git
synced 2026-04-29 10:53:24 +03:00
460 lines
18 KiB
Markdown
460 lines
18 KiB
Markdown
# PR #1467: Fix Unicode handling in chat_composer "@" token detection
|
||
|
||
- URL: https://github.com/openai/codex/pull/1467
|
||
- Author: ryozi-tn
|
||
- Created: 2025-07-06 18:00:10 UTC
|
||
- Updated: 2025-07-07 20:43:39 UTC
|
||
- Changes: +175/-18, Files changed: 1, Commits: 4
|
||
|
||
## Description
|
||
|
||
## Issues Fixed
|
||
|
||
- **Primary Issue (#1450)**: Unicode cursor positioning was incorrect due to mixing character positions with byte positions
|
||
- **Additional Issue**: Full-width spaces (CJK whitespace like " ") weren't properly handled as token boundaries
|
||
- ref: https://doc.rust-lang.org/std/primitive.char.html#method.is_whitespace
|
||
|
||
## Full Diff
|
||
|
||
```diff
|
||
diff --git a/codex-rs/tui/src/bottom_pane/chat_composer.rs b/codex-rs/tui/src/bottom_pane/chat_composer.rs
|
||
index 59d6e4579d..cd8e9fa17f 100644
|
||
--- a/codex-rs/tui/src/bottom_pane/chat_composer.rs
|
||
+++ b/codex-rs/tui/src/bottom_pane/chat_composer.rs
|
||
@@ -290,26 +290,28 @@ impl ChatComposer<'_> {
|
||
// Guard against out-of-bounds rows.
|
||
let line = textarea.lines().get(row)?.as_str();
|
||
|
||
- // Clamp the cursor column to the line length to avoid slicing panics
|
||
- // when the cursor is at the end of the line.
|
||
- let col = col.min(line.len());
|
||
+ // Calculate byte offset for cursor position
|
||
+ let cursor_byte_offset = line.chars().take(col).map(|c| c.len_utf8()).sum::<usize>();
|
||
|
||
// Split the line at the cursor position so we can search for word
|
||
// boundaries on both sides.
|
||
- let before_cursor = &line[..col];
|
||
- let after_cursor = &line[col..];
|
||
+ let before_cursor = &line[..cursor_byte_offset];
|
||
+ let after_cursor = &line[cursor_byte_offset..];
|
||
|
||
- // Find start index (first character **after** the previous whitespace).
|
||
+ // Find start index (first character **after** the previous multi-byte whitespace).
|
||
let start_idx = before_cursor
|
||
- .rfind(|c: char| c.is_whitespace())
|
||
- .map(|idx| idx + 1)
|
||
+ .char_indices()
|
||
+ .rfind(|(_, c)| c.is_whitespace())
|
||
+ .map(|(idx, c)| idx + c.len_utf8())
|
||
.unwrap_or(0);
|
||
|
||
- // Find end index (first whitespace **after** the cursor position).
|
||
+ // Find end index (first multi-byte whitespace **after** the cursor position).
|
||
let end_rel_idx = after_cursor
|
||
- .find(|c: char| c.is_whitespace())
|
||
+ .char_indices()
|
||
+ .find(|(_, c)| c.is_whitespace())
|
||
+ .map(|(idx, _)| idx)
|
||
.unwrap_or(after_cursor.len());
|
||
- let end_idx = col + end_rel_idx;
|
||
+ let end_idx = cursor_byte_offset + end_rel_idx;
|
||
|
||
if start_idx >= end_idx {
|
||
return None;
|
||
@@ -336,21 +338,25 @@ impl ChatComposer<'_> {
|
||
let mut lines: Vec<String> = self.textarea.lines().to_vec();
|
||
|
||
if let Some(line) = lines.get_mut(row) {
|
||
- let col = col.min(line.len());
|
||
+ // Calculate byte offset for cursor position
|
||
+ let cursor_byte_offset = line.chars().take(col).map(|c| c.len_utf8()).sum::<usize>();
|
||
|
||
- let before_cursor = &line[..col];
|
||
- let after_cursor = &line[col..];
|
||
+ let before_cursor = &line[..cursor_byte_offset];
|
||
+ let after_cursor = &line[cursor_byte_offset..];
|
||
|
||
// Determine token boundaries.
|
||
let start_idx = before_cursor
|
||
- .rfind(|c: char| c.is_whitespace())
|
||
- .map(|idx| idx + 1)
|
||
+ .char_indices()
|
||
+ .rfind(|(_, c)| c.is_whitespace())
|
||
+ .map(|(idx, c)| idx + c.len_utf8())
|
||
.unwrap_or(0);
|
||
|
||
let end_rel_idx = after_cursor
|
||
- .find(|c: char| c.is_whitespace())
|
||
+ .char_indices()
|
||
+ .find(|(_, c)| c.is_whitespace())
|
||
+ .map(|(idx, _)| idx)
|
||
.unwrap_or(after_cursor.len());
|
||
- let end_idx = col + end_rel_idx;
|
||
+ let end_idx = cursor_byte_offset + end_rel_idx;
|
||
|
||
// Replace the slice `[start_idx, end_idx)` with the chosen path and a trailing space.
|
||
let mut new_line =
|
||
@@ -618,3 +624,154 @@ impl WidgetRef for &ChatComposer<'_> {
|
||
}
|
||
}
|
||
}
|
||
+
|
||
+#[cfg(test)]
|
||
+mod tests {
|
||
+ use crate::bottom_pane::ChatComposer;
|
||
+ use tui_textarea::TextArea;
|
||
+
|
||
+ #[test]
|
||
+ fn test_current_at_token_basic_cases() {
|
||
+ let test_cases = vec![
|
||
+ // Valid @ tokens
|
||
+ ("@hello", 3, Some("hello".to_string()), "Basic ASCII token"),
|
||
+ (
|
||
+ "@file.txt",
|
||
+ 4,
|
||
+ Some("file.txt".to_string()),
|
||
+ "ASCII with extension",
|
||
+ ),
|
||
+ (
|
||
+ "hello @world test",
|
||
+ 8,
|
||
+ Some("world".to_string()),
|
||
+ "ASCII token in middle",
|
||
+ ),
|
||
+ (
|
||
+ "@test123",
|
||
+ 5,
|
||
+ Some("test123".to_string()),
|
||
+ "ASCII with numbers",
|
||
+ ),
|
||
+ // Unicode examples
|
||
+ ("@İstanbul", 3, Some("İstanbul".to_string()), "Turkish text"),
|
||
+ (
|
||
+ "@testЙЦУ.rs",
|
||
+ 8,
|
||
+ Some("testЙЦУ.rs".to_string()),
|
||
+ "Mixed ASCII and Cyrillic",
|
||
+ ),
|
||
+ ("@诶", 2, Some("诶".to_string()), "Chinese character"),
|
||
+ ("@👍", 2, Some("👍".to_string()), "Emoji token"),
|
||
+ // Invalid cases (should return None)
|
||
+ ("hello", 2, None, "No @ symbol"),
|
||
+ ("@", 1, None, "Only @ symbol"),
|
||
+ ("@ hello", 2, None, "@ followed by space"),
|
||
+ ("test @ world", 6, None, "@ with spaces around"),
|
||
+ ];
|
||
+
|
||
+ for (input, cursor_pos, expected, description) in test_cases {
|
||
+ let mut textarea = TextArea::default();
|
||
+ textarea.insert_str(input);
|
||
+ textarea.move_cursor(tui_textarea::CursorMove::Jump(0, cursor_pos));
|
||
+
|
||
+ let result = ChatComposer::current_at_token(&textarea);
|
||
+ assert_eq!(
|
||
+ result, expected,
|
||
+ "Failed for case: {} - input: '{}', cursor: {}",
|
||
+ description, input, cursor_pos
|
||
+ );
|
||
+ }
|
||
+ }
|
||
+
|
||
+ #[test]
|
||
+ fn test_current_at_token_cursor_positions() {
|
||
+ let test_cases = vec![
|
||
+ // Different cursor positions within a token
|
||
+ ("@test", 0, Some("test".to_string()), "Cursor at @"),
|
||
+ ("@test", 1, Some("test".to_string()), "Cursor after @"),
|
||
+ ("@test", 5, Some("test".to_string()), "Cursor at end"),
|
||
+ // Multiple tokens - cursor determines which token
|
||
+ ("@file1 @file2", 0, Some("file1".to_string()), "First token"),
|
||
+ (
|
||
+ "@file1 @file2",
|
||
+ 8,
|
||
+ Some("file2".to_string()),
|
||
+ "Second token",
|
||
+ ),
|
||
+ // Edge cases
|
||
+ ("@", 0, None, "Only @ symbol"),
|
||
+ ("@a", 2, Some("a".to_string()), "Single character after @"),
|
||
+ ("", 0, None, "Empty input"),
|
||
+ ];
|
||
+
|
||
+ for (input, cursor_pos, expected, description) in test_cases {
|
||
+ let mut textarea = TextArea::default();
|
||
+ textarea.insert_str(input);
|
||
+ textarea.move_cursor(tui_textarea::CursorMove::Jump(0, cursor_pos));
|
||
+
|
||
+ let result = ChatComposer::current_at_token(&textarea);
|
||
+ assert_eq!(
|
||
+ result, expected,
|
||
+ "Failed for cursor position case: {description} - input: '{input}', cursor: {cursor_pos}",
|
||
+ );
|
||
+ }
|
||
+ }
|
||
+
|
||
+ #[test]
|
||
+ fn test_current_at_token_whitespace_boundaries() {
|
||
+ let test_cases = vec![
|
||
+ // Space boundaries
|
||
+ (
|
||
+ "aaa@aaa",
|
||
+ 4,
|
||
+ None,
|
||
+ "Connected @ token - no completion by design",
|
||
+ ),
|
||
+ (
|
||
+ "aaa @aaa",
|
||
+ 5,
|
||
+ Some("aaa".to_string()),
|
||
+ "@ token after space",
|
||
+ ),
|
||
+ (
|
||
+ "test @file.txt",
|
||
+ 7,
|
||
+ Some("file.txt".to_string()),
|
||
+ "@ token after space",
|
||
+ ),
|
||
+ // Full-width space boundaries
|
||
+ (
|
||
+ "test @İstanbul",
|
||
+ 6,
|
||
+ Some("İstanbul".to_string()),
|
||
+ "@ token after full-width space",
|
||
+ ),
|
||
+ (
|
||
+ "@ЙЦУ @诶",
|
||
+ 6,
|
||
+ Some("诶".to_string()),
|
||
+ "Full-width space between Unicode tokens",
|
||
+ ),
|
||
+ // Tab and newline boundaries
|
||
+ (
|
||
+ "test\t@file",
|
||
+ 6,
|
||
+ Some("file".to_string()),
|
||
+ "@ token after tab",
|
||
+ ),
|
||
+ ];
|
||
+
|
||
+ for (input, cursor_pos, expected, description) in test_cases {
|
||
+ let mut textarea = TextArea::default();
|
||
+ textarea.insert_str(input);
|
||
+ textarea.move_cursor(tui_textarea::CursorMove::Jump(0, cursor_pos));
|
||
+
|
||
+ let result = ChatComposer::current_at_token(&textarea);
|
||
+ assert_eq!(
|
||
+ result, expected,
|
||
+ "Failed for whitespace boundary case: {description} - input: '{input}', cursor: {cursor_pos}",
|
||
+ );
|
||
+ }
|
||
+ }
|
||
+}
|
||
```
|
||
|
||
## Review Comments
|
||
|
||
### codex-rs/tui/src/bottom_pane/chat_composer.rs
|
||
|
||
- Created: 2025-07-07 20:31:18 UTC | Link: https://github.com/openai/codex/pull/1467#discussion_r2190962611
|
||
|
||
```diff
|
||
@@ -618,3 +624,107 @@ impl WidgetRef for &ChatComposer<'_> {
|
||
}
|
||
}
|
||
}
|
||
+
|
||
+#[cfg(test)]
|
||
+mod tests {
|
||
+ use crate::bottom_pane::ChatComposer;
|
||
+ use tui_textarea::TextArea;
|
||
+
|
||
+ #[test]
|
||
+ fn test_current_at_token_basic_cases() {
|
||
+ let test_cases = vec![
|
||
+ // Valid @ tokens
|
||
+ ("@hello", 3, Some("hello".to_string()), "Basic ASCII token"),
|
||
+ ("@file.txt", 4, Some("file.txt".to_string()), "ASCII with extension"),
|
||
+ ("hello @world test", 8, Some("world".to_string()), "ASCII token in middle"),
|
||
+ ("@test123", 5, Some("test123".to_string()), "ASCII with numbers"),
|
||
+
|
||
+ // Unicode examples
|
||
+ ("@İstanbul", 3, Some("İstanbul".to_string()), "Turkish text"),
|
||
+ ("@testЙЦУ.rs", 8, Some("testЙЦУ.rs".to_string()), "Mixed ASCII and Cyrillic"),
|
||
+ ("@诶", 2, Some("诶".to_string()), "Chinese character"),
|
||
+ ("@👍", 2, Some("👍".to_string()), "Emoji token"),
|
||
+
|
||
+ // Invalid cases (should return None)
|
||
+ ("hello", 2, None, "No @ symbol"),
|
||
+ ("@", 1, None, "Only @ symbol"),
|
||
+ ("@ hello", 2, None, "@ followed by space"),
|
||
+ ("test @ world", 6, None, "@ with spaces around"),
|
||
+ ];
|
||
+
|
||
+ for (input, cursor_pos, expected, description) in test_cases {
|
||
+ let mut textarea = TextArea::default();
|
||
+ textarea.insert_str(input);
|
||
+ textarea.move_cursor(tui_textarea::CursorMove::Jump(0, cursor_pos));
|
||
+
|
||
+ let result = ChatComposer::current_at_token(&textarea);
|
||
+ assert_eq!(
|
||
+ result, expected,
|
||
+ "Failed for case: {} - input: '{}', cursor: {}",
|
||
+ description, input, cursor_pos
|
||
+ );
|
||
+ }
|
||
+ }
|
||
+
|
||
+ #[test]
|
||
+ fn test_current_at_token_cursor_positions() {
|
||
+ let test_cases = vec![
|
||
+ // Different cursor positions within a token
|
||
+ ("@test", 0, Some("test".to_string()), "Cursor at @"),
|
||
+ ("@test", 1, Some("test".to_string()), "Cursor after @"),
|
||
+ ("@test", 5, Some("test".to_string()), "Cursor at end"),
|
||
+
|
||
+ // Multiple tokens - cursor determines which token
|
||
+ ("@file1 @file2", 0, Some("file1".to_string()), "First token"),
|
||
+ ("@file1 @file2", 8, Some("file2".to_string()), "Second token"),
|
||
+
|
||
+ // Edge cases
|
||
+ ("@", 0, None, "Only @ symbol"),
|
||
+ ("@a", 2, Some("a".to_string()), "Single character after @"),
|
||
+ ("", 0, None, "Empty input"),
|
||
+ ];
|
||
+
|
||
+ for (input, cursor_pos, expected, description) in test_cases {
|
||
+ let mut textarea = TextArea::default();
|
||
+ textarea.insert_str(input);
|
||
+ textarea.move_cursor(tui_textarea::CursorMove::Jump(0, cursor_pos));
|
||
+
|
||
+ let result = ChatComposer::current_at_token(&textarea);
|
||
+ assert_eq!(
|
||
+ result, expected,
|
||
+ "Failed for cursor position case: {} - input: '{}', cursor: {}",
|
||
+ description, input, cursor_pos
|
||
+ );
|
||
+ }
|
||
+ }
|
||
+
|
||
+ #[test]
|
||
+ fn test_current_at_token_whitespace_boundaries() {
|
||
+ let test_cases = vec![
|
||
+ // Space boundaries
|
||
+ ("aaa@aaa", 4, None, "Connected @ token - no completion by design"),
|
||
+ ("aaa @aaa", 5, Some("aaa".to_string()), "@ token after space"),
|
||
+ ("test @file.txt", 7, Some("file.txt".to_string()), "@ token after space"),
|
||
+
|
||
+ // Full-width space boundaries
|
||
+ ("test @İstanbul", 6, Some("İstanbul".to_string()), "@ token after full-width space"),
|
||
+ ("@ЙЦУ @诶", 6, Some("诶".to_string()), "Full-width space between Unicode tokens"),
|
||
+
|
||
+ // Tab and newline boundaries
|
||
+ ("test\t@file", 6, Some("file".to_string()), "@ token after tab"),
|
||
+ ];
|
||
+
|
||
+ for (input, cursor_pos, expected, description) in test_cases {
|
||
+ let mut textarea = TextArea::default();
|
||
+ textarea.insert_str(input);
|
||
+ textarea.move_cursor(tui_textarea::CursorMove::Jump(0, cursor_pos));
|
||
+
|
||
+ let result = ChatComposer::current_at_token(&textarea);
|
||
+ assert_eq!(
|
||
+ result, expected,
|
||
+ "Failed for whitespace boundary case: {} - input: '{}', cursor: {}",
|
||
+ description, input, cursor_pos
|
||
+ );
|
||
```
|
||
|
||
> ```suggestion
|
||
> assert_eq!(
|
||
> result, expected,
|
||
> "Failed for whitespace boundary case: {description} - input: '{input}', cursor: {cursor_pos}",
|
||
> );
|
||
> ```
|
||
|
||
- Created: 2025-07-07 20:31:43 UTC | Link: https://github.com/openai/codex/pull/1467#discussion_r2190963121
|
||
|
||
```diff
|
||
@@ -618,3 +624,107 @@ impl WidgetRef for &ChatComposer<'_> {
|
||
}
|
||
}
|
||
}
|
||
+
|
||
+#[cfg(test)]
|
||
+mod tests {
|
||
+ use crate::bottom_pane::ChatComposer;
|
||
+ use tui_textarea::TextArea;
|
||
+
|
||
+ #[test]
|
||
+ fn test_current_at_token_basic_cases() {
|
||
+ let test_cases = vec![
|
||
+ // Valid @ tokens
|
||
+ ("@hello", 3, Some("hello".to_string()), "Basic ASCII token"),
|
||
+ ("@file.txt", 4, Some("file.txt".to_string()), "ASCII with extension"),
|
||
+ ("hello @world test", 8, Some("world".to_string()), "ASCII token in middle"),
|
||
+ ("@test123", 5, Some("test123".to_string()), "ASCII with numbers"),
|
||
+
|
||
+ // Unicode examples
|
||
+ ("@İstanbul", 3, Some("İstanbul".to_string()), "Turkish text"),
|
||
+ ("@testЙЦУ.rs", 8, Some("testЙЦУ.rs".to_string()), "Mixed ASCII and Cyrillic"),
|
||
+ ("@诶", 2, Some("诶".to_string()), "Chinese character"),
|
||
+ ("@👍", 2, Some("👍".to_string()), "Emoji token"),
|
||
+
|
||
+ // Invalid cases (should return None)
|
||
+ ("hello", 2, None, "No @ symbol"),
|
||
+ ("@", 1, None, "Only @ symbol"),
|
||
+ ("@ hello", 2, None, "@ followed by space"),
|
||
+ ("test @ world", 6, None, "@ with spaces around"),
|
||
+ ];
|
||
+
|
||
+ for (input, cursor_pos, expected, description) in test_cases {
|
||
+ let mut textarea = TextArea::default();
|
||
+ textarea.insert_str(input);
|
||
+ textarea.move_cursor(tui_textarea::CursorMove::Jump(0, cursor_pos));
|
||
+
|
||
+ let result = ChatComposer::current_at_token(&textarea);
|
||
+ assert_eq!(
|
||
+ result, expected,
|
||
+ "Failed for case: {} - input: '{}', cursor: {}",
|
||
+ description, input, cursor_pos
|
||
+ );
|
||
+ }
|
||
+ }
|
||
+
|
||
+ #[test]
|
||
+ fn test_current_at_token_cursor_positions() {
|
||
+ let test_cases = vec![
|
||
+ // Different cursor positions within a token
|
||
+ ("@test", 0, Some("test".to_string()), "Cursor at @"),
|
||
+ ("@test", 1, Some("test".to_string()), "Cursor after @"),
|
||
+ ("@test", 5, Some("test".to_string()), "Cursor at end"),
|
||
+
|
||
+ // Multiple tokens - cursor determines which token
|
||
+ ("@file1 @file2", 0, Some("file1".to_string()), "First token"),
|
||
+ ("@file1 @file2", 8, Some("file2".to_string()), "Second token"),
|
||
+
|
||
+ // Edge cases
|
||
+ ("@", 0, None, "Only @ symbol"),
|
||
+ ("@a", 2, Some("a".to_string()), "Single character after @"),
|
||
+ ("", 0, None, "Empty input"),
|
||
+ ];
|
||
+
|
||
+ for (input, cursor_pos, expected, description) in test_cases {
|
||
+ let mut textarea = TextArea::default();
|
||
+ textarea.insert_str(input);
|
||
+ textarea.move_cursor(tui_textarea::CursorMove::Jump(0, cursor_pos));
|
||
+
|
||
+ let result = ChatComposer::current_at_token(&textarea);
|
||
+ assert_eq!(
|
||
+ result, expected,
|
||
+ "Failed for cursor position case: {} - input: '{}', cursor: {}",
|
||
+ description, input, cursor_pos
|
||
+ );
|
||
```
|
||
|
||
> ```suggestion
|
||
> assert_eq!(
|
||
> result, expected,
|
||
> "Failed for cursor position case: {description} - input: '{input}', cursor: {cursor_pos}",
|
||
> );
|
||
> ``` |