Files
codex/prs/bolinfest/PR-1467.md
2025-09-02 15:17:45 -07:00

460 lines
18 KiB
Markdown
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# PR #1467: Fix Unicode handling in chat_composer "@" token detection
- URL: https://github.com/openai/codex/pull/1467
- Author: ryozi-tn
- Created: 2025-07-06 18:00:10 UTC
- Updated: 2025-07-07 20:43:39 UTC
- Changes: +175/-18, Files changed: 1, Commits: 4
## Description
## Issues Fixed
- **Primary Issue (#1450)**: Unicode cursor positioning was incorrect due to mixing character positions with byte positions
- **Additional Issue**: Full-width spaces (CJK whitespace like " ") weren't properly handled as token boundaries
- ref: https://doc.rust-lang.org/std/primitive.char.html#method.is_whitespace
## Full Diff
```diff
diff --git a/codex-rs/tui/src/bottom_pane/chat_composer.rs b/codex-rs/tui/src/bottom_pane/chat_composer.rs
index 59d6e4579d..cd8e9fa17f 100644
--- a/codex-rs/tui/src/bottom_pane/chat_composer.rs
+++ b/codex-rs/tui/src/bottom_pane/chat_composer.rs
@@ -290,26 +290,28 @@ impl ChatComposer<'_> {
// Guard against out-of-bounds rows.
let line = textarea.lines().get(row)?.as_str();
- // Clamp the cursor column to the line length to avoid slicing panics
- // when the cursor is at the end of the line.
- let col = col.min(line.len());
+ // Calculate byte offset for cursor position
+ let cursor_byte_offset = line.chars().take(col).map(|c| c.len_utf8()).sum::<usize>();
// Split the line at the cursor position so we can search for word
// boundaries on both sides.
- let before_cursor = &line[..col];
- let after_cursor = &line[col..];
+ let before_cursor = &line[..cursor_byte_offset];
+ let after_cursor = &line[cursor_byte_offset..];
- // Find start index (first character **after** the previous whitespace).
+ // Find start index (first character **after** the previous multi-byte whitespace).
let start_idx = before_cursor
- .rfind(|c: char| c.is_whitespace())
- .map(|idx| idx + 1)
+ .char_indices()
+ .rfind(|(_, c)| c.is_whitespace())
+ .map(|(idx, c)| idx + c.len_utf8())
.unwrap_or(0);
- // Find end index (first whitespace **after** the cursor position).
+ // Find end index (first multi-byte whitespace **after** the cursor position).
let end_rel_idx = after_cursor
- .find(|c: char| c.is_whitespace())
+ .char_indices()
+ .find(|(_, c)| c.is_whitespace())
+ .map(|(idx, _)| idx)
.unwrap_or(after_cursor.len());
- let end_idx = col + end_rel_idx;
+ let end_idx = cursor_byte_offset + end_rel_idx;
if start_idx >= end_idx {
return None;
@@ -336,21 +338,25 @@ impl ChatComposer<'_> {
let mut lines: Vec<String> = self.textarea.lines().to_vec();
if let Some(line) = lines.get_mut(row) {
- let col = col.min(line.len());
+ // Calculate byte offset for cursor position
+ let cursor_byte_offset = line.chars().take(col).map(|c| c.len_utf8()).sum::<usize>();
- let before_cursor = &line[..col];
- let after_cursor = &line[col..];
+ let before_cursor = &line[..cursor_byte_offset];
+ let after_cursor = &line[cursor_byte_offset..];
// Determine token boundaries.
let start_idx = before_cursor
- .rfind(|c: char| c.is_whitespace())
- .map(|idx| idx + 1)
+ .char_indices()
+ .rfind(|(_, c)| c.is_whitespace())
+ .map(|(idx, c)| idx + c.len_utf8())
.unwrap_or(0);
let end_rel_idx = after_cursor
- .find(|c: char| c.is_whitespace())
+ .char_indices()
+ .find(|(_, c)| c.is_whitespace())
+ .map(|(idx, _)| idx)
.unwrap_or(after_cursor.len());
- let end_idx = col + end_rel_idx;
+ let end_idx = cursor_byte_offset + end_rel_idx;
// Replace the slice `[start_idx, end_idx)` with the chosen path and a trailing space.
let mut new_line =
@@ -618,3 +624,154 @@ impl WidgetRef for &ChatComposer<'_> {
}
}
}
+
+#[cfg(test)]
+mod tests {
+ use crate::bottom_pane::ChatComposer;
+ use tui_textarea::TextArea;
+
+ #[test]
+ fn test_current_at_token_basic_cases() {
+ let test_cases = vec![
+ // Valid @ tokens
+ ("@hello", 3, Some("hello".to_string()), "Basic ASCII token"),
+ (
+ "@file.txt",
+ 4,
+ Some("file.txt".to_string()),
+ "ASCII with extension",
+ ),
+ (
+ "hello @world test",
+ 8,
+ Some("world".to_string()),
+ "ASCII token in middle",
+ ),
+ (
+ "@test123",
+ 5,
+ Some("test123".to_string()),
+ "ASCII with numbers",
+ ),
+ // Unicode examples
+ ("@İstanbul", 3, Some("İstanbul".to_string()), "Turkish text"),
+ (
+ "@testЙЦУ.rs",
+ 8,
+ Some("testЙЦУ.rs".to_string()),
+ "Mixed ASCII and Cyrillic",
+ ),
+ ("@诶", 2, Some("诶".to_string()), "Chinese character"),
+ ("@👍", 2, Some("👍".to_string()), "Emoji token"),
+ // Invalid cases (should return None)
+ ("hello", 2, None, "No @ symbol"),
+ ("@", 1, None, "Only @ symbol"),
+ ("@ hello", 2, None, "@ followed by space"),
+ ("test @ world", 6, None, "@ with spaces around"),
+ ];
+
+ for (input, cursor_pos, expected, description) in test_cases {
+ let mut textarea = TextArea::default();
+ textarea.insert_str(input);
+ textarea.move_cursor(tui_textarea::CursorMove::Jump(0, cursor_pos));
+
+ let result = ChatComposer::current_at_token(&textarea);
+ assert_eq!(
+ result, expected,
+ "Failed for case: {} - input: '{}', cursor: {}",
+ description, input, cursor_pos
+ );
+ }
+ }
+
+ #[test]
+ fn test_current_at_token_cursor_positions() {
+ let test_cases = vec![
+ // Different cursor positions within a token
+ ("@test", 0, Some("test".to_string()), "Cursor at @"),
+ ("@test", 1, Some("test".to_string()), "Cursor after @"),
+ ("@test", 5, Some("test".to_string()), "Cursor at end"),
+ // Multiple tokens - cursor determines which token
+ ("@file1 @file2", 0, Some("file1".to_string()), "First token"),
+ (
+ "@file1 @file2",
+ 8,
+ Some("file2".to_string()),
+ "Second token",
+ ),
+ // Edge cases
+ ("@", 0, None, "Only @ symbol"),
+ ("@a", 2, Some("a".to_string()), "Single character after @"),
+ ("", 0, None, "Empty input"),
+ ];
+
+ for (input, cursor_pos, expected, description) in test_cases {
+ let mut textarea = TextArea::default();
+ textarea.insert_str(input);
+ textarea.move_cursor(tui_textarea::CursorMove::Jump(0, cursor_pos));
+
+ let result = ChatComposer::current_at_token(&textarea);
+ assert_eq!(
+ result, expected,
+ "Failed for cursor position case: {description} - input: '{input}', cursor: {cursor_pos}",
+ );
+ }
+ }
+
+ #[test]
+ fn test_current_at_token_whitespace_boundaries() {
+ let test_cases = vec![
+ // Space boundaries
+ (
+ "aaa@aaa",
+ 4,
+ None,
+ "Connected @ token - no completion by design",
+ ),
+ (
+ "aaa @aaa",
+ 5,
+ Some("aaa".to_string()),
+ "@ token after space",
+ ),
+ (
+ "test @file.txt",
+ 7,
+ Some("file.txt".to_string()),
+ "@ token after space",
+ ),
+ // Full-width space boundaries
+ (
+ "test @İstanbul",
+ 6,
+ Some("İstanbul".to_string()),
+ "@ token after full-width space",
+ ),
+ (
+ "@ЙЦУ @诶",
+ 6,
+ Some("诶".to_string()),
+ "Full-width space between Unicode tokens",
+ ),
+ // Tab and newline boundaries
+ (
+ "test\t@file",
+ 6,
+ Some("file".to_string()),
+ "@ token after tab",
+ ),
+ ];
+
+ for (input, cursor_pos, expected, description) in test_cases {
+ let mut textarea = TextArea::default();
+ textarea.insert_str(input);
+ textarea.move_cursor(tui_textarea::CursorMove::Jump(0, cursor_pos));
+
+ let result = ChatComposer::current_at_token(&textarea);
+ assert_eq!(
+ result, expected,
+ "Failed for whitespace boundary case: {description} - input: '{input}', cursor: {cursor_pos}",
+ );
+ }
+ }
+}
```
## Review Comments
### codex-rs/tui/src/bottom_pane/chat_composer.rs
- Created: 2025-07-07 20:31:18 UTC | Link: https://github.com/openai/codex/pull/1467#discussion_r2190962611
```diff
@@ -618,3 +624,107 @@ impl WidgetRef for &ChatComposer<'_> {
}
}
}
+
+#[cfg(test)]
+mod tests {
+ use crate::bottom_pane::ChatComposer;
+ use tui_textarea::TextArea;
+
+ #[test]
+ fn test_current_at_token_basic_cases() {
+ let test_cases = vec![
+ // Valid @ tokens
+ ("@hello", 3, Some("hello".to_string()), "Basic ASCII token"),
+ ("@file.txt", 4, Some("file.txt".to_string()), "ASCII with extension"),
+ ("hello @world test", 8, Some("world".to_string()), "ASCII token in middle"),
+ ("@test123", 5, Some("test123".to_string()), "ASCII with numbers"),
+
+ // Unicode examples
+ ("@İstanbul", 3, Some("İstanbul".to_string()), "Turkish text"),
+ ("@testЙЦУ.rs", 8, Some("testЙЦУ.rs".to_string()), "Mixed ASCII and Cyrillic"),
+ ("@诶", 2, Some("诶".to_string()), "Chinese character"),
+ ("@👍", 2, Some("👍".to_string()), "Emoji token"),
+
+ // Invalid cases (should return None)
+ ("hello", 2, None, "No @ symbol"),
+ ("@", 1, None, "Only @ symbol"),
+ ("@ hello", 2, None, "@ followed by space"),
+ ("test @ world", 6, None, "@ with spaces around"),
+ ];
+
+ for (input, cursor_pos, expected, description) in test_cases {
+ let mut textarea = TextArea::default();
+ textarea.insert_str(input);
+ textarea.move_cursor(tui_textarea::CursorMove::Jump(0, cursor_pos));
+
+ let result = ChatComposer::current_at_token(&textarea);
+ assert_eq!(
+ result, expected,
+ "Failed for case: {} - input: '{}', cursor: {}",
+ description, input, cursor_pos
+ );
+ }
+ }
+
+ #[test]
+ fn test_current_at_token_cursor_positions() {
+ let test_cases = vec![
+ // Different cursor positions within a token
+ ("@test", 0, Some("test".to_string()), "Cursor at @"),
+ ("@test", 1, Some("test".to_string()), "Cursor after @"),
+ ("@test", 5, Some("test".to_string()), "Cursor at end"),
+
+ // Multiple tokens - cursor determines which token
+ ("@file1 @file2", 0, Some("file1".to_string()), "First token"),
+ ("@file1 @file2", 8, Some("file2".to_string()), "Second token"),
+
+ // Edge cases
+ ("@", 0, None, "Only @ symbol"),
+ ("@a", 2, Some("a".to_string()), "Single character after @"),
+ ("", 0, None, "Empty input"),
+ ];
+
+ for (input, cursor_pos, expected, description) in test_cases {
+ let mut textarea = TextArea::default();
+ textarea.insert_str(input);
+ textarea.move_cursor(tui_textarea::CursorMove::Jump(0, cursor_pos));
+
+ let result = ChatComposer::current_at_token(&textarea);
+ assert_eq!(
+ result, expected,
+ "Failed for cursor position case: {} - input: '{}', cursor: {}",
+ description, input, cursor_pos
+ );
+ }
+ }
+
+ #[test]
+ fn test_current_at_token_whitespace_boundaries() {
+ let test_cases = vec![
+ // Space boundaries
+ ("aaa@aaa", 4, None, "Connected @ token - no completion by design"),
+ ("aaa @aaa", 5, Some("aaa".to_string()), "@ token after space"),
+ ("test @file.txt", 7, Some("file.txt".to_string()), "@ token after space"),
+
+ // Full-width space boundaries
+ ("test @İstanbul", 6, Some("İstanbul".to_string()), "@ token after full-width space"),
+ ("@ЙЦУ @诶", 6, Some("诶".to_string()), "Full-width space between Unicode tokens"),
+
+ // Tab and newline boundaries
+ ("test\t@file", 6, Some("file".to_string()), "@ token after tab"),
+ ];
+
+ for (input, cursor_pos, expected, description) in test_cases {
+ let mut textarea = TextArea::default();
+ textarea.insert_str(input);
+ textarea.move_cursor(tui_textarea::CursorMove::Jump(0, cursor_pos));
+
+ let result = ChatComposer::current_at_token(&textarea);
+ assert_eq!(
+ result, expected,
+ "Failed for whitespace boundary case: {} - input: '{}', cursor: {}",
+ description, input, cursor_pos
+ );
```
> ```suggestion
> assert_eq!(
> result, expected,
> "Failed for whitespace boundary case: {description} - input: '{input}', cursor: {cursor_pos}",
> );
> ```
- Created: 2025-07-07 20:31:43 UTC | Link: https://github.com/openai/codex/pull/1467#discussion_r2190963121
```diff
@@ -618,3 +624,107 @@ impl WidgetRef for &ChatComposer<'_> {
}
}
}
+
+#[cfg(test)]
+mod tests {
+ use crate::bottom_pane::ChatComposer;
+ use tui_textarea::TextArea;
+
+ #[test]
+ fn test_current_at_token_basic_cases() {
+ let test_cases = vec![
+ // Valid @ tokens
+ ("@hello", 3, Some("hello".to_string()), "Basic ASCII token"),
+ ("@file.txt", 4, Some("file.txt".to_string()), "ASCII with extension"),
+ ("hello @world test", 8, Some("world".to_string()), "ASCII token in middle"),
+ ("@test123", 5, Some("test123".to_string()), "ASCII with numbers"),
+
+ // Unicode examples
+ ("@İstanbul", 3, Some("İstanbul".to_string()), "Turkish text"),
+ ("@testЙЦУ.rs", 8, Some("testЙЦУ.rs".to_string()), "Mixed ASCII and Cyrillic"),
+ ("@诶", 2, Some("诶".to_string()), "Chinese character"),
+ ("@👍", 2, Some("👍".to_string()), "Emoji token"),
+
+ // Invalid cases (should return None)
+ ("hello", 2, None, "No @ symbol"),
+ ("@", 1, None, "Only @ symbol"),
+ ("@ hello", 2, None, "@ followed by space"),
+ ("test @ world", 6, None, "@ with spaces around"),
+ ];
+
+ for (input, cursor_pos, expected, description) in test_cases {
+ let mut textarea = TextArea::default();
+ textarea.insert_str(input);
+ textarea.move_cursor(tui_textarea::CursorMove::Jump(0, cursor_pos));
+
+ let result = ChatComposer::current_at_token(&textarea);
+ assert_eq!(
+ result, expected,
+ "Failed for case: {} - input: '{}', cursor: {}",
+ description, input, cursor_pos
+ );
+ }
+ }
+
+ #[test]
+ fn test_current_at_token_cursor_positions() {
+ let test_cases = vec![
+ // Different cursor positions within a token
+ ("@test", 0, Some("test".to_string()), "Cursor at @"),
+ ("@test", 1, Some("test".to_string()), "Cursor after @"),
+ ("@test", 5, Some("test".to_string()), "Cursor at end"),
+
+ // Multiple tokens - cursor determines which token
+ ("@file1 @file2", 0, Some("file1".to_string()), "First token"),
+ ("@file1 @file2", 8, Some("file2".to_string()), "Second token"),
+
+ // Edge cases
+ ("@", 0, None, "Only @ symbol"),
+ ("@a", 2, Some("a".to_string()), "Single character after @"),
+ ("", 0, None, "Empty input"),
+ ];
+
+ for (input, cursor_pos, expected, description) in test_cases {
+ let mut textarea = TextArea::default();
+ textarea.insert_str(input);
+ textarea.move_cursor(tui_textarea::CursorMove::Jump(0, cursor_pos));
+
+ let result = ChatComposer::current_at_token(&textarea);
+ assert_eq!(
+ result, expected,
+ "Failed for cursor position case: {} - input: '{}', cursor: {}",
+ description, input, cursor_pos
+ );
```
> ```suggestion
> assert_eq!(
> result, expected,
> "Failed for cursor position case: {description} - input: '{input}', cursor: {cursor_pos}",
> );
> ```