Files
codex/codex-rs/core/src/text_encoding.rs
Michael Bolin 0c8a36676a fix: move inline codex-rs/core unit tests into sibling files (#14444)
## Why
PR #13783 moved the `codex.rs` unit tests into `codex_tests.rs`. This
applies the same extraction pattern across the rest of `codex-rs/core`
so the production modules stay focused on runtime code instead of large
inline test blocks.

Keeping the tests in sibling files also makes follow-up edits easier to
review because product changes no longer have to share a file with
hundreds or thousands of lines of test scaffolding.

## What changed
- replaced each inline `mod tests { ... }` in `codex-rs/core/src/**`
with a path-based module declaration
- moved each extracted unit test module into a sibling `*_tests.rs`
file, using `mod_tests.rs` for `mod.rs` modules
- preserved the existing `cfg(...)` guards and module-local structure so
the refactor remains structural rather than behavioral

## Testing
- `cargo test -p codex-core --lib` (`1653 passed; 0 failed; 5 ignored`)
- `just fix -p codex-core`
- `cargo fmt --check`
- `cargo shear`
2026-03-12 08:16:36 -07:00

122 lines
5.3 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//! Text encoding detection and conversion utilities for shell output.
//!
//! Windows users frequently run into code pages such as CP1251 or CP866 when invoking commands
//! through VS Code. Those bytes show up as invalid UTF-8 and used to be replaced with the standard
//! Unicode replacement character. We now lean on `chardetng` and `encoding_rs` so we can
//! automatically detect and decode the vast majority of legacy encodings before falling back to
//! lossy UTF-8 decoding.
use chardetng::EncodingDetector;
use encoding_rs::Encoding;
use encoding_rs::IBM866;
use encoding_rs::WINDOWS_1252;
/// Attempts to convert arbitrary bytes to UTF-8 with best-effort encoding detection.
pub fn bytes_to_string_smart(bytes: &[u8]) -> String {
if bytes.is_empty() {
return String::new();
}
if let Ok(utf8_str) = std::str::from_utf8(bytes) {
return utf8_str.to_owned();
}
let encoding = detect_encoding(bytes);
decode_bytes(bytes, encoding)
}
// Windows-1252 reassigns a handful of 0x80-0x9F slots to smart punctuation (curly quotes, dashes,
// ™). CP866 uses those *same byte values* for uppercase Cyrillic letters. When chardetng sees shell
// snippets that mix these bytes with ASCII it sometimes guesses IBM866, so “smart quotes” render as
// Cyrillic garbage (“УФЦ”) in VS Code. However, CP866 uppercase tokens are perfectly valid output
// (e.g., `ПРИ test`) so we cannot flip every 0x80-0x9F byte to Windows-1252 either. The compromise
// is to only coerce IBM866 to Windows-1252 when (a) the high bytes are exclusively the punctuation
// values listed below and (b) we spot adjacent ASCII. This targets the real failure case without
// clobbering legitimate Cyrillic text. If another code page has a similar collision, introduce a
// dedicated allowlist (like this one) plus unit tests that capture the actual shell output we want
// to preserve. Windows-1252 byte values for smart punctuation.
const WINDOWS_1252_PUNCT_BYTES: [u8; 8] = [
0x91, // (left single quotation mark)
0x92, // (right single quotation mark)
0x93, // “ (left double quotation mark)
0x94, // ” (right double quotation mark)
0x95, // • (bullet)
0x96, // (en dash)
0x97, // — (em dash)
0x99, // ™ (trade mark sign)
];
fn detect_encoding(bytes: &[u8]) -> &'static Encoding {
let mut detector = EncodingDetector::new();
detector.feed(bytes, true);
let (encoding, _is_confident) = detector.guess_assess(None, true);
// chardetng occasionally reports IBM866 for short strings that only contain Windows-1252 “smart
// punctuation” bytes (0x80-0x9F) because that range maps to Cyrillic letters in IBM866. When
// those bytes show up alongside an ASCII word (typical shell output: `"“`test), we know the
// intent was likely CP1252 quotes/dashes. Prefer WINDOWS_1252 in that specific situation so we
// render the characters users expect instead of Cyrillic junk. References:
// - Windows-1252 reserving 0x80-0x9F for curly quotes/dashes:
// https://en.wikipedia.org/wiki/Windows-1252
// - CP866 mapping 0x93/0x94/0x96 to Cyrillic letters, so the same bytes show up as “УФЦ” when
// mis-decoded: https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP866.TXT
if encoding == IBM866 && looks_like_windows_1252_punctuation(bytes) {
return WINDOWS_1252;
}
encoding
}
fn decode_bytes(bytes: &[u8], encoding: &'static Encoding) -> String {
let (decoded, _, had_errors) = encoding.decode(bytes);
if had_errors {
return String::from_utf8_lossy(bytes).into_owned();
}
decoded.into_owned()
}
/// Detect whether the byte stream looks like Windows-1252 “smart punctuation” wrapped around
/// otherwise-ASCII text.
///
/// Context: IBM866 and Windows-1252 share the 0x80-0x9F slot range. In IBM866 these bytes decode to
/// Cyrillic letters, whereas Windows-1252 maps them to curly quotes and dashes. chardetng can guess
/// IBM866 for short snippets that only contain those bytes, which turns shell output such as
/// `“test”` into unreadable Cyrillic. To avoid that, we treat inputs comprising a handful of bytes
/// from the problematic range plus ASCII letters as CP1252 punctuation. We deliberately do *not*
/// cap how many of those punctuation bytes we accept: VS Code frequently prints several quoted
/// phrases (e.g., `"foo" "bar"`), and truncating the count would once again mis-decode those as
/// Cyrillic. If we discover additional encodings with overlapping byte ranges, prefer adding
/// encoding-specific byte allowlists like `WINDOWS_1252_PUNCT` and tests that exercise real-world
/// shell snippets.
fn looks_like_windows_1252_punctuation(bytes: &[u8]) -> bool {
let mut saw_extended_punctuation = false;
let mut saw_ascii_word = false;
for &byte in bytes {
if byte >= 0xA0 {
return false;
}
if (0x80..=0x9F).contains(&byte) {
if !is_windows_1252_punct(byte) {
return false;
}
saw_extended_punctuation = true;
}
if byte.is_ascii_alphabetic() {
saw_ascii_word = true;
}
}
saw_extended_punctuation && saw_ascii_word
}
fn is_windows_1252_punct(byte: u8) -> bool {
WINDOWS_1252_PUNCT_BYTES.contains(&byte)
}
#[cfg(test)]
#[path = "text_encoding_tests.rs"]
mod tests;