//! Utilities for truncating large chunks of output while preserving a prefix //! and suffix on UTF-8 boundaries. const APPROX_BYTES_PER_TOKEN: usize = 4; /// Truncate a string to `max_bytes` using a character-count marker. pub fn truncate_middle_chars(s: &str, max_bytes: usize) -> String { truncate_with_byte_estimate(s, max_bytes, /*use_tokens*/ false) } /// Truncate the middle of a UTF-8 string to at most `max_tokens` approximate /// tokens, preserving the beginning and the end. Returns the possibly /// truncated string and `Some(original_token_count)` if truncation occurred; /// otherwise returns the original string and `None`. pub fn truncate_middle_with_token_budget(s: &str, max_tokens: usize) -> (String, Option) { if s.is_empty() { return (String::new(), None); } if max_tokens > 0 && s.len() <= approx_bytes_for_tokens(max_tokens) { return (s.to_string(), None); } let truncated = truncate_with_byte_estimate( s, approx_bytes_for_tokens(max_tokens), /*use_tokens*/ true, ); let total_tokens = u64::try_from(approx_token_count(s)).unwrap_or(u64::MAX); if truncated == s { (truncated, None) } else { (truncated, Some(total_tokens)) } } fn truncate_with_byte_estimate(s: &str, max_bytes: usize, use_tokens: bool) -> String { if s.is_empty() { return String::new(); } let total_chars = s.chars().count(); if max_bytes == 0 { return format_truncation_marker( use_tokens, removed_units(use_tokens, s.len(), total_chars), ); } if s.len() <= max_bytes { return s.to_string(); } let total_bytes = s.len(); let (left_budget, right_budget) = split_budget(max_bytes); let (removed_chars, left, right) = split_string(s, left_budget, right_budget); let marker = format_truncation_marker( use_tokens, removed_units( use_tokens, total_bytes.saturating_sub(max_bytes), removed_chars, ), ); assemble_truncated_output(left, right, &marker) } pub fn approx_token_count(text: &str) -> usize { let len = text.len(); len.saturating_add(APPROX_BYTES_PER_TOKEN.saturating_sub(1)) / APPROX_BYTES_PER_TOKEN } pub fn approx_bytes_for_tokens(tokens: usize) -> usize { tokens.saturating_mul(APPROX_BYTES_PER_TOKEN) } pub fn approx_tokens_from_byte_count(bytes: usize) -> u64 { let bytes_u64 = bytes as u64; bytes_u64.saturating_add((APPROX_BYTES_PER_TOKEN as u64).saturating_sub(1)) / (APPROX_BYTES_PER_TOKEN as u64) } fn split_string(s: &str, beginning_bytes: usize, end_bytes: usize) -> (usize, &str, &str) { if s.is_empty() { return (0, "", ""); } let len = s.len(); let tail_start_target = len.saturating_sub(end_bytes); let mut prefix_end = 0usize; let mut suffix_start = len; let mut removed_chars = 0usize; let mut suffix_started = false; for (idx, ch) in s.char_indices() { let char_end = idx + ch.len_utf8(); if char_end <= beginning_bytes { prefix_end = char_end; continue; } if idx >= tail_start_target { if !suffix_started { suffix_start = idx; suffix_started = true; } continue; } removed_chars = removed_chars.saturating_add(1); } if suffix_start < prefix_end { suffix_start = prefix_end; } let before = &s[..prefix_end]; let after = &s[suffix_start..]; (removed_chars, before, after) } fn split_budget(budget: usize) -> (usize, usize) { let left = budget / 2; (left, budget - left) } fn format_truncation_marker(use_tokens: bool, removed_count: u64) -> String { if use_tokens { format!("…{removed_count} tokens truncated…") } else { format!("…{removed_count} chars truncated…") } } fn removed_units(use_tokens: bool, removed_bytes: usize, removed_chars: usize) -> u64 { if use_tokens { approx_tokens_from_byte_count(removed_bytes) } else { u64::try_from(removed_chars).unwrap_or(u64::MAX) } } fn assemble_truncated_output(prefix: &str, suffix: &str, marker: &str) -> String { let mut out = String::with_capacity(prefix.len() + marker.len() + suffix.len() + 1); out.push_str(prefix); out.push_str(marker); out.push_str(suffix); out } #[cfg(test)] mod tests;