mirror of
https://github.com/openai/codex.git
synced 2026-03-05 21:45:28 +03:00
Single pass truncation (#6914)
This commit is contained in:
@@ -185,6 +185,7 @@ fn truncate_with_byte_estimate(s: &str, policy: TruncationPolicy) -> String {
|
||||
if s.is_empty() {
|
||||
return String::new();
|
||||
}
|
||||
|
||||
let total_chars = s.chars().count();
|
||||
let max_bytes = policy.byte_budget();
|
||||
|
||||
@@ -204,24 +205,55 @@ fn truncate_with_byte_estimate(s: &str, policy: TruncationPolicy) -> String {
|
||||
let total_bytes = s.len();
|
||||
|
||||
let (left_budget, right_budget) = split_budget(max_bytes);
|
||||
let prefix_end = pick_prefix_end(s, left_budget);
|
||||
let mut suffix_start = pick_suffix_start(s, right_budget);
|
||||
if suffix_start < prefix_end {
|
||||
suffix_start = prefix_end;
|
||||
}
|
||||
|
||||
let left_chars = s[..prefix_end].chars().count();
|
||||
let right_chars = s[suffix_start..].chars().count();
|
||||
let removed_chars = total_chars
|
||||
.saturating_sub(left_chars)
|
||||
.saturating_sub(right_chars);
|
||||
let (removed_chars, left, right) = split_string(s, left_budget, right_budget);
|
||||
|
||||
let marker = format_truncation_marker(
|
||||
policy,
|
||||
removed_units_for_source(policy, total_bytes.saturating_sub(max_bytes), removed_chars),
|
||||
);
|
||||
|
||||
assemble_truncated_output(&s[..prefix_end], &s[suffix_start..], &marker)
|
||||
assemble_truncated_output(left, right, &marker)
|
||||
}
|
||||
|
||||
fn split_string(s: &str, beginning_bytes: usize, end_bytes: usize) -> (usize, &str, &str) {
|
||||
if s.is_empty() {
|
||||
return (0, "", "");
|
||||
}
|
||||
|
||||
let len = s.len();
|
||||
let tail_start_target = len.saturating_sub(end_bytes);
|
||||
let mut prefix_end = 0usize;
|
||||
let mut suffix_start = len;
|
||||
let mut removed_chars = 0usize;
|
||||
let mut suffix_started = false;
|
||||
|
||||
for (idx, ch) in s.char_indices() {
|
||||
let char_end = idx + ch.len_utf8();
|
||||
if char_end <= beginning_bytes {
|
||||
prefix_end = char_end;
|
||||
continue;
|
||||
}
|
||||
|
||||
if idx >= tail_start_target {
|
||||
if !suffix_started {
|
||||
suffix_start = idx;
|
||||
suffix_started = true;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
removed_chars = removed_chars.saturating_add(1);
|
||||
}
|
||||
|
||||
if suffix_start < prefix_end {
|
||||
suffix_start = prefix_end;
|
||||
}
|
||||
|
||||
let before = &s[..prefix_end];
|
||||
let after = &s[suffix_start..];
|
||||
|
||||
(removed_chars, before, after)
|
||||
}
|
||||
|
||||
fn format_truncation_marker(policy: TruncationPolicy, removed_count: u64) -> String {
|
||||
@@ -270,42 +302,54 @@ fn approx_tokens_from_byte_count(bytes: usize) -> u64 {
|
||||
/ (APPROX_BYTES_PER_TOKEN as u64)
|
||||
}
|
||||
|
||||
fn truncate_on_boundary(input: &str, max_len: usize) -> &str {
|
||||
if input.len() <= max_len {
|
||||
return input;
|
||||
}
|
||||
let mut end = max_len;
|
||||
while end > 0 && !input.is_char_boundary(end) {
|
||||
end -= 1;
|
||||
}
|
||||
&input[..end]
|
||||
}
|
||||
|
||||
fn pick_prefix_end(s: &str, left_budget: usize) -> usize {
|
||||
truncate_on_boundary(s, left_budget).len()
|
||||
}
|
||||
|
||||
fn pick_suffix_start(s: &str, right_budget: usize) -> usize {
|
||||
let start_tail = s.len().saturating_sub(right_budget);
|
||||
let mut idx = start_tail.min(s.len());
|
||||
while idx < s.len() && !s.is_char_boundary(idx) {
|
||||
idx += 1;
|
||||
}
|
||||
idx
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::TruncationPolicy;
|
||||
use super::approx_token_count;
|
||||
use super::formatted_truncate_text;
|
||||
use super::split_string;
|
||||
use super::truncate_function_output_items_with_policy;
|
||||
use super::truncate_text;
|
||||
use super::truncate_with_token_budget;
|
||||
use codex_protocol::models::FunctionCallOutputContentItem;
|
||||
use pretty_assertions::assert_eq;
|
||||
|
||||
#[test]
|
||||
fn split_string_works() {
|
||||
assert_eq!(split_string("hello world", 5, 5), (1, "hello", "world"));
|
||||
assert_eq!(split_string("abc", 0, 0), (3, "", ""));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn split_string_handles_empty_string() {
|
||||
assert_eq!(split_string("", 4, 4), (0, "", ""));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn split_string_only_keeps_prefix_when_tail_budget_is_zero() {
|
||||
assert_eq!(split_string("abcdef", 3, 0), (3, "abc", ""));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn split_string_only_keeps_suffix_when_prefix_budget_is_zero() {
|
||||
assert_eq!(split_string("abcdef", 0, 3), (3, "", "def"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn split_string_handles_overlapping_budgets_without_removal() {
|
||||
assert_eq!(split_string("abcdef", 4, 4), (0, "abcd", "ef"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn split_string_respects_utf8_boundaries() {
|
||||
assert_eq!(split_string("😀abc😀", 5, 5), (1, "😀a", "c😀"));
|
||||
|
||||
assert_eq!(split_string("😀😀😀😀😀", 1, 1), (5, "", ""));
|
||||
assert_eq!(split_string("😀😀😀😀😀", 7, 7), (3, "😀", "😀"));
|
||||
assert_eq!(split_string("😀😀😀😀😀", 8, 8), (1, "😀😀", "😀😀"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn truncate_bytes_less_than_placeholder_returns_placeholder() {
|
||||
let content = "example output";
|
||||
|
||||
Reference in New Issue
Block a user