Move shell to use truncate_text (#6842)

Move shell to use the configurable `truncate_text` --------- Co-authored-by: pakrym-oai <pakrym@openai.com>
2026-04-30 19:32:04 +03:00 · 2025-11-19 01:56:08 -08:00
parent 75f38f16dd
commit efebc62fb7
16 changed files with 583 additions and 498 deletions
--- a/codex-rs/core/tests/suite/truncation.rs
+++ b/codex-rs/core/tests/suite/truncation.rs
@@ -101,17 +101,16 @@ async fn truncate_function_error_trims_respond_to_model() -> Result<()> {
 // Verifies that a standard tool call (shell) exceeding the model formatting
 // limits is truncated before being sent back to the model.
 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
-async fn tool_call_output_exceeds_limit_truncated_for_model() -> Result<()> {
+async fn tool_call_output_configured_limit_chars_type() -> Result<()> {
    skip_if_no_network!(Ok(()));

    let server = start_mock_server().await;

    // Use a model that exposes the generic shell tool.
-    let mut builder = test_codex().with_config(|config| {
-        config.model = "gpt-5.1-codex".to_string();
-        config.model_family =
-            find_family_for_model("gpt-5.1-codex").expect("gpt-5.1-codex is a model family");
+    let mut builder = test_codex().with_model("gpt-5.1").with_config(|config| {
+        config.tool_output_token_limit = Some(100_000);
    });
+
    let fixture = builder.build(&server).await?;

    let call_id = "shell-too-large";
@@ -120,13 +119,13 @@ async fn tool_call_output_exceeds_limit_truncated_for_model() -> Result<()> {
            "command": [
                "powershell",
                "-Command",
-                "for ($i=1; $i -le 400; $i++) { Write-Output $i }"
+                "for ($i=1; $i -le 100000; $i++) { Write-Output $i }"
            ],
            "timeout_ms": 5_000,
        })
    } else {
        serde_json::json!({
-            "command": ["/bin/sh", "-c", "seq 1 400"],
+            "command": ["/bin/sh", "-c", "seq 1 100000"],
            "timeout_ms": 5_000,
        })
    };
@@ -162,14 +161,169 @@ async fn tool_call_output_exceeds_limit_truncated_for_model() -> Result<()> {
        .context("function_call_output present for shell call")?;
    let output = output.replace("\r\n", "\n");

-    // Expect plain text (not JSON) with truncation markers and line elision.
+    // Expect plain text (not JSON) containing the entire shell output.
+    assert!(
+        serde_json::from_str::<Value>(&output).is_err(),
+        "expected truncated shell output to be plain text"
+    );
+
+    assert_eq!(output.len(), 400094, "we should be almost 100k tokens");
+
+    assert!(
+        !output.contains("tokens truncated"),
+        "shell output should not contain tokens truncated marker: {output}"
+    );
+
+    Ok(())
+}
+
+// Verifies that a standard tool call (shell) exceeding the model formatting
+// limits is truncated before being sent back to the model.
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn tool_call_output_exceeds_limit_truncated_chars_limit() -> Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    let server = start_mock_server().await;
+
+    // Use a model that exposes the generic shell tool.
+    let mut builder = test_codex().with_model("gpt-5.1");
+
+    let fixture = builder.build(&server).await?;
+
+    let call_id = "shell-too-large";
+    let args = if cfg!(windows) {
+        serde_json::json!({
+            "command": [
+                "powershell",
+                "-Command",
+                "for ($i=1; $i -le 100000; $i++) { Write-Output $i }"
+            ],
+            "timeout_ms": 5_000,
+        })
+    } else {
+        serde_json::json!({
+            "command": ["/bin/sh", "-c", "seq 1 100000"],
+            "timeout_ms": 5_000,
+        })
+    };
+
+    // First response: model tells us to run the tool; second: complete the turn.
+    mount_sse_once(
+        &server,
+        sse(vec![
+            responses::ev_response_created("resp-1"),
+            responses::ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
+            responses::ev_completed("resp-1"),
+        ]),
+    )
+    .await;
+    let mock2 = mount_sse_once(
+        &server,
+        sse(vec![
+            responses::ev_assistant_message("msg-1", "done"),
+            responses::ev_completed("resp-2"),
+        ]),
+    )
+    .await;
+
+    fixture
+        .submit_turn_with_policy("trigger big shell output", SandboxPolicy::DangerFullAccess)
+        .await?;
+
+    // Inspect what we sent back to the model; it should contain a truncated
+    // function_call_output for the shell call.
+    let output = mock2
+        .single_request()
+        .function_call_output_text(call_id)
+        .context("function_call_output present for shell call")?;
+    let output = output.replace("\r\n", "\n");
+
+    // Expect plain text (not JSON) containing the entire shell output.
+    assert!(
+        serde_json::from_str::<Value>(&output).is_err(),
+        "expected truncated shell output to be plain text"
+    );
+
+    assert_eq!(output.len(), 9976); // ~10k characters
+    let truncated_pattern = r#"(?s)^Exit code: 0\nWall time: 0 seconds\nTotal output lines: 100000\n.*?…578898 chars truncated….*$"#;
+
+    assert_regex_match(truncated_pattern, &output);
+
+    Ok(())
+}
+
+// Verifies that a standard tool call (shell) exceeding the model formatting
+// limits is truncated before being sent back to the model.
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn tool_call_output_exceeds_limit_truncated_for_model() -> Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    let server = start_mock_server().await;
+
+    // Use a model that exposes the generic shell tool.
+    let mut builder = test_codex().with_config(|config| {
+        config.model = "gpt-5.1-codex".to_string();
+        config.model_family =
+            find_family_for_model("gpt-5.1-codex").expect("gpt-5.1-codex is a model family");
+    });
+    let fixture = builder.build(&server).await?;
+
+    let call_id = "shell-too-large";
+    let args = if cfg!(windows) {
+        serde_json::json!({
+            "command": [
+                "powershell",
+                "-Command",
+                "for ($i=1; $i -le 100000; $i++) { Write-Output $i }"
+            ],
+            "timeout_ms": 5_000,
+        })
+    } else {
+        serde_json::json!({
+            "command": ["/bin/sh", "-c", "seq 1 100000"],
+            "timeout_ms": 5_000,
+        })
+    };
+
+    // First response: model tells us to run the tool; second: complete the turn.
+    mount_sse_once(
+        &server,
+        sse(vec![
+            responses::ev_response_created("resp-1"),
+            responses::ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
+            responses::ev_completed("resp-1"),
+        ]),
+    )
+    .await;
+    let mock2 = mount_sse_once(
+        &server,
+        sse(vec![
+            responses::ev_assistant_message("msg-1", "done"),
+            responses::ev_completed("resp-2"),
+        ]),
+    )
+    .await;
+
+    fixture
+        .submit_turn_with_policy("trigger big shell output", SandboxPolicy::DangerFullAccess)
+        .await?;
+
+    // Inspect what we sent back to the model; it should contain a truncated
+    // function_call_output for the shell call.
+    let output = mock2
+        .single_request()
+        .function_call_output_text(call_id)
+        .context("function_call_output present for shell call")?;
+    let output = output.replace("\r\n", "\n");
+
+    // Expect plain text (not JSON) containing the entire shell output.
    assert!(
        serde_json::from_str::<Value>(&output).is_err(),
        "expected truncated shell output to be plain text"
    );
    let truncated_pattern = r#"(?s)^Exit code: 0
-Wall time: .* seconds
-Total output lines: 400
+Wall time: [0-9]+(?:\.[0-9]+)? seconds
+Total output lines: 100000
 Output:
 1
 2
@@ -177,15 +331,9 @@ Output:
 4
 5
 6
-.*
-\[\.{3} omitted 144 of 400 lines \.{3}\]
-
-.*
-396
-397
-398
-399
-400
+.*…137224 tokens truncated.*
+99999
+100000
 $"#;
    assert_regex_match(truncated_pattern, &output);

@@ -211,13 +359,13 @@ async fn tool_call_output_truncated_only_once() -> Result<()> {
            "command": [
                "powershell",
                "-Command",
-                "for ($i=1; $i -le 2000; $i++) { Write-Output $i }"
+                "for ($i=1; $i -le 10000; $i++) { Write-Output $i }"
            ],
            "timeout_ms": 5_000,
        })
    } else {
        serde_json::json!({
-            "command": ["/bin/sh", "-c", "seq 1 2000"],
+            "command": ["/bin/sh", "-c", "seq 1 10000"],
            "timeout_ms": 5_000,
        })
    };
@@ -249,11 +397,11 @@ async fn tool_call_output_truncated_only_once() -> Result<()> {
        .function_call_output_text(call_id)
        .context("function_call_output present for shell call")?;

-    let total_line_headers = output.matches("Total output lines:").count();
+    let truncation_markers = output.matches("tokens truncated").count();

    assert_eq!(
-        total_line_headers, 1,
-        "shell output should carry only one truncation header: {output}"
+        truncation_markers, 1,
+        "shell output should carry only one truncation marker: {output}"
    );

    Ok(())
@@ -321,6 +469,7 @@ async fn mcp_tool_call_output_exceeds_limit_truncated_for_model() -> Result<()>
                disabled_tools: None,
            },
        );
+        config.tool_output_token_limit = Some(500);
    });
    let fixture = builder.build(&server).await?;

@@ -337,12 +486,6 @@ async fn mcp_tool_call_output_exceeds_limit_truncated_for_model() -> Result<()>
        .function_call_output_text(call_id)
        .context("function_call_output present for rmcp call")?;

-    // Expect plain text with token-based truncation marker; the original JSON body
-    // is truncated in the middle of the echo string.
-    assert!(
-        serde_json::from_str::<Value>(&output).is_err(),
-        "expected truncated MCP output to be plain text"
-    );
    assert!(
        !output.contains("Total output lines:"),
        "MCP output should not include line-based truncation header: {output}"
@@ -350,6 +493,7 @@ async fn mcp_tool_call_output_exceeds_limit_truncated_for_model() -> Result<()>

    let truncated_pattern = r#"(?s)^\{"echo":\s*"ECHOING: long-message-with-newlines-.*tokens truncated.*long-message-with-newlines-.*$"#;
    assert_regex_match(truncated_pattern, &output);
+    assert!(output.len() < 2500, "{}", output.len());

    Ok(())
 }
@@ -501,7 +645,9 @@ async fn token_policy_marker_reports_tokens() -> Result<()> {
        .function_call_output_text(call_id)
        .context("shell output present")?;

-    assert_regex_match(r"\[\u{2026}127 tokens truncated\u{2026}]", &output);
+    let pattern = r#"(?s)^\{"output":"Total output lines: 150\\n\\n1\\n2\\n3\\n4\\n5\\n.*?…\d+ tokens truncated…7\\n138\\n139\\n140\\n141\\n142\\n143\\n144\\n145\\n146\\n147\\n148\\n149\\n150\\n","metadata":\{"exit_code":0,"duration_seconds":0\.0\}\}$"#;
+
+    assert_regex_match(pattern, &output);

    Ok(())
 }
@@ -552,14 +698,16 @@ async fn byte_policy_marker_reports_bytes() -> Result<()> {
        .function_call_output_text(call_id)
        .context("shell output present")?;

-    assert_regex_match(r"\[\u{2026}505 bytes truncated\u{2026}]", &output);
+    let pattern = r#"(?s)^\{"output":"Total output lines: 150\\n\\n1\\n2\\n3\\n4\\n5.*?…\d+ chars truncated…7\\n138\\n139\\n140\\n141\\n142\\n143\\n144\\n145\\n146\\n147\\n148\\n149\\n150\\n","metadata":\{"exit_code":0,"duration_seconds":0\.0\}\}$"#;
+
+    assert_regex_match(pattern, &output);

    Ok(())
 }

-// Overriding config with a large token budget should avoid truncation.
+// Shell tool output should remain intact when the config opts into a large token budget.
 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
-async fn large_budget_avoids_truncation() -> Result<()> {
+async fn shell_tool_output_not_truncated_with_custom_limit() -> Result<()> {
    skip_if_no_network!(Ok(()));

    let server = start_mock_server().await;
@@ -576,6 +724,7 @@ async fn large_budget_avoids_truncation() -> Result<()> {
        "command": ["/bin/sh", "-c", "seq 1 1000"],
        "timeout_ms": 5_000,
    });
+    let expected_body: String = (1..=1000).map(|i| format!("{i}\n")).collect();

    mount_sse_once(
        &server,
@@ -607,6 +756,10 @@ async fn large_budget_avoids_truncation() -> Result<()> {
        .function_call_output_text(call_id)
        .context("shell output present")?;

+    assert!(
+        output.ends_with(&expected_body),
+        "expected entire shell output when budget increased: {output}"
+    );
    assert!(
        !output.contains("truncated"),
        "output should remain untruncated with ample budget"
@@ -614,3 +767,101 @@ async fn large_budget_avoids_truncation() -> Result<()> {

    Ok(())
 }
+
+// MCP server output should also remain intact when the config increases the token limit.
+#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+async fn mcp_tool_call_output_not_truncated_with_custom_limit() -> Result<()> {
+    skip_if_no_network!(Ok(()));
+
+    let server = start_mock_server().await;
+
+    let call_id = "rmcp-untruncated";
+    let server_name = "rmcp";
+    let tool_name = format!("mcp__{server_name}__echo");
+    let large_msg = "a".repeat(80_000);
+    let args_json = serde_json::json!({ "message": large_msg });
+
+    mount_sse_once(
+        &server,
+        sse(vec![
+            responses::ev_response_created("resp-1"),
+            responses::ev_function_call(call_id, &tool_name, &args_json.to_string()),
+            responses::ev_completed("resp-1"),
+        ]),
+    )
+    .await;
+    let mock2 = mount_sse_once(
+        &server,
+        sse(vec![
+            responses::ev_assistant_message("msg-1", "rmcp echo tool completed."),
+            responses::ev_completed("resp-2"),
+        ]),
+    )
+    .await;
+
+    let rmcp_test_server_bin = CargoBuild::new()
+        .package("codex-rmcp-client")
+        .bin("test_stdio_server")
+        .run()?
+        .path()
+        .to_string_lossy()
+        .into_owned();
+
+    let mut builder = test_codex().with_config(move |config| {
+        config.features.enable(Feature::RmcpClient);
+        config.tool_output_token_limit = Some(50_000);
+        config.mcp_servers.insert(
+            server_name.to_string(),
+            codex_core::config::types::McpServerConfig {
+                transport: codex_core::config::types::McpServerTransportConfig::Stdio {
+                    command: rmcp_test_server_bin,
+                    args: Vec::new(),
+                    env: None,
+                    env_vars: Vec::new(),
+                    cwd: None,
+                },
+                enabled: true,
+                startup_timeout_sec: Some(std::time::Duration::from_secs(10)),
+                tool_timeout_sec: None,
+                enabled_tools: None,
+                disabled_tools: None,
+            },
+        );
+    });
+    let fixture = builder.build(&server).await?;
+
+    fixture
+        .submit_turn_with_policy(
+            "call the rmcp echo tool with a very large message",
+            SandboxPolicy::ReadOnly,
+        )
+        .await?;
+
+    let output = mock2
+        .single_request()
+        .function_call_output_text(call_id)
+        .context("function_call_output present for rmcp call")?;
+
+    let parsed: Value = serde_json::from_str(&output)?;
+    assert_eq!(
+        output.len(),
+        80031,
+        "parsed MCP output should retain its serialized length"
+    );
+    let expected_echo = format!("ECHOING: {large_msg}");
+    let echo_str = parsed["echo"]
+        .as_str()
+        .context("echo field should be a string in rmcp echo output")?;
+    assert_eq!(
+        echo_str.len(),
+        expected_echo.len(),
+        "echo length should match"
+    );
+    assert_eq!(echo_str, expected_echo);
+    assert!(
+        !output.contains("truncated"),
+        "output should not include truncation markers when limit is raised: {output}"
+    );
+
+    Ok(())
+}