Migrate coverage to shell_command (#7042)

This commit is contained in:
pakrym-oai
2025-11-20 19:44:00 -08:00
committed by GitHub
parent 830ab4ce20
commit 767b66f407
25 changed files with 284 additions and 262 deletions

View File

@@ -98,7 +98,7 @@ async fn truncate_function_error_trims_respond_to_model() -> Result<()> {
Ok(())
}
// Verifies that a standard tool call (shell) exceeding the model formatting
// Verifies that a standard tool call (shell_command) exceeding the model formatting
// limits is truncated before being sent back to the model.
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn tool_call_output_configured_limit_chars_type() -> Result<()> {
@@ -106,7 +106,7 @@ async fn tool_call_output_configured_limit_chars_type() -> Result<()> {
let server = start_mock_server().await;
// Use a model that exposes the generic shell tool.
// Use a model that exposes the shell_command tool.
let mut builder = test_codex().with_model("gpt-5.1").with_config(|config| {
config.tool_output_token_limit = Some(100_000);
});
@@ -114,28 +114,22 @@ async fn tool_call_output_configured_limit_chars_type() -> Result<()> {
let fixture = builder.build(&server).await?;
let call_id = "shell-too-large";
let args = if cfg!(windows) {
serde_json::json!({
"command": [
"powershell",
"-Command",
"for ($i=1; $i -le 100000; $i++) { Write-Output $i }"
],
"timeout_ms": 5_000,
})
let command = if cfg!(windows) {
"for ($i=1; $i -le 100000; $i++) { Write-Output $i }"
} else {
serde_json::json!({
"command": ["/bin/sh", "-c", "seq 1 100000"],
"timeout_ms": 5_000,
})
"seq 1 100000"
};
let args = serde_json::json!({
"command": command,
"timeout_ms": 5_000,
});
// First response: model tells us to run the tool; second: complete the turn.
mount_sse_once(
&server,
sse(vec![
responses::ev_response_created("resp-1"),
responses::ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
responses::ev_function_call(call_id, "shell_command", &serde_json::to_string(&args)?),
responses::ev_completed("resp-1"),
]),
)
@@ -167,7 +161,10 @@ async fn tool_call_output_configured_limit_chars_type() -> Result<()> {
"expected truncated shell output to be plain text"
);
assert_eq!(output.len(), 400097, "we should be almost 100k tokens");
assert!(
(400000..=401000).contains(&output.len()),
"we should be almost 100k tokens"
);
assert!(
!output.contains("tokens truncated"),
@@ -177,7 +174,7 @@ async fn tool_call_output_configured_limit_chars_type() -> Result<()> {
Ok(())
}
// Verifies that a standard tool call (shell) exceeding the model formatting
// Verifies that a standard tool call (shell_command) exceeding the model formatting
// limits is truncated before being sent back to the model.
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn tool_call_output_exceeds_limit_truncated_chars_limit() -> Result<()> {
@@ -185,34 +182,28 @@ async fn tool_call_output_exceeds_limit_truncated_chars_limit() -> Result<()> {
let server = start_mock_server().await;
// Use a model that exposes the generic shell tool.
// Use a model that exposes the shell_command tool.
let mut builder = test_codex().with_model("gpt-5.1");
let fixture = builder.build(&server).await?;
let call_id = "shell-too-large";
let args = if cfg!(windows) {
serde_json::json!({
"command": [
"powershell",
"-Command",
"for ($i=1; $i -le 100000; $i++) { Write-Output $i }"
],
"timeout_ms": 5_000,
})
let command = if cfg!(windows) {
"for ($i=1; $i -le 100000; $i++) { Write-Output $i }"
} else {
serde_json::json!({
"command": ["/bin/sh", "-c", "seq 1 100000"],
"timeout_ms": 5_000,
})
"seq 1 100000"
};
let args = serde_json::json!({
"command": command,
"timeout_ms": 5_000,
});
// First response: model tells us to run the tool; second: complete the turn.
mount_sse_once(
&server,
sse(vec![
responses::ev_response_created("resp-1"),
responses::ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
responses::ev_function_call(call_id, "shell_command", &serde_json::to_string(&args)?),
responses::ev_completed("resp-1"),
]),
)
@@ -250,14 +241,14 @@ async fn tool_call_output_exceeds_limit_truncated_chars_limit() -> Result<()> {
let len = output.len();
assert!(
(9_900..=10_000).contains(&len),
(9_900..=10_100).contains(&len),
"expected ~10k chars after truncation, got {len}"
);
Ok(())
}
// Verifies that a standard tool call (shell) exceeding the model formatting
// Verifies that a standard tool call (shell_command) exceeding the model formatting
// limits is truncated before being sent back to the model.
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn tool_call_output_exceeds_limit_truncated_for_model() -> Result<()> {
@@ -265,7 +256,7 @@ async fn tool_call_output_exceeds_limit_truncated_for_model() -> Result<()> {
let server = start_mock_server().await;
// Use a model that exposes the generic shell tool.
// Use a model that exposes the shell_command tool.
let mut builder = test_codex().with_config(|config| {
config.model = "gpt-5.1-codex".to_string();
config.model_family =
@@ -274,28 +265,22 @@ async fn tool_call_output_exceeds_limit_truncated_for_model() -> Result<()> {
let fixture = builder.build(&server).await?;
let call_id = "shell-too-large";
let args = if cfg!(windows) {
serde_json::json!({
"command": [
"powershell",
"-Command",
"for ($i=1; $i -le 100000; $i++) { Write-Output $i }"
],
"timeout_ms": 5_000,
})
let command = if cfg!(windows) {
"for ($i=1; $i -le 100000; $i++) { Write-Output $i }"
} else {
serde_json::json!({
"command": ["/bin/sh", "-c", "seq 1 100000"],
"timeout_ms": 5_000,
})
"seq 1 100000"
};
let args = serde_json::json!({
"command": command,
"timeout_ms": 5_000,
});
// First response: model tells us to run the tool; second: complete the turn.
mount_sse_once(
&server,
sse(vec![
responses::ev_response_created("resp-1"),
responses::ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
responses::ev_function_call(call_id, "shell_command", &serde_json::to_string(&args)?),
responses::ev_completed("resp-1"),
]),
)
@@ -345,7 +330,7 @@ $"#;
Ok(())
}
// Ensures shell tool outputs that exceed the line limit are truncated only once.
// Ensures shell_command outputs that exceed the line limit are truncated only once.
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn tool_call_output_truncated_only_once() -> Result<()> {
skip_if_no_network!(Ok(()));
@@ -359,27 +344,21 @@ async fn tool_call_output_truncated_only_once() -> Result<()> {
});
let fixture = builder.build(&server).await?;
let call_id = "shell-single-truncation";
let args = if cfg!(windows) {
serde_json::json!({
"command": [
"powershell",
"-Command",
"for ($i=1; $i -le 10000; $i++) { Write-Output $i }"
],
"timeout_ms": 5_000,
})
let command = if cfg!(windows) {
"for ($i=1; $i -le 10000; $i++) { Write-Output $i }"
} else {
serde_json::json!({
"command": ["/bin/sh", "-c", "seq 1 10000"],
"timeout_ms": 5_000,
})
"seq 1 10000"
};
let args = serde_json::json!({
"command": command,
"timeout_ms": 5_000,
});
mount_sse_once(
&server,
sse(vec![
responses::ev_response_created("resp-1"),
responses::ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
responses::ev_function_call(call_id, "shell_command", &serde_json::to_string(&args)?),
responses::ev_completed("resp-1"),
]),
)
@@ -619,7 +598,7 @@ async fn token_policy_marker_reports_tokens() -> Result<()> {
let call_id = "shell-token-marker";
let args = json!({
"command": ["/bin/sh", "-c", "seq 1 150"],
"command": "seq 1 150",
"timeout_ms": 5_000,
});
@@ -627,7 +606,7 @@ async fn token_policy_marker_reports_tokens() -> Result<()> {
&server,
sse(vec![
ev_response_created("resp-1"),
ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
ev_function_call(call_id, "shell_command", &serde_json::to_string(&args)?),
ev_completed("resp-1"),
]),
)
@@ -650,7 +629,7 @@ async fn token_policy_marker_reports_tokens() -> Result<()> {
.function_call_output_text(call_id)
.context("shell output present")?;
let pattern = r#"(?s)^\{"output":"Total output lines: 150\\n\\n1\\n2\\n3\\n4\\n5\\n.*?…\d+ tokens truncated…7\\n138\\n139\\n140\\n141\\n142\\n143\\n144\\n145\\n146\\n147\\n148\\n149\\n150\\n","metadata":\{"exit_code":0,"duration_seconds":0\.0\}\}$"#;
let pattern = r"(?s)^Exit code: 0\nWall time: [0-9]+(?:\.[0-9]+)? seconds\nTotal output lines: 150\nOutput:\n1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19.*tokens truncated.*129\n130\n131\n132\n133\n134\n135\n136\n137\n138\n139\n140\n141\n142\n143\n144\n145\n146\n147\n148\n149\n150\n$";
assert_regex_match(pattern, &output);
@@ -672,7 +651,7 @@ async fn byte_policy_marker_reports_bytes() -> Result<()> {
let call_id = "shell-byte-marker";
let args = json!({
"command": ["/bin/sh", "-c", "seq 1 150"],
"command": "seq 1 150",
"timeout_ms": 5_000,
});
@@ -680,7 +659,7 @@ async fn byte_policy_marker_reports_bytes() -> Result<()> {
&server,
sse(vec![
ev_response_created("resp-1"),
ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
ev_function_call(call_id, "shell_command", &serde_json::to_string(&args)?),
ev_completed("resp-1"),
]),
)
@@ -703,16 +682,16 @@ async fn byte_policy_marker_reports_bytes() -> Result<()> {
.function_call_output_text(call_id)
.context("shell output present")?;
let pattern = r#"(?s)^\{"output":"Total output lines: 150\\n\\n1\\n2\\n3\\n4\\n5.*?…\d+ chars truncated…7\\n138\\n139\\n140\\n141\\n142\\n143\\n144\\n145\\n146\\n147\\n148\\n149\\n150\\n","metadata":\{"exit_code":0,"duration_seconds":0\.0\}\}$"#;
let pattern = r"(?s)^Exit code: 0\nWall time: [0-9]+(?:\.[0-9]+)? seconds\nTotal output lines: 150\nOutput:\n1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19.*chars truncated.*129\n130\n131\n132\n133\n134\n135\n136\n137\n138\n139\n140\n141\n142\n143\n144\n145\n146\n147\n148\n149\n150\n$";
assert_regex_match(pattern, &output);
Ok(())
}
// Shell tool output should remain intact when the config opts into a large token budget.
// shell_command output should remain intact when the config opts into a large token budget.
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn shell_tool_output_not_truncated_with_custom_limit() -> Result<()> {
async fn shell_command_output_not_truncated_with_custom_limit() -> Result<()> {
skip_if_no_network!(Ok(()));
let server = start_mock_server().await;
@@ -726,7 +705,7 @@ async fn shell_tool_output_not_truncated_with_custom_limit() -> Result<()> {
let call_id = "shell-no-trunc";
let args = json!({
"command": ["/bin/sh", "-c", "seq 1 1000"],
"command": "seq 1 1000",
"timeout_ms": 5_000,
});
let expected_body: String = (1..=1000).map(|i| format!("{i}\n")).collect();
@@ -735,7 +714,7 @@ async fn shell_tool_output_not_truncated_with_custom_limit() -> Result<()> {
&server,
sse(vec![
ev_response_created("resp-1"),
ev_function_call(call_id, "shell", &serde_json::to_string(&args)?),
ev_function_call(call_id, "shell_command", &serde_json::to_string(&args)?),
ev_completed("resp-1"),
]),
)