feat(app-server, core): allow text + image content items for dynamic tool outputs (#10567)

Took over the work that @aaronl-openai started here:
https://github.com/openai/codex/pull/10397

Now that app-server clients are able to set up custom tools (called
`dynamic_tools` in app-server), we should expose a way for clients to
pass in not just text, but also image outputs. This is something the
Responses API already supports for function call outputs, where you can
pass in either a string or an array of content outputs (text, image,
file):
https://platform.openai.com/docs/api-reference/responses/create#responses_create-input-input_item_list-item-function_tool_call_output-output-array-input_image

So let's just plumb it through in Codex (with the caveat that we only
support text and image for now). This is implemented end-to-end across
app-server v2 protocol types and core tool handling.

## Breaking API change
NOTE: This introduces a breaking change with dynamic tools, but I think
it's ok since this concept was only recently introduced
(https://github.com/openai/codex/pull/9539) and it's better to get the
API contract correct. I don't think there are any real consumers of this
yet (not even the Codex App).

Old shape:
`{ "output": "dynamic-ok", "success": true }`

New shape:
```
{
    "contentItems": [
      { "type": "inputText", "text": "dynamic-ok" },
      { "type": "inputImage", "imageUrl": "data:image/png;base64,AAA" }
    ]
  "success": true
}
```
This commit is contained in:
Owen Lin
2026-02-04 16:12:47 -08:00
committed by GitHub
parent f9c38f531c
commit 5ea107a088
49 changed files with 1103 additions and 468 deletions

View File

@@ -5,7 +5,7 @@ use crate::tools::TELEMETRY_PREVIEW_MAX_LINES;
use crate::tools::TELEMETRY_PREVIEW_TRUNCATION_NOTICE;
use crate::turn_diff_tracker::TurnDiffTracker;
use codex_protocol::mcp::CallToolResult;
use codex_protocol::models::FunctionCallOutputContentItem;
use codex_protocol::models::FunctionCallOutputBody;
use codex_protocol::models::FunctionCallOutputPayload;
use codex_protocol::models::ResponseInputItem;
use codex_protocol::models::ShellToolCallParams;
@@ -58,10 +58,9 @@ impl ToolPayload {
#[derive(Clone)]
pub enum ToolOutput {
Function {
// Plain text representation of the tool output.
content: String,
// Some tool calls such as MCP calls may return structured content that can get parsed into an array of polymorphic content items.
content_items: Option<Vec<FunctionCallOutputContentItem>>,
// Canonical output body for function-style tools. This may be plain text
// or structured content items.
body: FunctionCallOutputBody,
success: Option<bool>,
},
Mcp {
@@ -72,7 +71,9 @@ pub enum ToolOutput {
impl ToolOutput {
pub fn log_preview(&self) -> String {
match self {
ToolOutput::Function { content, .. } => telemetry_preview(content),
ToolOutput::Function { body, .. } => {
telemetry_preview(&body.to_text().unwrap_or_default())
}
ToolOutput::Mcp { result } => format!("{result:?}"),
}
}
@@ -86,27 +87,28 @@ impl ToolOutput {
pub fn into_response(self, call_id: &str, payload: &ToolPayload) -> ResponseInputItem {
match self {
ToolOutput::Function {
content,
content_items,
success,
} => {
ToolOutput::Function { body, success } => {
// `custom_tool_call` is the Responses API item type for freeform
// tools (`ToolSpec::Freeform`, e.g. freeform `apply_patch`).
// Those payloads must round-trip as `custom_tool_call_output`
// with plain string output.
if matches!(payload, ToolPayload::Custom { .. }) {
ResponseInputItem::CustomToolCallOutput {
// Freeform/custom tools (`custom_tool_call`) use the custom
// output wire shape and remain string-only.
return ResponseInputItem::CustomToolCallOutput {
call_id: call_id.to_string(),
output: content,
}
} else {
ResponseInputItem::FunctionCallOutput {
call_id: call_id.to_string(),
output: FunctionCallOutputPayload {
content,
content_items,
success,
},
}
output: body.to_text().unwrap_or_default(),
};
}
// Function-style outputs (JSON function tools, including dynamic
// tools and MCP adaptation) preserve the exact body shape.
ResponseInputItem::FunctionCallOutput {
call_id: call_id.to_string(),
output: FunctionCallOutputPayload { body, success },
}
}
// Direct MCP response path for MCP tool result envelopes.
ToolOutput::Mcp { result } => ResponseInputItem::McpToolCallOutput {
call_id: call_id.to_string(),
result,
@@ -158,6 +160,7 @@ fn telemetry_preview(content: &str) -> String {
#[cfg(test)]
mod tests {
use super::*;
use codex_protocol::models::FunctionCallOutputContentItem;
use pretty_assertions::assert_eq;
#[test]
@@ -166,8 +169,7 @@ mod tests {
input: "patch".to_string(),
};
let response = ToolOutput::Function {
content: "patched".to_string(),
content_items: None,
body: FunctionCallOutputBody::Text("patched".to_string()),
success: Some(true),
}
.into_response("call-42", &payload);
@@ -187,8 +189,7 @@ mod tests {
arguments: "{}".to_string(),
};
let response = ToolOutput::Function {
content: "ok".to_string(),
content_items: None,
body: FunctionCallOutputBody::Text("ok".to_string()),
success: Some(true),
}
.into_response("fn-1", &payload);
@@ -196,14 +197,58 @@ mod tests {
match response {
ResponseInputItem::FunctionCallOutput { call_id, output } => {
assert_eq!(call_id, "fn-1");
assert_eq!(output.content, "ok");
assert!(output.content_items.is_none());
assert_eq!(output.text_content(), Some("ok"));
assert!(output.content_items().is_none());
assert_eq!(output.success, Some(true));
}
other => panic!("expected FunctionCallOutput, got {other:?}"),
}
}
#[test]
fn custom_tool_calls_can_derive_text_from_content_items() {
let payload = ToolPayload::Custom {
input: "patch".to_string(),
};
let response = ToolOutput::Function {
body: FunctionCallOutputBody::ContentItems(vec![
FunctionCallOutputContentItem::InputText {
text: "line 1".to_string(),
},
FunctionCallOutputContentItem::InputImage {
image_url: "data:image/png;base64,AAA".to_string(),
},
FunctionCallOutputContentItem::InputText {
text: "line 2".to_string(),
},
]),
success: Some(true),
}
.into_response("call-99", &payload);
match response {
ResponseInputItem::CustomToolCallOutput { call_id, output } => {
assert_eq!(call_id, "call-99");
assert_eq!(output, "line 1\nline 2");
}
other => panic!("expected CustomToolCallOutput, got {other:?}"),
}
}
#[test]
fn log_preview_uses_content_items_when_plain_text_is_missing() {
let output = ToolOutput::Function {
body: FunctionCallOutputBody::ContentItems(vec![
FunctionCallOutputContentItem::InputText {
text: "preview".to_string(),
},
]),
success: Some(true),
};
assert_eq!(output.log_preview(), "preview");
}
#[test]
fn telemetry_preview_returns_original_within_limits() {
let content = "short output";