Support multimodal custom tool outputs (#12948)

## Summary This changes `custom_tool_call_output` to use the same output payload shape as `function_call_output`, so freeform tools can return either plain text or structured content items. The main goal is to let `js_repl` return image content from nested `view_image` calls in its own `custom_tool_call_output`, instead of relying on a separate injected message. ## What changed - Changed `custom_tool_call_output.output` from `string` to `FunctionCallOutputPayload` - Updated freeform tool plumbing to preserve structured output bodies - Updated `js_repl` to aggregate nested tool content items and attach them to the outer `js_repl` result - Removed the old `js_repl` special case that injected `view_image` results as a separate pending user image message - Updated normalization/history/truncation paths to handle multimodal `custom_tool_call_output` - Regenerated app-server protocol schema artifacts ## Behavior Direct `view_image` calls still return a `function_call_output` with image content. When `view_image` is called inside `js_repl`, the outer `js_repl` `custom_tool_call_output` now carries: - an `input_text` item if the JS produced text output - one or more `input_image` items from nested tool results So the nested image result now stays inside the `js_repl` tool output instead of being injected as a separate message. ## Compatibility This is intended to be backward-compatible for resumed conversations. Older histories that stored `custom_tool_call_output.output` as a plain string still deserialize correctly, and older histories that used the previous injected-image-message flow also continue to resume. Added regression coverage for resuming a pre-change rollout containing: - string-valued `custom_tool_call_output` - legacy injected image message history #### [git stack](https://github.com/magus/git-stack-cli) - 👉 `1` https://github.com/openai/codex/pull/12948
2026-05-02 20:32:04 +03:00 · 2026-02-26 18:17:46 -08:00
parent f90e97e414
commit 7e980d7db6
20 changed files with 688 additions and 177 deletions
--- a/codex-rs/core/src/context_manager/history.rs
+++ b/codex-rs/core/src/context_manager/history.rs
@@ -344,32 +344,21 @@ impl ContextManager {
        let policy_with_serialization_budget = policy * 1.2;
        match item {
            ResponseItem::FunctionCallOutput { call_id, output } => {
-                let body = match &output.body {
-                    FunctionCallOutputBody::Text(content) => FunctionCallOutputBody::Text(
-                        truncate_text(content, policy_with_serialization_budget),
-                    ),
-                    FunctionCallOutputBody::ContentItems(items) => {
-                        FunctionCallOutputBody::ContentItems(
-                            truncate_function_output_items_with_policy(
-                                items,
-                                policy_with_serialization_budget,
-                            ),
-                        )
-                    }
-                };
                ResponseItem::FunctionCallOutput {
                    call_id: call_id.clone(),
-                    output: FunctionCallOutputPayload {
-                        body,
-                        success: output.success,
-                    },
+                    output: truncate_function_output_payload(
+                        output,
+                        policy_with_serialization_budget,
+                    ),
                }
            }
            ResponseItem::CustomToolCallOutput { call_id, output } => {
-                let truncated = truncate_text(output, policy_with_serialization_budget);
                ResponseItem::CustomToolCallOutput {
                    call_id: call_id.clone(),
-                    output: truncated,
+                    output: truncate_function_output_payload(
+                        output,
+                        policy_with_serialization_budget,
+                    ),
                }
            }
            ResponseItem::Message { .. }
@@ -385,6 +374,25 @@ impl ContextManager {
    }
 }

+fn truncate_function_output_payload(
+    output: &FunctionCallOutputPayload,
+    policy: TruncationPolicy,
+) -> FunctionCallOutputPayload {
+    let body = match &output.body {
+        FunctionCallOutputBody::Text(content) => {
+            FunctionCallOutputBody::Text(truncate_text(content, policy))
+        }
+        FunctionCallOutputBody::ContentItems(items) => FunctionCallOutputBody::ContentItems(
+            truncate_function_output_items_with_policy(items, policy),
+        ),
+    };
+
+    FunctionCallOutputPayload {
+        body,
+        success: output.success,
+    }
+}
+
 /// API messages include every non-system item (user/assistant messages, reasoning,
 /// tool calls, tool outputs, shell calls, and web-search calls).
 fn is_api_message(message: &ResponseItem) -> bool {
@@ -508,7 +516,8 @@ fn image_data_url_estimate_adjustment(item: &ResponseItem) -> (i64, i64) {
                }
            }
        }
-        ResponseItem::FunctionCallOutput { output, .. } => {
+        ResponseItem::FunctionCallOutput { output, .. }
+        | ResponseItem::CustomToolCallOutput { output, .. } => {
            if let FunctionCallOutputBody::ContentItems(items) = &output.body {
                for content_item in items {
                    if let FunctionCallOutputContentItem::InputImage { image_url } = content_item {
--- a/codex-rs/core/src/context_manager/history_tests.rs
+++ b/codex-rs/core/src/context_manager/history_tests.rs
@@ -67,7 +67,7 @@ fn user_input_text_msg(text: &str) -> ResponseItem {
 fn custom_tool_call_output(call_id: &str, output: &str) -> ResponseItem {
    ResponseItem::CustomToolCallOutput {
        call_id: call_id.to_string(),
-        output: output.to_string(),
+        output: FunctionCallOutputPayload::from_text(output.to_string()),
    }
 }

@@ -279,6 +279,24 @@ fn for_prompt_strips_images_when_model_does_not_support_images() {
                },
            ]),
        },
+        ResponseItem::CustomToolCall {
+            id: None,
+            status: None,
+            call_id: "tool-1".to_string(),
+            name: "js_repl".to_string(),
+            input: "view_image".to_string(),
+        },
+        ResponseItem::CustomToolCallOutput {
+            call_id: "tool-1".to_string(),
+            output: FunctionCallOutputPayload::from_content_items(vec![
+                FunctionCallOutputContentItem::InputText {
+                    text: "js repl result".to_string(),
+                },
+                FunctionCallOutputContentItem::InputImage {
+                    image_url: "https://example.com/js-repl-result.png".to_string(),
+                },
+            ]),
+        },
    ];
    let history = create_history_with_items(items);
    let text_only_modalities = vec![InputModality::Text];
@@ -321,6 +339,25 @@ fn for_prompt_strips_images_when_model_does_not_support_images() {
                },
            ]),
        },
+        ResponseItem::CustomToolCall {
+            id: None,
+            status: None,
+            call_id: "tool-1".to_string(),
+            name: "js_repl".to_string(),
+            input: "view_image".to_string(),
+        },
+        ResponseItem::CustomToolCallOutput {
+            call_id: "tool-1".to_string(),
+            output: FunctionCallOutputPayload::from_content_items(vec![
+                FunctionCallOutputContentItem::InputText {
+                    text: "js repl result".to_string(),
+                },
+                FunctionCallOutputContentItem::InputText {
+                    text: "image content omitted because you do not support image input"
+                        .to_string(),
+                },
+            ]),
+        },
    ];
    assert_eq!(stripped, expected);

@@ -671,7 +708,7 @@ fn remove_first_item_handles_custom_tool_pair() {
        },
        ResponseItem::CustomToolCallOutput {
            call_id: "tool-1".to_string(),
-            output: "ok".to_string(),
+            output: FunctionCallOutputPayload::from_text("ok".to_string()),
        },
    ];
    let mut h = create_history_with_items(items);
@@ -750,7 +787,7 @@ fn record_items_truncates_custom_tool_call_output_content() {
    let long_output = line.repeat(2_500);
    let item = ResponseItem::CustomToolCallOutput {
        call_id: "tool-200".to_string(),
-        output: long_output.clone(),
+        output: FunctionCallOutputPayload::from_text(long_output.clone()),
    };

    history.record_items([&item], policy);
@@ -758,7 +795,8 @@ fn record_items_truncates_custom_tool_call_output_content() {
    assert_eq!(history.items.len(), 1);
    match &history.items[0] {
        ResponseItem::CustomToolCallOutput { output, .. } => {
-            assert_ne!(output, &long_output);
+            let output = output.text_content().unwrap_or_default();
+            assert_ne!(output, long_output);
            assert!(
                output.contains("tokens truncated"),
                "expected token-based truncation marker, got {output}"
@@ -949,7 +987,7 @@ fn normalize_adds_missing_output_for_custom_tool_call() {
            },
            ResponseItem::CustomToolCallOutput {
                call_id: "tool-x".to_string(),
-                output: "aborted".to_string(),
+                output: FunctionCallOutputPayload::from_text("aborted".to_string()),
            },
        ]
    );
@@ -1016,7 +1054,7 @@ fn normalize_removes_orphan_function_call_output() {
 fn normalize_removes_orphan_custom_tool_call_output() {
    let items = vec![ResponseItem::CustomToolCallOutput {
        call_id: "orphan-2".to_string(),
-        output: "ok".to_string(),
+        output: FunctionCallOutputPayload::from_text("ok".to_string()),
    }];
    let mut h = create_history_with_items(items);

@@ -1089,7 +1127,7 @@ fn normalize_mixed_inserts_and_removals() {
            },
            ResponseItem::CustomToolCallOutput {
                call_id: "t1".to_string(),
-                output: "aborted".to_string(),
+                output: FunctionCallOutputPayload::from_text("aborted".to_string()),
            },
            ResponseItem::LocalShellCall {
                id: None,
@@ -1191,7 +1229,7 @@ fn normalize_removes_orphan_function_call_output_panics_in_debug() {
 fn normalize_removes_orphan_custom_tool_call_output_panics_in_debug() {
    let items = vec![ResponseItem::CustomToolCallOutput {
        call_id: "orphan-2".to_string(),
-        output: "ok".to_string(),
+        output: FunctionCallOutputPayload::from_text("ok".to_string()),
    }];
    let mut h = create_history_with_items(items);
    h.normalize_history(&default_input_modalities());
@@ -1294,6 +1332,28 @@ fn image_data_url_payload_does_not_dominate_function_call_output_estimate() {
    assert!(estimated < raw_len);
 }

+#[test]
+fn image_data_url_payload_does_not_dominate_custom_tool_call_output_estimate() {
+    let payload = "C".repeat(50_000);
+    let image_url = format!("data:image/png;base64,{payload}");
+    let item = ResponseItem::CustomToolCallOutput {
+        call_id: "call-js-repl".to_string(),
+        output: FunctionCallOutputPayload::from_content_items(vec![
+            FunctionCallOutputContentItem::InputText {
+                text: "Screenshot captured".to_string(),
+            },
+            FunctionCallOutputContentItem::InputImage { image_url },
+        ]),
+    };
+
+    let raw_len = serde_json::to_string(&item).unwrap().len() as i64;
+    let estimated = estimate_response_item_model_visible_bytes(&item);
+    let expected = raw_len - payload.len() as i64 + IMAGE_BYTES_ESTIMATE;
+
+    assert_eq!(estimated, expected);
+    assert!(estimated < raw_len);
+}
+
 #[test]
 fn non_base64_image_urls_are_unchanged() {
    let message_item = ResponseItem::Message {
--- a/codex-rs/core/src/context_manager/normalize.rs
+++ b/codex-rs/core/src/context_manager/normalize.rs
@@ -1,7 +1,6 @@
 use std::collections::HashSet;

 use codex_protocol::models::ContentItem;
-use codex_protocol::models::FunctionCallOutputBody;
 use codex_protocol::models::FunctionCallOutputContentItem;
 use codex_protocol::models::FunctionCallOutputPayload;
 use codex_protocol::models::ResponseItem;
@@ -35,10 +34,7 @@ pub(crate) fn ensure_call_outputs_present(items: &mut Vec<ResponseItem>) {
                        idx,
                        ResponseItem::FunctionCallOutput {
                            call_id: call_id.clone(),
-                            output: FunctionCallOutputPayload {
-                                body: FunctionCallOutputBody::Text("aborted".to_string()),
-                                ..Default::default()
-                            },
+                            output: FunctionCallOutputPayload::from_text("aborted".to_string()),
                        },
                    ));
                }
@@ -59,7 +55,7 @@ pub(crate) fn ensure_call_outputs_present(items: &mut Vec<ResponseItem>) {
                        idx,
                        ResponseItem::CustomToolCallOutput {
                            call_id: call_id.clone(),
-                            output: "aborted".to_string(),
+                            output: FunctionCallOutputPayload::from_text("aborted".to_string()),
                        },
                    ));
                }
@@ -82,10 +78,7 @@ pub(crate) fn ensure_call_outputs_present(items: &mut Vec<ResponseItem>) {
                            idx,
                            ResponseItem::FunctionCallOutput {
                                call_id: call_id.clone(),
-                                output: FunctionCallOutputPayload {
-                                    body: FunctionCallOutputBody::Text("aborted".to_string()),
-                                    ..Default::default()
-                                },
+                                output: FunctionCallOutputPayload::from_text("aborted".to_string()),
                            },
                        ));
                    }
@@ -245,7 +238,8 @@ pub(crate) fn strip_images_when_unsupported(
                }
                *content = normalized_content;
            }
-            ResponseItem::FunctionCallOutput { output, .. } => {
+            ResponseItem::FunctionCallOutput { output, .. }
+            | ResponseItem::CustomToolCallOutput { output, .. } => {
                if let Some(content_items) = output.content_items_mut() {
                    let mut normalized_content_items = Vec::with_capacity(content_items.len());
                    for content_item in content_items.iter() {