mirror of
https://github.com/openai/codex.git
synced 2026-05-02 20:32:04 +03:00
Support multimodal custom tool outputs (#12948)
## Summary This changes `custom_tool_call_output` to use the same output payload shape as `function_call_output`, so freeform tools can return either plain text or structured content items. The main goal is to let `js_repl` return image content from nested `view_image` calls in its own `custom_tool_call_output`, instead of relying on a separate injected message. ## What changed - Changed `custom_tool_call_output.output` from `string` to `FunctionCallOutputPayload` - Updated freeform tool plumbing to preserve structured output bodies - Updated `js_repl` to aggregate nested tool content items and attach them to the outer `js_repl` result - Removed the old `js_repl` special case that injected `view_image` results as a separate pending user image message - Updated normalization/history/truncation paths to handle multimodal `custom_tool_call_output` - Regenerated app-server protocol schema artifacts ## Behavior Direct `view_image` calls still return a `function_call_output` with image content. When `view_image` is called inside `js_repl`, the outer `js_repl` `custom_tool_call_output` now carries: - an `input_text` item if the JS produced text output - one or more `input_image` items from nested tool results So the nested image result now stays inside the `js_repl` tool output instead of being injected as a separate message. ## Compatibility This is intended to be backward-compatible for resumed conversations. Older histories that stored `custom_tool_call_output.output` as a plain string still deserialize correctly, and older histories that used the previous injected-image-message flow also continue to resume. Added regression coverage for resuming a pre-change rollout containing: - string-valued `custom_tool_call_output` - legacy injected image message history #### [git stack](https://github.com/magus/git-stack-cli) - 👉 `1` https://github.com/openai/codex/pull/12948
This commit is contained in:
committed by
GitHub
parent
f90e97e414
commit
7e980d7db6
@@ -344,32 +344,21 @@ impl ContextManager {
|
||||
let policy_with_serialization_budget = policy * 1.2;
|
||||
match item {
|
||||
ResponseItem::FunctionCallOutput { call_id, output } => {
|
||||
let body = match &output.body {
|
||||
FunctionCallOutputBody::Text(content) => FunctionCallOutputBody::Text(
|
||||
truncate_text(content, policy_with_serialization_budget),
|
||||
),
|
||||
FunctionCallOutputBody::ContentItems(items) => {
|
||||
FunctionCallOutputBody::ContentItems(
|
||||
truncate_function_output_items_with_policy(
|
||||
items,
|
||||
policy_with_serialization_budget,
|
||||
),
|
||||
)
|
||||
}
|
||||
};
|
||||
ResponseItem::FunctionCallOutput {
|
||||
call_id: call_id.clone(),
|
||||
output: FunctionCallOutputPayload {
|
||||
body,
|
||||
success: output.success,
|
||||
},
|
||||
output: truncate_function_output_payload(
|
||||
output,
|
||||
policy_with_serialization_budget,
|
||||
),
|
||||
}
|
||||
}
|
||||
ResponseItem::CustomToolCallOutput { call_id, output } => {
|
||||
let truncated = truncate_text(output, policy_with_serialization_budget);
|
||||
ResponseItem::CustomToolCallOutput {
|
||||
call_id: call_id.clone(),
|
||||
output: truncated,
|
||||
output: truncate_function_output_payload(
|
||||
output,
|
||||
policy_with_serialization_budget,
|
||||
),
|
||||
}
|
||||
}
|
||||
ResponseItem::Message { .. }
|
||||
@@ -385,6 +374,25 @@ impl ContextManager {
|
||||
}
|
||||
}
|
||||
|
||||
fn truncate_function_output_payload(
|
||||
output: &FunctionCallOutputPayload,
|
||||
policy: TruncationPolicy,
|
||||
) -> FunctionCallOutputPayload {
|
||||
let body = match &output.body {
|
||||
FunctionCallOutputBody::Text(content) => {
|
||||
FunctionCallOutputBody::Text(truncate_text(content, policy))
|
||||
}
|
||||
FunctionCallOutputBody::ContentItems(items) => FunctionCallOutputBody::ContentItems(
|
||||
truncate_function_output_items_with_policy(items, policy),
|
||||
),
|
||||
};
|
||||
|
||||
FunctionCallOutputPayload {
|
||||
body,
|
||||
success: output.success,
|
||||
}
|
||||
}
|
||||
|
||||
/// API messages include every non-system item (user/assistant messages, reasoning,
|
||||
/// tool calls, tool outputs, shell calls, and web-search calls).
|
||||
fn is_api_message(message: &ResponseItem) -> bool {
|
||||
@@ -508,7 +516,8 @@ fn image_data_url_estimate_adjustment(item: &ResponseItem) -> (i64, i64) {
|
||||
}
|
||||
}
|
||||
}
|
||||
ResponseItem::FunctionCallOutput { output, .. } => {
|
||||
ResponseItem::FunctionCallOutput { output, .. }
|
||||
| ResponseItem::CustomToolCallOutput { output, .. } => {
|
||||
if let FunctionCallOutputBody::ContentItems(items) = &output.body {
|
||||
for content_item in items {
|
||||
if let FunctionCallOutputContentItem::InputImage { image_url } = content_item {
|
||||
|
||||
@@ -67,7 +67,7 @@ fn user_input_text_msg(text: &str) -> ResponseItem {
|
||||
fn custom_tool_call_output(call_id: &str, output: &str) -> ResponseItem {
|
||||
ResponseItem::CustomToolCallOutput {
|
||||
call_id: call_id.to_string(),
|
||||
output: output.to_string(),
|
||||
output: FunctionCallOutputPayload::from_text(output.to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -279,6 +279,24 @@ fn for_prompt_strips_images_when_model_does_not_support_images() {
|
||||
},
|
||||
]),
|
||||
},
|
||||
ResponseItem::CustomToolCall {
|
||||
id: None,
|
||||
status: None,
|
||||
call_id: "tool-1".to_string(),
|
||||
name: "js_repl".to_string(),
|
||||
input: "view_image".to_string(),
|
||||
},
|
||||
ResponseItem::CustomToolCallOutput {
|
||||
call_id: "tool-1".to_string(),
|
||||
output: FunctionCallOutputPayload::from_content_items(vec![
|
||||
FunctionCallOutputContentItem::InputText {
|
||||
text: "js repl result".to_string(),
|
||||
},
|
||||
FunctionCallOutputContentItem::InputImage {
|
||||
image_url: "https://example.com/js-repl-result.png".to_string(),
|
||||
},
|
||||
]),
|
||||
},
|
||||
];
|
||||
let history = create_history_with_items(items);
|
||||
let text_only_modalities = vec![InputModality::Text];
|
||||
@@ -321,6 +339,25 @@ fn for_prompt_strips_images_when_model_does_not_support_images() {
|
||||
},
|
||||
]),
|
||||
},
|
||||
ResponseItem::CustomToolCall {
|
||||
id: None,
|
||||
status: None,
|
||||
call_id: "tool-1".to_string(),
|
||||
name: "js_repl".to_string(),
|
||||
input: "view_image".to_string(),
|
||||
},
|
||||
ResponseItem::CustomToolCallOutput {
|
||||
call_id: "tool-1".to_string(),
|
||||
output: FunctionCallOutputPayload::from_content_items(vec![
|
||||
FunctionCallOutputContentItem::InputText {
|
||||
text: "js repl result".to_string(),
|
||||
},
|
||||
FunctionCallOutputContentItem::InputText {
|
||||
text: "image content omitted because you do not support image input"
|
||||
.to_string(),
|
||||
},
|
||||
]),
|
||||
},
|
||||
];
|
||||
assert_eq!(stripped, expected);
|
||||
|
||||
@@ -671,7 +708,7 @@ fn remove_first_item_handles_custom_tool_pair() {
|
||||
},
|
||||
ResponseItem::CustomToolCallOutput {
|
||||
call_id: "tool-1".to_string(),
|
||||
output: "ok".to_string(),
|
||||
output: FunctionCallOutputPayload::from_text("ok".to_string()),
|
||||
},
|
||||
];
|
||||
let mut h = create_history_with_items(items);
|
||||
@@ -750,7 +787,7 @@ fn record_items_truncates_custom_tool_call_output_content() {
|
||||
let long_output = line.repeat(2_500);
|
||||
let item = ResponseItem::CustomToolCallOutput {
|
||||
call_id: "tool-200".to_string(),
|
||||
output: long_output.clone(),
|
||||
output: FunctionCallOutputPayload::from_text(long_output.clone()),
|
||||
};
|
||||
|
||||
history.record_items([&item], policy);
|
||||
@@ -758,7 +795,8 @@ fn record_items_truncates_custom_tool_call_output_content() {
|
||||
assert_eq!(history.items.len(), 1);
|
||||
match &history.items[0] {
|
||||
ResponseItem::CustomToolCallOutput { output, .. } => {
|
||||
assert_ne!(output, &long_output);
|
||||
let output = output.text_content().unwrap_or_default();
|
||||
assert_ne!(output, long_output);
|
||||
assert!(
|
||||
output.contains("tokens truncated"),
|
||||
"expected token-based truncation marker, got {output}"
|
||||
@@ -949,7 +987,7 @@ fn normalize_adds_missing_output_for_custom_tool_call() {
|
||||
},
|
||||
ResponseItem::CustomToolCallOutput {
|
||||
call_id: "tool-x".to_string(),
|
||||
output: "aborted".to_string(),
|
||||
output: FunctionCallOutputPayload::from_text("aborted".to_string()),
|
||||
},
|
||||
]
|
||||
);
|
||||
@@ -1016,7 +1054,7 @@ fn normalize_removes_orphan_function_call_output() {
|
||||
fn normalize_removes_orphan_custom_tool_call_output() {
|
||||
let items = vec![ResponseItem::CustomToolCallOutput {
|
||||
call_id: "orphan-2".to_string(),
|
||||
output: "ok".to_string(),
|
||||
output: FunctionCallOutputPayload::from_text("ok".to_string()),
|
||||
}];
|
||||
let mut h = create_history_with_items(items);
|
||||
|
||||
@@ -1089,7 +1127,7 @@ fn normalize_mixed_inserts_and_removals() {
|
||||
},
|
||||
ResponseItem::CustomToolCallOutput {
|
||||
call_id: "t1".to_string(),
|
||||
output: "aborted".to_string(),
|
||||
output: FunctionCallOutputPayload::from_text("aborted".to_string()),
|
||||
},
|
||||
ResponseItem::LocalShellCall {
|
||||
id: None,
|
||||
@@ -1191,7 +1229,7 @@ fn normalize_removes_orphan_function_call_output_panics_in_debug() {
|
||||
fn normalize_removes_orphan_custom_tool_call_output_panics_in_debug() {
|
||||
let items = vec![ResponseItem::CustomToolCallOutput {
|
||||
call_id: "orphan-2".to_string(),
|
||||
output: "ok".to_string(),
|
||||
output: FunctionCallOutputPayload::from_text("ok".to_string()),
|
||||
}];
|
||||
let mut h = create_history_with_items(items);
|
||||
h.normalize_history(&default_input_modalities());
|
||||
@@ -1294,6 +1332,28 @@ fn image_data_url_payload_does_not_dominate_function_call_output_estimate() {
|
||||
assert!(estimated < raw_len);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn image_data_url_payload_does_not_dominate_custom_tool_call_output_estimate() {
|
||||
let payload = "C".repeat(50_000);
|
||||
let image_url = format!("data:image/png;base64,{payload}");
|
||||
let item = ResponseItem::CustomToolCallOutput {
|
||||
call_id: "call-js-repl".to_string(),
|
||||
output: FunctionCallOutputPayload::from_content_items(vec![
|
||||
FunctionCallOutputContentItem::InputText {
|
||||
text: "Screenshot captured".to_string(),
|
||||
},
|
||||
FunctionCallOutputContentItem::InputImage { image_url },
|
||||
]),
|
||||
};
|
||||
|
||||
let raw_len = serde_json::to_string(&item).unwrap().len() as i64;
|
||||
let estimated = estimate_response_item_model_visible_bytes(&item);
|
||||
let expected = raw_len - payload.len() as i64 + IMAGE_BYTES_ESTIMATE;
|
||||
|
||||
assert_eq!(estimated, expected);
|
||||
assert!(estimated < raw_len);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_base64_image_urls_are_unchanged() {
|
||||
let message_item = ResponseItem::Message {
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use std::collections::HashSet;
|
||||
|
||||
use codex_protocol::models::ContentItem;
|
||||
use codex_protocol::models::FunctionCallOutputBody;
|
||||
use codex_protocol::models::FunctionCallOutputContentItem;
|
||||
use codex_protocol::models::FunctionCallOutputPayload;
|
||||
use codex_protocol::models::ResponseItem;
|
||||
@@ -35,10 +34,7 @@ pub(crate) fn ensure_call_outputs_present(items: &mut Vec<ResponseItem>) {
|
||||
idx,
|
||||
ResponseItem::FunctionCallOutput {
|
||||
call_id: call_id.clone(),
|
||||
output: FunctionCallOutputPayload {
|
||||
body: FunctionCallOutputBody::Text("aborted".to_string()),
|
||||
..Default::default()
|
||||
},
|
||||
output: FunctionCallOutputPayload::from_text("aborted".to_string()),
|
||||
},
|
||||
));
|
||||
}
|
||||
@@ -59,7 +55,7 @@ pub(crate) fn ensure_call_outputs_present(items: &mut Vec<ResponseItem>) {
|
||||
idx,
|
||||
ResponseItem::CustomToolCallOutput {
|
||||
call_id: call_id.clone(),
|
||||
output: "aborted".to_string(),
|
||||
output: FunctionCallOutputPayload::from_text("aborted".to_string()),
|
||||
},
|
||||
));
|
||||
}
|
||||
@@ -82,10 +78,7 @@ pub(crate) fn ensure_call_outputs_present(items: &mut Vec<ResponseItem>) {
|
||||
idx,
|
||||
ResponseItem::FunctionCallOutput {
|
||||
call_id: call_id.clone(),
|
||||
output: FunctionCallOutputPayload {
|
||||
body: FunctionCallOutputBody::Text("aborted".to_string()),
|
||||
..Default::default()
|
||||
},
|
||||
output: FunctionCallOutputPayload::from_text("aborted".to_string()),
|
||||
},
|
||||
));
|
||||
}
|
||||
@@ -245,7 +238,8 @@ pub(crate) fn strip_images_when_unsupported(
|
||||
}
|
||||
*content = normalized_content;
|
||||
}
|
||||
ResponseItem::FunctionCallOutput { output, .. } => {
|
||||
ResponseItem::FunctionCallOutput { output, .. }
|
||||
| ResponseItem::CustomToolCallOutput { output, .. } => {
|
||||
if let Some(content_items) = output.content_items_mut() {
|
||||
let mut normalized_content_items = Vec::with_capacity(content_items.len());
|
||||
for content_item in content_items.iter() {
|
||||
|
||||
Reference in New Issue
Block a user