feat(app-server, core): allow text + image content items for dynamic tool outputs (#10567)

Took over the work that @aaronl-openai started here: https://github.com/openai/codex/pull/10397 Now that app-server clients are able to set up custom tools (called `dynamic_tools` in app-server), we should expose a way for clients to pass in not just text, but also image outputs. This is something the Responses API already supports for function call outputs, where you can pass in either a string or an array of content outputs (text, image, file): https://platform.openai.com/docs/api-reference/responses/create#responses_create-input-input_item_list-item-function_tool_call_output-output-array-input_image So let's just plumb it through in Codex (with the caveat that we only support text and image for now). This is implemented end-to-end across app-server v2 protocol types and core tool handling. ## Breaking API change NOTE: This introduces a breaking change with dynamic tools, but I think it's ok since this concept was only recently introduced (https://github.com/openai/codex/pull/9539) and it's better to get the API contract correct. I don't think there are any real consumers of this yet (not even the Codex App). Old shape: `{ "output": "dynamic-ok", "success": true }` New shape: ``` { "contentItems": [ { "type": "inputText", "text": "dynamic-ok" }, { "type": "inputImage", "imageUrl": "data:image/png;base64,AAA" } ] "success": true } ```
2026-05-01 20:02:05 +03:00 · 2026-02-04 16:12:47 -08:00
parent f9c38f531c
commit 5ea107a088
49 changed files with 1103 additions and 468 deletions
--- a/codex-rs/protocol/src/models.rs
+++ b/codex-rs/protocol/src/models.rs
@@ -129,11 +129,11 @@ pub enum ResponseItem {
        arguments: String,
        call_id: String,
    },
-    // NOTE: The input schema for `function_call_output` objects that clients send to the
-    // OpenAI /v1/responses endpoint is NOT the same shape as the objects the server returns on the
-    // SSE stream. When *sending* we must wrap the string output inside an object that includes a
-    // required `success` boolean. To ensure we serialize exactly the expected shape we introduce
-    // a dedicated payload struct and flatten it here.
+    // NOTE: The `output` field for `function_call_output` uses a dedicated payload type with
+    // custom serialization. On the wire it is either:
+    //   - a plain string (`content`)
+    //   - an array of structured content items (`content_items`)
+    // We keep this behavior centralized in `FunctionCallOutputPayload`.
    FunctionCallOutput {
        call_id: String,
        output: FunctionCallOutputPayload,
@@ -617,9 +617,8 @@ impl From<ResponseInputItem> for ResponseItem {
                let output = match result {
                    Ok(result) => FunctionCallOutputPayload::from(&result),
                    Err(tool_call_err) => FunctionCallOutputPayload {
-                        content: format!("err: {tool_call_err:?}"),
+                        body: FunctionCallOutputBody::Text(format!("err: {tool_call_err:?}")),
                        success: Some(false),
-                        ..Default::default()
                    },
                };
                Self::FunctionCallOutput { call_id, output }
@@ -780,39 +779,146 @@ pub enum FunctionCallOutputContentItem {
    InputImage { image_url: String },
 }

+/// Converts structured function-call output content into plain text for
+/// human-readable surfaces.
+///
+/// This conversion is intentionally lossy:
+/// - only `input_text` items are included
+/// - image items are ignored
+///
+/// We use this helper where callers still need a string representation (for
+/// example telemetry previews or legacy string-only output paths) while keeping
+/// the original multimodal `content_items` as the authoritative payload sent to
+/// the model.
+pub fn function_call_output_content_items_to_text(
+    content_items: &[FunctionCallOutputContentItem],
+) -> Option<String> {
+    let text_segments = content_items
+        .iter()
+        .filter_map(|item| match item {
+            FunctionCallOutputContentItem::InputText { text } if !text.trim().is_empty() => {
+                Some(text.as_str())
+            }
+            FunctionCallOutputContentItem::InputText { .. }
+            | FunctionCallOutputContentItem::InputImage { .. } => None,
+        })
+        .collect::<Vec<_>>();
+
+    if text_segments.is_empty() {
+        None
+    } else {
+        Some(text_segments.join("\n"))
+    }
+}
+
+impl From<crate::dynamic_tools::DynamicToolCallOutputContentItem>
+    for FunctionCallOutputContentItem
+{
+    fn from(item: crate::dynamic_tools::DynamicToolCallOutputContentItem) -> Self {
+        match item {
+            crate::dynamic_tools::DynamicToolCallOutputContentItem::InputText { text } => {
+                Self::InputText { text }
+            }
+            crate::dynamic_tools::DynamicToolCallOutputContentItem::InputImage { image_url } => {
+                Self::InputImage { image_url }
+            }
+        }
+    }
+}
+
 /// The payload we send back to OpenAI when reporting a tool call result.
 ///
-/// `content` preserves the historical plain-string payload so downstream
-/// integrations (tests, logging, etc.) can keep treating tool output as
-/// `String`. When an MCP server returns richer data we additionally populate
-/// `content_items` with the structured form that the Responses API understands.
+/// `body` serializes directly as the wire value for `function_call_output.output`.
+/// `success` remains internal metadata for downstream handling.
 #[derive(Debug, Default, Clone, PartialEq, JsonSchema, TS)]
 pub struct FunctionCallOutputPayload {
-    pub content: String,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub content_items: Option<Vec<FunctionCallOutputContentItem>>,
+    pub body: FunctionCallOutputBody,
    pub success: Option<bool>,
 }

-#[derive(Deserialize)]
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, JsonSchema, TS)]
 #[serde(untagged)]
-enum FunctionCallOutputPayloadSerde {
+pub enum FunctionCallOutputBody {
    Text(String),
-    Items(Vec<FunctionCallOutputContentItem>),
+    ContentItems(Vec<FunctionCallOutputContentItem>),
 }

-// The Responses API expects two *different* shapes depending on success vs failure:
-//   • success → output is a plain string (no nested object)
-//   • failure → output is an object { content, success:false }
+impl FunctionCallOutputBody {
+    /// Best-effort conversion of a function-call output body to plain text for
+    /// human-readable surfaces.
+    ///
+    /// This conversion is intentionally lossy when the body contains content
+    /// items: image entries are dropped and text entries are joined with
+    /// newlines.
+    pub fn to_text(&self) -> Option<String> {
+        match self {
+            Self::Text(content) => Some(content.clone()),
+            Self::ContentItems(items) => function_call_output_content_items_to_text(items),
+        }
+    }
+}
+
+impl Default for FunctionCallOutputBody {
+    fn default() -> Self {
+        Self::Text(String::new())
+    }
+}
+
+impl FunctionCallOutputPayload {
+    pub fn from_text(content: String) -> Self {
+        Self {
+            body: FunctionCallOutputBody::Text(content),
+            success: None,
+        }
+    }
+
+    pub fn from_content_items(content_items: Vec<FunctionCallOutputContentItem>) -> Self {
+        Self {
+            body: FunctionCallOutputBody::ContentItems(content_items),
+            success: None,
+        }
+    }
+
+    pub fn text_content(&self) -> Option<&str> {
+        match &self.body {
+            FunctionCallOutputBody::Text(content) => Some(content),
+            FunctionCallOutputBody::ContentItems(_) => None,
+        }
+    }
+
+    pub fn text_content_mut(&mut self) -> Option<&mut String> {
+        match &mut self.body {
+            FunctionCallOutputBody::Text(content) => Some(content),
+            FunctionCallOutputBody::ContentItems(_) => None,
+        }
+    }
+
+    pub fn content_items(&self) -> Option<&[FunctionCallOutputContentItem]> {
+        match &self.body {
+            FunctionCallOutputBody::Text(_) => None,
+            FunctionCallOutputBody::ContentItems(items) => Some(items),
+        }
+    }
+
+    pub fn content_items_mut(&mut self) -> Option<&mut Vec<FunctionCallOutputContentItem>> {
+        match &mut self.body {
+            FunctionCallOutputBody::Text(_) => None,
+            FunctionCallOutputBody::ContentItems(items) => Some(items),
+        }
+    }
+}
+
+// `function_call_output.output` is encoded as either:
+//   - an array of structured content items
+//   - a plain string
 impl Serialize for FunctionCallOutputPayload {
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
    where
        S: Serializer,
    {
-        if let Some(items) = &self.content_items {
-            items.serialize(serializer)
-        } else {
-            serializer.serialize_str(&self.content)
+        match &self.body {
+            FunctionCallOutputBody::Text(content) => serializer.serialize_str(content),
+            FunctionCallOutputBody::ContentItems(items) => items.serialize(serializer),
        }
    }
 }
@@ -822,20 +928,11 @@ impl<'de> Deserialize<'de> for FunctionCallOutputPayload {
    where
        D: Deserializer<'de>,
    {
-        match FunctionCallOutputPayloadSerde::deserialize(deserializer)? {
-            FunctionCallOutputPayloadSerde::Text(content) => Ok(FunctionCallOutputPayload {
-                content,
-                ..Default::default()
-            }),
-            FunctionCallOutputPayloadSerde::Items(items) => {
-                let content = serde_json::to_string(&items).map_err(serde::de::Error::custom)?;
-                Ok(FunctionCallOutputPayload {
-                    content,
-                    content_items: Some(items),
-                    success: None,
-                })
-            }
-        }
+        let body = FunctionCallOutputBody::deserialize(deserializer)?;
+        Ok(FunctionCallOutputPayload {
+            body,
+            success: None,
+        })
    }
 }

@@ -856,16 +953,14 @@ impl From<&CallToolResult> for FunctionCallOutputPayload {
            match serde_json::to_string(structured_content) {
                Ok(serialized_structured_content) => {
                    return FunctionCallOutputPayload {
-                        content: serialized_structured_content,
+                        body: FunctionCallOutputBody::Text(serialized_structured_content),
                        success: Some(is_success),
-                        ..Default::default()
                    };
                }
                Err(err) => {
                    return FunctionCallOutputPayload {
-                        content: err.to_string(),
+                        body: FunctionCallOutputBody::Text(err.to_string()),
                        success: Some(false),
-                        ..Default::default()
                    };
                }
            }
@@ -875,18 +970,21 @@ impl From<&CallToolResult> for FunctionCallOutputPayload {
            Ok(serialized_content) => serialized_content,
            Err(err) => {
                return FunctionCallOutputPayload {
-                    content: err.to_string(),
+                    body: FunctionCallOutputBody::Text(err.to_string()),
                    success: Some(false),
-                    ..Default::default()
                };
            }
        };

        let content_items = convert_mcp_content_to_items(content);

+        let body = match content_items {
+            Some(content_items) => FunctionCallOutputBody::ContentItems(content_items),
+            None => FunctionCallOutputBody::Text(serialized_content),
+        };
+
        FunctionCallOutputPayload {
-            content: serialized_content,
-            content_items,
+            body,
            success: Some(is_success),
        }
    }
@@ -937,19 +1035,18 @@ fn convert_mcp_content_to_items(
 }

 // Implement Display so callers can treat the payload like a plain string when logging or doing
-// trivial substring checks in tests (existing tests call `.contains()` on the output). Display
-// returns the raw `content` field.
+// trivial substring checks in tests (existing tests call `.contains()` on the output). For
+// `ContentItems`, Display emits a JSON representation.

 impl std::fmt::Display for FunctionCallOutputPayload {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.write_str(&self.content)
-    }
-}
-
-impl std::ops::Deref for FunctionCallOutputPayload {
-    type Target = str;
-    fn deref(&self) -> &Self::Target {
-        &self.content
+        match &self.body {
+            FunctionCallOutputBody::Text(content) => f.write_str(content),
+            FunctionCallOutputBody::ContentItems(items) => {
+                let content = serde_json::to_string(items).unwrap_or_default();
+                f.write_str(content.as_str())
+            }
+        }
    }
 }

@@ -1010,6 +1107,61 @@ mod tests {
        assert_eq!(convert_mcp_content_to_items(&contents), None);
    }

+    #[test]
+    fn function_call_output_content_items_to_text_joins_text_segments() {
+        let content_items = vec![
+            FunctionCallOutputContentItem::InputText {
+                text: "line 1".to_string(),
+            },
+            FunctionCallOutputContentItem::InputImage {
+                image_url: "data:image/png;base64,AAA".to_string(),
+            },
+            FunctionCallOutputContentItem::InputText {
+                text: "line 2".to_string(),
+            },
+        ];
+
+        let text = function_call_output_content_items_to_text(&content_items);
+        assert_eq!(text, Some("line 1\nline 2".to_string()));
+    }
+
+    #[test]
+    fn function_call_output_content_items_to_text_ignores_blank_text_and_images() {
+        let content_items = vec![
+            FunctionCallOutputContentItem::InputText {
+                text: "   ".to_string(),
+            },
+            FunctionCallOutputContentItem::InputImage {
+                image_url: "data:image/png;base64,AAA".to_string(),
+            },
+        ];
+
+        let text = function_call_output_content_items_to_text(&content_items);
+        assert_eq!(text, None);
+    }
+
+    #[test]
+    fn function_call_output_body_to_text_returns_plain_text_content() {
+        let body = FunctionCallOutputBody::Text("ok".to_string());
+        let text = body.to_text();
+        assert_eq!(text, Some("ok".to_string()));
+    }
+
+    #[test]
+    fn function_call_output_body_to_text_uses_content_item_fallback() {
+        let body = FunctionCallOutputBody::ContentItems(vec![
+            FunctionCallOutputContentItem::InputText {
+                text: "line 1".to_string(),
+            },
+            FunctionCallOutputContentItem::InputImage {
+                image_url: "data:image/png;base64,AAA".to_string(),
+            },
+        ]);
+
+        let text = body.to_text();
+        assert_eq!(text, Some("line 1".to_string()));
+    }
+
    #[test]
    fn converts_sandbox_mode_into_developer_instructions() {
        let workspace_write: DeveloperInstructions = SandboxMode::WorkspaceWrite.into();
@@ -1156,10 +1308,7 @@ mod tests {
    fn serializes_success_as_plain_string() -> Result<()> {
        let item = ResponseInputItem::FunctionCallOutput {
            call_id: "call1".into(),
-            output: FunctionCallOutputPayload {
-                content: "ok".into(),
-                ..Default::default()
-            },
+            output: FunctionCallOutputPayload::from_text("ok".into()),
        };

        let json = serde_json::to_string(&item)?;
@@ -1175,9 +1324,8 @@ mod tests {
        let item = ResponseInputItem::FunctionCallOutput {
            call_id: "call1".into(),
            output: FunctionCallOutputPayload {
-                content: "bad".into(),
+                body: FunctionCallOutputBody::Text("bad".into()),
                success: Some(false),
-                ..Default::default()
            },
        };

@@ -1202,7 +1350,10 @@ mod tests {

        let payload = FunctionCallOutputPayload::from(&call_tool_result);
        assert_eq!(payload.success, Some(true));
-        let items = payload.content_items.clone().expect("content items");
+        let Some(items) = payload.content_items() else {
+            panic!("expected content items");
+        };
+        let items = items.to_vec();
        assert_eq!(
            items,
            vec![
@@ -1243,9 +1394,10 @@ mod tests {
        };

        let payload = FunctionCallOutputPayload::from(&call_tool_result);
-        let Some(items) = payload.content_items else {
+        let Some(items) = payload.content_items() else {
            panic!("expected content items");
        };
+        let items = items.to_vec();
        assert_eq!(
            items,
            vec![FunctionCallOutputContentItem::InputImage {
@@ -1274,10 +1426,14 @@ mod tests {
                image_url: "data:image/png;base64,XYZ".into(),
            },
        ];
-        assert_eq!(payload.content_items, Some(expected_items.clone()));
-
-        let expected_content = serde_json::to_string(&expected_items)?;
-        assert_eq!(payload.content, expected_content);
+        assert_eq!(
+            payload.body,
+            FunctionCallOutputBody::ContentItems(expected_items.clone())
+        );
+        assert_eq!(
+            serde_json::to_string(&payload)?,
+            serde_json::to_string(&expected_items)?
+        );

        Ok(())
    }