Fix: Render MCP image outputs regardless of ordering (#9815)

## What? - Render an MCP image output cell whenever a decodable image block exists in `CallToolResult.content` (including text-before-image or malformed image before valid image). ## Why? - Tool results that include caption text before the image currently drop the image output cell. - A malformed image block can also suppress later valid image output. ## How? - Iterate `content` and return the first successfully decoded image instead of only checking the first block. - Add unit tests that cover text-before-image ordering and invalid-image-before-valid. ## Before ```rust let image = match result { Ok(mcp_types::CallToolResult { content, .. }) => { if let Some(mcp_types::ContentBlock::ImageContent(image)) = content.first() { // decode image (fails -> None) } else { None } } _ => None, }?; ``` ## After ```rust let image = result .as_ref() .ok()? .content .iter() .find_map(decode_mcp_image)?; ``` ## Risk / Impact - Low: only affects image cell creation for MCP tool results; no change for non-image outputs. ## Tests - [x] `just fmt` - [x] `cargo test -p codex-tui` - [x] Rerun after branch update (2026-01-27): `just fmt`, `cargo test -p codex-tui` Manual testing # Manual testing: MCP image tool result rendering (Codex TUI) # Build the rmcp stdio test server binary: cd codex-rs cargo build -p codex-rmcp-client --bin test_stdio_server # Register the server as an MCP server (absolute path to the built binary): codex mcp add mcpimg -- /Users/joshka/code/codex-pr-review/codex-rs/target/debug/test_stdio_server # Then in Codex TUI, ask it to call: - mcpimg.image_scenario({"scenario":"image_only"}) - mcpimg.image_scenario({"scenario":"text_then_image","caption":"Here is the image:"}) - mcpimg.image_scenario({"scenario":"invalid_base64_then_image"}) - mcpimg.image_scenario({"scenario":"invalid_image_bytes_then_image"}) - mcpimg.image_scenario({"scenario":"multiple_valid_images"}) - mcpimg.image_scenario({"scenario":"image_then_text","caption":"Here is the image:"}) - mcpimg.image_scenario({"scenario":"text_only","caption":"Here is the image:"}) # Expected: # - You should see an extra history cell: "tool result (image output)" when the # tool result contains at least one decodable image block (even if earlier # blocks are text or invalid images). Fixes #9814 --------- Co-authored-by: Josh McKinney <joshka@openai.com>
2026-05-02 04:11:39 +03:00 · 2026-01-28 08:14:08 +11:00
parent 28051d18c6
commit 337643b00a
2 changed files with 296 additions and 42 deletions
--- a/codex-rs/tui/src/history_cell.rs
+++ b/codex-rs/tui/src/history_cell.rs
@@ -1429,44 +1429,62 @@ pub(crate) fn new_web_search_call(
    cell
 }

-/// If the first content is an image, return a new cell with the image.
-/// TODO(rgwood-dd): Handle images properly even if they're not the first result.
+/// Returns an additional history cell if an MCP tool result includes a decodable image.
+///
+/// This intentionally returns at most one cell: the first image in `CallToolResult.content` that
+/// successfully base64-decodes and parses as an image. This is used as a lightweight “image output
+/// exists” affordance separate from the main MCP tool call cell.
+///
+/// Manual testing tip:
+/// - Run the rmcp stdio test server (`codex-rs/rmcp-client/src/bin/test_stdio_server.rs`) and
+///   register it as an MCP server via `codex mcp add`.
+/// - Use its `image_scenario` tool with cases like `text_then_image`,
+///   `invalid_base64_then_image`, or `invalid_image_bytes_then_image` to ensure this path triggers
+///   even when the first block is not a valid image.
 fn try_new_completed_mcp_tool_call_with_image_output(
    result: &Result<mcp_types::CallToolResult, String>,
 ) -> Option<CompletedMcpToolCallWithImageOutput> {
-    match result {
-        Ok(mcp_types::CallToolResult { content, .. }) => {
-            if let Some(mcp_types::ContentBlock::ImageContent(image)) = content.first() {
-                let raw_data = match base64::engine::general_purpose::STANDARD.decode(&image.data) {
-                    Ok(data) => data,
-                    Err(e) => {
-                        error!("Failed to decode image data: {e}");
-                        return None;
-                    }
-                };
-                let reader = match ImageReader::new(Cursor::new(raw_data)).with_guessed_format() {
-                    Ok(reader) => reader,
-                    Err(e) => {
-                        error!("Failed to guess image format: {e}");
-                        return None;
-                    }
-                };
+    let image = result
+        .as_ref()
+        .ok()?
+        .content
+        .iter()
+        .find_map(decode_mcp_image)?;

-                let image = match reader.decode() {
-                    Ok(image) => image,
-                    Err(e) => {
-                        error!("Image decoding failed: {e}");
-                        return None;
-                    }
-                };
+    Some(CompletedMcpToolCallWithImageOutput { _image: image })
+}

-                Some(CompletedMcpToolCallWithImageOutput { _image: image })
-            } else {
-                None
-            }
-        }
-        _ => None,
-    }
+/// Decodes an MCP `ImageContent` block into an in-memory image.
+///
+/// Returns `None` when the block is not an image, when base64 decoding fails, when the format
+/// cannot be inferred, or when the image decoder rejects the bytes.
+fn decode_mcp_image(block: &mcp_types::ContentBlock) -> Option<DynamicImage> {
+    let image = match block {
+        mcp_types::ContentBlock::ImageContent(image) => image,
+        _ => return None,
+    };
+    let raw_data = base64::engine::general_purpose::STANDARD
+        .decode(&image.data)
+        .map_err(|e| {
+            error!("Failed to decode image data: {e}");
+            e
+        })
+        .ok()?;
+    let reader = ImageReader::new(Cursor::new(raw_data))
+        .with_guessed_format()
+        .map_err(|e| {
+            error!("Failed to guess image format: {e}");
+            e
+        })
+        .ok()?;
+
+    reader
+        .decode()
+        .map_err(|e| {
+            error!("Image decoding failed: {e}");
+            e
+        })
+        .ok()
 }

 #[allow(clippy::disallowed_methods)]
@@ -1929,9 +1947,12 @@ mod tests {
    use codex_core::protocol::ExecCommandSource;
    use mcp_types::CallToolResult;
    use mcp_types::ContentBlock;
+    use mcp_types::ImageContent;
    use mcp_types::TextContent;
    use mcp_types::Tool;
    use mcp_types::ToolInputSchema;
+
+    const SMALL_PNG_BASE64: &str = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR4nGP4z8DwHwAFAAH/iZk9HQAAAABJRU5ErkJggg==";
    async fn test_config() -> Config {
        let codex_home = std::env::temp_dir();
        ConfigBuilder::default()
@@ -1957,6 +1978,15 @@ mod tests {
        render_lines(&cell.transcript_lines(u16::MAX))
    }

+    fn image_block(data: &str) -> ContentBlock {
+        ContentBlock::ImageContent(ImageContent {
+            annotations: None,
+            data: data.to_string(),
+            mime_type: "image/png".into(),
+            r#type: "image".into(),
+        })
+    }
+
    #[test]
    fn unified_exec_interaction_cell_renders_input() {
        let cell =
@@ -2251,6 +2281,63 @@ mod tests {
        insta::assert_snapshot!(rendered);
    }

+    #[test]
+    fn completed_mcp_tool_call_image_after_text_returns_extra_cell() {
+        let invocation = McpInvocation {
+            server: "image".into(),
+            tool: "generate".into(),
+            arguments: Some(json!({
+                "prompt": "tiny image",
+            })),
+        };
+
+        let result = CallToolResult {
+            content: vec![
+                ContentBlock::TextContent(TextContent {
+                    annotations: None,
+                    text: "Here is the image:".into(),
+                    r#type: "text".into(),
+                }),
+                image_block(SMALL_PNG_BASE64),
+            ],
+            is_error: None,
+            structured_content: None,
+        };
+
+        let mut cell = new_active_mcp_tool_call("call-image".into(), invocation, true);
+        let extra_cell = cell
+            .complete(Duration::from_millis(25), Ok(result))
+            .expect("expected image cell");
+
+        let rendered = render_lines(&extra_cell.display_lines(80));
+        assert_eq!(rendered, vec!["tool result (image output)"]);
+    }
+
+    #[test]
+    fn completed_mcp_tool_call_skips_invalid_image_blocks() {
+        let invocation = McpInvocation {
+            server: "image".into(),
+            tool: "generate".into(),
+            arguments: Some(json!({
+                "prompt": "tiny image",
+            })),
+        };
+
+        let result = CallToolResult {
+            content: vec![image_block("not-base64"), image_block(SMALL_PNG_BASE64)],
+            is_error: None,
+            structured_content: None,
+        };
+
+        let mut cell = new_active_mcp_tool_call("call-image-2".into(), invocation, true);
+        let extra_cell = cell
+            .complete(Duration::from_millis(25), Ok(result))
+            .expect("expected image cell");
+
+        let rendered = render_lines(&extra_cell.display_lines(80));
+        assert_eq!(rendered, vec!["tool result (image output)"]);
+    }
+
    #[test]
    fn completed_mcp_tool_call_error_snapshot() {
        let invocation = McpInvocation {