Add text element metadata to protocol, app server, and core (#9331)

The second part of breaking up PR
https://github.com/openai/codex/pull/9116

Summary:

- Add `TextElement` / `ByteRange` to protocol user inputs and user
message events with defaults.
- Thread `text_elements` through app-server v1/v2 request handling and
history rebuild.
- Preserve UI metadata only in user input/events (not `ContentItem`)
while keeping local image attachments in user events for rehydration.

Details:

- Protocol: `UserInput::Text` carries `text_elements`;
`UserMessageEvent` carries `text_elements` + `local_images`.
Serialization includes empty vectors for backward compatibility.
- app-server-protocol: v1 defines `V1TextElement` / `V1ByteRange` in
camelCase with conversions; v2 uses its own camelCase wrapper.
- app-server: v1/v2 input mapping includes `text_elements`; thread
history rebuilds include them.
- Core: user event emission preserves UI metadata while model history
stays clean; history replay round-trips the metadata.
This commit is contained in:
charley-oai
2026-01-15 17:26:41 -08:00
committed by GitHub
parent 004a74940a
commit 1fa8350ae7
18 changed files with 416 additions and 46 deletions

View File

@@ -30,6 +30,8 @@ use codex_protocol::protocol::SkillMetadata as CoreSkillMetadata;
use codex_protocol::protocol::SkillScope as CoreSkillScope;
use codex_protocol::protocol::TokenUsage as CoreTokenUsage;
use codex_protocol::protocol::TokenUsageInfo as CoreTokenUsageInfo;
use codex_protocol::user_input::ByteRange as CoreByteRange;
use codex_protocol::user_input::TextElement as CoreTextElement;
use codex_protocol::user_input::UserInput as CoreUserInput;
use codex_utils_absolute_path::AbsolutePathBuf;
use mcp_types::ContentBlock as McpContentBlock;
@@ -1589,6 +1591,24 @@ pub struct ByteRange {
pub end: usize,
}
impl From<CoreByteRange> for ByteRange {
fn from(value: CoreByteRange) -> Self {
Self {
start: value.start,
end: value.end,
}
}
}
impl From<ByteRange> for CoreByteRange {
fn from(value: ByteRange) -> Self {
Self {
start: value.start,
end: value.end,
}
}
}
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, JsonSchema, TS)]
#[serde(rename_all = "camelCase")]
#[ts(export_to = "v2/")]
@@ -1599,6 +1619,24 @@ pub struct TextElement {
pub placeholder: Option<String>,
}
impl From<CoreTextElement> for TextElement {
fn from(value: CoreTextElement) -> Self {
Self {
byte_range: value.byte_range.into(),
placeholder: value.placeholder,
}
}
}
impl From<TextElement> for CoreTextElement {
fn from(value: TextElement) -> Self {
Self {
byte_range: value.byte_range.into(),
placeholder: value.placeholder,
}
}
}
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, JsonSchema, TS)]
#[serde(tag = "type", rename_all = "camelCase")]
#[ts(tag = "type")]
@@ -1625,10 +1663,12 @@ pub enum UserInput {
impl UserInput {
pub fn into_core(self) -> CoreUserInput {
match self {
UserInput::Text { text, .. } => CoreUserInput::Text {
UserInput::Text {
text,
// TODO: Thread text element ranges into v2 inputs. Empty keeps old behavior.
text_elements: Vec::new(),
text_elements,
} => CoreUserInput::Text {
text,
text_elements: text_elements.into_iter().map(Into::into).collect(),
},
UserInput::Image { url } => CoreUserInput::Image { image_url: url },
UserInput::LocalImage { path } => CoreUserInput::LocalImage { path },
@@ -1640,10 +1680,12 @@ impl UserInput {
impl From<CoreUserInput> for UserInput {
fn from(value: CoreUserInput) -> Self {
match value {
CoreUserInput::Text { text, .. } => UserInput::Text {
CoreUserInput::Text {
text,
// TODO: Thread text element ranges from core into v2 inputs.
text_elements: Vec::new(),
text_elements,
} => UserInput::Text {
text,
text_elements: text_elements.into_iter().map(Into::into).collect(),
},
CoreUserInput::Image { image_url } => UserInput::Image { url: image_url },
CoreUserInput::LocalImage { path } => UserInput::LocalImage { path },