Wire realtime api to core (#12268)

- Introduce `RealtimeConversationManager` for realtime API management 
- Add `op::conversation` to start conversation, insert audio, insert
text, and close conversation.
- emit conversation lifecycle and realtime events.
- Move shared realtime payload types into codex-protocol and add core
e2e websocket tests for start/replace/transport-close paths.

Things to consider:
- Should we use the same `op::` and `Events` channel to carry audio? I
think we should try this simple approach and later we can create
separate one if the channels got congested.
- Sending text updates to the client: we can start simple and later
restrict that.
- Provider auth isn't wired for now intentionally
This commit is contained in:
Ahmed Ibrahim
2026-02-20 19:06:35 -08:00
committed by GitHub
parent 936e744c93
commit 6817f0be8a
28 changed files with 2102 additions and 42 deletions

View File

@@ -85,6 +85,41 @@ pub struct McpServerRefreshConfig {
pub mcp_oauth_credentials_store_mode: Value,
}
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, JsonSchema, TS)]
pub struct ConversationStartParams {
pub prompt: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub session_id: Option<String>,
}
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)]
pub struct RealtimeAudioFrame {
pub data: String,
pub sample_rate: u32,
pub num_channels: u16,
#[serde(skip_serializing_if = "Option::is_none")]
pub samples_per_channel: Option<u32>,
}
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)]
pub enum RealtimeEvent {
SessionCreated { session_id: String },
SessionUpdated { backend_prompt: Option<String> },
AudioOut(RealtimeAudioFrame),
ConversationItemAdded(Value),
Error(String),
}
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, JsonSchema, TS)]
pub struct ConversationAudioParams {
pub frame: RealtimeAudioFrame,
}
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, JsonSchema, TS)]
pub struct ConversationTextParams {
pub text: String,
}
/// Submission operation
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, JsonSchema)]
#[serde(tag = "type", rename_all = "snake_case")]
@@ -98,6 +133,18 @@ pub enum Op {
/// Terminate all running background terminal processes for this thread.
CleanBackgroundTerminals,
/// Start a realtime conversation stream.
RealtimeConversationStart(ConversationStartParams),
/// Send audio input to the running realtime conversation stream.
RealtimeConversationAudio(ConversationAudioParams),
/// Send text input to the running realtime conversation stream.
RealtimeConversationText(ConversationTextParams),
/// Close the running realtime conversation stream.
RealtimeConversationClose,
/// Legacy user input.
///
/// Prefer [`Op::UserTurn`] so the caller provides full turn context
@@ -899,6 +946,15 @@ pub enum EventMsg {
/// indicates the turn continued but the user should still be notified.
Warning(WarningEvent),
/// Realtime conversation lifecycle start event.
RealtimeConversationStarted(RealtimeConversationStartedEvent),
/// Realtime conversation streaming payload event.
RealtimeConversationRealtime(RealtimeConversationRealtimeEvent),
/// Realtime conversation lifecycle close event.
RealtimeConversationClosed(RealtimeConversationClosedEvent),
/// Model routing changed from the requested model to a different model.
ModelReroute(ModelRerouteEvent),
@@ -1078,6 +1134,22 @@ pub enum EventMsg {
CollabResumeEnd(CollabResumeEndEvent),
}
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, JsonSchema, TS)]
pub struct RealtimeConversationStartedEvent {
pub session_id: Option<String>,
}
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, JsonSchema, TS)]
pub struct RealtimeConversationRealtimeEvent {
pub payload: RealtimeEvent,
}
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, JsonSchema, TS)]
pub struct RealtimeConversationClosedEvent {
#[serde(skip_serializing_if = "Option::is_none")]
pub reason: Option<String>,
}
impl From<CollabAgentSpawnBeginEvent> for EventMsg {
fn from(event: CollabAgentSpawnBeginEvent) -> Self {
EventMsg::CollabAgentSpawnBegin(event)
@@ -3047,6 +3119,61 @@ mod tests {
assert!(event.affects_turn_status());
}
#[test]
fn conversation_op_serializes_as_unnested_variants() {
let audio = Op::RealtimeConversationAudio(ConversationAudioParams {
frame: RealtimeAudioFrame {
data: "AQID".to_string(),
sample_rate: 24_000,
num_channels: 1,
samples_per_channel: Some(480),
},
});
let start = Op::RealtimeConversationStart(ConversationStartParams {
prompt: "be helpful".to_string(),
session_id: Some("conv_1".to_string()),
});
let text = Op::RealtimeConversationText(ConversationTextParams {
text: "hello".to_string(),
});
let close = Op::RealtimeConversationClose;
assert_eq!(
serde_json::to_value(&start).unwrap(),
json!({
"type": "realtime_conversation_start",
"prompt": "be helpful",
"session_id": "conv_1"
})
);
assert_eq!(
serde_json::to_value(&audio).unwrap(),
json!({
"type": "realtime_conversation_audio",
"frame": {
"data": "AQID",
"sample_rate": 24000,
"num_channels": 1,
"samples_per_channel": 480
}
})
);
assert_eq!(
serde_json::from_value::<Op>(serde_json::to_value(&text).unwrap()).unwrap(),
text
);
assert_eq!(
serde_json::to_value(&close).unwrap(),
json!({
"type": "realtime_conversation_close"
})
);
assert_eq!(
serde_json::from_value::<Op>(serde_json::to_value(&close).unwrap()).unwrap(),
close
);
}
#[test]
fn user_input_serialization_omits_final_output_json_schema_when_none() -> Result<()> {
let op = Op::UserInput {