Add realtime output modality and transcript events (#17701)

- Add outputModality to thread/realtime/start and wire text/audio output
selection through app-server, core, API, and TUI.\n- Rename the realtime
transcript delta notification and add a separate transcript done
notification that forwards final text from item done without correlating
it with deltas.
This commit is contained in:
Ahmed Ibrahim
2026-04-14 00:13:13 -07:00
committed by GitHub
parent a6b03a22cc
commit 2f6fc7c137
38 changed files with 711 additions and 77 deletions

View File

@@ -13,6 +13,7 @@ pub use models::ModelsClient;
pub use realtime_call::RealtimeCallClient;
pub use realtime_call::RealtimeCallResponse;
pub use realtime_websocket::RealtimeEventParser;
pub use realtime_websocket::RealtimeOutputModality;
pub use realtime_websocket::RealtimeSessionConfig;
pub use realtime_websocket::RealtimeSessionMode;
pub use realtime_websocket::RealtimeWebsocketClient;

View File

@@ -221,6 +221,7 @@ fn decode_call_id_from_location(headers: &HeaderMap) -> Result<String, ApiError>
mod tests {
use super::*;
use crate::endpoint::realtime_websocket::RealtimeEventParser;
use crate::endpoint::realtime_websocket::RealtimeOutputModality;
use crate::endpoint::realtime_websocket::RealtimeSessionMode;
use crate::provider::RetryConfig;
use async_trait::async_trait;
@@ -311,6 +312,7 @@ mod tests {
session_id: Some(session_id.to_string()),
event_parser: RealtimeEventParser::RealtimeV2,
session_mode: RealtimeSessionMode::Conversational,
output_modality: RealtimeOutputModality::Audio,
voice: RealtimeVoice::Marin,
}
}

View File

@@ -7,9 +7,9 @@ use crate::endpoint::realtime_websocket::protocol::RealtimeAudioFrame;
use crate::endpoint::realtime_websocket::protocol::RealtimeEvent;
use crate::endpoint::realtime_websocket::protocol::RealtimeEventParser;
use crate::endpoint::realtime_websocket::protocol::RealtimeOutboundMessage;
use crate::endpoint::realtime_websocket::protocol::RealtimeOutputModality;
use crate::endpoint::realtime_websocket::protocol::RealtimeSessionConfig;
use crate::endpoint::realtime_websocket::protocol::RealtimeSessionMode;
use crate::endpoint::realtime_websocket::protocol::RealtimeTranscriptDelta;
use crate::endpoint::realtime_websocket::protocol::RealtimeTranscriptEntry;
use crate::endpoint::realtime_websocket::protocol::RealtimeVoice;
use crate::endpoint::realtime_websocket::protocol::parse_realtime_event;
@@ -17,6 +17,7 @@ use crate::error::ApiError;
use crate::provider::Provider;
use codex_client::backoff;
use codex_client::maybe_build_rustls_client_config_with_custom_ca;
use codex_protocol::protocol::RealtimeTranscriptDelta;
use codex_utils_rustls_provider::ensure_rustls_crypto_provider;
use futures::SinkExt;
use futures::StreamExt;
@@ -307,10 +308,17 @@ impl RealtimeWebsocketWriter {
&self,
instructions: String,
session_mode: RealtimeSessionMode,
output_modality: RealtimeOutputModality,
voice: RealtimeVoice,
) -> Result<(), ApiError> {
let session_mode = normalized_session_mode(self.event_parser, session_mode);
let session = session_update_session(self.event_parser, instructions, session_mode, voice);
let session = session_update_session(
self.event_parser,
instructions,
session_mode,
output_modality,
voice,
);
self.send_json(&RealtimeOutboundMessage::SessionUpdate { session })
.await
}
@@ -406,10 +414,10 @@ impl RealtimeWebsocketEvents {
let mut active_transcript = self.active_transcript.lock().await;
match event {
RealtimeEvent::InputAudioSpeechStarted(_) => {}
RealtimeEvent::InputTranscriptDelta(RealtimeTranscriptDelta { delta }) => {
RealtimeEvent::InputTranscriptDelta(RealtimeTranscriptDelta { delta, .. }) => {
append_transcript_delta(&mut active_transcript.entries, "user", delta);
}
RealtimeEvent::OutputTranscriptDelta(RealtimeTranscriptDelta { delta }) => {
RealtimeEvent::OutputTranscriptDelta(RealtimeTranscriptDelta { delta, .. }) => {
append_transcript_delta(&mut active_transcript.entries, "assistant", delta);
}
RealtimeEvent::HandoffRequested(handoff) => {
@@ -418,6 +426,8 @@ impl RealtimeWebsocketEvents {
}
}
RealtimeEvent::SessionUpdated { .. }
| RealtimeEvent::InputTranscriptDone(_)
| RealtimeEvent::OutputTranscriptDone(_)
| RealtimeEvent::AudioOut(_)
| RealtimeEvent::ResponseCreated(_)
| RealtimeEvent::ResponseCancelled(_)
@@ -581,7 +591,12 @@ impl RealtimeWebsocketClient {
);
connection
.writer
.send_session_update(config.instructions, config.session_mode, config.voice)
.send_session_update(
config.instructions,
config.session_mode,
config.output_modality,
config.voice,
)
.await?;
Ok(connection)
}
@@ -721,13 +736,14 @@ fn normalize_realtime_path(url: &mut Url) {
#[cfg(test)]
mod tests {
use super::*;
use crate::endpoint::realtime_websocket::protocol::RealtimeTranscriptDelta;
use crate::endpoint::realtime_websocket::protocol::RealtimeTranscriptEntry;
use codex_protocol::protocol::RealtimeHandoffRequested;
use codex_protocol::protocol::RealtimeInputAudioSpeechStarted;
use codex_protocol::protocol::RealtimeResponseCancelled;
use codex_protocol::protocol::RealtimeResponseCreated;
use codex_protocol::protocol::RealtimeResponseDone;
use codex_protocol::protocol::RealtimeTranscriptDelta;
use codex_protocol::protocol::RealtimeTranscriptDone;
use codex_protocol::protocol::RealtimeVoice;
use http::HeaderValue;
use pretty_assertions::assert_eq;
@@ -894,6 +910,8 @@ mod tests {
fn parse_realtime_v2_input_audio_transcription_delta_event() {
let payload = json!({
"type": "conversation.item.input_audio_transcription.delta",
"item_id": "item_input_1",
"content_index": 0,
"delta": "hello"
})
.to_string();
@@ -908,6 +926,32 @@ mod tests {
);
}
#[test]
fn parse_realtime_v2_item_done_output_text_event() {
let payload = json!({
"type": "conversation.item.done",
"item": {
"id": "item_output_1",
"type": "message",
"role": "assistant",
"content": [
{"type": "output_text", "text": "hello"},
{"type": "output_text", "text": " world"}
]
}
})
.to_string();
assert_eq!(
parse_realtime_event(payload.as_str(), RealtimeEventParser::RealtimeV2),
Some(RealtimeEvent::OutputTranscriptDone(
RealtimeTranscriptDone {
text: "hello world".to_string(),
}
))
);
}
#[test]
fn parse_realtime_v2_output_audio_delta_defaults_audio_shape() {
let payload = json!({
@@ -1374,6 +1418,7 @@ mod tests {
session_id: Some("conv_1".to_string()),
event_parser: RealtimeEventParser::V1,
session_mode: RealtimeSessionMode::Conversational,
output_modality: RealtimeOutputModality::Audio,
voice: RealtimeVoice::Breeze,
},
HeaderMap::new(),
@@ -1648,6 +1693,7 @@ mod tests {
session_id: Some("conv_1".to_string()),
event_parser: RealtimeEventParser::RealtimeV2,
session_mode: RealtimeSessionMode::Conversational,
output_modality: RealtimeOutputModality::Audio,
voice: RealtimeVoice::Cedar,
},
HeaderMap::new(),
@@ -1753,6 +1799,7 @@ mod tests {
session_id: Some("conv_1".to_string()),
event_parser: RealtimeEventParser::RealtimeV2,
session_mode: RealtimeSessionMode::Transcription,
output_modality: RealtimeOutputModality::Audio,
voice: RealtimeVoice::Marin,
},
HeaderMap::new(),
@@ -1856,6 +1903,7 @@ mod tests {
session_id: Some("conv_1".to_string()),
event_parser: RealtimeEventParser::V1,
session_mode: RealtimeSessionMode::Transcription,
output_modality: RealtimeOutputModality::Audio,
voice: RealtimeVoice::Cove,
},
HeaderMap::new(),
@@ -1945,6 +1993,7 @@ mod tests {
session_id: Some("conv_1".to_string()),
event_parser: RealtimeEventParser::V1,
session_mode: RealtimeSessionMode::Conversational,
output_modality: RealtimeOutputModality::Audio,
voice: RealtimeVoice::Cove,
},
HeaderMap::new(),

View File

@@ -8,6 +8,7 @@ use crate::endpoint::realtime_websocket::methods_v2::session_update_session as v
use crate::endpoint::realtime_websocket::methods_v2::websocket_intent as v2_websocket_intent;
use crate::endpoint::realtime_websocket::protocol::RealtimeEventParser;
use crate::endpoint::realtime_websocket::protocol::RealtimeOutboundMessage;
use crate::endpoint::realtime_websocket::protocol::RealtimeOutputModality;
use crate::endpoint::realtime_websocket::protocol::RealtimeSessionConfig;
use crate::endpoint::realtime_websocket::protocol::RealtimeSessionMode;
use crate::endpoint::realtime_websocket::protocol::RealtimeVoice;
@@ -57,13 +58,14 @@ pub(super) fn session_update_session(
event_parser: RealtimeEventParser,
instructions: String,
session_mode: RealtimeSessionMode,
output_modality: RealtimeOutputModality,
voice: RealtimeVoice,
) -> SessionUpdateSession {
let session_mode = normalized_session_mode(event_parser, session_mode);
match event_parser {
RealtimeEventParser::V1 => v1_session_update_session(instructions, voice),
RealtimeEventParser::RealtimeV2 => {
v2_session_update_session(instructions, session_mode, voice)
v2_session_update_session(instructions, session_mode, output_modality, voice)
}
}
}
@@ -73,6 +75,7 @@ pub fn session_update_session_json(config: RealtimeSessionConfig) -> JsonResult<
config.event_parser,
config.instructions,
config.session_mode,
config.output_modality,
config.voice,
);
session.id = config.session_id;

View File

@@ -9,6 +9,7 @@ use crate::endpoint::realtime_websocket::protocol::ConversationMessageItem;
use crate::endpoint::realtime_websocket::protocol::ConversationRole;
use crate::endpoint::realtime_websocket::protocol::NoiseReductionType;
use crate::endpoint::realtime_websocket::protocol::RealtimeOutboundMessage;
use crate::endpoint::realtime_websocket::protocol::RealtimeOutputModality;
use crate::endpoint::realtime_websocket::protocol::RealtimeSessionMode;
use crate::endpoint::realtime_websocket::protocol::RealtimeVoice;
use crate::endpoint::realtime_websocket::protocol::SessionAudio;
@@ -26,6 +27,7 @@ use crate::endpoint::realtime_websocket::protocol::TurnDetectionType;
use serde_json::json;
const REALTIME_V2_OUTPUT_MODALITY_AUDIO: &str = "audio";
const REALTIME_V2_OUTPUT_MODALITY_TEXT: &str = "text";
const REALTIME_V2_TOOL_CHOICE: &str = "auto";
const REALTIME_V2_BACKGROUND_AGENT_TOOL_NAME: &str = "background_agent";
const REALTIME_V2_BACKGROUND_AGENT_TOOL_DESCRIPTION: &str = "Send a user request to the background agent. Use this as the default action. Do not rephrase the user's ask or rewrite it in your own words; pass along the user's own words. If the background agent is idle, this starts a new task and returns the final result to the user. If the background agent is already working on a task, this sends the request as guidance to steer that previous task. If the user asks to do something next, later, after this, or once current work finishes, call this tool so the work is actually queued instead of merely promising to do it later.";
@@ -59,6 +61,7 @@ pub(super) fn conversation_handoff_append_message(
pub(super) fn session_update_session(
instructions: String,
session_mode: RealtimeSessionMode,
output_modality: RealtimeOutputModality,
voice: RealtimeVoice,
) -> SessionUpdateSession {
match session_mode {
@@ -67,7 +70,7 @@ pub(super) fn session_update_session(
r#type: SessionType::Realtime,
model: None,
instructions: Some(instructions),
output_modalities: Some(vec![REALTIME_V2_OUTPUT_MODALITY_AUDIO.to_string()]),
output_modalities: Some(vec![output_modality_value(output_modality).to_string()]),
audio: SessionAudio {
input: SessionAudioInput {
format: SessionAudioFormat {
@@ -132,6 +135,13 @@ pub(super) fn session_update_session(
}
}
fn output_modality_value(output_modality: RealtimeOutputModality) -> &'static str {
match output_modality {
RealtimeOutputModality::Text => REALTIME_V2_OUTPUT_MODALITY_TEXT,
RealtimeOutputModality::Audio => REALTIME_V2_OUTPUT_MODALITY_AUDIO,
}
}
pub(super) fn websocket_intent() -> Option<&'static str> {
None
}

View File

@@ -13,5 +13,6 @@ pub use methods::RealtimeWebsocketEvents;
pub use methods::RealtimeWebsocketWriter;
pub use methods_common::session_update_session_json;
pub use protocol::RealtimeEventParser;
pub use protocol::RealtimeOutputModality;
pub use protocol::RealtimeSessionConfig;
pub use protocol::RealtimeSessionMode;

View File

@@ -2,7 +2,7 @@ use crate::endpoint::realtime_websocket::protocol_v1::parse_realtime_event_v1;
use crate::endpoint::realtime_websocket::protocol_v2::parse_realtime_event_v2;
pub use codex_protocol::protocol::RealtimeAudioFrame;
pub use codex_protocol::protocol::RealtimeEvent;
pub use codex_protocol::protocol::RealtimeTranscriptDelta;
pub use codex_protocol::protocol::RealtimeOutputModality;
pub use codex_protocol::protocol::RealtimeTranscriptEntry;
pub use codex_protocol::protocol::RealtimeVoice;
use serde::Serialize;
@@ -27,6 +27,7 @@ pub struct RealtimeSessionConfig {
pub session_id: Option<String>,
pub event_parser: RealtimeEventParser,
pub session_mode: RealtimeSessionMode,
pub output_modality: RealtimeOutputModality,
pub voice: RealtimeVoice,
}

View File

@@ -1,5 +1,6 @@
use codex_protocol::protocol::RealtimeEvent;
use codex_protocol::protocol::RealtimeTranscriptDelta;
use codex_protocol::protocol::RealtimeTranscriptDone;
use serde_json::Value;
use tracing::debug;
@@ -53,6 +54,17 @@ pub(super) fn parse_transcript_delta_event(
.map(|delta| RealtimeTranscriptDelta { delta })
}
pub(super) fn parse_transcript_done_event(
parsed: &Value,
field: &str,
) -> Option<RealtimeTranscriptDone> {
parsed
.get(field)
.and_then(Value::as_str)
.map(str::to_string)
.map(|text| RealtimeTranscriptDone { text })
}
pub(super) fn parse_error_event(parsed: &Value) -> Option<RealtimeEvent> {
parsed
.get("message")

View File

@@ -2,6 +2,7 @@ use crate::endpoint::realtime_websocket::protocol_common::parse_error_event;
use crate::endpoint::realtime_websocket::protocol_common::parse_realtime_payload;
use crate::endpoint::realtime_websocket::protocol_common::parse_session_updated_event;
use crate::endpoint::realtime_websocket::protocol_common::parse_transcript_delta_event;
use crate::endpoint::realtime_websocket::protocol_common::parse_transcript_done_event;
use codex_protocol::protocol::RealtimeAudioFrame;
use codex_protocol::protocol::RealtimeEvent;
use codex_protocol::protocol::RealtimeHandoffRequested;
@@ -9,6 +10,7 @@ use codex_protocol::protocol::RealtimeInputAudioSpeechStarted;
use codex_protocol::protocol::RealtimeResponseCancelled;
use codex_protocol::protocol::RealtimeResponseCreated;
use codex_protocol::protocol::RealtimeResponseDone;
use codex_protocol::protocol::RealtimeTranscriptDone;
use serde_json::Map as JsonMap;
use serde_json::Value;
use tracing::debug;
@@ -30,8 +32,8 @@ pub(super) fn parse_realtime_event_v2(payload: &str) -> Option<RealtimeEvent> {
parse_transcript_delta_event(&parsed, "delta").map(RealtimeEvent::InputTranscriptDelta)
}
"conversation.item.input_audio_transcription.completed" => {
parse_transcript_delta_event(&parsed, "transcript")
.map(RealtimeEvent::InputTranscriptDelta)
parse_transcript_done_event(&parsed, "transcript")
.map(RealtimeEvent::InputTranscriptDone)
}
"response.output_text.delta" | "response.output_audio_transcript.delta" => {
parse_transcript_delta_event(&parsed, "delta").map(RealtimeEvent::OutputTranscriptDelta)
@@ -120,12 +122,43 @@ fn parse_conversation_item_done_event(parsed: &Value) -> Option<RealtimeEvent> {
return Some(handoff);
}
if let Some(transcript_done) = parse_item_done_transcript(item) {
return Some(transcript_done);
}
item.get("id")
.and_then(Value::as_str)
.map(str::to_string)
.map(|item_id| RealtimeEvent::ConversationItemDone { item_id })
}
fn parse_item_done_transcript(item: &JsonMap<String, Value>) -> Option<RealtimeEvent> {
let role = item.get("role").and_then(Value::as_str)?;
let text = item
.get("content")
.and_then(Value::as_array)?
.iter()
.filter_map(item_content_text)
.collect::<String>();
if text.is_empty() {
return None;
}
let done = RealtimeTranscriptDone { text };
match role {
"user" => Some(RealtimeEvent::InputTranscriptDone(done)),
"assistant" => Some(RealtimeEvent::OutputTranscriptDone(done)),
_ => None,
}
}
fn item_content_text(content: &Value) -> Option<&str> {
content
.get("text")
.or_else(|| content.get("transcript"))
.and_then(Value::as_str)
}
fn parse_handoff_requested_event(item: &JsonMap<String, Value>) -> Option<RealtimeEvent> {
let item_type = item.get("type").and_then(Value::as_str);
let item_name = item.get("name").and_then(Value::as_str);

View File

@@ -41,6 +41,7 @@ pub use crate::endpoint::ModelsClient;
pub use crate::endpoint::RealtimeCallClient;
pub use crate::endpoint::RealtimeCallResponse;
pub use crate::endpoint::RealtimeEventParser;
pub use crate::endpoint::RealtimeOutputModality;
pub use crate::endpoint::RealtimeSessionConfig;
pub use crate::endpoint::RealtimeSessionMode;
pub use crate::endpoint::RealtimeWebsocketClient;