diff --git a/codex-rs/app-server-protocol/schema/json/ClientRequest.json b/codex-rs/app-server-protocol/schema/json/ClientRequest.json index 4ca884cb55..c5dc32fb94 100644 --- a/codex-rs/app-server-protocol/schema/json/ClientRequest.json +++ b/codex-rs/app-server-protocol/schema/json/ClientRequest.json @@ -1469,6 +1469,30 @@ } ] }, + "RealtimeVoice": { + "enum": [ + "alloy", + "arbor", + "ash", + "ballad", + "breeze", + "cedar", + "coral", + "cove", + "echo", + "ember", + "juniper", + "maple", + "marin", + "sage", + "shimmer", + "sol", + "spruce", + "vale", + "verse" + ], + "type": "string" + }, "ReasoningEffort": { "description": "See https://platform.openai.com/docs/guides/reasoning?api-mode=responses#get-started-with-reasoning", "enum": [ diff --git a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json index fb441ca303..d3447132ab 100644 --- a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json +++ b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json @@ -10501,6 +10501,59 @@ ], "type": "string" }, + "RealtimeVoice": { + "enum": [ + "alloy", + "arbor", + "ash", + "ballad", + "breeze", + "cedar", + "coral", + "cove", + "echo", + "ember", + "juniper", + "maple", + "marin", + "sage", + "shimmer", + "sol", + "spruce", + "vale", + "verse" + ], + "type": "string" + }, + "RealtimeVoicesList": { + "properties": { + "defaultV1": { + "$ref": "#/definitions/v2/RealtimeVoice" + }, + "defaultV2": { + "$ref": "#/definitions/v2/RealtimeVoice" + }, + "v1": { + "items": { + "$ref": "#/definitions/v2/RealtimeVoice" + }, + "type": "array" + }, + "v2": { + "items": { + "$ref": "#/definitions/v2/RealtimeVoice" + }, + "type": "array" + } + }, + "required": [ + "defaultV1", + "defaultV2", + "v1", + "v2" + ], + "type": "object" + }, "ReasoningEffort": { "description": "See https://platform.openai.com/docs/guides/reasoning?api-mode=responses#get-started-with-reasoning", "enum": [ diff --git a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json index 1b93e77f72..6f2efa4e12 100644 --- a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json +++ b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json @@ -7304,6 +7304,59 @@ ], "type": "string" }, + "RealtimeVoice": { + "enum": [ + "alloy", + "arbor", + "ash", + "ballad", + "breeze", + "cedar", + "coral", + "cove", + "echo", + "ember", + "juniper", + "maple", + "marin", + "sage", + "shimmer", + "sol", + "spruce", + "vale", + "verse" + ], + "type": "string" + }, + "RealtimeVoicesList": { + "properties": { + "defaultV1": { + "$ref": "#/definitions/RealtimeVoice" + }, + "defaultV2": { + "$ref": "#/definitions/RealtimeVoice" + }, + "v1": { + "items": { + "$ref": "#/definitions/RealtimeVoice" + }, + "type": "array" + }, + "v2": { + "items": { + "$ref": "#/definitions/RealtimeVoice" + }, + "type": "array" + } + }, + "required": [ + "defaultV1", + "defaultV2", + "v1", + "v2" + ], + "type": "object" + }, "ReasoningEffort": { "description": "See https://platform.openai.com/docs/guides/reasoning?api-mode=responses#get-started-with-reasoning", "enum": [ diff --git a/codex-rs/app-server-protocol/schema/typescript/RealtimeVoice.ts b/codex-rs/app-server-protocol/schema/typescript/RealtimeVoice.ts new file mode 100644 index 0000000000..c3a434e944 --- /dev/null +++ b/codex-rs/app-server-protocol/schema/typescript/RealtimeVoice.ts @@ -0,0 +1,5 @@ +// GENERATED CODE! DO NOT MODIFY BY HAND! + +// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually. + +export type RealtimeVoice = "alloy" | "arbor" | "ash" | "ballad" | "breeze" | "cedar" | "coral" | "cove" | "echo" | "ember" | "juniper" | "maple" | "marin" | "sage" | "shimmer" | "sol" | "spruce" | "vale" | "verse"; diff --git a/codex-rs/app-server-protocol/schema/typescript/RealtimeVoicesList.ts b/codex-rs/app-server-protocol/schema/typescript/RealtimeVoicesList.ts new file mode 100644 index 0000000000..b81cbc0a0c --- /dev/null +++ b/codex-rs/app-server-protocol/schema/typescript/RealtimeVoicesList.ts @@ -0,0 +1,6 @@ +// GENERATED CODE! DO NOT MODIFY BY HAND! + +// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually. +import type { RealtimeVoice } from "./RealtimeVoice"; + +export type RealtimeVoicesList = { v1: Array, v2: Array, defaultV1: RealtimeVoice, defaultV2: RealtimeVoice, }; diff --git a/codex-rs/app-server-protocol/schema/typescript/index.ts b/codex-rs/app-server-protocol/schema/typescript/index.ts index 7ffc15e83d..2a35207896 100644 --- a/codex-rs/app-server-protocol/schema/typescript/index.ts +++ b/codex-rs/app-server-protocol/schema/typescript/index.ts @@ -49,6 +49,8 @@ export type { ParsedCommand } from "./ParsedCommand"; export type { Personality } from "./Personality"; export type { PlanType } from "./PlanType"; export type { RealtimeConversationVersion } from "./RealtimeConversationVersion"; +export type { RealtimeVoice } from "./RealtimeVoice"; +export type { RealtimeVoicesList } from "./RealtimeVoicesList"; export type { ReasoningEffort } from "./ReasoningEffort"; export type { ReasoningItemContent } from "./ReasoningItemContent"; export type { ReasoningItemReasoningSummary } from "./ReasoningItemReasoningSummary"; diff --git a/codex-rs/app-server-protocol/src/protocol/common.rs b/codex-rs/app-server-protocol/src/protocol/common.rs index 4610024770..1f3a935a7e 100644 --- a/codex-rs/app-server-protocol/src/protocol/common.rs +++ b/codex-rs/app-server-protocol/src/protocol/common.rs @@ -414,6 +414,11 @@ client_request_definitions! { params: v2::ThreadRealtimeStopParams, response: v2::ThreadRealtimeStopResponse, }, + #[experimental("thread/realtime/listVoices")] + ThreadRealtimeListVoices => "thread/realtime/listVoices" { + params: v2::ThreadRealtimeListVoicesParams, + response: v2::ThreadRealtimeListVoicesResponse, + }, ReviewStart => "review/start" { params: v2::ReviewStartParams, response: v2::ReviewStartResponse, @@ -1764,6 +1769,7 @@ mod tests { prompt: Some(Some("You are on a call".to_string())), session_id: Some("sess_456".to_string()), transport: None, + voice: Some(codex_protocol::protocol::RealtimeVoice::Marin), }, }; assert_eq!( @@ -1774,7 +1780,8 @@ mod tests { "threadId": "thr_123", "prompt": "You are on a call", "sessionId": "sess_456", - "transport": null + "transport": null, + "voice": "marin" } }), serde_json::to_value(&request)?, @@ -1791,6 +1798,7 @@ mod tests { prompt: None, session_id: None, transport: None, + voice: None, }, }; assert_eq!( @@ -1800,7 +1808,8 @@ mod tests { "params": { "threadId": "thr_123", "sessionId": null, - "transport": null + "transport": null, + "voice": null } }), serde_json::to_value(&default_prompt_request)?, @@ -1813,6 +1822,7 @@ mod tests { prompt: Some(None), session_id: None, transport: None, + voice: None, }, }; assert_eq!( @@ -1823,7 +1833,8 @@ mod tests { "threadId": "thr_123", "prompt": null, "sessionId": null, - "transport": null + "transport": null, + "voice": null } }), serde_json::to_value(&null_prompt_request)?, @@ -1835,7 +1846,8 @@ mod tests { "params": { "threadId": "thr_123", "sessionId": null, - "transport": null + "transport": null, + "voice": null } }); assert_eq!( @@ -1850,7 +1862,8 @@ mod tests { "threadId": "thr_123", "prompt": null, "sessionId": null, - "transport": null + "transport": null, + "voice": null } }); assert_eq!( @@ -1934,6 +1947,7 @@ mod tests { prompt: Some(Some("You are on a call".to_string())), session_id: None, transport: None, + voice: None, }, }; let reason = crate::experimental_api::ExperimentalApi::experimental_reason(&request); diff --git a/codex-rs/app-server-protocol/src/protocol/v2.rs b/codex-rs/app-server-protocol/src/protocol/v2.rs index 8d485ff829..4e4d6264ec 100644 --- a/codex-rs/app-server-protocol/src/protocol/v2.rs +++ b/codex-rs/app-server-protocol/src/protocol/v2.rs @@ -72,6 +72,8 @@ use codex_protocol::protocol::RateLimitWindow as CoreRateLimitWindow; use codex_protocol::protocol::ReadOnlyAccess as CoreReadOnlyAccess; use codex_protocol::protocol::RealtimeAudioFrame as CoreRealtimeAudioFrame; use codex_protocol::protocol::RealtimeConversationVersion; +use codex_protocol::protocol::RealtimeVoice; +use codex_protocol::protocol::RealtimeVoicesList; use codex_protocol::protocol::ReviewDecision as CoreReviewDecision; use codex_protocol::protocol::SessionSource as CoreSessionSource; use codex_protocol::protocol::SkillDependencies as CoreSkillDependencies; @@ -3866,6 +3868,8 @@ pub struct ThreadRealtimeStartParams { pub session_id: Option, #[ts(optional = nullable)] pub transport: Option, + #[ts(optional = nullable)] + pub voice: Option, } /// EXPERIMENTAL - transport used by thread realtime. @@ -3931,6 +3935,20 @@ pub struct ThreadRealtimeStopParams { #[ts(export_to = "v2/")] pub struct ThreadRealtimeStopResponse {} +/// EXPERIMENTAL - list voices supported by thread realtime. +#[derive(Serialize, Deserialize, Debug, Default, Clone, PartialEq, JsonSchema, TS)] +#[serde(rename_all = "camelCase")] +#[ts(export_to = "v2/")] +pub struct ThreadRealtimeListVoicesParams {} + +/// EXPERIMENTAL - response for listing supported realtime voices. +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, JsonSchema, TS)] +#[serde(rename_all = "camelCase")] +#[ts(export_to = "v2/")] +pub struct ThreadRealtimeListVoicesResponse { + pub voices: RealtimeVoicesList, +} + /// EXPERIMENTAL - emitted when thread realtime startup is accepted. #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, JsonSchema, TS)] #[serde(rename_all = "camelCase")] diff --git a/codex-rs/app-server/src/codex_message_processor.rs b/codex-rs/app-server/src/codex_message_processor.rs index dafaffca3c..4e3bc70cb3 100644 --- a/codex-rs/app-server/src/codex_message_processor.rs +++ b/codex-rs/app-server/src/codex_message_processor.rs @@ -144,6 +144,8 @@ use codex_app_server_protocol::ThreadRealtimeAppendAudioParams; use codex_app_server_protocol::ThreadRealtimeAppendAudioResponse; use codex_app_server_protocol::ThreadRealtimeAppendTextParams; use codex_app_server_protocol::ThreadRealtimeAppendTextResponse; +use codex_app_server_protocol::ThreadRealtimeListVoicesParams; +use codex_app_server_protocol::ThreadRealtimeListVoicesResponse; use codex_app_server_protocol::ThreadRealtimeStartParams; use codex_app_server_protocol::ThreadRealtimeStartResponse; use codex_app_server_protocol::ThreadRealtimeStartTransport; @@ -279,6 +281,7 @@ use codex_protocol::protocol::McpAuthStatus as CoreMcpAuthStatus; use codex_protocol::protocol::McpServerRefreshConfig; use codex_protocol::protocol::Op; use codex_protocol::protocol::RateLimitSnapshot as CoreRateLimitSnapshot; +use codex_protocol::protocol::RealtimeVoicesList; use codex_protocol::protocol::ReviewDelivery as CoreReviewDelivery; use codex_protocol::protocol::ReviewRequest; use codex_protocol::protocol::ReviewTarget as CoreReviewTarget; @@ -839,6 +842,10 @@ impl CodexMessageProcessor { self.thread_realtime_stop(to_connection_request_id(request_id), params) .await; } + ClientRequest::ThreadRealtimeListVoices { request_id, params } => { + self.thread_realtime_list_voices(to_connection_request_id(request_id), params) + .await; + } ClientRequest::ReviewStart { request_id, params } => { self.review_start(to_connection_request_id(request_id), params) .await; @@ -6861,6 +6868,7 @@ impl CodexMessageProcessor { ConversationStartTransport::Webrtc { sdp } } }), + voice: params.voice, }), ) .await; @@ -6987,6 +6995,21 @@ impl CodexMessageProcessor { } } + async fn thread_realtime_list_voices( + &mut self, + request_id: ConnectionRequestId, + _params: ThreadRealtimeListVoicesParams, + ) { + self.outgoing + .send_response( + request_id, + ThreadRealtimeListVoicesResponse { + voices: RealtimeVoicesList::builtin(), + }, + ) + .await; + } + fn build_review_turn(turn_id: String, display_text: &str) -> Turn { let items = if display_text.is_empty() { Vec::new() diff --git a/codex-rs/app-server/tests/common/mcp_process.rs b/codex-rs/app-server/tests/common/mcp_process.rs index e660b82646..03c2284b84 100644 --- a/codex-rs/app-server/tests/common/mcp_process.rs +++ b/codex-rs/app-server/tests/common/mcp_process.rs @@ -67,6 +67,7 @@ use codex_app_server_protocol::ThreadMetadataUpdateParams; use codex_app_server_protocol::ThreadReadParams; use codex_app_server_protocol::ThreadRealtimeAppendAudioParams; use codex_app_server_protocol::ThreadRealtimeAppendTextParams; +use codex_app_server_protocol::ThreadRealtimeListVoicesParams; use codex_app_server_protocol::ThreadRealtimeStartParams; use codex_app_server_protocol::ThreadRealtimeStopParams; use codex_app_server_protocol::ThreadResumeParams; @@ -664,6 +665,15 @@ impl McpProcess { self.send_request("thread/realtime/stop", params).await } + pub async fn send_thread_realtime_list_voices_request( + &mut self, + params: ThreadRealtimeListVoicesParams, + ) -> anyhow::Result { + let params = Some(serde_json::to_value(params)?); + self.send_request("thread/realtime/listVoices", params) + .await + } + /// Deterministically clean up an intentionally in-flight turn. /// /// Some tests assert behavior while a turn is still running. Returning from those tests diff --git a/codex-rs/app-server/tests/suite/v2/experimental_api.rs b/codex-rs/app-server/tests/suite/v2/experimental_api.rs index ea18b17544..4a532aebc0 100644 --- a/codex-rs/app-server/tests/suite/v2/experimental_api.rs +++ b/codex-rs/app-server/tests/suite/v2/experimental_api.rs @@ -77,6 +77,7 @@ async fn realtime_conversation_start_requires_experimental_api_capability() -> R prompt: Some(Some("hello".to_string())), session_id: None, transport: None, + voice: None, }) .await?; let error = timeout( @@ -114,6 +115,7 @@ async fn realtime_webrtc_start_requires_experimental_api_capability() -> Result< transport: Some(ThreadRealtimeStartTransport::Webrtc { sdp: "v=offer\r\n".to_string(), }), + voice: None, }) .await?; let error = timeout( diff --git a/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs b/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs index 3ffeed03d5..2ca4e39ab9 100644 --- a/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs +++ b/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs @@ -21,6 +21,8 @@ use codex_app_server_protocol::ThreadRealtimeAudioChunk; use codex_app_server_protocol::ThreadRealtimeClosedNotification; use codex_app_server_protocol::ThreadRealtimeErrorNotification; use codex_app_server_protocol::ThreadRealtimeItemAddedNotification; +use codex_app_server_protocol::ThreadRealtimeListVoicesParams; +use codex_app_server_protocol::ThreadRealtimeListVoicesResponse; use codex_app_server_protocol::ThreadRealtimeOutputAudioDeltaNotification; use codex_app_server_protocol::ThreadRealtimeSdpNotification; use codex_app_server_protocol::ThreadRealtimeStartParams; @@ -37,6 +39,8 @@ use codex_app_server_protocol::TurnStartedNotification; use codex_features::FEATURES; use codex_features::Feature; use codex_protocol::protocol::RealtimeConversationVersion; +use codex_protocol::protocol::RealtimeVoice; +use codex_protocol::protocol::RealtimeVoicesList; use core_test_support::responses; use core_test_support::responses::WebSocketConnectionConfig; use core_test_support::responses::WebSocketRequest; @@ -294,11 +298,12 @@ impl RealtimeE2eHarness { .mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { thread_id: self.thread_id.clone(), - prompt: "backend prompt".to_string(), + prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: Some(ThreadRealtimeStartTransport::Webrtc { sdp: offer_sdp.to_string(), }), + voice: None, }) .await?; let start_response: JSONRPCResponse = timeout( @@ -516,6 +521,7 @@ async fn realtime_conversation_streams_v2_notifications() -> Result<()> { prompt: None, session_id: None, transport: None, + voice: Some(RealtimeVoice::Cedar), }) .await?; let start_response: JSONRPCResponse = timeout( @@ -539,6 +545,10 @@ async fn realtime_conversation_streams_v2_notifications() -> Result<()> { startup_context_request.body_json()["type"].as_str(), Some("session.update") ); + assert_eq!( + startup_context_request.body_json()["session"]["audio"]["output"]["voice"], + "cedar" + ); let startup_context_instructions = startup_context_request.body_json()["session"]["instructions"] .as_str() @@ -683,6 +693,66 @@ async fn realtime_conversation_streams_v2_notifications() -> Result<()> { Ok(()) } +#[tokio::test] +async fn realtime_list_voices_returns_supported_names() -> Result<()> { + let codex_home = TempDir::new()?; + create_config_toml( + codex_home.path(), + "http://127.0.0.1:1", + "ws://127.0.0.1:1", + /*realtime_enabled*/ true, + StartupContextConfig::Generated, + )?; + + let mut mcp = McpProcess::new(codex_home.path()).await?; + mcp.initialize().await?; + + let request_id = mcp + .send_thread_realtime_list_voices_request(ThreadRealtimeListVoicesParams {}) + .await?; + let response: JSONRPCResponse = timeout( + DEFAULT_TIMEOUT, + mcp.read_stream_until_response_message(RequestId::Integer(request_id)), + ) + .await??; + let response: ThreadRealtimeListVoicesResponse = to_response(response)?; + + assert_eq!( + response, + ThreadRealtimeListVoicesResponse { + voices: RealtimeVoicesList { + v1: vec![ + RealtimeVoice::Juniper, + RealtimeVoice::Maple, + RealtimeVoice::Spruce, + RealtimeVoice::Ember, + RealtimeVoice::Vale, + RealtimeVoice::Breeze, + RealtimeVoice::Arbor, + RealtimeVoice::Sol, + RealtimeVoice::Cove, + ], + v2: vec![ + RealtimeVoice::Alloy, + RealtimeVoice::Ash, + RealtimeVoice::Ballad, + RealtimeVoice::Coral, + RealtimeVoice::Echo, + RealtimeVoice::Sage, + RealtimeVoice::Shimmer, + RealtimeVoice::Verse, + RealtimeVoice::Marin, + RealtimeVoice::Cedar, + ], + default_v1: RealtimeVoice::Cove, + default_v2: RealtimeVoice::Marin, + }, + } + ); + + Ok(()) +} + #[tokio::test] async fn realtime_conversation_stop_emits_closed_notification() -> Result<()> { skip_if_no_network!(Ok(())); @@ -726,6 +796,7 @@ async fn realtime_conversation_stop_emits_closed_notification() -> Result<()> { prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, + voice: None, }) .await?; let start_response: JSONRPCResponse = timeout( @@ -823,6 +894,7 @@ async fn realtime_webrtc_start_emits_sdp_notification() -> Result<()> { transport: Some(ThreadRealtimeStartTransport::Webrtc { sdp: "v=offer\r\n".to_string(), }), + voice: None, }) .await?; let start_response: JSONRPCResponse = timeout( @@ -1383,6 +1455,7 @@ async fn realtime_webrtc_start_surfaces_backend_error() -> Result<()> { transport: Some(ThreadRealtimeStartTransport::Webrtc { sdp: "v=offer\r\n".to_string(), }), + voice: None, }) .await?; let start_response: JSONRPCResponse = timeout( @@ -1438,6 +1511,7 @@ async fn realtime_conversation_requires_feature_flag() -> Result<()> { prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, + voice: None, }) .await?; let error = timeout( @@ -1578,7 +1652,7 @@ fn assert_v1_session_update(request: &Value) -> Result<()> { ); assert_eq!( request["session"]["audio"]["output"]["voice"].as_str(), - Some("fathom") + Some("cove") ); assert_eq!(request["session"]["tools"], Value::Null); Ok(()) @@ -1635,7 +1709,7 @@ fn assert_call_create_multipart( } fn v1_session_create_json() -> &'static str { - r#"{"audio":{"input":{"format":{"type":"audio/pcm","rate":24000}},"output":{"voice":"fathom"}},"type":"quicksilver","instructions":"backend prompt\n\nstartup context"}"# + r#"{"audio":{"input":{"format":{"type":"audio/pcm","rate":24000}},"output":{"voice":"cove"}},"type":"quicksilver","instructions":"backend prompt\n\nstartup context"}"# } fn create_config_toml( diff --git a/codex-rs/codex-api/src/endpoint/realtime_call.rs b/codex-rs/codex-api/src/endpoint/realtime_call.rs index df69a25e45..8a68d088c7 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_call.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_call.rs @@ -226,6 +226,7 @@ mod tests { use codex_client::Response; use codex_client::StreamResponse; use codex_client::TransportError; + use codex_protocol::protocol::RealtimeVoice; use http::StatusCode; use pretty_assertions::assert_eq; use std::sync::Mutex; @@ -308,6 +309,7 @@ mod tests { session_id: Some(session_id.to_string()), event_parser: RealtimeEventParser::RealtimeV2, session_mode: RealtimeSessionMode::Conversational, + voice: RealtimeVoice::Marin, } } diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs index 0147cdefef..71c931d9cc 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs @@ -11,6 +11,7 @@ use crate::endpoint::realtime_websocket::protocol::RealtimeSessionConfig; use crate::endpoint::realtime_websocket::protocol::RealtimeSessionMode; use crate::endpoint::realtime_websocket::protocol::RealtimeTranscriptDelta; use crate::endpoint::realtime_websocket::protocol::RealtimeTranscriptEntry; +use crate::endpoint::realtime_websocket::protocol::RealtimeVoice; use crate::endpoint::realtime_websocket::protocol::parse_realtime_event; use crate::error::ApiError; use crate::provider::Provider; @@ -306,9 +307,10 @@ impl RealtimeWebsocketWriter { &self, instructions: String, session_mode: RealtimeSessionMode, + voice: RealtimeVoice, ) -> Result<(), ApiError> { let session_mode = normalized_session_mode(self.event_parser, session_mode); - let session = session_update_session(self.event_parser, instructions, session_mode); + let session = session_update_session(self.event_parser, instructions, session_mode, voice); self.send_json(&RealtimeOutboundMessage::SessionUpdate { session }) .await } @@ -577,7 +579,7 @@ impl RealtimeWebsocketClient { ); connection .writer - .send_session_update(config.instructions, config.session_mode) + .send_session_update(config.instructions, config.session_mode, config.voice) .await?; Ok(connection) } @@ -722,6 +724,7 @@ mod tests { use codex_protocol::protocol::RealtimeHandoffRequested; use codex_protocol::protocol::RealtimeInputAudioSpeechStarted; use codex_protocol::protocol::RealtimeResponseCancelled; + use codex_protocol::protocol::RealtimeVoice; use http::HeaderValue; use pretty_assertions::assert_eq; use serde_json::Value; @@ -1238,7 +1241,7 @@ mod tests { ); assert_eq!( first_json["session"]["audio"]["output"]["voice"], - Value::String("fathom".to_string()) + Value::String("breeze".to_string()) ); ws.send(Message::Text( @@ -1371,6 +1374,7 @@ mod tests { session_id: Some("conv_1".to_string()), event_parser: RealtimeEventParser::V1, session_mode: RealtimeSessionMode::Conversational, + voice: RealtimeVoice::Breeze, }, HeaderMap::new(), HeaderMap::new(), @@ -1546,7 +1550,7 @@ mod tests { ); assert_eq!( first_json["session"]["audio"]["output"]["voice"], - Value::String("marin".to_string()) + Value::String("cedar".to_string()) ); assert_eq!( first_json["session"]["tools"][0]["type"], @@ -1644,6 +1648,7 @@ mod tests { session_id: Some("conv_1".to_string()), event_parser: RealtimeEventParser::RealtimeV2, session_mode: RealtimeSessionMode::Conversational, + voice: RealtimeVoice::Cedar, }, HeaderMap::new(), HeaderMap::new(), @@ -1748,6 +1753,7 @@ mod tests { session_id: Some("conv_1".to_string()), event_parser: RealtimeEventParser::RealtimeV2, session_mode: RealtimeSessionMode::Transcription, + voice: RealtimeVoice::Marin, }, HeaderMap::new(), HeaderMap::new(), @@ -1811,7 +1817,7 @@ mod tests { ); assert_eq!( first_json["session"]["audio"]["output"]["voice"], - Value::String("fathom".to_string()) + Value::String("cove".to_string()) ); assert!(first_json["session"].get("tools").is_none()); @@ -1850,6 +1856,7 @@ mod tests { session_id: Some("conv_1".to_string()), event_parser: RealtimeEventParser::V1, session_mode: RealtimeSessionMode::Transcription, + voice: RealtimeVoice::Cove, }, HeaderMap::new(), HeaderMap::new(), @@ -1938,6 +1945,7 @@ mod tests { session_id: Some("conv_1".to_string()), event_parser: RealtimeEventParser::V1, session_mode: RealtimeSessionMode::Conversational, + voice: RealtimeVoice::Cove, }, HeaderMap::new(), HeaderMap::new(), diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_common.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_common.rs index 15e55ff1b7..8eb079fe83 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_common.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_common.rs @@ -10,6 +10,7 @@ use crate::endpoint::realtime_websocket::protocol::RealtimeEventParser; use crate::endpoint::realtime_websocket::protocol::RealtimeOutboundMessage; use crate::endpoint::realtime_websocket::protocol::RealtimeSessionConfig; use crate::endpoint::realtime_websocket::protocol::RealtimeSessionMode; +use crate::endpoint::realtime_websocket::protocol::RealtimeVoice; use crate::endpoint::realtime_websocket::protocol::SessionUpdateSession; use serde_json::Result as JsonResult; use serde_json::Value; @@ -56,11 +57,14 @@ pub(super) fn session_update_session( event_parser: RealtimeEventParser, instructions: String, session_mode: RealtimeSessionMode, + voice: RealtimeVoice, ) -> SessionUpdateSession { let session_mode = normalized_session_mode(event_parser, session_mode); match event_parser { - RealtimeEventParser::V1 => v1_session_update_session(instructions), - RealtimeEventParser::RealtimeV2 => v2_session_update_session(instructions, session_mode), + RealtimeEventParser::V1 => v1_session_update_session(instructions, voice), + RealtimeEventParser::RealtimeV2 => { + v2_session_update_session(instructions, session_mode, voice) + } } } @@ -69,6 +73,7 @@ pub fn session_update_session_json(config: RealtimeSessionConfig) -> JsonResult< config.event_parser, config.instructions, config.session_mode, + config.voice, ); session.id = config.session_id; session.model = config.model; diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_v1.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_v1.rs index 2a729e9390..22e728dcff 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_v1.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_v1.rs @@ -7,11 +7,11 @@ use crate::endpoint::realtime_websocket::protocol::ConversationItemType; use crate::endpoint::realtime_websocket::protocol::ConversationMessageItem; use crate::endpoint::realtime_websocket::protocol::ConversationRole; use crate::endpoint::realtime_websocket::protocol::RealtimeOutboundMessage; +use crate::endpoint::realtime_websocket::protocol::RealtimeVoice; use crate::endpoint::realtime_websocket::protocol::SessionAudio; use crate::endpoint::realtime_websocket::protocol::SessionAudioFormat; use crate::endpoint::realtime_websocket::protocol::SessionAudioInput; use crate::endpoint::realtime_websocket::protocol::SessionAudioOutput; -use crate::endpoint::realtime_websocket::protocol::SessionAudioVoice; use crate::endpoint::realtime_websocket::protocol::SessionType; use crate::endpoint::realtime_websocket::protocol::SessionUpdateSession; @@ -38,7 +38,10 @@ pub(super) fn conversation_handoff_append_message( } } -pub(super) fn session_update_session(instructions: String) -> SessionUpdateSession { +pub(super) fn session_update_session( + instructions: String, + voice: RealtimeVoice, +) -> SessionUpdateSession { SessionUpdateSession { id: None, r#type: SessionType::Quicksilver, @@ -56,7 +59,7 @@ pub(super) fn session_update_session(instructions: String) -> SessionUpdateSessi }, output: Some(SessionAudioOutput { format: None, - voice: SessionAudioVoice::Fathom, + voice, }), }, tools: None, diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_v2.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_v2.rs index 95476594bb..fd512a902d 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_v2.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_v2.rs @@ -10,12 +10,12 @@ use crate::endpoint::realtime_websocket::protocol::ConversationRole; use crate::endpoint::realtime_websocket::protocol::NoiseReductionType; use crate::endpoint::realtime_websocket::protocol::RealtimeOutboundMessage; use crate::endpoint::realtime_websocket::protocol::RealtimeSessionMode; +use crate::endpoint::realtime_websocket::protocol::RealtimeVoice; use crate::endpoint::realtime_websocket::protocol::SessionAudio; use crate::endpoint::realtime_websocket::protocol::SessionAudioFormat; use crate::endpoint::realtime_websocket::protocol::SessionAudioInput; use crate::endpoint::realtime_websocket::protocol::SessionAudioOutput; use crate::endpoint::realtime_websocket::protocol::SessionAudioOutputFormat; -use crate::endpoint::realtime_websocket::protocol::SessionAudioVoice; use crate::endpoint::realtime_websocket::protocol::SessionFunctionTool; use crate::endpoint::realtime_websocket::protocol::SessionNoiseReduction; use crate::endpoint::realtime_websocket::protocol::SessionToolType; @@ -59,6 +59,7 @@ pub(super) fn conversation_handoff_append_message( pub(super) fn session_update_session( instructions: String, session_mode: RealtimeSessionMode, + voice: RealtimeVoice, ) -> SessionUpdateSession { match session_mode { RealtimeSessionMode::Conversational => SessionUpdateSession { @@ -87,7 +88,7 @@ pub(super) fn session_update_session( r#type: AudioFormatType::AudioPcm, rate: REALTIME_AUDIO_SAMPLE_RATE, }), - voice: SessionAudioVoice::Marin, + voice, }), }, tools: Some(vec![SessionFunctionTool { diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol.rs index e6f0f0df11..0185984c61 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol.rs @@ -4,6 +4,7 @@ pub use codex_protocol::protocol::RealtimeAudioFrame; pub use codex_protocol::protocol::RealtimeEvent; pub use codex_protocol::protocol::RealtimeTranscriptDelta; pub use codex_protocol::protocol::RealtimeTranscriptEntry; +pub use codex_protocol::protocol::RealtimeVoice; use serde::Serialize; use serde_json::Value; @@ -26,6 +27,7 @@ pub struct RealtimeSessionConfig { pub session_id: Option, pub event_parser: RealtimeEventParser, pub session_mode: RealtimeSessionMode, + pub voice: RealtimeVoice, } #[derive(Debug, Clone, Serialize)] @@ -106,15 +108,7 @@ pub(super) enum AudioFormatType { pub(super) struct SessionAudioOutput { #[serde(skip_serializing_if = "Option::is_none")] pub(super) format: Option, - pub(super) voice: SessionAudioVoice, -} - -#[derive(Debug, Clone, Copy, Serialize)] -pub(super) enum SessionAudioVoice { - #[serde(rename = "fathom")] - Fathom, - #[serde(rename = "marin")] - Marin, + pub(super) voice: RealtimeVoice, } #[derive(Debug, Clone, Serialize)] diff --git a/codex-rs/codex-api/tests/realtime_websocket_e2e.rs b/codex-rs/codex-api/tests/realtime_websocket_e2e.rs index 5f402c31a0..9be1733d49 100644 --- a/codex-rs/codex-api/tests/realtime_websocket_e2e.rs +++ b/codex-rs/codex-api/tests/realtime_websocket_e2e.rs @@ -11,6 +11,7 @@ use codex_api::RealtimeSessionMode; use codex_api::RealtimeWebsocketClient; use codex_api::RetryConfig; use codex_protocol::protocol::RealtimeHandoffRequested; +use codex_protocol::protocol::RealtimeVoice; use futures::SinkExt; use futures::StreamExt; use http::HeaderMap; @@ -144,6 +145,7 @@ async fn realtime_ws_e2e_session_create_and_event_flow() { session_id: Some("conv_123".to_string()), event_parser: RealtimeEventParser::V1, session_mode: RealtimeSessionMode::Conversational, + voice: RealtimeVoice::Cove, }, HeaderMap::new(), HeaderMap::new(), @@ -246,6 +248,7 @@ async fn realtime_ws_connect_webrtc_sideband_retries_join_until_server_is_availa session_id: Some("conv_123".to_string()), event_parser: RealtimeEventParser::RealtimeV2, session_mode: RealtimeSessionMode::Conversational, + voice: RealtimeVoice::Marin, }, "rtc_test", HeaderMap::new(), @@ -316,6 +319,7 @@ async fn realtime_ws_e2e_send_while_next_event_waits() { session_id: Some("conv_123".to_string()), event_parser: RealtimeEventParser::V1, session_mode: RealtimeSessionMode::Conversational, + voice: RealtimeVoice::Cove, }, HeaderMap::new(), HeaderMap::new(), @@ -382,6 +386,7 @@ async fn realtime_ws_e2e_disconnected_emitted_once() { session_id: Some("conv_123".to_string()), event_parser: RealtimeEventParser::V1, session_mode: RealtimeSessionMode::Conversational, + voice: RealtimeVoice::Cove, }, HeaderMap::new(), HeaderMap::new(), @@ -444,6 +449,7 @@ async fn realtime_ws_e2e_ignores_unknown_text_events() { session_id: Some("conv_123".to_string()), event_parser: RealtimeEventParser::V1, session_mode: RealtimeSessionMode::Conversational, + voice: RealtimeVoice::Cove, }, HeaderMap::new(), HeaderMap::new(), @@ -509,6 +515,7 @@ async fn realtime_ws_e2e_realtime_v2_parser_emits_handoff_requested() { session_id: Some("conv_123".to_string()), event_parser: RealtimeEventParser::RealtimeV2, session_mode: RealtimeSessionMode::Conversational, + voice: RealtimeVoice::Marin, }, HeaderMap::new(), HeaderMap::new(), diff --git a/codex-rs/config/src/config_toml.rs b/codex-rs/config/src/config_toml.rs index b51bcc7b0f..4d1f3da2ad 100644 --- a/codex-rs/config/src/config_toml.rs +++ b/codex-rs/config/src/config_toml.rs @@ -455,6 +455,7 @@ pub enum RealtimeTransport { } pub use codex_protocol::protocol::RealtimeConversationVersion as RealtimeWsVersion; +pub use codex_protocol::protocol::RealtimeVoice; #[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq, Eq, JsonSchema)] #[schemars(deny_unknown_fields)] @@ -463,6 +464,7 @@ pub struct RealtimeConfig { #[serde(rename = "type")] pub session_type: RealtimeWsMode, pub transport: RealtimeTransport, + pub voice: Option, } #[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq, Eq, JsonSchema)] @@ -472,6 +474,7 @@ pub struct RealtimeToml { #[serde(rename = "type")] pub session_type: Option, pub transport: Option, + pub voice: Option, } #[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq, Eq, JsonSchema)] diff --git a/codex-rs/core/config.schema.json b/codex-rs/core/config.schema.json index 7a18a99004..ce43d3ceca 100644 --- a/codex-rs/core/config.schema.json +++ b/codex-rs/core/config.schema.json @@ -1505,6 +1505,9 @@ }, "version": { "$ref": "#/definitions/RealtimeConversationVersion" + }, + "voice": { + "$ref": "#/definitions/RealtimeVoice" } }, "type": "object" @@ -1516,6 +1519,30 @@ ], "type": "string" }, + "RealtimeVoice": { + "enum": [ + "alloy", + "arbor", + "ash", + "ballad", + "breeze", + "cedar", + "coral", + "cove", + "echo", + "ember", + "juniper", + "maple", + "marin", + "sage", + "shimmer", + "sol", + "spruce", + "vale", + "verse" + ], + "type": "string" + }, "RealtimeWsMode": { "enum": [ "conversational", diff --git a/codex-rs/core/src/codex.rs b/codex-rs/core/src/codex.rs index aeda1614cf..6e5529800d 100644 --- a/codex-rs/core/src/codex.rs +++ b/codex-rs/core/src/codex.rs @@ -4626,6 +4626,10 @@ async fn submission_loop(sess: Arc, config: Arc, rx_sub: Receiv handle_realtime_conversation_close(&sess, sub.id.clone()).await; false } + Op::RealtimeConversationListVoices => { + handlers::realtime_conversation_list_voices(&sess, sub.id.clone()).await; + false + } Op::OverrideTurnContext { cwd, approval_policy, @@ -4860,6 +4864,8 @@ mod handlers { use codex_protocol::protocol::ListSkillsResponseEvent; use codex_protocol::protocol::McpServerRefreshConfig; use codex_protocol::protocol::Op; + use codex_protocol::protocol::RealtimeConversationListVoicesResponseEvent; + use codex_protocol::protocol::RealtimeVoicesList; use codex_protocol::protocol::ReviewDecision; use codex_protocol::protocol::ReviewRequest; use codex_protocol::protocol::RolloutItem; @@ -4894,6 +4900,18 @@ mod handlers { sess.close_unified_exec_processes().await; } + pub async fn realtime_conversation_list_voices(sess: &Session, sub_id: String) { + sess.send_event_raw(Event { + id: sub_id, + msg: EventMsg::RealtimeConversationListVoicesResponse( + RealtimeConversationListVoicesResponseEvent { + voices: RealtimeVoicesList::builtin(), + }, + ), + }) + .await; + } + pub async fn override_turn_context( sess: &Session, sub_id: String, @@ -7189,6 +7207,7 @@ fn realtime_text_for_event(msg: &EventMsg) -> Option { | EventMsg::GetHistoryEntryResponse(_) | EventMsg::McpListToolsResponse(_) | EventMsg::ListSkillsResponse(_) + | EventMsg::RealtimeConversationListVoicesResponse(_) | EventMsg::SkillsUpdateAvailable | EventMsg::PlanUpdate(_) | EventMsg::TurnAborted(_) diff --git a/codex-rs/core/src/codex_tests.rs b/codex-rs/core/src/codex_tests.rs index caf67dc52b..fdc83f31b2 100644 --- a/codex-rs/core/src/codex_tests.rs +++ b/codex-rs/core/src/codex_tests.rs @@ -75,6 +75,9 @@ use codex_protocol::protocol::NetworkApprovalProtocol; use codex_protocol::protocol::RateLimitSnapshot; use codex_protocol::protocol::RateLimitWindow; use codex_protocol::protocol::RealtimeAudioFrame; +use codex_protocol::protocol::RealtimeConversationListVoicesResponseEvent; +use codex_protocol::protocol::RealtimeVoice; +use codex_protocol::protocol::RealtimeVoicesList; use codex_protocol::protocol::ResumedHistory; use codex_protocol::protocol::RolloutItem; use codex_protocol::protocol::Submission; @@ -4588,6 +4591,51 @@ async fn run_user_shell_command_does_not_set_reference_context_item() { ); } +#[tokio::test] +async fn realtime_conversation_list_voices_emits_builtin_list() { + let (session, _turn_context, rx) = make_session_and_context_with_rx().await; + + handlers::realtime_conversation_list_voices(&session, "sub-id".to_string()).await; + + let event = rx.recv().await.expect("event"); + let voices = match event.msg { + EventMsg::RealtimeConversationListVoicesResponse( + RealtimeConversationListVoicesResponseEvent { voices }, + ) => voices, + msg => panic!("expected list voices response, got {msg:?}"), + }; + assert_eq!( + voices, + RealtimeVoicesList { + v1: vec![ + RealtimeVoice::Juniper, + RealtimeVoice::Maple, + RealtimeVoice::Spruce, + RealtimeVoice::Ember, + RealtimeVoice::Vale, + RealtimeVoice::Breeze, + RealtimeVoice::Arbor, + RealtimeVoice::Sol, + RealtimeVoice::Cove, + ], + v2: vec![ + RealtimeVoice::Alloy, + RealtimeVoice::Ash, + RealtimeVoice::Ballad, + RealtimeVoice::Coral, + RealtimeVoice::Echo, + RealtimeVoice::Sage, + RealtimeVoice::Shimmer, + RealtimeVoice::Verse, + RealtimeVoice::Marin, + RealtimeVoice::Cedar, + ], + default_v1: RealtimeVoice::Cove, + default_v2: RealtimeVoice::Marin, + }, + ); +} + #[derive(Clone, Copy)] struct NeverEndingTask { kind: TaskKind, diff --git a/codex-rs/core/src/config/config_tests.rs b/codex-rs/core/src/config/config_tests.rs index b6507c0091..5bc34b4574 100644 --- a/codex-rs/core/src/config/config_tests.rs +++ b/codex-rs/core/src/config/config_tests.rs @@ -53,6 +53,7 @@ use codex_protocol::permissions::FileSystemSandboxPolicy; use codex_protocol::permissions::FileSystemSpecialPath; use codex_protocol::permissions::NetworkSandboxPolicy; use codex_protocol::protocol::ReadOnlyAccess; +use codex_protocol::protocol::RealtimeVoice; use serde::Deserialize; use tempfile::tempdir; @@ -6489,6 +6490,7 @@ fn realtime_loads_from_config_toml() -> std::io::Result<()> { version = "v2" type = "transcription" transport = "webrtc" +voice = "cedar" "#, ) .expect("TOML deserialization should succeed"); @@ -6499,6 +6501,7 @@ transport = "webrtc" version: Some(RealtimeWsVersion::V2), session_type: Some(RealtimeWsMode::Transcription), transport: Some(RealtimeTransport::WebRtc), + voice: Some(RealtimeVoice::Cedar), }) ); @@ -6515,6 +6518,7 @@ transport = "webrtc" version: RealtimeWsVersion::V2, session_type: RealtimeWsMode::Transcription, transport: RealtimeTransport::WebRtc, + voice: Some(RealtimeVoice::Cedar), } ); Ok(()) diff --git a/codex-rs/core/src/config/edit.rs b/codex-rs/core/src/config/edit.rs index 1b3d700ab7..0f1189b22d 100644 --- a/codex-rs/core/src/config/edit.rs +++ b/codex-rs/core/src/config/edit.rs @@ -1019,6 +1019,18 @@ impl ConfigEditsBuilder { self } + pub fn set_realtime_voice(mut self, voice: Option<&str>) -> Self { + let segments = vec!["realtime".to_string(), "voice".to_string()]; + match voice { + Some(voice) => self.edits.push(ConfigEdit::SetPath { + segments, + value: value(voice), + }), + None => self.edits.push(ConfigEdit::ClearPath { segments }), + } + self + } + pub fn clear_legacy_windows_sandbox_keys(mut self) -> Self { for key in [ "experimental_windows_sandbox", diff --git a/codex-rs/core/src/config/edit_tests.rs b/codex-rs/core/src/config/edit_tests.rs index 314d8badb3..af1251b34f 100644 --- a/codex-rs/core/src/config/edit_tests.rs +++ b/codex-rs/core/src/config/edit_tests.rs @@ -1066,6 +1066,41 @@ fn blocking_builder_set_realtime_audio_persists_and_clears() { ); } +#[test] +fn blocking_builder_set_realtime_voice_persists_and_clears() { + let tmp = tempdir().expect("tmpdir"); + let codex_home = tmp.path(); + + ConfigEditsBuilder::new(codex_home) + .set_realtime_voice(Some("cedar")) + .apply_blocking() + .expect("persist realtime voice"); + + let raw = std::fs::read_to_string(codex_home.join(CONFIG_TOML_FILE)).expect("read config"); + let config: TomlValue = toml::from_str(&raw).expect("parse config"); + let realtime = config + .get("realtime") + .and_then(TomlValue::as_table) + .expect("realtime table should exist"); + assert_eq!( + realtime.get("voice").and_then(TomlValue::as_str), + Some("cedar") + ); + + ConfigEditsBuilder::new(codex_home) + .set_realtime_voice(/*voice*/ None) + .apply_blocking() + .expect("clear realtime voice"); + + let raw = std::fs::read_to_string(codex_home.join(CONFIG_TOML_FILE)).expect("read config"); + let config: TomlValue = toml::from_str(&raw).expect("parse config"); + let realtime = config + .get("realtime") + .and_then(TomlValue::as_table) + .expect("realtime table should exist"); + assert_eq!(realtime.get("voice"), None); +} + #[test] fn replace_mcp_servers_blocking_clears_table_when_empty() { let tmp = tempdir().expect("tmpdir"); diff --git a/codex-rs/core/src/config/mod.rs b/codex-rs/core/src/config/mod.rs index 29269cd85e..904862eaf7 100644 --- a/codex-rs/core/src/config/mod.rs +++ b/codex-rs/core/src/config/mod.rs @@ -2098,6 +2098,7 @@ impl Config { version: realtime.version.unwrap_or_default(), session_type: realtime.session_type.unwrap_or_default(), transport: realtime.transport.unwrap_or_default(), + voice: realtime.voice, }), experimental_realtime_ws_backend_prompt: cfg.experimental_realtime_ws_backend_prompt, experimental_realtime_ws_startup_context: cfg.experimental_realtime_ws_startup_context, diff --git a/codex-rs/core/src/realtime_conversation.rs b/codex-rs/core/src/realtime_conversation.rs index 17b2440e8d..38cb0c273a 100644 --- a/codex-rs/core/src/realtime_conversation.rs +++ b/codex-rs/core/src/realtime_conversation.rs @@ -39,6 +39,8 @@ use codex_protocol::protocol::RealtimeConversationRealtimeEvent; use codex_protocol::protocol::RealtimeConversationSdpEvent; use codex_protocol::protocol::RealtimeConversationStartedEvent; use codex_protocol::protocol::RealtimeHandoffRequested; +use codex_protocol::protocol::RealtimeVoice; +use codex_protocol::protocol::RealtimeVoicesList; use http::HeaderMap; use http::HeaderValue; use http::header::AUTHORIZATION; @@ -521,7 +523,7 @@ async fn prepare_realtime_start( } let version = config.realtime.version; let session_config = - build_realtime_session_config(sess, params.prompt, params.session_id).await?; + build_realtime_session_config(sess, params.prompt, params.session_id, params.voice).await?; let requested_session_id = session_config.session_id.clone(); let extra_headers = match transport { ConversationStartTransport::Websocket => { @@ -549,6 +551,7 @@ pub(crate) async fn build_realtime_session_config( sess: &Arc, prompt: Option>, session_id: Option, + voice: Option, ) -> CodexResult { let config = sess.get_config().await; let prompt = prepare_realtime_backend_prompt( @@ -578,15 +581,53 @@ pub(crate) async fn build_realtime_session_config( RealtimeWsMode::Conversational => RealtimeSessionMode::Conversational, RealtimeWsMode::Transcription => RealtimeSessionMode::Transcription, }; + let voice = voice + .or(config.realtime.voice) + .unwrap_or_else(|| default_realtime_voice(config.realtime.version)); + validate_realtime_voice(config.realtime.version, voice)?; Ok(RealtimeSessionConfig { instructions: prompt, model, session_id: Some(session_id.unwrap_or_else(|| sess.conversation_id.to_string())), event_parser, session_mode, + voice, }) } +fn default_realtime_voice(version: RealtimeWsVersion) -> RealtimeVoice { + let voices = RealtimeVoicesList::builtin(); + match version { + RealtimeWsVersion::V1 => voices.default_v1, + RealtimeWsVersion::V2 => voices.default_v2, + } +} + +fn validate_realtime_voice(version: RealtimeWsVersion, voice: RealtimeVoice) -> CodexResult<()> { + let voices = RealtimeVoicesList::builtin(); + let allowed = match version { + RealtimeWsVersion::V1 => &voices.v1, + RealtimeWsVersion::V2 => &voices.v2, + }; + if allowed.contains(&voice) { + return Ok(()); + } + + let version = match version { + RealtimeWsVersion::V1 => "v1", + RealtimeWsVersion::V2 => "v2", + }; + let allowed = allowed + .iter() + .map(|voice| voice.wire_name()) + .collect::>() + .join(", "); + Err(CodexErr::InvalidRequest(format!( + "realtime voice `{}` is not supported for {version}; supported voices: {allowed}", + voice.wire_name() + ))) +} + async fn handle_start_inner( sess: &Arc, sub_id: &str, diff --git a/codex-rs/core/tests/suite/compact_remote.rs b/codex-rs/core/tests/suite/compact_remote.rs index 8786046025..d8015812ab 100644 --- a/codex-rs/core/tests/suite/compact_remote.rs +++ b/codex-rs/core/tests/suite/compact_remote.rs @@ -119,6 +119,7 @@ async fn start_realtime_conversation(codex: &codex_core::CodexThread) -> Result< prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, + voice: None, })) .await?; diff --git a/codex-rs/core/tests/suite/realtime_conversation.rs b/codex-rs/core/tests/suite/realtime_conversation.rs index 447657ecfe..fb4f871551 100644 --- a/codex-rs/core/tests/suite/realtime_conversation.rs +++ b/codex-rs/core/tests/suite/realtime_conversation.rs @@ -1,6 +1,7 @@ use anyhow::Context; use anyhow::Result; use chrono::Utc; +use codex_config::config_toml::RealtimeWsVersion; use codex_login::CodexAuth; use codex_login::OPENAI_API_KEY_ENV_VAR; use codex_protocol::ThreadId; @@ -16,6 +17,7 @@ use codex_protocol::protocol::RealtimeAudioFrame; use codex_protocol::protocol::RealtimeConversationRealtimeEvent; use codex_protocol::protocol::RealtimeConversationVersion; use codex_protocol::protocol::RealtimeEvent; +use codex_protocol::protocol::RealtimeVoice; use codex_protocol::protocol::SessionSource; use codex_protocol::user_input::UserInput; use core_test_support::responses; @@ -243,6 +245,7 @@ async fn conversation_start_audio_text_close_round_trip() -> Result<()> { prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, + voice: None, })) .await?; @@ -299,6 +302,10 @@ async fn conversation_start_audio_text_close_round_trip() -> Result<()> { connection[0].body_json()["type"].as_str(), Some("session.update") ); + assert_eq!( + connection[0].body_json()["session"]["audio"]["output"]["voice"], + "cove" + ); let initial_instructions = websocket_request_instructions(&connection[0]) .expect("initial session update instructions"); assert!(initial_instructions.starts_with("backend prompt")); @@ -396,6 +403,7 @@ async fn conversation_webrtc_start_posts_generated_session() -> Result<()> { transport: Some(ConversationStartTransport::Webrtc { sdp: "v=offer\r\n".to_string(), }), + voice: None, })) .await?; @@ -439,7 +447,7 @@ async fn conversation_webrtc_start_posts_generated_session() -> Result<()> { Some("multipart/form-data; boundary=codex-realtime-call-boundary") ); let body = String::from_utf8(request.body).context("multipart body should be utf-8")?; - let session = r#"{"audio":{"input":{"format":{"type":"audio/pcm","rate":24000}},"output":{"voice":"fathom"}},"type":"quicksilver","model":"realtime-test-model","instructions":"backend prompt\n\nstartup context"}"#; + let session = r#"{"audio":{"input":{"format":{"type":"audio/pcm","rate":24000}},"output":{"voice":"cove"}},"type":"quicksilver","model":"realtime-test-model","instructions":"backend prompt\n\nstartup context"}"#; assert_eq!( body, format!( @@ -530,6 +538,7 @@ async fn conversation_start_uses_openai_env_key_fallback_with_chatgpt_auth() -> prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, + voice: None, })) .await?; @@ -590,6 +599,7 @@ async fn conversation_transport_close_emits_closed_event() -> Result<()> { prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, + voice: None, })) .await?; @@ -674,6 +684,7 @@ async fn conversation_start_preflight_failure_emits_realtime_error_only() -> Res prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, + voice: None, })) .await?; @@ -715,6 +726,7 @@ async fn conversation_start_connect_failure_emits_realtime_error_only() -> Resul prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, + voice: None, })) .await?; @@ -804,6 +816,7 @@ async fn conversation_second_start_replaces_runtime() -> Result<()> { prompt: Some(Some("old".to_string())), session_id: Some("conv_old".to_string()), transport: None, + voice: None, })) .await?; wait_for_event_match(&test.codex, |msg| match msg { @@ -821,6 +834,7 @@ async fn conversation_second_start_replaces_runtime() -> Result<()> { prompt: Some(Some("new".to_string())), session_id: Some("conv_new".to_string()), transport: None, + voice: None, })) .await?; wait_for_event_match(&test.codex, |msg| match msg { @@ -908,6 +922,7 @@ async fn conversation_uses_experimental_realtime_ws_base_url_override() -> Resul prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, + voice: None, })) .await?; @@ -964,6 +979,7 @@ async fn conversation_uses_default_realtime_backend_prompt() -> Result<()> { prompt: None, session_id: None, transport: None, + voice: None, })) .await?; @@ -1028,6 +1044,7 @@ async fn conversation_uses_empty_instructions_for_null_or_empty_prompt() -> Resu prompt, session_id: None, transport: None, + voice: None, })) .await?; @@ -1061,6 +1078,141 @@ async fn conversation_uses_empty_instructions_for_null_or_empty_prompt() -> Resu Ok(()) } +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn conversation_uses_explicit_start_voice() -> Result<()> { + skip_if_no_network!(Ok(())); + + let server = start_websocket_server(vec![ + vec![], + vec![vec![json!({ + "type": "session.updated", + "session": { "id": "sess_voice", "instructions": "backend prompt" } + })]], + ]) + .await; + let test = test_codex().build_with_websocket_server(&server).await?; + assert!( + server + .wait_for_handshakes(/*expected*/ 1, Duration::from_secs(2)) + .await + ); + + test.codex + .submit(Op::RealtimeConversationStart(ConversationStartParams { + prompt: Some(Some("backend prompt".to_string())), + session_id: None, + transport: None, + voice: Some(RealtimeVoice::Breeze), + })) + .await?; + + let session_updated = wait_for_event_match(&test.codex, |msg| match msg { + EventMsg::RealtimeConversationRealtime(RealtimeConversationRealtimeEvent { + payload: RealtimeEvent::SessionUpdated { session_id, .. }, + }) => Some(session_id.clone()), + _ => None, + }) + .await; + assert_eq!(session_updated, "sess_voice"); + + let connections = server.connections(); + assert_eq!( + connections[1][0].body_json()["session"]["audio"]["output"]["voice"], + "breeze" + ); + + server.shutdown().await; + Ok(()) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn conversation_uses_configured_realtime_voice() -> Result<()> { + skip_if_no_network!(Ok(())); + + let server = start_websocket_server(vec![ + vec![], + vec![vec![json!({ + "type": "session.updated", + "session": { "id": "sess_config_voice", "instructions": "backend prompt" } + })]], + ]) + .await; + let mut builder = test_codex().with_config(|config| { + config.realtime.voice = Some(RealtimeVoice::Cove); + }); + let test = builder.build_with_websocket_server(&server).await?; + assert!( + server + .wait_for_handshakes(/*expected*/ 1, Duration::from_secs(2)) + .await + ); + + test.codex + .submit(Op::RealtimeConversationStart(ConversationStartParams { + prompt: Some(Some("backend prompt".to_string())), + session_id: None, + transport: None, + voice: None, + })) + .await?; + + let session_updated = wait_for_event_match(&test.codex, |msg| match msg { + EventMsg::RealtimeConversationRealtime(RealtimeConversationRealtimeEvent { + payload: RealtimeEvent::SessionUpdated { session_id, .. }, + }) => Some(session_id.clone()), + _ => None, + }) + .await; + assert_eq!(session_updated, "sess_config_voice"); + + let connections = server.connections(); + assert_eq!( + connections[1][0].body_json()["session"]["audio"]["output"]["voice"], + "cove" + ); + + server.shutdown().await; + Ok(()) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn conversation_rejects_voice_for_wrong_realtime_version() -> Result<()> { + skip_if_no_network!(Ok(())); + + let server = start_websocket_server(vec![vec![]]).await; + let mut builder = test_codex().with_config(|config| { + config.realtime.version = RealtimeWsVersion::V2; + }); + let test = builder.build_with_websocket_server(&server).await?; + assert!( + server + .wait_for_handshakes(/*expected*/ 1, Duration::from_secs(2)) + .await + ); + + test.codex + .submit(Op::RealtimeConversationStart(ConversationStartParams { + prompt: Some(Some("backend prompt".to_string())), + session_id: None, + transport: None, + voice: Some(RealtimeVoice::Cove), + })) + .await?; + + let error = wait_for_event_match(&test.codex, |msg| match msg { + EventMsg::RealtimeConversationRealtime(RealtimeConversationRealtimeEvent { + payload: RealtimeEvent::Error(message), + }) => Some(message.clone()), + _ => None, + }) + .await; + assert!(error.contains("realtime voice `cove` is not supported for v2")); + + assert_eq!(server.connections().len(), 1); + server.shutdown().await; + Ok(()) +} + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn conversation_uses_experimental_realtime_ws_backend_prompt_override() -> Result<()> { skip_if_no_network!(Ok(())); @@ -1089,6 +1241,7 @@ async fn conversation_uses_experimental_realtime_ws_backend_prompt_override() -> prompt: Some(Some("prompt from op".to_string())), session_id: None, transport: None, + voice: None, })) .await?; @@ -1152,6 +1305,7 @@ async fn conversation_uses_experimental_realtime_ws_startup_context_override() - prompt: Some(Some("prompt from op".to_string())), session_id: None, transport: None, + voice: None, })) .await?; @@ -1213,6 +1367,7 @@ async fn conversation_disables_realtime_startup_context_with_empty_override() -> prompt: Some(Some("prompt from op".to_string())), session_id: None, transport: None, + voice: None, })) .await?; @@ -1267,6 +1422,7 @@ async fn conversation_start_injects_startup_context_from_thread_history() -> Res prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, + voice: None, })) .await?; @@ -1321,6 +1477,7 @@ async fn conversation_startup_context_falls_back_to_workspace_map() -> Result<() prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, + voice: None, })) .await?; @@ -1373,6 +1530,7 @@ async fn conversation_startup_context_is_truncated_and_sent_once_per_start() -> prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, + voice: None, })) .await?; @@ -1458,6 +1616,7 @@ async fn conversation_mirrors_assistant_message_text_to_realtime_handoff() -> Re prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, + voice: None, })) .await?; @@ -1585,6 +1744,7 @@ async fn conversation_handoff_persists_across_item_done_until_turn_complete() -> prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, + voice: None, })) .await?; @@ -1727,6 +1887,7 @@ async fn inbound_handoff_request_starts_turn() -> Result<()> { prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, + voice: None, })) .await?; @@ -1822,6 +1983,7 @@ async fn inbound_handoff_request_uses_active_transcript() -> Result<()> { prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, + voice: None, })) .await?; @@ -1915,6 +2077,7 @@ async fn inbound_handoff_request_clears_active_transcript_after_each_handoff() - prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, + voice: None, })) .await?; @@ -2015,6 +2178,7 @@ async fn inbound_conversation_item_does_not_start_turn_and_still_forwards_audio( prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, + voice: None, })) .await?; @@ -2128,6 +2292,7 @@ async fn delegated_turn_user_role_echo_does_not_redelegate_and_still_forwards_au prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, + voice: None, })) .await?; @@ -2271,6 +2436,7 @@ async fn inbound_handoff_request_does_not_block_realtime_event_forwarding() -> R prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, + voice: None, })) .await?; @@ -2398,6 +2564,7 @@ async fn inbound_handoff_request_steers_active_turn() -> Result<()> { prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, + voice: None, })) .await?; let _ = wait_for_event_match(&test.codex, |msg| match msg { @@ -2540,6 +2707,7 @@ async fn inbound_handoff_request_starts_turn_and_does_not_block_realtime_audio() prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, + voice: None, })) .await?; diff --git a/codex-rs/mcp-server/src/codex_tool_runner.rs b/codex-rs/mcp-server/src/codex_tool_runner.rs index a2ef6b6201..2eb0ea4966 100644 --- a/codex-rs/mcp-server/src/codex_tool_runner.rs +++ b/codex-rs/mcp-server/src/codex_tool_runner.rs @@ -337,6 +337,7 @@ async fn run_codex_tool_session_inner( | EventMsg::McpToolCallEnd(_) | EventMsg::McpListToolsResponse(_) | EventMsg::ListSkillsResponse(_) + | EventMsg::RealtimeConversationListVoicesResponse(_) | EventMsg::ExecCommandBegin(_) | EventMsg::TerminalInteraction(_) | EventMsg::ExecCommandOutputDelta(_) diff --git a/codex-rs/protocol/src/protocol.rs b/codex-rs/protocol/src/protocol.rs index 8f50df24dc..dd3d1bfca9 100644 --- a/codex-rs/protocol/src/protocol.rs +++ b/codex-rs/protocol/src/protocol.rs @@ -144,6 +144,8 @@ pub struct ConversationStartParams { pub session_id: Option, #[serde(skip_serializing_if = "Option::is_none")] pub transport: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub voice: Option, } #[derive(Debug, Clone, Deserialize, Serialize, PartialEq, JsonSchema, TS)] @@ -176,6 +178,101 @@ mod conversation_start_prompt_serde { } } +#[derive( + Debug, Clone, Copy, Deserialize, Serialize, PartialEq, Eq, Hash, JsonSchema, TS, Ord, PartialOrd, +)] +#[serde(rename_all = "snake_case")] +#[ts(rename_all = "snake_case")] +pub enum RealtimeVoice { + Alloy, + Arbor, + Ash, + Ballad, + Breeze, + Cedar, + Coral, + Cove, + Echo, + Ember, + Juniper, + Maple, + Marin, + Sage, + Shimmer, + Sol, + Spruce, + Vale, + Verse, +} + +impl RealtimeVoice { + pub fn wire_name(self) -> &'static str { + match self { + Self::Alloy => "alloy", + Self::Arbor => "arbor", + Self::Ash => "ash", + Self::Ballad => "ballad", + Self::Breeze => "breeze", + Self::Cedar => "cedar", + Self::Coral => "coral", + Self::Cove => "cove", + Self::Echo => "echo", + Self::Ember => "ember", + Self::Juniper => "juniper", + Self::Maple => "maple", + Self::Marin => "marin", + Self::Sage => "sage", + Self::Shimmer => "shimmer", + Self::Sol => "sol", + Self::Spruce => "spruce", + Self::Vale => "vale", + Self::Verse => "verse", + } + } +} + +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] +#[serde(rename_all = "camelCase")] +#[ts(rename_all = "camelCase")] +pub struct RealtimeVoicesList { + pub v1: Vec, + pub v2: Vec, + pub default_v1: RealtimeVoice, + pub default_v2: RealtimeVoice, +} + +impl RealtimeVoicesList { + pub fn builtin() -> Self { + Self { + v1: vec![ + RealtimeVoice::Juniper, + RealtimeVoice::Maple, + RealtimeVoice::Spruce, + RealtimeVoice::Ember, + RealtimeVoice::Vale, + RealtimeVoice::Breeze, + RealtimeVoice::Arbor, + RealtimeVoice::Sol, + RealtimeVoice::Cove, + ], + v2: vec![ + RealtimeVoice::Alloy, + RealtimeVoice::Ash, + RealtimeVoice::Ballad, + RealtimeVoice::Coral, + RealtimeVoice::Echo, + RealtimeVoice::Sage, + RealtimeVoice::Shimmer, + RealtimeVoice::Verse, + RealtimeVoice::Marin, + RealtimeVoice::Cedar, + ], + default_v1: RealtimeVoice::Cove, + default_v2: RealtimeVoice::Marin, + } + } +} + #[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] pub struct RealtimeAudioFrame { pub data: String, @@ -271,6 +368,9 @@ pub enum Op { /// Close the running realtime conversation stream. RealtimeConversationClose, + /// Request the list of voices supported by realtime conversation streams. + RealtimeConversationListVoices, + /// Legacy user input. /// /// Prefer [`Op::UserTurn`] so the caller provides full turn context @@ -617,6 +717,7 @@ impl Op { Self::RealtimeConversationAudio(_) => "realtime_conversation_audio", Self::RealtimeConversationText(_) => "realtime_conversation_text", Self::RealtimeConversationClose => "realtime_conversation_close", + Self::RealtimeConversationListVoices => "realtime_conversation_list_voices", Self::UserInput { .. } => "user_input", Self::UserTurn { .. } => "user_turn", Self::InterAgentCommunication { .. } => "inter_agent_communication", @@ -1398,6 +1499,9 @@ pub enum EventMsg { /// List of skills available to the agent. ListSkillsResponse(ListSkillsResponseEvent), + /// List of voices supported by realtime conversation streams. + RealtimeConversationListVoicesResponse(RealtimeConversationListVoicesResponseEvent), + /// Notification that skill data may have been updated and clients may want to reload. SkillsUpdateAvailable, @@ -3147,6 +3251,11 @@ pub struct ListSkillsResponseEvent { pub skills: Vec, } +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] +pub struct RealtimeConversationListVoicesResponseEvent { + pub voices: RealtimeVoicesList, +} + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, JsonSchema, TS)] #[serde(rename_all = "lowercase")] #[ts(rename_all = "lowercase")] @@ -4441,6 +4550,7 @@ mod tests { prompt: Some(Some("be helpful".to_string())), session_id: Some("conv_1".to_string()), transport: None, + voice: None, }); let webrtc_start = Op::RealtimeConversationStart(ConversationStartParams { prompt: Some(Some("be helpful".to_string())), @@ -4448,6 +4558,7 @@ mod tests { transport: Some(ConversationStartTransport::Webrtc { sdp: "v=offer\r\n".to_string(), }), + voice: Some(RealtimeVoice::Cove), }); let text = Op::RealtimeConversationText(ConversationTextParams { text: "hello".to_string(), @@ -4457,12 +4568,15 @@ mod tests { prompt: None, session_id: None, transport: None, + voice: None, }); let null_prompt_start = Op::RealtimeConversationStart(ConversationStartParams { prompt: Some(None), session_id: None, transport: None, + voice: None, }); + let list_voices = Op::RealtimeConversationListVoices; assert_eq!( serde_json::to_value(&start).unwrap(), @@ -4526,6 +4640,16 @@ mod tests { serde_json::from_value::(serde_json::to_value(&close).unwrap()).unwrap(), close ); + assert_eq!( + serde_json::to_value(&list_voices).unwrap(), + json!({ + "type": "realtime_conversation_list_voices" + }) + ); + assert_eq!( + serde_json::from_value::(serde_json::to_value(&list_voices).unwrap()).unwrap(), + list_voices + ); assert_eq!( serde_json::to_value(&webrtc_start).unwrap(), json!({ @@ -4535,11 +4659,46 @@ mod tests { "transport": { "type": "webrtc", "sdp": "v=offer\r\n" - } + }, + "voice": "cove" }) ); } + #[test] + fn realtime_voice_list_is_stable() { + assert_eq!( + RealtimeVoicesList::builtin(), + RealtimeVoicesList { + v1: vec![ + RealtimeVoice::Juniper, + RealtimeVoice::Maple, + RealtimeVoice::Spruce, + RealtimeVoice::Ember, + RealtimeVoice::Vale, + RealtimeVoice::Breeze, + RealtimeVoice::Arbor, + RealtimeVoice::Sol, + RealtimeVoice::Cove, + ], + v2: vec![ + RealtimeVoice::Alloy, + RealtimeVoice::Ash, + RealtimeVoice::Ballad, + RealtimeVoice::Coral, + RealtimeVoice::Echo, + RealtimeVoice::Sage, + RealtimeVoice::Shimmer, + RealtimeVoice::Verse, + RealtimeVoice::Marin, + RealtimeVoice::Cedar, + ], + default_v1: RealtimeVoice::Cove, + default_v2: RealtimeVoice::Marin, + } + ); + } + #[test] fn user_input_serialization_omits_final_output_json_schema_when_none() -> Result<()> { let op = Op::UserInput { diff --git a/codex-rs/rollout/src/policy.rs b/codex-rs/rollout/src/policy.rs index 55167e25b1..a4c8913a46 100644 --- a/codex-rs/rollout/src/policy.rs +++ b/codex-rs/rollout/src/policy.rs @@ -160,6 +160,7 @@ fn event_msg_persistence_mode(ev: &EventMsg) -> Option { | EventMsg::GetHistoryEntryResponse(_) | EventMsg::UndoStarted(_) | EventMsg::McpListToolsResponse(_) + | EventMsg::RealtimeConversationListVoicesResponse(_) | EventMsg::McpStartupUpdate(_) | EventMsg::McpStartupComplete(_) | EventMsg::ListSkillsResponse(_) diff --git a/codex-rs/tui/src/app_server_session.rs b/codex-rs/tui/src/app_server_session.rs index 20175ba347..501dbeb010 100644 --- a/codex-rs/tui/src/app_server_session.rs +++ b/codex-rs/tui/src/app_server_session.rs @@ -645,6 +645,7 @@ impl AppServerSession { thread_id: thread_id.to_string(), prompt: params.prompt, session_id: params.session_id, + voice: params.voice, transport: params.transport.map(|transport| match transport { ConversationStartTransport::Websocket => { ThreadRealtimeStartTransport::Websocket diff --git a/codex-rs/tui/src/chatwidget.rs b/codex-rs/tui/src/chatwidget.rs index 421d9fd127..07c8f0b759 100644 --- a/codex-rs/tui/src/chatwidget.rs +++ b/codex-rs/tui/src/chatwidget.rs @@ -7034,7 +7034,8 @@ impl ChatWidget { | EventMsg::ReasoningContentDelta(_) | EventMsg::ReasoningRawContentDelta(_) | EventMsg::DynamicToolCallRequest(_) - | EventMsg::DynamicToolCallResponse(_) => {} + | EventMsg::DynamicToolCallResponse(_) + | EventMsg::RealtimeConversationListVoicesResponse(_) => {} EventMsg::HookStarted(event) => self.on_hook_started(event), EventMsg::HookCompleted(event) => self.on_hook_completed(event), EventMsg::RealtimeConversationStarted(ev) => { diff --git a/codex-rs/tui/src/chatwidget/realtime.rs b/codex-rs/tui/src/chatwidget/realtime.rs index 8906048d45..167c769bd0 100644 --- a/codex-rs/tui/src/chatwidget/realtime.rs +++ b/codex-rs/tui/src/chatwidget/realtime.rs @@ -263,6 +263,7 @@ impl ChatWidget { prompt: None, session_id: None, transport, + voice: self.config.realtime.voice, }, )); }