mirror of
https://github.com/openai/codex.git
synced 2026-04-28 18:32:04 +03:00
Wire realtime api to core (#12268)
- Introduce `RealtimeConversationManager` for realtime API management - Add `op::conversation` to start conversation, insert audio, insert text, and close conversation. - emit conversation lifecycle and realtime events. - Move shared realtime payload types into codex-protocol and add core e2e websocket tests for start/replace/transport-close paths. Things to consider: - Should we use the same `op::` and `Events` channel to carry audio? I think we should try this simple approach and later we can create separate one if the channels got congested. - Sending text updates to the client: we can start simple and later restrict that. - Provider auth isn't wired for now intentionally
This commit is contained in:
@@ -85,6 +85,41 @@ pub struct McpServerRefreshConfig {
|
||||
pub mcp_oauth_credentials_store_mode: Value,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, JsonSchema, TS)]
|
||||
pub struct ConversationStartParams {
|
||||
pub prompt: String,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub session_id: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)]
|
||||
pub struct RealtimeAudioFrame {
|
||||
pub data: String,
|
||||
pub sample_rate: u32,
|
||||
pub num_channels: u16,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub samples_per_channel: Option<u32>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)]
|
||||
pub enum RealtimeEvent {
|
||||
SessionCreated { session_id: String },
|
||||
SessionUpdated { backend_prompt: Option<String> },
|
||||
AudioOut(RealtimeAudioFrame),
|
||||
ConversationItemAdded(Value),
|
||||
Error(String),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, JsonSchema, TS)]
|
||||
pub struct ConversationAudioParams {
|
||||
pub frame: RealtimeAudioFrame,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, JsonSchema, TS)]
|
||||
pub struct ConversationTextParams {
|
||||
pub text: String,
|
||||
}
|
||||
|
||||
/// Submission operation
|
||||
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, JsonSchema)]
|
||||
#[serde(tag = "type", rename_all = "snake_case")]
|
||||
@@ -98,6 +133,18 @@ pub enum Op {
|
||||
/// Terminate all running background terminal processes for this thread.
|
||||
CleanBackgroundTerminals,
|
||||
|
||||
/// Start a realtime conversation stream.
|
||||
RealtimeConversationStart(ConversationStartParams),
|
||||
|
||||
/// Send audio input to the running realtime conversation stream.
|
||||
RealtimeConversationAudio(ConversationAudioParams),
|
||||
|
||||
/// Send text input to the running realtime conversation stream.
|
||||
RealtimeConversationText(ConversationTextParams),
|
||||
|
||||
/// Close the running realtime conversation stream.
|
||||
RealtimeConversationClose,
|
||||
|
||||
/// Legacy user input.
|
||||
///
|
||||
/// Prefer [`Op::UserTurn`] so the caller provides full turn context
|
||||
@@ -899,6 +946,15 @@ pub enum EventMsg {
|
||||
/// indicates the turn continued but the user should still be notified.
|
||||
Warning(WarningEvent),
|
||||
|
||||
/// Realtime conversation lifecycle start event.
|
||||
RealtimeConversationStarted(RealtimeConversationStartedEvent),
|
||||
|
||||
/// Realtime conversation streaming payload event.
|
||||
RealtimeConversationRealtime(RealtimeConversationRealtimeEvent),
|
||||
|
||||
/// Realtime conversation lifecycle close event.
|
||||
RealtimeConversationClosed(RealtimeConversationClosedEvent),
|
||||
|
||||
/// Model routing changed from the requested model to a different model.
|
||||
ModelReroute(ModelRerouteEvent),
|
||||
|
||||
@@ -1078,6 +1134,22 @@ pub enum EventMsg {
|
||||
CollabResumeEnd(CollabResumeEndEvent),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, JsonSchema, TS)]
|
||||
pub struct RealtimeConversationStartedEvent {
|
||||
pub session_id: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, JsonSchema, TS)]
|
||||
pub struct RealtimeConversationRealtimeEvent {
|
||||
pub payload: RealtimeEvent,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, JsonSchema, TS)]
|
||||
pub struct RealtimeConversationClosedEvent {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub reason: Option<String>,
|
||||
}
|
||||
|
||||
impl From<CollabAgentSpawnBeginEvent> for EventMsg {
|
||||
fn from(event: CollabAgentSpawnBeginEvent) -> Self {
|
||||
EventMsg::CollabAgentSpawnBegin(event)
|
||||
@@ -3047,6 +3119,61 @@ mod tests {
|
||||
assert!(event.affects_turn_status());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn conversation_op_serializes_as_unnested_variants() {
|
||||
let audio = Op::RealtimeConversationAudio(ConversationAudioParams {
|
||||
frame: RealtimeAudioFrame {
|
||||
data: "AQID".to_string(),
|
||||
sample_rate: 24_000,
|
||||
num_channels: 1,
|
||||
samples_per_channel: Some(480),
|
||||
},
|
||||
});
|
||||
let start = Op::RealtimeConversationStart(ConversationStartParams {
|
||||
prompt: "be helpful".to_string(),
|
||||
session_id: Some("conv_1".to_string()),
|
||||
});
|
||||
let text = Op::RealtimeConversationText(ConversationTextParams {
|
||||
text: "hello".to_string(),
|
||||
});
|
||||
let close = Op::RealtimeConversationClose;
|
||||
|
||||
assert_eq!(
|
||||
serde_json::to_value(&start).unwrap(),
|
||||
json!({
|
||||
"type": "realtime_conversation_start",
|
||||
"prompt": "be helpful",
|
||||
"session_id": "conv_1"
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
serde_json::to_value(&audio).unwrap(),
|
||||
json!({
|
||||
"type": "realtime_conversation_audio",
|
||||
"frame": {
|
||||
"data": "AQID",
|
||||
"sample_rate": 24000,
|
||||
"num_channels": 1,
|
||||
"samples_per_channel": 480
|
||||
}
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
serde_json::from_value::<Op>(serde_json::to_value(&text).unwrap()).unwrap(),
|
||||
text
|
||||
);
|
||||
assert_eq!(
|
||||
serde_json::to_value(&close).unwrap(),
|
||||
json!({
|
||||
"type": "realtime_conversation_close"
|
||||
})
|
||||
);
|
||||
assert_eq!(
|
||||
serde_json::from_value::<Op>(serde_json::to_value(&close).unwrap()).unwrap(),
|
||||
close
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn user_input_serialization_omits_final_output_json_schema_when_none() -> Result<()> {
|
||||
let op = Op::UserInput {
|
||||
|
||||
Reference in New Issue
Block a user