mirror of
https://github.com/openai/codex.git
synced 2026-05-05 22:01:37 +03:00
Split realtime v1/v2 logic into isolated modules
This commit is contained in:
@@ -1,27 +1,12 @@
|
||||
use crate::endpoint::realtime_websocket::protocol::ConversationItem;
|
||||
use crate::endpoint::realtime_websocket::protocol::ConversationItemContent;
|
||||
use crate::endpoint::realtime_websocket::protocol::RealtimeApiMode;
|
||||
use crate::endpoint::realtime_websocket::protocol::RealtimeAudioFrame;
|
||||
use crate::endpoint::realtime_websocket::protocol::RealtimeEvent;
|
||||
use crate::endpoint::realtime_websocket::protocol::RealtimeOutboundMessage;
|
||||
use crate::endpoint::realtime_websocket::protocol::RealtimeSessionConfig;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionAudioFormat;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionAudioInputV1;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionAudioInputV2;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionAudioOutputFormat;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionAudioOutputV1;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionAudioOutputV2;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionAudioV1;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionAudioV2;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionTool;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionToolParameters;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionToolProperties;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionToolProperty;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionTurnDetection;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionUpdateSession;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionUpdateSessionV1;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionUpdateSessionV2;
|
||||
use crate::endpoint::realtime_websocket::protocol::parse_realtime_event;
|
||||
use crate::endpoint::realtime_websocket::mode_v1;
|
||||
use crate::endpoint::realtime_websocket::mode_v2;
|
||||
use crate::endpoint::realtime_websocket::protocol_v1;
|
||||
use crate::endpoint::realtime_websocket::protocol_v2;
|
||||
use crate::endpoint::realtime_websocket::types::RealtimeApiMode;
|
||||
use crate::endpoint::realtime_websocket::types::RealtimeAudioFrame;
|
||||
use crate::endpoint::realtime_websocket::types::RealtimeEvent;
|
||||
use crate::endpoint::realtime_websocket::types::RealtimeOutboundMessage;
|
||||
use crate::endpoint::realtime_websocket::types::RealtimeSessionConfig;
|
||||
use crate::error::ApiError;
|
||||
use crate::provider::Provider;
|
||||
use codex_utils_rustls_provider::ensure_rustls_crypto_provider;
|
||||
@@ -29,7 +14,7 @@ use futures::SinkExt;
|
||||
use futures::StreamExt;
|
||||
use http::HeaderMap;
|
||||
use http::HeaderValue;
|
||||
use serde_json::json;
|
||||
use http::header::Entry;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
@@ -294,17 +279,11 @@ impl RealtimeWebsocketWriter {
|
||||
}
|
||||
|
||||
pub async fn send_conversation_item_create(&self, text: String) -> Result<(), ApiError> {
|
||||
let kind = match self.mode {
|
||||
RealtimeApiMode::V1 => "text".to_string(),
|
||||
RealtimeApiMode::V2 => "input_text".to_string(),
|
||||
let message = match self.mode {
|
||||
RealtimeApiMode::V1 => mode_v1::conversation_item_create(text),
|
||||
RealtimeApiMode::V2 => mode_v2::conversation_item_create(text),
|
||||
};
|
||||
self.send_json(RealtimeOutboundMessage::ConversationItemCreate {
|
||||
item: ConversationItem::Message {
|
||||
role: "user".to_string(),
|
||||
content: vec![ConversationItemContent { kind, text }],
|
||||
},
|
||||
})
|
||||
.await
|
||||
self.send_json(message).await
|
||||
}
|
||||
|
||||
pub async fn send_conversation_handoff_append(
|
||||
@@ -312,27 +291,11 @@ impl RealtimeWebsocketWriter {
|
||||
handoff_id: String,
|
||||
output_text: String,
|
||||
) -> Result<(), ApiError> {
|
||||
match self.mode {
|
||||
RealtimeApiMode::V1 => {
|
||||
self.send_json(RealtimeOutboundMessage::ConversationHandoffAppend {
|
||||
handoff_id,
|
||||
output_text,
|
||||
})
|
||||
.await
|
||||
}
|
||||
RealtimeApiMode::V2 => {
|
||||
self.send_json(RealtimeOutboundMessage::ConversationItemCreate {
|
||||
item: ConversationItem::Message {
|
||||
role: "assistant".to_string(),
|
||||
content: vec![ConversationItemContent {
|
||||
kind: "output_text".to_string(),
|
||||
text: output_text,
|
||||
}],
|
||||
},
|
||||
})
|
||||
.await
|
||||
}
|
||||
}
|
||||
let message = match self.mode {
|
||||
RealtimeApiMode::V1 => mode_v1::handoff_append(handoff_id, output_text),
|
||||
RealtimeApiMode::V2 => mode_v2::handoff_append(output_text),
|
||||
};
|
||||
self.send_json(message).await
|
||||
}
|
||||
|
||||
pub async fn send_function_call_output(
|
||||
@@ -343,14 +306,8 @@ impl RealtimeWebsocketWriter {
|
||||
match self.mode {
|
||||
RealtimeApiMode::V1 => Ok(()),
|
||||
RealtimeApiMode::V2 => {
|
||||
let output = json!({
|
||||
"content": output_text,
|
||||
})
|
||||
.to_string();
|
||||
self.send_json(RealtimeOutboundMessage::ConversationItemCreate {
|
||||
item: ConversationItem::FunctionCallOutput { call_id, output },
|
||||
})
|
||||
.await
|
||||
self.send_json(mode_v2::function_call_output(call_id, output_text))
|
||||
.await
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -366,69 +323,11 @@ impl RealtimeWebsocketWriter {
|
||||
}
|
||||
|
||||
pub async fn send_session_update(&self, instructions: String) -> Result<(), ApiError> {
|
||||
let session = match self.mode {
|
||||
RealtimeApiMode::V1 => SessionUpdateSession::V1(SessionUpdateSessionV1 {
|
||||
kind: "quicksilver".to_string(),
|
||||
instructions,
|
||||
audio: SessionAudioV1 {
|
||||
input: SessionAudioInputV1 {
|
||||
format: SessionAudioFormat {
|
||||
kind: "audio/pcm".to_string(),
|
||||
rate: 24_000,
|
||||
},
|
||||
},
|
||||
output: SessionAudioOutputV1 {
|
||||
voice: "mundo".to_string(),
|
||||
},
|
||||
},
|
||||
}),
|
||||
RealtimeApiMode::V2 => SessionUpdateSession::V2(SessionUpdateSessionV2 {
|
||||
kind: "realtime".to_string(),
|
||||
instructions,
|
||||
output_modalities: vec!["audio".to_string()],
|
||||
audio: SessionAudioV2 {
|
||||
input: SessionAudioInputV2 {
|
||||
format: SessionAudioFormat {
|
||||
kind: "audio/pcm".to_string(),
|
||||
rate: 24_000,
|
||||
},
|
||||
turn_detection: SessionTurnDetection {
|
||||
kind: "semantic_vad".to_string(),
|
||||
interrupt_response: false,
|
||||
create_response: true,
|
||||
},
|
||||
},
|
||||
output: SessionAudioOutputV2 {
|
||||
format: SessionAudioOutputFormat {
|
||||
kind: "audio/pcm".to_string(),
|
||||
rate: 24_000,
|
||||
},
|
||||
voice: "marin".to_string(),
|
||||
},
|
||||
},
|
||||
tools: vec![SessionTool {
|
||||
kind: "function".to_string(),
|
||||
name: "codex".to_string(),
|
||||
description:
|
||||
"Delegate a request to Codex and return the final result to the user."
|
||||
.to_string(),
|
||||
parameters: SessionToolParameters {
|
||||
kind: "object".to_string(),
|
||||
properties: SessionToolProperties {
|
||||
prompt: SessionToolProperty {
|
||||
kind: "string".to_string(),
|
||||
description: "The user request to delegate to Codex.".to_string(),
|
||||
},
|
||||
},
|
||||
required: vec!["prompt".to_string()],
|
||||
},
|
||||
}],
|
||||
tool_choice: "auto".to_string(),
|
||||
}),
|
||||
let message = match self.mode {
|
||||
RealtimeApiMode::V1 => mode_v1::session_update(instructions),
|
||||
RealtimeApiMode::V2 => mode_v2::session_update(instructions),
|
||||
};
|
||||
|
||||
self.send_json(RealtimeOutboundMessage::SessionUpdate { session })
|
||||
.await
|
||||
self.send_json(message).await
|
||||
}
|
||||
|
||||
pub async fn close(&self) -> Result<(), ApiError> {
|
||||
@@ -519,6 +418,13 @@ impl RealtimeWebsocketEvents {
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_realtime_event(payload: &str, mode: RealtimeApiMode) -> Option<RealtimeEvent> {
|
||||
match mode {
|
||||
RealtimeApiMode::V1 => protocol_v1::parse_realtime_event(payload),
|
||||
RealtimeApiMode::V2 => protocol_v2::parse_realtime_event(payload),
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RealtimeWebsocketClient {
|
||||
provider: Provider,
|
||||
}
|
||||
@@ -588,7 +494,7 @@ fn merge_request_headers(
|
||||
let mut headers = provider_headers.clone();
|
||||
headers.extend(extra_headers);
|
||||
for (name, value) in &default_headers {
|
||||
if let http::header::Entry::Vacant(entry) = headers.entry(name) {
|
||||
if let Entry::Vacant(entry) = headers.entry(name) {
|
||||
entry.insert(value.clone());
|
||||
}
|
||||
}
|
||||
@@ -639,24 +545,9 @@ fn websocket_url_from_api_url(
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
let mut query = url.query_pairs_mut();
|
||||
if mode == RealtimeApiMode::V1 {
|
||||
query.append_pair("intent", "quicksilver");
|
||||
}
|
||||
if let Some(model) = model {
|
||||
query.append_pair("model", model);
|
||||
}
|
||||
if let Some(query_params) = query_params {
|
||||
for (key, value) in query_params {
|
||||
if (key == "model" && model.is_some())
|
||||
|| (key == "intent" && mode == RealtimeApiMode::V1)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
query.append_pair(key, value);
|
||||
}
|
||||
}
|
||||
match mode {
|
||||
RealtimeApiMode::V1 => mode_v1::append_query_params(&mut url, query_params, model),
|
||||
RealtimeApiMode::V2 => mode_v2::append_query_params(&mut url, query_params, model),
|
||||
}
|
||||
|
||||
Ok(url)
|
||||
@@ -691,8 +582,8 @@ fn normalize_realtime_path(url: &mut Url) {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::endpoint::realtime_websocket::protocol::RealtimeHandoffMessage;
|
||||
use crate::endpoint::realtime_websocket::protocol::RealtimeHandoffRequested;
|
||||
use crate::endpoint::realtime_websocket::types::RealtimeHandoffMessage;
|
||||
use crate::endpoint::realtime_websocket::types::RealtimeHandoffRequested;
|
||||
use http::HeaderValue;
|
||||
use pretty_assertions::assert_eq;
|
||||
use serde_json::Value;
|
||||
|
||||
@@ -1,5 +1,9 @@
|
||||
pub mod methods;
|
||||
pub mod protocol;
|
||||
mod mode_v1;
|
||||
mod mode_v2;
|
||||
mod protocol_v1;
|
||||
mod protocol_v2;
|
||||
mod types;
|
||||
|
||||
pub use codex_protocol::protocol::RealtimeAudioFrame;
|
||||
pub use codex_protocol::protocol::RealtimeEvent;
|
||||
@@ -7,5 +11,5 @@ pub use methods::RealtimeWebsocketClient;
|
||||
pub use methods::RealtimeWebsocketConnection;
|
||||
pub use methods::RealtimeWebsocketEvents;
|
||||
pub use methods::RealtimeWebsocketWriter;
|
||||
pub use protocol::RealtimeApiMode;
|
||||
pub use protocol::RealtimeSessionConfig;
|
||||
pub use types::RealtimeApiMode;
|
||||
pub use types::RealtimeSessionConfig;
|
||||
|
||||
@@ -0,0 +1,70 @@
|
||||
use crate::endpoint::realtime_websocket::types::ConversationItem;
|
||||
use crate::endpoint::realtime_websocket::types::ConversationItemContent;
|
||||
use crate::endpoint::realtime_websocket::types::RealtimeOutboundMessage;
|
||||
use crate::endpoint::realtime_websocket::types::SessionAudioFormat;
|
||||
use crate::endpoint::realtime_websocket::types::SessionAudioInputV1;
|
||||
use crate::endpoint::realtime_websocket::types::SessionAudioOutputV1;
|
||||
use crate::endpoint::realtime_websocket::types::SessionAudioV1;
|
||||
use crate::endpoint::realtime_websocket::types::SessionUpdateSession;
|
||||
use crate::endpoint::realtime_websocket::types::SessionUpdateSessionV1;
|
||||
use std::collections::HashMap;
|
||||
use url::Url;
|
||||
|
||||
pub(super) fn conversation_item_create(text: String) -> RealtimeOutboundMessage {
|
||||
RealtimeOutboundMessage::ConversationItemCreate {
|
||||
item: ConversationItem::Message {
|
||||
role: "user".to_string(),
|
||||
content: vec![ConversationItemContent {
|
||||
kind: "text".to_string(),
|
||||
text,
|
||||
}],
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn handoff_append(handoff_id: String, output_text: String) -> RealtimeOutboundMessage {
|
||||
RealtimeOutboundMessage::ConversationHandoffAppend {
|
||||
handoff_id,
|
||||
output_text,
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn session_update(instructions: String) -> RealtimeOutboundMessage {
|
||||
RealtimeOutboundMessage::SessionUpdate {
|
||||
session: SessionUpdateSession::V1(SessionUpdateSessionV1 {
|
||||
kind: "quicksilver".to_string(),
|
||||
instructions,
|
||||
audio: SessionAudioV1 {
|
||||
input: SessionAudioInputV1 {
|
||||
format: SessionAudioFormat {
|
||||
kind: "audio/pcm".to_string(),
|
||||
rate: 24_000,
|
||||
},
|
||||
},
|
||||
output: SessionAudioOutputV1 {
|
||||
voice: "mundo".to_string(),
|
||||
},
|
||||
},
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn append_query_params(
|
||||
url: &mut Url,
|
||||
query_params: Option<&HashMap<String, String>>,
|
||||
model: Option<&str>,
|
||||
) {
|
||||
let mut query = url.query_pairs_mut();
|
||||
query.append_pair("intent", "quicksilver");
|
||||
if let Some(model) = model {
|
||||
query.append_pair("model", model);
|
||||
}
|
||||
if let Some(query_params) = query_params {
|
||||
for (key, value) in query_params {
|
||||
if (key == "model" && model.is_some()) || key == "intent" {
|
||||
continue;
|
||||
}
|
||||
query.append_pair(key, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
121
codex-rs/codex-api/src/endpoint/realtime_websocket/mode_v2.rs
Normal file
121
codex-rs/codex-api/src/endpoint/realtime_websocket/mode_v2.rs
Normal file
@@ -0,0 +1,121 @@
|
||||
use crate::endpoint::realtime_websocket::types::ConversationItem;
|
||||
use crate::endpoint::realtime_websocket::types::ConversationItemContent;
|
||||
use crate::endpoint::realtime_websocket::types::RealtimeOutboundMessage;
|
||||
use crate::endpoint::realtime_websocket::types::SessionAudioFormat;
|
||||
use crate::endpoint::realtime_websocket::types::SessionAudioInputV2;
|
||||
use crate::endpoint::realtime_websocket::types::SessionAudioOutputFormat;
|
||||
use crate::endpoint::realtime_websocket::types::SessionAudioOutputV2;
|
||||
use crate::endpoint::realtime_websocket::types::SessionAudioV2;
|
||||
use crate::endpoint::realtime_websocket::types::SessionTool;
|
||||
use crate::endpoint::realtime_websocket::types::SessionToolParameters;
|
||||
use crate::endpoint::realtime_websocket::types::SessionToolProperties;
|
||||
use crate::endpoint::realtime_websocket::types::SessionToolProperty;
|
||||
use crate::endpoint::realtime_websocket::types::SessionTurnDetection;
|
||||
use crate::endpoint::realtime_websocket::types::SessionUpdateSession;
|
||||
use crate::endpoint::realtime_websocket::types::SessionUpdateSessionV2;
|
||||
use serde_json::json;
|
||||
use std::collections::HashMap;
|
||||
use url::Url;
|
||||
|
||||
pub(super) fn conversation_item_create(text: String) -> RealtimeOutboundMessage {
|
||||
RealtimeOutboundMessage::ConversationItemCreate {
|
||||
item: ConversationItem::Message {
|
||||
role: "user".to_string(),
|
||||
content: vec![ConversationItemContent {
|
||||
kind: "input_text".to_string(),
|
||||
text,
|
||||
}],
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn handoff_append(output_text: String) -> RealtimeOutboundMessage {
|
||||
RealtimeOutboundMessage::ConversationItemCreate {
|
||||
item: ConversationItem::Message {
|
||||
role: "assistant".to_string(),
|
||||
content: vec![ConversationItemContent {
|
||||
kind: "output_text".to_string(),
|
||||
text: output_text,
|
||||
}],
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn function_call_output(
|
||||
call_id: String,
|
||||
output_text: String,
|
||||
) -> RealtimeOutboundMessage {
|
||||
let output = json!({
|
||||
"content": output_text,
|
||||
})
|
||||
.to_string();
|
||||
RealtimeOutboundMessage::ConversationItemCreate {
|
||||
item: ConversationItem::FunctionCallOutput { call_id, output },
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn session_update(instructions: String) -> RealtimeOutboundMessage {
|
||||
RealtimeOutboundMessage::SessionUpdate {
|
||||
session: SessionUpdateSession::V2(SessionUpdateSessionV2 {
|
||||
kind: "realtime".to_string(),
|
||||
instructions,
|
||||
output_modalities: vec!["audio".to_string()],
|
||||
audio: SessionAudioV2 {
|
||||
input: SessionAudioInputV2 {
|
||||
format: SessionAudioFormat {
|
||||
kind: "audio/pcm".to_string(),
|
||||
rate: 24_000,
|
||||
},
|
||||
turn_detection: SessionTurnDetection {
|
||||
kind: "semantic_vad".to_string(),
|
||||
interrupt_response: false,
|
||||
create_response: true,
|
||||
},
|
||||
},
|
||||
output: SessionAudioOutputV2 {
|
||||
format: SessionAudioOutputFormat {
|
||||
kind: "audio/pcm".to_string(),
|
||||
rate: 24_000,
|
||||
},
|
||||
voice: "marin".to_string(),
|
||||
},
|
||||
},
|
||||
tools: vec![SessionTool {
|
||||
kind: "function".to_string(),
|
||||
name: "codex".to_string(),
|
||||
description: "Delegate a request to Codex and return the final result to the user."
|
||||
.to_string(),
|
||||
parameters: SessionToolParameters {
|
||||
kind: "object".to_string(),
|
||||
properties: SessionToolProperties {
|
||||
prompt: SessionToolProperty {
|
||||
kind: "string".to_string(),
|
||||
description: "The user request to delegate to Codex.".to_string(),
|
||||
},
|
||||
},
|
||||
required: vec!["prompt".to_string()],
|
||||
},
|
||||
}],
|
||||
tool_choice: "auto".to_string(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn append_query_params(
|
||||
url: &mut Url,
|
||||
query_params: Option<&HashMap<String, String>>,
|
||||
model: Option<&str>,
|
||||
) {
|
||||
let mut query = url.query_pairs_mut();
|
||||
if let Some(model) = model {
|
||||
query.append_pair("model", model);
|
||||
}
|
||||
if let Some(query_params) = query_params {
|
||||
for (key, value) in query_params {
|
||||
if key == "model" && model.is_some() {
|
||||
continue;
|
||||
}
|
||||
query.append_pair(key, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,432 +0,0 @@
|
||||
pub use codex_protocol::protocol::RealtimeAudioFrame;
|
||||
pub use codex_protocol::protocol::RealtimeEvent;
|
||||
pub use codex_protocol::protocol::RealtimeHandoffMessage;
|
||||
pub use codex_protocol::protocol::RealtimeHandoffRequested;
|
||||
use serde::Deserialize;
|
||||
use serde::Serialize;
|
||||
use serde_json::Value;
|
||||
use std::string::ToString;
|
||||
use tracing::debug;
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum RealtimeApiMode {
|
||||
V1,
|
||||
V2,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct RealtimeSessionConfig {
|
||||
pub instructions: String,
|
||||
pub model: Option<String>,
|
||||
pub session_id: Option<String>,
|
||||
pub mode: RealtimeApiMode,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
#[serde(tag = "type")]
|
||||
pub(super) enum RealtimeOutboundMessage {
|
||||
#[serde(rename = "input_audio_buffer.append")]
|
||||
InputAudioBufferAppend { audio: String },
|
||||
#[serde(rename = "conversation.handoff.append")]
|
||||
ConversationHandoffAppend {
|
||||
handoff_id: String,
|
||||
output_text: String,
|
||||
},
|
||||
#[serde(rename = "response.create")]
|
||||
ResponseCreate,
|
||||
#[serde(rename = "session.update")]
|
||||
SessionUpdate { session: SessionUpdateSession },
|
||||
#[serde(rename = "conversation.item.create")]
|
||||
ConversationItemCreate { item: ConversationItem },
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
#[serde(untagged)]
|
||||
pub(super) enum SessionUpdateSession {
|
||||
V1(SessionUpdateSessionV1),
|
||||
V2(SessionUpdateSessionV2),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionUpdateSessionV1 {
|
||||
#[serde(rename = "type")]
|
||||
pub(super) kind: String,
|
||||
pub(super) instructions: String,
|
||||
pub(super) audio: SessionAudioV1,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionUpdateSessionV2 {
|
||||
#[serde(rename = "type")]
|
||||
pub(super) kind: String,
|
||||
pub(super) instructions: String,
|
||||
pub(super) output_modalities: Vec<String>,
|
||||
pub(super) audio: SessionAudioV2,
|
||||
pub(super) tools: Vec<SessionTool>,
|
||||
pub(super) tool_choice: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionAudioV1 {
|
||||
pub(super) input: SessionAudioInputV1,
|
||||
pub(super) output: SessionAudioOutputV1,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionAudioV2 {
|
||||
pub(super) input: SessionAudioInputV2,
|
||||
pub(super) output: SessionAudioOutputV2,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionAudioInputV1 {
|
||||
pub(super) format: SessionAudioFormat,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionAudioInputV2 {
|
||||
pub(super) format: SessionAudioFormat,
|
||||
pub(super) turn_detection: SessionTurnDetection,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionAudioFormat {
|
||||
#[serde(rename = "type")]
|
||||
pub(super) kind: String,
|
||||
pub(super) rate: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionTurnDetection {
|
||||
#[serde(rename = "type")]
|
||||
pub(super) kind: String,
|
||||
pub(super) interrupt_response: bool,
|
||||
pub(super) create_response: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionAudioOutputV1 {
|
||||
pub(super) voice: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionAudioOutputV2 {
|
||||
pub(super) format: SessionAudioOutputFormat,
|
||||
pub(super) voice: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionAudioOutputFormat {
|
||||
#[serde(rename = "type")]
|
||||
pub(super) kind: String,
|
||||
pub(super) rate: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionTool {
|
||||
#[serde(rename = "type")]
|
||||
pub(super) kind: String,
|
||||
pub(super) name: String,
|
||||
pub(super) description: String,
|
||||
pub(super) parameters: SessionToolParameters,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionToolParameters {
|
||||
#[serde(rename = "type")]
|
||||
pub(super) kind: String,
|
||||
pub(super) properties: SessionToolProperties,
|
||||
pub(super) required: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionToolProperties {
|
||||
pub(super) prompt: SessionToolProperty,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionToolProperty {
|
||||
#[serde(rename = "type")]
|
||||
pub(super) kind: String,
|
||||
pub(super) description: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
#[serde(tag = "type")]
|
||||
pub(super) enum ConversationItem {
|
||||
#[serde(rename = "message")]
|
||||
Message {
|
||||
role: String,
|
||||
content: Vec<ConversationItemContent>,
|
||||
},
|
||||
#[serde(rename = "function_call_output")]
|
||||
FunctionCallOutput { call_id: String, output: String },
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct ConversationItemContent {
|
||||
#[serde(rename = "type")]
|
||||
pub(super) kind: String,
|
||||
pub(super) text: String,
|
||||
}
|
||||
|
||||
pub(super) fn parse_realtime_event(payload: &str, mode: RealtimeApiMode) -> Option<RealtimeEvent> {
|
||||
let parsed: Value = match serde_json::from_str(payload) {
|
||||
Ok(msg) => msg,
|
||||
Err(err) => {
|
||||
debug!("failed to parse realtime event: {err}, data: {payload}");
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
let message_type = match parsed.get("type").and_then(Value::as_str) {
|
||||
Some(message_type) => message_type,
|
||||
None => {
|
||||
debug!("received realtime event without type field: {payload}");
|
||||
return None;
|
||||
}
|
||||
};
|
||||
match mode {
|
||||
RealtimeApiMode::V1 => parse_realtime_event_v1(&parsed, message_type, payload),
|
||||
RealtimeApiMode::V2 => parse_realtime_event_v2(parsed, message_type),
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_realtime_event_v1(
|
||||
parsed: &Value,
|
||||
message_type: &str,
|
||||
payload: &str,
|
||||
) -> Option<RealtimeEvent> {
|
||||
match message_type {
|
||||
"session.updated" => parse_session_updated(parsed),
|
||||
"conversation.output_audio.delta" => parse_audio_delta(parsed, false),
|
||||
"conversation.item.added" => parsed
|
||||
.get("item")
|
||||
.cloned()
|
||||
.map(RealtimeEvent::ConversationItemAdded),
|
||||
"conversation.item.done" => parsed
|
||||
.get("item")
|
||||
.and_then(Value::as_object)
|
||||
.and_then(|item| item.get("id"))
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)
|
||||
.map(|item_id| RealtimeEvent::ConversationItemDone { item_id }),
|
||||
"conversation.handoff.requested" => parse_handoff_requested_v1(parsed),
|
||||
"error" => parse_realtime_error(parsed),
|
||||
_ => {
|
||||
debug!("received unsupported realtime event type: {message_type}, data: {payload}");
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_realtime_event_v2(parsed: Value, message_type: &str) -> Option<RealtimeEvent> {
|
||||
match message_type {
|
||||
"session.created" | "session.updated" => parse_session_updated(&parsed),
|
||||
"response.output_audio.delta" => parse_audio_delta(&parsed, true),
|
||||
"conversation.item.added" => parsed
|
||||
.get("item")
|
||||
.cloned()
|
||||
.map(RealtimeEvent::ConversationItemAdded),
|
||||
"conversation.item.done" => parsed
|
||||
.get("item")
|
||||
.and_then(Value::as_object)
|
||||
.and_then(|item| item.get("id"))
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)
|
||||
.map(|item_id| RealtimeEvent::ConversationItemDone { item_id }),
|
||||
"response.done" => {
|
||||
if let Some(handoff) = parse_handoff_requested_v2(&parsed) {
|
||||
return Some(RealtimeEvent::HandoffRequested(handoff));
|
||||
}
|
||||
Some(RealtimeEvent::ConversationItemAdded(parsed))
|
||||
}
|
||||
"error" => parse_realtime_error(&parsed),
|
||||
_ => Some(RealtimeEvent::ConversationItemAdded(parsed)),
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_session_updated(parsed: &Value) -> Option<RealtimeEvent> {
|
||||
let session_id = parsed
|
||||
.get("session")
|
||||
.and_then(Value::as_object)
|
||||
.and_then(|session| session.get("id"))
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string);
|
||||
let instructions = parsed
|
||||
.get("session")
|
||||
.and_then(Value::as_object)
|
||||
.and_then(|session| session.get("instructions"))
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string);
|
||||
session_id.map(|session_id| RealtimeEvent::SessionUpdated {
|
||||
session_id,
|
||||
instructions,
|
||||
})
|
||||
}
|
||||
|
||||
fn parse_audio_delta(parsed: &Value, default_shape: bool) -> Option<RealtimeEvent> {
|
||||
let data = parsed
|
||||
.get("delta")
|
||||
.and_then(Value::as_str)
|
||||
.or_else(|| parsed.get("data").and_then(Value::as_str))
|
||||
.map(str::to_string)?;
|
||||
let sample_rate = parsed
|
||||
.get("sample_rate")
|
||||
.and_then(Value::as_u64)
|
||||
.and_then(|v| u32::try_from(v).ok());
|
||||
let num_channels = parsed
|
||||
.get("channels")
|
||||
.or_else(|| parsed.get("num_channels"))
|
||||
.and_then(Value::as_u64)
|
||||
.and_then(|v| u16::try_from(v).ok());
|
||||
Some(RealtimeEvent::AudioOut(RealtimeAudioFrame {
|
||||
data,
|
||||
sample_rate: sample_rate.or_else(|| default_shape.then_some(24_000))?,
|
||||
num_channels: num_channels.or_else(|| default_shape.then_some(1))?,
|
||||
samples_per_channel: parsed
|
||||
.get("samples_per_channel")
|
||||
.and_then(Value::as_u64)
|
||||
.and_then(|v| u32::try_from(v).ok()),
|
||||
}))
|
||||
}
|
||||
|
||||
fn parse_realtime_error(parsed: &Value) -> Option<RealtimeEvent> {
|
||||
parsed
|
||||
.get("message")
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)
|
||||
.or_else(|| {
|
||||
parsed
|
||||
.get("error")
|
||||
.and_then(Value::as_object)
|
||||
.and_then(|error| error.get("message"))
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)
|
||||
})
|
||||
.or_else(|| parsed.get("error").map(ToString::to_string))
|
||||
.map(RealtimeEvent::Error)
|
||||
}
|
||||
|
||||
fn parse_handoff_requested_v1(parsed: &Value) -> Option<RealtimeHandoffRequested> {
|
||||
let handoff_id = parsed
|
||||
.get("handoff_id")
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)?;
|
||||
let item_id = parsed
|
||||
.get("item_id")
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)?;
|
||||
let input_transcript = parsed
|
||||
.get("input_transcript")
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)?;
|
||||
let messages = parsed
|
||||
.get("messages")
|
||||
.and_then(Value::as_array)?
|
||||
.iter()
|
||||
.filter_map(|message| {
|
||||
let role = message.get("role").and_then(Value::as_str)?.to_string();
|
||||
let text = message.get("text").and_then(Value::as_str)?.to_string();
|
||||
Some(RealtimeHandoffMessage { role, text })
|
||||
})
|
||||
.collect();
|
||||
Some(RealtimeEvent::HandoffRequested(RealtimeHandoffRequested {
|
||||
handoff_id,
|
||||
item_id,
|
||||
input_transcript,
|
||||
messages,
|
||||
}))
|
||||
}
|
||||
|
||||
fn parse_handoff_requested_v2(parsed: &Value) -> Option<RealtimeHandoffRequested> {
|
||||
let outputs = parsed
|
||||
.get("response")
|
||||
.and_then(Value::as_object)
|
||||
.and_then(|response| response.get("output"))
|
||||
.and_then(Value::as_array)?;
|
||||
let function_call = outputs.iter().find(|item| {
|
||||
item.get("type").and_then(Value::as_str) == Some("function_call")
|
||||
&& item.get("name").and_then(Value::as_str) == Some("codex")
|
||||
})?;
|
||||
let handoff_id = function_call
|
||||
.get("call_id")
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)?;
|
||||
let item_id = function_call
|
||||
.get("id")
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)
|
||||
.unwrap_or_else(|| handoff_id.clone());
|
||||
let arguments = function_call
|
||||
.get("arguments")
|
||||
.and_then(Value::as_str)
|
||||
.unwrap_or_default();
|
||||
let (input_transcript, messages) = parse_handoff_arguments(arguments);
|
||||
Some(RealtimeHandoffRequested {
|
||||
handoff_id,
|
||||
item_id,
|
||||
input_transcript,
|
||||
messages,
|
||||
})
|
||||
}
|
||||
|
||||
fn parse_handoff_arguments(arguments: &str) -> (String, Vec<RealtimeHandoffMessage>) {
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct HandoffArguments {
|
||||
#[serde(default)]
|
||||
prompt: Option<String>,
|
||||
#[serde(default)]
|
||||
text: Option<String>,
|
||||
#[serde(default)]
|
||||
input: Option<String>,
|
||||
#[serde(default)]
|
||||
message: Option<String>,
|
||||
#[serde(default)]
|
||||
input_transcript: Option<String>,
|
||||
#[serde(default)]
|
||||
messages: Vec<RealtimeHandoffMessage>,
|
||||
}
|
||||
|
||||
let Some(parsed) = serde_json::from_str::<HandoffArguments>(arguments).ok() else {
|
||||
return (
|
||||
arguments.to_string(),
|
||||
vec![RealtimeHandoffMessage {
|
||||
role: "user".to_string(),
|
||||
text: arguments.to_string(),
|
||||
}],
|
||||
);
|
||||
};
|
||||
let messages = parsed
|
||||
.messages
|
||||
.into_iter()
|
||||
.filter(|message| !message.text.is_empty())
|
||||
.collect::<Vec<_>>();
|
||||
for value in [
|
||||
parsed.prompt,
|
||||
parsed.text,
|
||||
parsed.input,
|
||||
parsed.message,
|
||||
parsed.input_transcript,
|
||||
]
|
||||
.into_iter()
|
||||
.flatten()
|
||||
{
|
||||
if !value.is_empty() {
|
||||
if messages.is_empty() {
|
||||
return (
|
||||
value.clone(),
|
||||
vec![RealtimeHandoffMessage {
|
||||
role: "user".to_string(),
|
||||
text: value,
|
||||
}],
|
||||
);
|
||||
}
|
||||
return (value, messages);
|
||||
}
|
||||
}
|
||||
if let Some(first_message) = messages.first() {
|
||||
return (first_message.text.clone(), messages);
|
||||
}
|
||||
(String::new(), messages)
|
||||
}
|
||||
@@ -0,0 +1,140 @@
|
||||
use crate::endpoint::realtime_websocket::types::RealtimeAudioFrame;
|
||||
use crate::endpoint::realtime_websocket::types::RealtimeEvent;
|
||||
use crate::endpoint::realtime_websocket::types::RealtimeHandoffMessage;
|
||||
use crate::endpoint::realtime_websocket::types::RealtimeHandoffRequested;
|
||||
use serde_json::Value;
|
||||
use std::string::ToString;
|
||||
use tracing::debug;
|
||||
|
||||
pub(super) fn parse_realtime_event(payload: &str) -> Option<RealtimeEvent> {
|
||||
let parsed: Value = match serde_json::from_str(payload) {
|
||||
Ok(msg) => msg,
|
||||
Err(err) => {
|
||||
debug!("failed to parse realtime event: {err}, data: {payload}");
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
let message_type = match parsed.get("type").and_then(Value::as_str) {
|
||||
Some(message_type) => message_type,
|
||||
None => {
|
||||
debug!("received realtime event without type field: {payload}");
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
match message_type {
|
||||
"session.updated" => parse_session_updated(&parsed),
|
||||
"conversation.output_audio.delta" => parse_audio_delta(&parsed),
|
||||
"conversation.item.added" => parsed
|
||||
.get("item")
|
||||
.cloned()
|
||||
.map(RealtimeEvent::ConversationItemAdded),
|
||||
"conversation.item.done" => parsed
|
||||
.get("item")
|
||||
.and_then(Value::as_object)
|
||||
.and_then(|item| item.get("id"))
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)
|
||||
.map(|item_id| RealtimeEvent::ConversationItemDone { item_id }),
|
||||
"conversation.handoff.requested" => parse_handoff_requested(&parsed),
|
||||
"error" => parse_realtime_error(&parsed),
|
||||
_ => {
|
||||
debug!("received unsupported realtime event type: {message_type}, data: {payload}");
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_session_updated(parsed: &Value) -> Option<RealtimeEvent> {
|
||||
let session_id = parsed
|
||||
.get("session")
|
||||
.and_then(Value::as_object)
|
||||
.and_then(|session| session.get("id"))
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string);
|
||||
let instructions = parsed
|
||||
.get("session")
|
||||
.and_then(Value::as_object)
|
||||
.and_then(|session| session.get("instructions"))
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string);
|
||||
session_id.map(|session_id| RealtimeEvent::SessionUpdated {
|
||||
session_id,
|
||||
instructions,
|
||||
})
|
||||
}
|
||||
|
||||
fn parse_audio_delta(parsed: &Value) -> Option<RealtimeEvent> {
|
||||
let data = parsed
|
||||
.get("delta")
|
||||
.and_then(Value::as_str)
|
||||
.or_else(|| parsed.get("data").and_then(Value::as_str))
|
||||
.map(str::to_string)?;
|
||||
let sample_rate = parsed
|
||||
.get("sample_rate")
|
||||
.and_then(Value::as_u64)
|
||||
.and_then(|v| u32::try_from(v).ok())?;
|
||||
let num_channels = parsed
|
||||
.get("channels")
|
||||
.or_else(|| parsed.get("num_channels"))
|
||||
.and_then(Value::as_u64)
|
||||
.and_then(|v| u16::try_from(v).ok())?;
|
||||
Some(RealtimeEvent::AudioOut(RealtimeAudioFrame {
|
||||
data,
|
||||
sample_rate,
|
||||
num_channels,
|
||||
samples_per_channel: parsed
|
||||
.get("samples_per_channel")
|
||||
.and_then(Value::as_u64)
|
||||
.and_then(|v| u32::try_from(v).ok()),
|
||||
}))
|
||||
}
|
||||
|
||||
fn parse_realtime_error(parsed: &Value) -> Option<RealtimeEvent> {
|
||||
parsed
|
||||
.get("message")
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)
|
||||
.or_else(|| {
|
||||
parsed
|
||||
.get("error")
|
||||
.and_then(Value::as_object)
|
||||
.and_then(|error| error.get("message"))
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)
|
||||
})
|
||||
.or_else(|| parsed.get("error").map(ToString::to_string))
|
||||
.map(RealtimeEvent::Error)
|
||||
}
|
||||
|
||||
fn parse_handoff_requested(parsed: &Value) -> Option<RealtimeEvent> {
|
||||
let handoff_id = parsed
|
||||
.get("handoff_id")
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)?;
|
||||
let item_id = parsed
|
||||
.get("item_id")
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)?;
|
||||
let input_transcript = parsed
|
||||
.get("input_transcript")
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)?;
|
||||
let messages = parsed
|
||||
.get("messages")
|
||||
.and_then(Value::as_array)?
|
||||
.iter()
|
||||
.filter_map(|message| {
|
||||
let role = message.get("role").and_then(Value::as_str)?.to_string();
|
||||
let text = message.get("text").and_then(Value::as_str)?.to_string();
|
||||
Some(RealtimeHandoffMessage { role, text })
|
||||
})
|
||||
.collect();
|
||||
Some(RealtimeEvent::HandoffRequested(RealtimeHandoffRequested {
|
||||
handoff_id,
|
||||
item_id,
|
||||
input_transcript,
|
||||
messages,
|
||||
}))
|
||||
}
|
||||
@@ -0,0 +1,206 @@
|
||||
use crate::endpoint::realtime_websocket::types::RealtimeAudioFrame;
|
||||
use crate::endpoint::realtime_websocket::types::RealtimeEvent;
|
||||
use crate::endpoint::realtime_websocket::types::RealtimeHandoffMessage;
|
||||
use crate::endpoint::realtime_websocket::types::RealtimeHandoffRequested;
|
||||
use serde::Deserialize;
|
||||
use serde_json::Value;
|
||||
use std::string::ToString;
|
||||
use tracing::debug;
|
||||
|
||||
pub(super) fn parse_realtime_event(payload: &str) -> Option<RealtimeEvent> {
|
||||
let parsed: Value = match serde_json::from_str(payload) {
|
||||
Ok(msg) => msg,
|
||||
Err(err) => {
|
||||
debug!("failed to parse realtime event: {err}, data: {payload}");
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
let message_type = match parsed.get("type").and_then(Value::as_str) {
|
||||
Some(message_type) => message_type,
|
||||
None => {
|
||||
debug!("received realtime event without type field: {payload}");
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
match message_type {
|
||||
"session.created" | "session.updated" => parse_session_updated(&parsed),
|
||||
"response.output_audio.delta" => parse_audio_delta(&parsed),
|
||||
"conversation.item.added" => parsed
|
||||
.get("item")
|
||||
.cloned()
|
||||
.map(RealtimeEvent::ConversationItemAdded),
|
||||
"conversation.item.done" => parsed
|
||||
.get("item")
|
||||
.and_then(Value::as_object)
|
||||
.and_then(|item| item.get("id"))
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)
|
||||
.map(|item_id| RealtimeEvent::ConversationItemDone { item_id }),
|
||||
"response.done" => {
|
||||
if let Some(handoff) = parse_handoff_requested(&parsed) {
|
||||
return Some(RealtimeEvent::HandoffRequested(handoff));
|
||||
}
|
||||
Some(RealtimeEvent::ConversationItemAdded(parsed))
|
||||
}
|
||||
"error" => parse_realtime_error(&parsed),
|
||||
_ => Some(RealtimeEvent::ConversationItemAdded(parsed)),
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_session_updated(parsed: &Value) -> Option<RealtimeEvent> {
|
||||
let session_id = parsed
|
||||
.get("session")
|
||||
.and_then(Value::as_object)
|
||||
.and_then(|session| session.get("id"))
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string);
|
||||
let instructions = parsed
|
||||
.get("session")
|
||||
.and_then(Value::as_object)
|
||||
.and_then(|session| session.get("instructions"))
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string);
|
||||
session_id.map(|session_id| RealtimeEvent::SessionUpdated {
|
||||
session_id,
|
||||
instructions,
|
||||
})
|
||||
}
|
||||
|
||||
fn parse_audio_delta(parsed: &Value) -> Option<RealtimeEvent> {
|
||||
let data = parsed
|
||||
.get("delta")
|
||||
.and_then(Value::as_str)
|
||||
.or_else(|| parsed.get("data").and_then(Value::as_str))
|
||||
.map(str::to_string)?;
|
||||
let sample_rate = parsed
|
||||
.get("sample_rate")
|
||||
.and_then(Value::as_u64)
|
||||
.and_then(|v| u32::try_from(v).ok())
|
||||
.unwrap_or(24_000);
|
||||
let num_channels = parsed
|
||||
.get("channels")
|
||||
.or_else(|| parsed.get("num_channels"))
|
||||
.and_then(Value::as_u64)
|
||||
.and_then(|v| u16::try_from(v).ok())
|
||||
.unwrap_or(1);
|
||||
Some(RealtimeEvent::AudioOut(RealtimeAudioFrame {
|
||||
data,
|
||||
sample_rate,
|
||||
num_channels,
|
||||
samples_per_channel: parsed
|
||||
.get("samples_per_channel")
|
||||
.and_then(Value::as_u64)
|
||||
.and_then(|v| u32::try_from(v).ok()),
|
||||
}))
|
||||
}
|
||||
|
||||
fn parse_realtime_error(parsed: &Value) -> Option<RealtimeEvent> {
|
||||
parsed
|
||||
.get("message")
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)
|
||||
.or_else(|| {
|
||||
parsed
|
||||
.get("error")
|
||||
.and_then(Value::as_object)
|
||||
.and_then(|error| error.get("message"))
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)
|
||||
})
|
||||
.or_else(|| parsed.get("error").map(ToString::to_string))
|
||||
.map(RealtimeEvent::Error)
|
||||
}
|
||||
|
||||
fn parse_handoff_requested(parsed: &Value) -> Option<RealtimeHandoffRequested> {
|
||||
let outputs = parsed
|
||||
.get("response")
|
||||
.and_then(Value::as_object)
|
||||
.and_then(|response| response.get("output"))
|
||||
.and_then(Value::as_array)?;
|
||||
let function_call = outputs.iter().find(|item| {
|
||||
item.get("type").and_then(Value::as_str) == Some("function_call")
|
||||
&& item.get("name").and_then(Value::as_str) == Some("codex")
|
||||
})?;
|
||||
let handoff_id = function_call
|
||||
.get("call_id")
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)?;
|
||||
let item_id = function_call
|
||||
.get("id")
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)
|
||||
.unwrap_or_else(|| handoff_id.clone());
|
||||
let arguments = function_call
|
||||
.get("arguments")
|
||||
.and_then(Value::as_str)
|
||||
.unwrap_or_default();
|
||||
let (input_transcript, messages) = parse_handoff_arguments(arguments);
|
||||
Some(RealtimeHandoffRequested {
|
||||
handoff_id,
|
||||
item_id,
|
||||
input_transcript,
|
||||
messages,
|
||||
})
|
||||
}
|
||||
|
||||
fn parse_handoff_arguments(arguments: &str) -> (String, Vec<RealtimeHandoffMessage>) {
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct HandoffArguments {
|
||||
#[serde(default)]
|
||||
prompt: Option<String>,
|
||||
#[serde(default)]
|
||||
text: Option<String>,
|
||||
#[serde(default)]
|
||||
input: Option<String>,
|
||||
#[serde(default)]
|
||||
message: Option<String>,
|
||||
#[serde(default)]
|
||||
input_transcript: Option<String>,
|
||||
#[serde(default)]
|
||||
messages: Vec<RealtimeHandoffMessage>,
|
||||
}
|
||||
|
||||
let Some(parsed) = serde_json::from_str::<HandoffArguments>(arguments).ok() else {
|
||||
return (
|
||||
arguments.to_string(),
|
||||
vec![RealtimeHandoffMessage {
|
||||
role: "user".to_string(),
|
||||
text: arguments.to_string(),
|
||||
}],
|
||||
);
|
||||
};
|
||||
let messages = parsed
|
||||
.messages
|
||||
.into_iter()
|
||||
.filter(|message| !message.text.is_empty())
|
||||
.collect::<Vec<_>>();
|
||||
for value in [
|
||||
parsed.prompt,
|
||||
parsed.text,
|
||||
parsed.input,
|
||||
parsed.message,
|
||||
parsed.input_transcript,
|
||||
]
|
||||
.into_iter()
|
||||
.flatten()
|
||||
{
|
||||
if !value.is_empty() {
|
||||
if messages.is_empty() {
|
||||
return (
|
||||
value.clone(),
|
||||
vec![RealtimeHandoffMessage {
|
||||
role: "user".to_string(),
|
||||
text: value,
|
||||
}],
|
||||
);
|
||||
}
|
||||
return (value, messages);
|
||||
}
|
||||
}
|
||||
if let Some(first_message) = messages.first() {
|
||||
return (first_message.text.clone(), messages);
|
||||
}
|
||||
(String::new(), messages)
|
||||
}
|
||||
167
codex-rs/codex-api/src/endpoint/realtime_websocket/types.rs
Normal file
167
codex-rs/codex-api/src/endpoint/realtime_websocket/types.rs
Normal file
@@ -0,0 +1,167 @@
|
||||
pub use codex_protocol::protocol::RealtimeAudioFrame;
|
||||
pub use codex_protocol::protocol::RealtimeEvent;
|
||||
pub use codex_protocol::protocol::RealtimeHandoffMessage;
|
||||
pub use codex_protocol::protocol::RealtimeHandoffRequested;
|
||||
use serde::Serialize;
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum RealtimeApiMode {
|
||||
V1,
|
||||
V2,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct RealtimeSessionConfig {
|
||||
pub instructions: String,
|
||||
pub model: Option<String>,
|
||||
pub session_id: Option<String>,
|
||||
pub mode: RealtimeApiMode,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
#[serde(tag = "type")]
|
||||
pub(super) enum RealtimeOutboundMessage {
|
||||
#[serde(rename = "input_audio_buffer.append")]
|
||||
InputAudioBufferAppend { audio: String },
|
||||
#[serde(rename = "conversation.handoff.append")]
|
||||
ConversationHandoffAppend {
|
||||
handoff_id: String,
|
||||
output_text: String,
|
||||
},
|
||||
#[serde(rename = "response.create")]
|
||||
ResponseCreate,
|
||||
#[serde(rename = "session.update")]
|
||||
SessionUpdate { session: SessionUpdateSession },
|
||||
#[serde(rename = "conversation.item.create")]
|
||||
ConversationItemCreate { item: ConversationItem },
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
#[serde(untagged)]
|
||||
pub(super) enum SessionUpdateSession {
|
||||
V1(SessionUpdateSessionV1),
|
||||
V2(SessionUpdateSessionV2),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionUpdateSessionV1 {
|
||||
#[serde(rename = "type")]
|
||||
pub(super) kind: String,
|
||||
pub(super) instructions: String,
|
||||
pub(super) audio: SessionAudioV1,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionUpdateSessionV2 {
|
||||
#[serde(rename = "type")]
|
||||
pub(super) kind: String,
|
||||
pub(super) instructions: String,
|
||||
pub(super) output_modalities: Vec<String>,
|
||||
pub(super) audio: SessionAudioV2,
|
||||
pub(super) tools: Vec<SessionTool>,
|
||||
pub(super) tool_choice: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionAudioV1 {
|
||||
pub(super) input: SessionAudioInputV1,
|
||||
pub(super) output: SessionAudioOutputV1,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionAudioV2 {
|
||||
pub(super) input: SessionAudioInputV2,
|
||||
pub(super) output: SessionAudioOutputV2,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionAudioInputV1 {
|
||||
pub(super) format: SessionAudioFormat,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionAudioInputV2 {
|
||||
pub(super) format: SessionAudioFormat,
|
||||
pub(super) turn_detection: SessionTurnDetection,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionAudioFormat {
|
||||
#[serde(rename = "type")]
|
||||
pub(super) kind: String,
|
||||
pub(super) rate: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionTurnDetection {
|
||||
#[serde(rename = "type")]
|
||||
pub(super) kind: String,
|
||||
pub(super) interrupt_response: bool,
|
||||
pub(super) create_response: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionAudioOutputV1 {
|
||||
pub(super) voice: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionAudioOutputV2 {
|
||||
pub(super) format: SessionAudioOutputFormat,
|
||||
pub(super) voice: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionAudioOutputFormat {
|
||||
#[serde(rename = "type")]
|
||||
pub(super) kind: String,
|
||||
pub(super) rate: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionTool {
|
||||
#[serde(rename = "type")]
|
||||
pub(super) kind: String,
|
||||
pub(super) name: String,
|
||||
pub(super) description: String,
|
||||
pub(super) parameters: SessionToolParameters,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionToolParameters {
|
||||
#[serde(rename = "type")]
|
||||
pub(super) kind: String,
|
||||
pub(super) properties: SessionToolProperties,
|
||||
pub(super) required: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionToolProperties {
|
||||
pub(super) prompt: SessionToolProperty,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionToolProperty {
|
||||
#[serde(rename = "type")]
|
||||
pub(super) kind: String,
|
||||
pub(super) description: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
#[serde(tag = "type")]
|
||||
pub(super) enum ConversationItem {
|
||||
#[serde(rename = "message")]
|
||||
Message {
|
||||
role: String,
|
||||
content: Vec<ConversationItemContent>,
|
||||
},
|
||||
#[serde(rename = "function_call_output")]
|
||||
FunctionCallOutput { call_id: String, output: String },
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct ConversationItemContent {
|
||||
#[serde(rename = "type")]
|
||||
pub(super) kind: String,
|
||||
pub(super) text: String,
|
||||
}
|
||||
Reference in New Issue
Block a user