mirror of
https://github.com/openai/codex.git
synced 2026-05-04 21:32:21 +03:00
Add realtime silence tool (#18635)
## Summary Adds a second realtime v2 function tool, `remain_silent`, so the realtime model has an explicit non-speaking action when the collaboration mode or latest context says it should not answer aloud. This is stacked on #18597. ## Design - Advertise `remain_silent` alongside `background_agent` in realtime v2 conversational sessions. - Parse `remain_silent` function calls into a typed `RealtimeEvent::NoopRequested` event. - Have core answer that function call with an empty `function_call_output` and deliberately avoid `response.create`, so no follow-up realtime response is requested. - Keep the event hidden from app-server/TUI surfaces; it is operational plumbing, not user-visible conversation content.
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
use crate::endpoint::realtime_websocket::methods_common::conversation_handoff_append_message;
|
||||
use crate::endpoint::realtime_websocket::methods_common::conversation_function_call_output_message;
|
||||
use crate::endpoint::realtime_websocket::methods_common::conversation_item_create_message;
|
||||
use crate::endpoint::realtime_websocket::methods_common::normalized_session_mode;
|
||||
use crate::endpoint::realtime_websocket::methods_common::session_update_session;
|
||||
@@ -230,13 +230,13 @@ impl RealtimeWebsocketConnection {
|
||||
self.writer.send_conversation_item_create(text).await
|
||||
}
|
||||
|
||||
pub async fn send_conversation_handoff_append(
|
||||
pub async fn send_conversation_function_call_output(
|
||||
&self,
|
||||
handoff_id: String,
|
||||
call_id: String,
|
||||
output_text: String,
|
||||
) -> Result<(), ApiError> {
|
||||
self.writer
|
||||
.send_conversation_handoff_append(handoff_id, output_text)
|
||||
.send_conversation_function_call_output(call_id, output_text)
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -290,14 +290,14 @@ impl RealtimeWebsocketWriter {
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn send_conversation_handoff_append(
|
||||
pub async fn send_conversation_function_call_output(
|
||||
&self,
|
||||
handoff_id: String,
|
||||
call_id: String,
|
||||
output_text: String,
|
||||
) -> Result<(), ApiError> {
|
||||
self.send_json(&conversation_handoff_append_message(
|
||||
self.send_json(&conversation_function_call_output_message(
|
||||
self.event_parser,
|
||||
handoff_id,
|
||||
call_id,
|
||||
output_text,
|
||||
))
|
||||
.await
|
||||
@@ -471,6 +471,7 @@ impl RealtimeWebsocketEvents {
|
||||
| RealtimeEvent::ResponseCancelled(_)
|
||||
| RealtimeEvent::ResponseDone(_)
|
||||
| RealtimeEvent::ConversationItemDone { .. }
|
||||
| RealtimeEvent::NoopRequested(_)
|
||||
| RealtimeEvent::ConversationItemAdded(_)
|
||||
| RealtimeEvent::Error(_) => {}
|
||||
}
|
||||
@@ -825,6 +826,7 @@ mod tests {
|
||||
use crate::endpoint::realtime_websocket::protocol::RealtimeTranscriptEntry;
|
||||
use codex_protocol::protocol::RealtimeHandoffRequested;
|
||||
use codex_protocol::protocol::RealtimeInputAudioSpeechStarted;
|
||||
use codex_protocol::protocol::RealtimeNoopRequested;
|
||||
use codex_protocol::protocol::RealtimeResponseCancelled;
|
||||
use codex_protocol::protocol::RealtimeResponseCreated;
|
||||
use codex_protocol::protocol::RealtimeResponseDone;
|
||||
@@ -1090,6 +1092,29 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_realtime_v2_noop_tool_call_event() {
|
||||
let payload = json!({
|
||||
"type": "conversation.item.done",
|
||||
"item": {
|
||||
"id": "item_silent",
|
||||
"type": "function_call",
|
||||
"name": "remain_silent",
|
||||
"call_id": "call_silent",
|
||||
"arguments": "{}"
|
||||
}
|
||||
})
|
||||
.to_string();
|
||||
|
||||
assert_eq!(
|
||||
parse_realtime_event(payload.as_str(), RealtimeEventParser::RealtimeV2),
|
||||
Some(RealtimeEvent::NoopRequested(RealtimeNoopRequested {
|
||||
call_id: "call_silent".to_string(),
|
||||
item_id: "item_silent".to_string(),
|
||||
}))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_realtime_v2_input_audio_transcription_delta_event() {
|
||||
let payload = json!({
|
||||
@@ -1689,7 +1714,7 @@ mod tests {
|
||||
.await
|
||||
.expect("send item");
|
||||
connection
|
||||
.send_conversation_handoff_append(
|
||||
.send_conversation_function_call_output(
|
||||
"handoff_1".to_string(),
|
||||
"hello from background agent".to_string(),
|
||||
)
|
||||
@@ -1850,6 +1875,18 @@ mod tests {
|
||||
first_json["session"]["tools"][0]["parameters"]["required"],
|
||||
json!(["prompt"])
|
||||
);
|
||||
assert_eq!(
|
||||
first_json["session"]["tools"][1]["type"],
|
||||
Value::String("function".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
first_json["session"]["tools"][1]["name"],
|
||||
Value::String("remain_silent".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
first_json["session"]["tools"][1]["parameters"]["properties"],
|
||||
json!({})
|
||||
);
|
||||
assert_eq!(
|
||||
first_json["session"]["tool_choice"],
|
||||
Value::String("auto".to_string())
|
||||
@@ -1961,7 +1998,10 @@ mod tests {
|
||||
.await
|
||||
.expect("send text item");
|
||||
connection
|
||||
.send_conversation_handoff_append("call_1".to_string(), "delegated result".to_string())
|
||||
.send_conversation_function_call_output(
|
||||
"call_1".to_string(),
|
||||
"delegated result".to_string(),
|
||||
)
|
||||
.await
|
||||
.expect("send handoff output");
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@ use crate::endpoint::realtime_websocket::methods_v1::conversation_handoff_append
|
||||
use crate::endpoint::realtime_websocket::methods_v1::conversation_item_create_message as v1_conversation_item_create_message;
|
||||
use crate::endpoint::realtime_websocket::methods_v1::session_update_session as v1_session_update_session;
|
||||
use crate::endpoint::realtime_websocket::methods_v1::websocket_intent as v1_websocket_intent;
|
||||
use crate::endpoint::realtime_websocket::methods_v2::conversation_handoff_append_message as v2_conversation_handoff_append_message;
|
||||
use crate::endpoint::realtime_websocket::methods_v2::conversation_function_call_output_message as v2_conversation_function_call_output_message;
|
||||
use crate::endpoint::realtime_websocket::methods_v2::conversation_item_create_message as v2_conversation_item_create_message;
|
||||
use crate::endpoint::realtime_websocket::methods_v2::session_update_session as v2_session_update_session;
|
||||
use crate::endpoint::realtime_websocket::methods_v2::websocket_intent as v2_websocket_intent;
|
||||
@@ -40,18 +40,18 @@ pub(super) fn conversation_item_create_message(
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn conversation_handoff_append_message(
|
||||
pub(super) fn conversation_function_call_output_message(
|
||||
event_parser: RealtimeEventParser,
|
||||
handoff_id: String,
|
||||
call_id: String,
|
||||
output_text: String,
|
||||
) -> RealtimeOutboundMessage {
|
||||
match event_parser {
|
||||
RealtimeEventParser::V1 => v1_conversation_handoff_append_message(
|
||||
handoff_id,
|
||||
call_id,
|
||||
format!("{AGENT_FINAL_MESSAGE_PREFIX}{output_text}"),
|
||||
),
|
||||
RealtimeEventParser::RealtimeV2 => {
|
||||
v2_conversation_handoff_append_message(handoff_id, output_text)
|
||||
v2_conversation_function_call_output_message(call_id, output_text)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -32,6 +32,8 @@ const REALTIME_V2_OUTPUT_MODALITY_TEXT: &str = "text";
|
||||
const REALTIME_V2_TOOL_CHOICE: &str = "auto";
|
||||
const REALTIME_V2_BACKGROUND_AGENT_TOOL_NAME: &str = "background_agent";
|
||||
const REALTIME_V2_BACKGROUND_AGENT_TOOL_DESCRIPTION: &str = "Send a user request to the background agent. Use this as the default action. Do not rephrase the user's ask or rewrite it in your own words; pass along the user's own words. If the background agent is idle, this starts a new task and returns the final result to the user. If the background agent is already working on a task, this sends the request as guidance to steer that previous task. If the user asks to do something next, later, after this, or once current work finishes, call this tool so the work is actually queued instead of merely promising to do it later.";
|
||||
const REALTIME_V2_SILENCE_TOOL_NAME: &str = "remain_silent";
|
||||
const REALTIME_V2_SILENCE_TOOL_DESCRIPTION: &str = "Call this when the best response is to say nothing. Use it instead of speaking after hidden system/control messages, after background agent updates in silent modes, or whenever acknowledging aloud would be distracting. This tool has no user-visible effect.";
|
||||
const REALTIME_V2_INPUT_TRANSCRIPTION_MODEL: &str = "gpt-4o-mini-transcribe";
|
||||
|
||||
pub(super) fn conversation_item_create_message(text: String) -> RealtimeOutboundMessage {
|
||||
@@ -47,14 +49,14 @@ pub(super) fn conversation_item_create_message(text: String) -> RealtimeOutbound
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn conversation_handoff_append_message(
|
||||
handoff_id: String,
|
||||
pub(super) fn conversation_function_call_output_message(
|
||||
call_id: String,
|
||||
output_text: String,
|
||||
) -> RealtimeOutboundMessage {
|
||||
RealtimeOutboundMessage::ConversationItemCreate {
|
||||
item: ConversationItemPayload::FunctionCallOutput(ConversationFunctionCallOutputItem {
|
||||
r#type: ConversationItemType::FunctionCallOutput,
|
||||
call_id: handoff_id,
|
||||
call_id,
|
||||
output: output_text,
|
||||
}),
|
||||
}
|
||||
@@ -100,22 +102,34 @@ pub(super) fn session_update_session(
|
||||
voice,
|
||||
}),
|
||||
},
|
||||
tools: Some(vec![SessionFunctionTool {
|
||||
r#type: SessionToolType::Function,
|
||||
name: REALTIME_V2_BACKGROUND_AGENT_TOOL_NAME.to_string(),
|
||||
description: REALTIME_V2_BACKGROUND_AGENT_TOOL_DESCRIPTION.to_string(),
|
||||
parameters: json!({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"prompt": {
|
||||
"type": "string",
|
||||
"description": "The user request to delegate to the background agent."
|
||||
}
|
||||
},
|
||||
"required": ["prompt"],
|
||||
"additionalProperties": false
|
||||
}),
|
||||
}]),
|
||||
tools: Some(vec![
|
||||
SessionFunctionTool {
|
||||
r#type: SessionToolType::Function,
|
||||
name: REALTIME_V2_BACKGROUND_AGENT_TOOL_NAME.to_string(),
|
||||
description: REALTIME_V2_BACKGROUND_AGENT_TOOL_DESCRIPTION.to_string(),
|
||||
parameters: json!({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"prompt": {
|
||||
"type": "string",
|
||||
"description": "The user request to delegate to the background agent."
|
||||
}
|
||||
},
|
||||
"required": ["prompt"],
|
||||
"additionalProperties": false
|
||||
}),
|
||||
},
|
||||
SessionFunctionTool {
|
||||
r#type: SessionToolType::Function,
|
||||
name: REALTIME_V2_SILENCE_TOOL_NAME.to_string(),
|
||||
description: REALTIME_V2_SILENCE_TOOL_DESCRIPTION.to_string(),
|
||||
parameters: json!({
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"additionalProperties": false
|
||||
}),
|
||||
},
|
||||
]),
|
||||
tool_choice: Some(REALTIME_V2_TOOL_CHOICE.to_string()),
|
||||
},
|
||||
RealtimeSessionMode::Transcription => SessionUpdateSession {
|
||||
|
||||
@@ -7,6 +7,7 @@ use codex_protocol::protocol::RealtimeAudioFrame;
|
||||
use codex_protocol::protocol::RealtimeEvent;
|
||||
use codex_protocol::protocol::RealtimeHandoffRequested;
|
||||
use codex_protocol::protocol::RealtimeInputAudioSpeechStarted;
|
||||
use codex_protocol::protocol::RealtimeNoopRequested;
|
||||
use codex_protocol::protocol::RealtimeResponseCancelled;
|
||||
use codex_protocol::protocol::RealtimeResponseCreated;
|
||||
use codex_protocol::protocol::RealtimeResponseDone;
|
||||
@@ -15,6 +16,7 @@ use serde_json::Value;
|
||||
use tracing::debug;
|
||||
|
||||
const BACKGROUND_AGENT_TOOL_NAME: &str = "background_agent";
|
||||
const SILENCE_TOOL_NAME: &str = "remain_silent";
|
||||
const DEFAULT_AUDIO_SAMPLE_RATE: u32 = 24_000;
|
||||
const DEFAULT_AUDIO_CHANNELS: u16 = 1;
|
||||
const TOOL_ARGUMENT_KEYS: [&str; 5] = ["input_transcript", "input", "text", "prompt", "query"];
|
||||
@@ -127,6 +129,9 @@ fn parse_conversation_item_done_event(parsed: &Value) -> Option<RealtimeEvent> {
|
||||
if let Some(handoff) = parse_handoff_requested_event(item) {
|
||||
return Some(handoff);
|
||||
}
|
||||
if let Some(noop) = parse_noop_requested_event(item) {
|
||||
return Some(noop);
|
||||
}
|
||||
|
||||
item.get("id")
|
||||
.and_then(Value::as_str)
|
||||
@@ -160,6 +165,29 @@ fn parse_handoff_requested_event(item: &JsonMap<String, Value>) -> Option<Realti
|
||||
}))
|
||||
}
|
||||
|
||||
fn parse_noop_requested_event(item: &JsonMap<String, Value>) -> Option<RealtimeEvent> {
|
||||
let item_type = item.get("type").and_then(Value::as_str);
|
||||
let item_name = item.get("name").and_then(Value::as_str);
|
||||
if item_type != Some("function_call") || item_name != Some(SILENCE_TOOL_NAME) {
|
||||
return None;
|
||||
}
|
||||
|
||||
let call_id = item
|
||||
.get("call_id")
|
||||
.and_then(Value::as_str)
|
||||
.or_else(|| item.get("id").and_then(Value::as_str))?;
|
||||
let item_id = item
|
||||
.get("id")
|
||||
.and_then(Value::as_str)
|
||||
.unwrap_or(call_id)
|
||||
.to_string();
|
||||
|
||||
Some(RealtimeEvent::NoopRequested(RealtimeNoopRequested {
|
||||
call_id: call_id.to_string(),
|
||||
item_id,
|
||||
}))
|
||||
}
|
||||
|
||||
fn extract_input_transcript(arguments: &str) -> String {
|
||||
if arguments.is_empty() {
|
||||
return String::new();
|
||||
|
||||
Reference in New Issue
Block a user