fix(core): prevent hanging turn/start due to websocket warming issues (#14838)

## Description

This PR fixes a bad first-turn failure mode in app-server when the
startup websocket prewarm hangs. Before this change, `initialize ->
thread/start -> turn/start` could sit behind the prewarm for up to five
minutes, so the client would not see `turn/started`, and even
`turn/interrupt` would block because the turn had not actually started
yet.

Now, we:
- set a (configurable) timeout of 15s for websocket startup time,
exposed as `websocket_startup_timeout_ms` in config.toml
- `turn/started` is sent immediately on `turn/start` even if the
websocket is still connecting
- `turn/interrupt` can be used to cancel a turn that is still waiting on
the websocket warmup
- the turn task will wait for the full 15s websocket warming timeout
before falling back

## Why

The old behavior made app-server feel stuck at exactly the moment the
client expects turn lifecycle events to start flowing. That was
especially painful for external clients, because from their point of
view the server had accepted the request but then went silent for
minutes.

## Configuring the websocket startup timeout
Can set it in config.toml like this:
```
[model_providers.openai]
supports_websockets = true
websocket_connect_timeout_ms = 15000
```
This commit is contained in:
Owen Lin
2026-03-17 10:07:46 -07:00
committed by GitHub
parent e8add54e5d
commit 6ea041032b
20 changed files with 548 additions and 176 deletions

View File

@@ -384,7 +384,6 @@ impl Session {
turn.add_task(task);
*active = Some(turn);
}
async fn take_active_turn(&self) -> Option<ActiveTurn> {
let mut active = self.active_turn.lock().await;
active.take()

View File

@@ -1,64 +1,27 @@
use std::sync::Arc;
use std::sync::Mutex;
use crate::client::ModelClient;
use crate::client::ModelClientSession;
use crate::client_common::Prompt;
use async_trait::async_trait;
use tokio_util::sync::CancellationToken;
use crate::codex::TurnContext;
use crate::codex::run_turn;
use crate::error::Result as CodexResult;
use crate::protocol::EventMsg;
use crate::protocol::TurnStartedEvent;
use crate::session_startup_prewarm::SessionStartupPrewarmResolution;
use crate::state::TaskKind;
use async_trait::async_trait;
use codex_protocol::user_input::UserInput;
use tokio_util::sync::CancellationToken;
use tracing::Instrument;
use tracing::trace_span;
use super::SessionTask;
use super::SessionTaskContext;
pub(crate) struct RegularTask {
prewarmed_session: Mutex<Option<ModelClientSession>>,
}
impl Default for RegularTask {
fn default() -> Self {
Self {
prewarmed_session: Mutex::new(None),
}
}
}
#[derive(Default)]
pub(crate) struct RegularTask;
impl RegularTask {
pub(crate) async fn with_startup_prewarm(
model_client: ModelClient,
prompt: Prompt,
turn_context: Arc<TurnContext>,
turn_metadata_header: Option<String>,
) -> CodexResult<Self> {
let mut client_session = model_client.new_session();
client_session
.prewarm_websocket(
&prompt,
&turn_context.model_info,
&turn_context.session_telemetry,
turn_context.reasoning_effort,
turn_context.reasoning_summary,
turn_context.config.service_tier,
turn_metadata_header.as_deref(),
)
.await?;
Ok(Self {
prewarmed_session: Mutex::new(Some(client_session)),
})
}
async fn take_prewarmed_session(&self) -> Option<ModelClientSession> {
self.prewarmed_session
.lock()
.unwrap_or_else(std::sync::PoisonError::into_inner)
.take()
pub(crate) fn new() -> Self {
Self
}
}
@@ -81,8 +44,25 @@ impl SessionTask for RegularTask {
) -> Option<String> {
let sess = session.clone_session();
let run_turn_span = trace_span!("run_turn");
// Regular turns emit `TurnStarted` inline so first-turn lifecycle does
// not wait on startup prewarm resolution.
let event = EventMsg::TurnStarted(TurnStartedEvent {
turn_id: ctx.sub_id.clone(),
model_context_window: ctx.model_context_window(),
collaboration_mode_kind: ctx.collaboration_mode.mode,
});
sess.send_event(ctx.as_ref(), event).await;
sess.set_server_reasoning_included(/*included*/ false).await;
let prewarmed_client_session = self.take_prewarmed_session().await;
let prewarmed_client_session = match sess
.consume_startup_prewarm_for_regular_turn(&cancellation_token)
.await
{
SessionStartupPrewarmResolution::Cancelled => return None,
SessionStartupPrewarmResolution::Unavailable { .. } => None,
SessionStartupPrewarmResolution::Ready(prewarmed_client_session) => {
Some(*prewarmed_client_session)
}
};
run_turn(
sess,
ctx,