feat(app-server): propagate traces across tasks and core ops (#14387)

## Summary

This PR keeps app-server RPC request trace context alive for the full
lifetime of the work that request kicks off (e.g. for `thread/start`,
this is `app-server rpc handler -> tokio background task -> core op
submissions`). Previously we lose trace lineage once the request handler
returns or hands work off to background tasks.

This approach is especially relevant for `thread/start` and other RPC
handlers that run in a non-blocking way. In the near future we'll most
likely want to make all app-server handlers run in a non-blocking way by
default, and only queue operations that must operate in order (e.g.
thread RPCs per thread?), so we want to make sure tracing in app-server
just generally works.

Depends on https://github.com/openai/codex/pull/14300

**Before**
<img width="155" height="207" alt="image"
src="https://github.com/user-attachments/assets/c9487459-36f1-436c-beb7-fafeb40737af"
/>


**After**
<img width="299" height="337" alt="image"
src="https://github.com/user-attachments/assets/727392b2-d072-4427-9dc4-0502d8652dea"
/>

## What changed

- Keep request-scoped trace context around until we send the final
response or error, or the connection closes.
- Thread that trace context through detached `thread/start` work so
background startup stays attached to the originating request.
- Pass request trace context through to downstream core operations,
including:
  - thread creation
  - resume/fork flows
  - turn submission
  - review
  - interrupt
  - realtime conversation operations
- Add tracing tests that verify:
  - remote W3C trace context is preserved for `thread/start`
  - remote W3C trace context is preserved for `turn/start`
  - downstream core spans stay under the originating request span
  - request-scoped tracing state is cleaned up correctly
- Clean up shutdown behavior so detached background tasks and spawned
threads are drained before process exit.
This commit is contained in:
Owen Lin
2026-03-11 20:18:31 -07:00
committed by GitHub
parent bf5e997b31
commit 5bc82c5b93
24 changed files with 1524 additions and 308 deletions

View File

@@ -9,11 +9,15 @@ use codex_app_server_protocol::Result;
use codex_app_server_protocol::ServerNotification;
use codex_app_server_protocol::ServerRequest;
use codex_app_server_protocol::ServerRequestPayload;
use codex_otel::span_w3c_trace_context;
use codex_protocol::ThreadId;
use codex_protocol::protocol::W3cTraceContext;
use serde::Serialize;
use tokio::sync::Mutex;
use tokio::sync::mpsc;
use tokio::sync::oneshot;
use tracing::Instrument;
use tracing::Span;
use tracing::warn;
use crate::error_code::INTERNAL_ERROR_CODE;
@@ -35,6 +39,37 @@ pub(crate) struct ConnectionRequestId {
pub(crate) request_id: RequestId,
}
/// Trace data we keep for an incoming request until we send its final
/// response or error.
#[derive(Clone)]
pub(crate) struct RequestContext {
request_id: ConnectionRequestId,
span: Span,
parent_trace: Option<W3cTraceContext>,
}
impl RequestContext {
pub(crate) fn new(
request_id: ConnectionRequestId,
span: Span,
parent_trace: Option<W3cTraceContext>,
) -> Self {
Self {
request_id,
span,
parent_trace,
}
}
pub(crate) fn request_trace(&self) -> Option<W3cTraceContext> {
span_w3c_trace_context(&self.span).or_else(|| self.parent_trace.clone())
}
pub(crate) fn span(&self) -> Span {
self.span.clone()
}
}
#[derive(Debug, Clone)]
pub(crate) enum OutgoingEnvelope {
ToConnection {
@@ -51,6 +86,10 @@ pub(crate) struct OutgoingMessageSender {
next_server_request_id: AtomicI64,
sender: mpsc::Sender<OutgoingEnvelope>,
request_id_to_callback: Mutex<HashMap<RequestId, PendingCallbackEntry>>,
/// Incoming requests that are still waiting on a final response or error.
/// We keep them here because this is where responses, errors, and
/// disconnect cleanup all get handled.
request_contexts: Mutex<HashMap<ConnectionRequestId, RequestContext>>,
}
#[derive(Clone)]
@@ -142,9 +181,48 @@ impl OutgoingMessageSender {
next_server_request_id: AtomicI64::new(0),
sender,
request_id_to_callback: Mutex::new(HashMap::new()),
request_contexts: Mutex::new(HashMap::new()),
}
}
pub(crate) async fn register_request_context(&self, request_context: RequestContext) {
let mut request_contexts = self.request_contexts.lock().await;
if request_contexts
.insert(request_context.request_id.clone(), request_context)
.is_some()
{
warn!("replaced unresolved request context");
}
}
pub(crate) async fn connection_closed(&self, connection_id: ConnectionId) {
let mut request_contexts = self.request_contexts.lock().await;
request_contexts.retain(|request_id, _| request_id.connection_id != connection_id);
}
pub(crate) async fn request_trace_context(
&self,
request_id: &ConnectionRequestId,
) -> Option<W3cTraceContext> {
let request_contexts = self.request_contexts.lock().await;
request_contexts
.get(request_id)
.and_then(RequestContext::request_trace)
}
async fn take_request_context(
&self,
request_id: &ConnectionRequestId,
) -> Option<RequestContext> {
let mut request_contexts = self.request_contexts.lock().await;
request_contexts.remove(request_id)
}
#[cfg(test)]
async fn request_context_count(&self) -> usize {
self.request_contexts.lock().await.len()
}
pub(crate) async fn send_request(
&self,
request: ServerRequestPayload,
@@ -353,25 +431,24 @@ impl OutgoingMessageSender {
request_id: ConnectionRequestId,
response: T,
) {
let request_context = self.take_request_context(&request_id).await;
match serde_json::to_value(response) {
Ok(result) => {
let outgoing_message = OutgoingMessage::Response(OutgoingResponse {
id: request_id.request_id,
id: request_id.request_id.clone(),
result,
});
if let Err(err) = self
.sender
.send(OutgoingEnvelope::ToConnection {
connection_id: request_id.connection_id,
message: outgoing_message,
})
.await
{
warn!("failed to send response to client: {err:?}");
}
self.send_outgoing_message_to_connection(
request_context,
request_id.connection_id,
outgoing_message,
"response",
)
.await;
}
Err(err) => {
self.send_error(
self.send_error_inner(
request_context,
request_id,
JSONRPCErrorError {
code: INTERNAL_ERROR_CODE,
@@ -461,20 +538,50 @@ impl OutgoingMessageSender {
&self,
request_id: ConnectionRequestId,
error: JSONRPCErrorError,
) {
let request_context = self.take_request_context(&request_id).await;
self.send_error_inner(request_context, request_id, error)
.await;
}
async fn send_error_inner(
&self,
request_context: Option<RequestContext>,
request_id: ConnectionRequestId,
error: JSONRPCErrorError,
) {
let outgoing_message = OutgoingMessage::Error(OutgoingError {
id: request_id.request_id,
error,
});
if let Err(err) = self
.sender
.send(OutgoingEnvelope::ToConnection {
connection_id: request_id.connection_id,
message: outgoing_message,
})
.await
{
warn!("failed to send error to client: {err:?}");
self.send_outgoing_message_to_connection(
request_context,
request_id.connection_id,
outgoing_message,
"error",
)
.await;
}
async fn send_outgoing_message_to_connection(
&self,
request_context: Option<RequestContext>,
connection_id: ConnectionId,
message: OutgoingMessage,
message_kind: &'static str,
) {
let send_fut = self.sender.send(OutgoingEnvelope::ToConnection {
connection_id,
message,
});
let send_result = if let Some(request_context) = request_context {
send_fut.instrument(request_context.span()).await
} else {
send_fut.await
};
if let Err(err) = send_result {
warn!("failed to send {message_kind} to client: {err:?}");
}
}
}
@@ -738,6 +845,31 @@ mod tests {
}
}
#[tokio::test]
async fn send_response_clears_registered_request_context() {
let (tx, _rx) = mpsc::channel::<OutgoingEnvelope>(4);
let outgoing = OutgoingMessageSender::new(tx);
let request_id = ConnectionRequestId {
connection_id: ConnectionId(42),
request_id: RequestId::Integer(7),
};
outgoing
.register_request_context(RequestContext::new(
request_id.clone(),
tracing::info_span!("app_server.request", rpc.method = "thread/start"),
None,
))
.await;
assert_eq!(outgoing.request_context_count().await, 1);
outgoing
.send_response(request_id, json!({ "ok": true }))
.await;
assert_eq!(outgoing.request_context_count().await, 0);
}
#[tokio::test]
async fn send_error_routes_to_target_connection() {
let (tx, mut rx) = mpsc::channel::<OutgoingEnvelope>(4);
@@ -775,6 +907,40 @@ mod tests {
}
}
#[tokio::test]
async fn connection_closed_clears_registered_request_contexts() {
let (tx, _rx) = mpsc::channel::<OutgoingEnvelope>(4);
let outgoing = OutgoingMessageSender::new(tx);
let closed_connection_request = ConnectionRequestId {
connection_id: ConnectionId(9),
request_id: RequestId::Integer(3),
};
let open_connection_request = ConnectionRequestId {
connection_id: ConnectionId(10),
request_id: RequestId::Integer(4),
};
outgoing
.register_request_context(RequestContext::new(
closed_connection_request,
tracing::info_span!("app_server.request", rpc.method = "turn/interrupt"),
None,
))
.await;
outgoing
.register_request_context(RequestContext::new(
open_connection_request,
tracing::info_span!("app_server.request", rpc.method = "turn/start"),
None,
))
.await;
assert_eq!(outgoing.request_context_count().await, 2);
outgoing.connection_closed(ConnectionId(9)).await;
assert_eq!(outgoing.request_context_count().await, 1);
}
#[tokio::test]
async fn notify_client_error_forwards_error_to_waiter() {
let (tx, _rx) = mpsc::channel::<OutgoingEnvelope>(4);