mirror of
https://github.com/openai/codex.git
synced 2026-05-02 12:21:26 +03:00
feat(app-server): propagate traces across tasks and core ops (#14387)
## Summary This PR keeps app-server RPC request trace context alive for the full lifetime of the work that request kicks off (e.g. for `thread/start`, this is `app-server rpc handler -> tokio background task -> core op submissions`). Previously we lose trace lineage once the request handler returns or hands work off to background tasks. This approach is especially relevant for `thread/start` and other RPC handlers that run in a non-blocking way. In the near future we'll most likely want to make all app-server handlers run in a non-blocking way by default, and only queue operations that must operate in order (e.g. thread RPCs per thread?), so we want to make sure tracing in app-server just generally works. Depends on https://github.com/openai/codex/pull/14300 **Before** <img width="155" height="207" alt="image" src="https://github.com/user-attachments/assets/c9487459-36f1-436c-beb7-fafeb40737af" /> **After** <img width="299" height="337" alt="image" src="https://github.com/user-attachments/assets/727392b2-d072-4427-9dc4-0502d8652dea" /> ## What changed - Keep request-scoped trace context around until we send the final response or error, or the connection closes. - Thread that trace context through detached `thread/start` work so background startup stays attached to the originating request. - Pass request trace context through to downstream core operations, including: - thread creation - resume/fork flows - turn submission - review - interrupt - realtime conversation operations - Add tracing tests that verify: - remote W3C trace context is preserved for `thread/start` - remote W3C trace context is preserved for `turn/start` - downstream core spans stay under the originating request span - request-scoped tracing state is cleaned up correctly - Clean up shutdown behavior so detached background tasks and spawned threads are drained before process exit.
This commit is contained in:
@@ -9,11 +9,15 @@ use codex_app_server_protocol::Result;
|
||||
use codex_app_server_protocol::ServerNotification;
|
||||
use codex_app_server_protocol::ServerRequest;
|
||||
use codex_app_server_protocol::ServerRequestPayload;
|
||||
use codex_otel::span_w3c_trace_context;
|
||||
use codex_protocol::ThreadId;
|
||||
use codex_protocol::protocol::W3cTraceContext;
|
||||
use serde::Serialize;
|
||||
use tokio::sync::Mutex;
|
||||
use tokio::sync::mpsc;
|
||||
use tokio::sync::oneshot;
|
||||
use tracing::Instrument;
|
||||
use tracing::Span;
|
||||
use tracing::warn;
|
||||
|
||||
use crate::error_code::INTERNAL_ERROR_CODE;
|
||||
@@ -35,6 +39,37 @@ pub(crate) struct ConnectionRequestId {
|
||||
pub(crate) request_id: RequestId,
|
||||
}
|
||||
|
||||
/// Trace data we keep for an incoming request until we send its final
|
||||
/// response or error.
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct RequestContext {
|
||||
request_id: ConnectionRequestId,
|
||||
span: Span,
|
||||
parent_trace: Option<W3cTraceContext>,
|
||||
}
|
||||
|
||||
impl RequestContext {
|
||||
pub(crate) fn new(
|
||||
request_id: ConnectionRequestId,
|
||||
span: Span,
|
||||
parent_trace: Option<W3cTraceContext>,
|
||||
) -> Self {
|
||||
Self {
|
||||
request_id,
|
||||
span,
|
||||
parent_trace,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn request_trace(&self) -> Option<W3cTraceContext> {
|
||||
span_w3c_trace_context(&self.span).or_else(|| self.parent_trace.clone())
|
||||
}
|
||||
|
||||
pub(crate) fn span(&self) -> Span {
|
||||
self.span.clone()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) enum OutgoingEnvelope {
|
||||
ToConnection {
|
||||
@@ -51,6 +86,10 @@ pub(crate) struct OutgoingMessageSender {
|
||||
next_server_request_id: AtomicI64,
|
||||
sender: mpsc::Sender<OutgoingEnvelope>,
|
||||
request_id_to_callback: Mutex<HashMap<RequestId, PendingCallbackEntry>>,
|
||||
/// Incoming requests that are still waiting on a final response or error.
|
||||
/// We keep them here because this is where responses, errors, and
|
||||
/// disconnect cleanup all get handled.
|
||||
request_contexts: Mutex<HashMap<ConnectionRequestId, RequestContext>>,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
@@ -142,9 +181,48 @@ impl OutgoingMessageSender {
|
||||
next_server_request_id: AtomicI64::new(0),
|
||||
sender,
|
||||
request_id_to_callback: Mutex::new(HashMap::new()),
|
||||
request_contexts: Mutex::new(HashMap::new()),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn register_request_context(&self, request_context: RequestContext) {
|
||||
let mut request_contexts = self.request_contexts.lock().await;
|
||||
if request_contexts
|
||||
.insert(request_context.request_id.clone(), request_context)
|
||||
.is_some()
|
||||
{
|
||||
warn!("replaced unresolved request context");
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn connection_closed(&self, connection_id: ConnectionId) {
|
||||
let mut request_contexts = self.request_contexts.lock().await;
|
||||
request_contexts.retain(|request_id, _| request_id.connection_id != connection_id);
|
||||
}
|
||||
|
||||
pub(crate) async fn request_trace_context(
|
||||
&self,
|
||||
request_id: &ConnectionRequestId,
|
||||
) -> Option<W3cTraceContext> {
|
||||
let request_contexts = self.request_contexts.lock().await;
|
||||
request_contexts
|
||||
.get(request_id)
|
||||
.and_then(RequestContext::request_trace)
|
||||
}
|
||||
|
||||
async fn take_request_context(
|
||||
&self,
|
||||
request_id: &ConnectionRequestId,
|
||||
) -> Option<RequestContext> {
|
||||
let mut request_contexts = self.request_contexts.lock().await;
|
||||
request_contexts.remove(request_id)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
async fn request_context_count(&self) -> usize {
|
||||
self.request_contexts.lock().await.len()
|
||||
}
|
||||
|
||||
pub(crate) async fn send_request(
|
||||
&self,
|
||||
request: ServerRequestPayload,
|
||||
@@ -353,25 +431,24 @@ impl OutgoingMessageSender {
|
||||
request_id: ConnectionRequestId,
|
||||
response: T,
|
||||
) {
|
||||
let request_context = self.take_request_context(&request_id).await;
|
||||
match serde_json::to_value(response) {
|
||||
Ok(result) => {
|
||||
let outgoing_message = OutgoingMessage::Response(OutgoingResponse {
|
||||
id: request_id.request_id,
|
||||
id: request_id.request_id.clone(),
|
||||
result,
|
||||
});
|
||||
if let Err(err) = self
|
||||
.sender
|
||||
.send(OutgoingEnvelope::ToConnection {
|
||||
connection_id: request_id.connection_id,
|
||||
message: outgoing_message,
|
||||
})
|
||||
.await
|
||||
{
|
||||
warn!("failed to send response to client: {err:?}");
|
||||
}
|
||||
self.send_outgoing_message_to_connection(
|
||||
request_context,
|
||||
request_id.connection_id,
|
||||
outgoing_message,
|
||||
"response",
|
||||
)
|
||||
.await;
|
||||
}
|
||||
Err(err) => {
|
||||
self.send_error(
|
||||
self.send_error_inner(
|
||||
request_context,
|
||||
request_id,
|
||||
JSONRPCErrorError {
|
||||
code: INTERNAL_ERROR_CODE,
|
||||
@@ -461,20 +538,50 @@ impl OutgoingMessageSender {
|
||||
&self,
|
||||
request_id: ConnectionRequestId,
|
||||
error: JSONRPCErrorError,
|
||||
) {
|
||||
let request_context = self.take_request_context(&request_id).await;
|
||||
self.send_error_inner(request_context, request_id, error)
|
||||
.await;
|
||||
}
|
||||
|
||||
async fn send_error_inner(
|
||||
&self,
|
||||
request_context: Option<RequestContext>,
|
||||
request_id: ConnectionRequestId,
|
||||
error: JSONRPCErrorError,
|
||||
) {
|
||||
let outgoing_message = OutgoingMessage::Error(OutgoingError {
|
||||
id: request_id.request_id,
|
||||
error,
|
||||
});
|
||||
if let Err(err) = self
|
||||
.sender
|
||||
.send(OutgoingEnvelope::ToConnection {
|
||||
connection_id: request_id.connection_id,
|
||||
message: outgoing_message,
|
||||
})
|
||||
.await
|
||||
{
|
||||
warn!("failed to send error to client: {err:?}");
|
||||
self.send_outgoing_message_to_connection(
|
||||
request_context,
|
||||
request_id.connection_id,
|
||||
outgoing_message,
|
||||
"error",
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
async fn send_outgoing_message_to_connection(
|
||||
&self,
|
||||
request_context: Option<RequestContext>,
|
||||
connection_id: ConnectionId,
|
||||
message: OutgoingMessage,
|
||||
message_kind: &'static str,
|
||||
) {
|
||||
let send_fut = self.sender.send(OutgoingEnvelope::ToConnection {
|
||||
connection_id,
|
||||
message,
|
||||
});
|
||||
let send_result = if let Some(request_context) = request_context {
|
||||
send_fut.instrument(request_context.span()).await
|
||||
} else {
|
||||
send_fut.await
|
||||
};
|
||||
|
||||
if let Err(err) = send_result {
|
||||
warn!("failed to send {message_kind} to client: {err:?}");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -738,6 +845,31 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn send_response_clears_registered_request_context() {
|
||||
let (tx, _rx) = mpsc::channel::<OutgoingEnvelope>(4);
|
||||
let outgoing = OutgoingMessageSender::new(tx);
|
||||
let request_id = ConnectionRequestId {
|
||||
connection_id: ConnectionId(42),
|
||||
request_id: RequestId::Integer(7),
|
||||
};
|
||||
|
||||
outgoing
|
||||
.register_request_context(RequestContext::new(
|
||||
request_id.clone(),
|
||||
tracing::info_span!("app_server.request", rpc.method = "thread/start"),
|
||||
None,
|
||||
))
|
||||
.await;
|
||||
assert_eq!(outgoing.request_context_count().await, 1);
|
||||
|
||||
outgoing
|
||||
.send_response(request_id, json!({ "ok": true }))
|
||||
.await;
|
||||
|
||||
assert_eq!(outgoing.request_context_count().await, 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn send_error_routes_to_target_connection() {
|
||||
let (tx, mut rx) = mpsc::channel::<OutgoingEnvelope>(4);
|
||||
@@ -775,6 +907,40 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn connection_closed_clears_registered_request_contexts() {
|
||||
let (tx, _rx) = mpsc::channel::<OutgoingEnvelope>(4);
|
||||
let outgoing = OutgoingMessageSender::new(tx);
|
||||
let closed_connection_request = ConnectionRequestId {
|
||||
connection_id: ConnectionId(9),
|
||||
request_id: RequestId::Integer(3),
|
||||
};
|
||||
let open_connection_request = ConnectionRequestId {
|
||||
connection_id: ConnectionId(10),
|
||||
request_id: RequestId::Integer(4),
|
||||
};
|
||||
|
||||
outgoing
|
||||
.register_request_context(RequestContext::new(
|
||||
closed_connection_request,
|
||||
tracing::info_span!("app_server.request", rpc.method = "turn/interrupt"),
|
||||
None,
|
||||
))
|
||||
.await;
|
||||
outgoing
|
||||
.register_request_context(RequestContext::new(
|
||||
open_connection_request,
|
||||
tracing::info_span!("app_server.request", rpc.method = "turn/start"),
|
||||
None,
|
||||
))
|
||||
.await;
|
||||
assert_eq!(outgoing.request_context_count().await, 2);
|
||||
|
||||
outgoing.connection_closed(ConnectionId(9)).await;
|
||||
|
||||
assert_eq!(outgoing.request_context_count().await, 1);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn notify_client_error_forwards_error_to_waiter() {
|
||||
let (tx, _rx) = mpsc::channel::<OutgoingEnvelope>(4);
|
||||
|
||||
Reference in New Issue
Block a user