mirror of
https://github.com/openai/codex.git
synced 2026-05-01 03:42:05 +03:00
feat(app-server): propagate traces across tasks and core ops (#14387)
## Summary This PR keeps app-server RPC request trace context alive for the full lifetime of the work that request kicks off (e.g. for `thread/start`, this is `app-server rpc handler -> tokio background task -> core op submissions`). Previously we lose trace lineage once the request handler returns or hands work off to background tasks. This approach is especially relevant for `thread/start` and other RPC handlers that run in a non-blocking way. In the near future we'll most likely want to make all app-server handlers run in a non-blocking way by default, and only queue operations that must operate in order (e.g. thread RPCs per thread?), so we want to make sure tracing in app-server just generally works. Depends on https://github.com/openai/codex/pull/14300 **Before** <img width="155" height="207" alt="image" src="https://github.com/user-attachments/assets/c9487459-36f1-436c-beb7-fafeb40737af" /> **After** <img width="299" height="337" alt="image" src="https://github.com/user-attachments/assets/727392b2-d072-4427-9dc4-0502d8652dea" /> ## What changed - Keep request-scoped trace context around until we send the final response or error, or the connection closes. - Thread that trace context through detached `thread/start` work so background startup stays attached to the originating request. - Pass request trace context through to downstream core operations, including: - thread creation - resume/fork flows - turn submission - review - interrupt - realtime conversation operations - Add tracing tests that verify: - remote W3C trace context is preserved for `thread/start` - remote W3C trace context is preserved for `turn/start` - downstream core spans stay under the originating request span - request-scoped tracing state is cleaned up correctly - Clean up shutdown behavior so detached background tasks and spawned threads are drained before process exit.
This commit is contained in:
@@ -3,6 +3,7 @@ use crate::CodexAuth;
|
||||
use crate::ModelProviderInfo;
|
||||
use crate::agent::AgentControl;
|
||||
use crate::codex::Codex;
|
||||
use crate::codex::CodexSpawnArgs;
|
||||
use crate::codex::CodexSpawnOk;
|
||||
use crate::codex::INITIAL_SUBMIT_ID;
|
||||
use crate::codex_thread::CodexThread;
|
||||
@@ -30,11 +31,15 @@ use codex_protocol::protocol::McpServerRefreshConfig;
|
||||
use codex_protocol::protocol::Op;
|
||||
use codex_protocol::protocol::RolloutItem;
|
||||
use codex_protocol::protocol::SessionSource;
|
||||
use codex_protocol::protocol::W3cTraceContext;
|
||||
use futures::StreamExt;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use std::collections::HashMap;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::time::Duration;
|
||||
use tokio::runtime::Handle;
|
||||
use tokio::runtime::RuntimeFlavor;
|
||||
use tokio::sync::RwLock;
|
||||
@@ -118,6 +123,19 @@ pub struct NewThread {
|
||||
pub session_configured: SessionConfiguredEvent,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, PartialEq, Eq)]
|
||||
pub struct ThreadShutdownReport {
|
||||
pub completed: Vec<ThreadId>,
|
||||
pub submit_failed: Vec<ThreadId>,
|
||||
pub timed_out: Vec<ThreadId>,
|
||||
}
|
||||
|
||||
enum ShutdownOutcome {
|
||||
Complete,
|
||||
SubmitFailed,
|
||||
TimedOut,
|
||||
}
|
||||
|
||||
/// [`ThreadManager`] is responsible for creating threads and maintaining
|
||||
/// them in memory.
|
||||
pub struct ThreadManager {
|
||||
@@ -329,6 +347,7 @@ impl ThreadManager {
|
||||
dynamic_tools,
|
||||
persist_extended_history,
|
||||
None,
|
||||
None,
|
||||
))
|
||||
.await
|
||||
}
|
||||
@@ -339,6 +358,7 @@ impl ThreadManager {
|
||||
dynamic_tools: Vec<codex_protocol::dynamic_tools::DynamicToolSpec>,
|
||||
persist_extended_history: bool,
|
||||
metrics_service_name: Option<String>,
|
||||
parent_trace: Option<W3cTraceContext>,
|
||||
) -> CodexResult<NewThread> {
|
||||
Box::pin(self.state.spawn_thread(
|
||||
config,
|
||||
@@ -348,6 +368,7 @@ impl ThreadManager {
|
||||
dynamic_tools,
|
||||
persist_extended_history,
|
||||
metrics_service_name,
|
||||
parent_trace,
|
||||
))
|
||||
.await
|
||||
}
|
||||
@@ -357,10 +378,17 @@ impl ThreadManager {
|
||||
config: Config,
|
||||
rollout_path: PathBuf,
|
||||
auth_manager: Arc<AuthManager>,
|
||||
parent_trace: Option<W3cTraceContext>,
|
||||
) -> CodexResult<NewThread> {
|
||||
let initial_history = RolloutRecorder::get_rollout_history(&rollout_path).await?;
|
||||
Box::pin(self.resume_thread_with_history(config, initial_history, auth_manager, false))
|
||||
.await
|
||||
Box::pin(self.resume_thread_with_history(
|
||||
config,
|
||||
initial_history,
|
||||
auth_manager,
|
||||
false,
|
||||
parent_trace,
|
||||
))
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn resume_thread_with_history(
|
||||
@@ -369,6 +397,7 @@ impl ThreadManager {
|
||||
initial_history: InitialHistory,
|
||||
auth_manager: Arc<AuthManager>,
|
||||
persist_extended_history: bool,
|
||||
parent_trace: Option<W3cTraceContext>,
|
||||
) -> CodexResult<NewThread> {
|
||||
Box::pin(self.state.spawn_thread(
|
||||
config,
|
||||
@@ -378,6 +407,7 @@ impl ThreadManager {
|
||||
Vec::new(),
|
||||
persist_extended_history,
|
||||
None,
|
||||
parent_trace,
|
||||
))
|
||||
.await
|
||||
}
|
||||
@@ -389,13 +419,55 @@ impl ThreadManager {
|
||||
self.state.threads.write().await.remove(thread_id)
|
||||
}
|
||||
|
||||
/// Closes all threads open in this ThreadManager
|
||||
pub async fn remove_and_close_all_threads(&self) -> CodexResult<()> {
|
||||
for thread in self.state.threads.read().await.values() {
|
||||
thread.submit(Op::Shutdown).await?;
|
||||
/// Tries to shut down all tracked threads concurrently within the provided timeout.
|
||||
/// Threads that complete shutdown are removed from the manager; incomplete shutdowns
|
||||
/// remain tracked so callers can retry or inspect them later.
|
||||
pub async fn shutdown_all_threads_bounded(&self, timeout: Duration) -> ThreadShutdownReport {
|
||||
let threads = {
|
||||
let threads = self.state.threads.read().await;
|
||||
threads
|
||||
.iter()
|
||||
.map(|(thread_id, thread)| (*thread_id, Arc::clone(thread)))
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
let mut shutdowns = threads
|
||||
.into_iter()
|
||||
.map(|(thread_id, thread)| async move {
|
||||
let outcome = match tokio::time::timeout(timeout, thread.shutdown_and_wait()).await
|
||||
{
|
||||
Ok(Ok(())) => ShutdownOutcome::Complete,
|
||||
Ok(Err(_)) => ShutdownOutcome::SubmitFailed,
|
||||
Err(_) => ShutdownOutcome::TimedOut,
|
||||
};
|
||||
(thread_id, outcome)
|
||||
})
|
||||
.collect::<FuturesUnordered<_>>();
|
||||
let mut report = ThreadShutdownReport::default();
|
||||
|
||||
while let Some((thread_id, outcome)) = shutdowns.next().await {
|
||||
match outcome {
|
||||
ShutdownOutcome::Complete => report.completed.push(thread_id),
|
||||
ShutdownOutcome::SubmitFailed => report.submit_failed.push(thread_id),
|
||||
ShutdownOutcome::TimedOut => report.timed_out.push(thread_id),
|
||||
}
|
||||
}
|
||||
self.state.threads.write().await.clear();
|
||||
Ok(())
|
||||
|
||||
let mut tracked_threads = self.state.threads.write().await;
|
||||
for thread_id in &report.completed {
|
||||
tracked_threads.remove(thread_id);
|
||||
}
|
||||
|
||||
report
|
||||
.completed
|
||||
.sort_by_key(std::string::ToString::to_string);
|
||||
report
|
||||
.submit_failed
|
||||
.sort_by_key(std::string::ToString::to_string);
|
||||
report
|
||||
.timed_out
|
||||
.sort_by_key(std::string::ToString::to_string);
|
||||
report
|
||||
}
|
||||
|
||||
/// Fork an existing thread by taking messages up to the given position (not including
|
||||
@@ -408,6 +480,7 @@ impl ThreadManager {
|
||||
config: Config,
|
||||
path: PathBuf,
|
||||
persist_extended_history: bool,
|
||||
parent_trace: Option<W3cTraceContext>,
|
||||
) -> CodexResult<NewThread> {
|
||||
let history = RolloutRecorder::get_rollout_history(&path).await?;
|
||||
let history = truncate_before_nth_user_message(history, nth_user_message);
|
||||
@@ -419,6 +492,7 @@ impl ThreadManager {
|
||||
Vec::new(),
|
||||
persist_extended_history,
|
||||
None,
|
||||
parent_trace,
|
||||
))
|
||||
.await
|
||||
}
|
||||
@@ -503,6 +577,7 @@ impl ThreadManagerState {
|
||||
persist_extended_history,
|
||||
metrics_service_name,
|
||||
inherited_shell_snapshot,
|
||||
None,
|
||||
))
|
||||
.await
|
||||
}
|
||||
@@ -526,6 +601,7 @@ impl ThreadManagerState {
|
||||
false,
|
||||
None,
|
||||
inherited_shell_snapshot,
|
||||
None,
|
||||
))
|
||||
.await
|
||||
}
|
||||
@@ -549,6 +625,7 @@ impl ThreadManagerState {
|
||||
persist_extended_history,
|
||||
None,
|
||||
inherited_shell_snapshot,
|
||||
None,
|
||||
))
|
||||
.await
|
||||
}
|
||||
@@ -564,6 +641,7 @@ impl ThreadManagerState {
|
||||
dynamic_tools: Vec<codex_protocol::dynamic_tools::DynamicToolSpec>,
|
||||
persist_extended_history: bool,
|
||||
metrics_service_name: Option<String>,
|
||||
parent_trace: Option<W3cTraceContext>,
|
||||
) -> CodexResult<NewThread> {
|
||||
Box::pin(self.spawn_thread_with_source(
|
||||
config,
|
||||
@@ -575,6 +653,7 @@ impl ThreadManagerState {
|
||||
persist_extended_history,
|
||||
metrics_service_name,
|
||||
None,
|
||||
parent_trace,
|
||||
))
|
||||
.await
|
||||
}
|
||||
@@ -591,28 +670,30 @@ impl ThreadManagerState {
|
||||
persist_extended_history: bool,
|
||||
metrics_service_name: Option<String>,
|
||||
inherited_shell_snapshot: Option<Arc<ShellSnapshot>>,
|
||||
parent_trace: Option<W3cTraceContext>,
|
||||
) -> CodexResult<NewThread> {
|
||||
let watch_registration = self
|
||||
.file_watcher
|
||||
.register_config(&config, self.skills_manager.as_ref());
|
||||
let CodexSpawnOk {
|
||||
codex, thread_id, ..
|
||||
} = Codex::spawn(
|
||||
} = Codex::spawn(CodexSpawnArgs {
|
||||
config,
|
||||
auth_manager,
|
||||
Arc::clone(&self.models_manager),
|
||||
Arc::clone(&self.skills_manager),
|
||||
Arc::clone(&self.plugins_manager),
|
||||
Arc::clone(&self.mcp_manager),
|
||||
Arc::clone(&self.file_watcher),
|
||||
initial_history,
|
||||
models_manager: Arc::clone(&self.models_manager),
|
||||
skills_manager: Arc::clone(&self.skills_manager),
|
||||
plugins_manager: Arc::clone(&self.plugins_manager),
|
||||
mcp_manager: Arc::clone(&self.mcp_manager),
|
||||
file_watcher: Arc::clone(&self.file_watcher),
|
||||
conversation_history: initial_history,
|
||||
session_source,
|
||||
agent_control,
|
||||
dynamic_tools,
|
||||
persist_extended_history,
|
||||
metrics_service_name,
|
||||
inherited_shell_snapshot,
|
||||
)
|
||||
parent_trace,
|
||||
})
|
||||
.await?;
|
||||
self.finalize_thread_spawn(codex, thread_id, watch_registration)
|
||||
.await
|
||||
@@ -672,11 +753,14 @@ fn truncate_before_nth_user_message(history: InitialHistory, n: usize) -> Initia
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::codex::make_session_and_context;
|
||||
use crate::config::test_config;
|
||||
use assert_matches::assert_matches;
|
||||
use codex_protocol::models::ContentItem;
|
||||
use codex_protocol::models::ReasoningItemReasoningSummary;
|
||||
use codex_protocol::models::ResponseItem;
|
||||
use pretty_assertions::assert_eq;
|
||||
use std::time::Duration;
|
||||
use tempfile::tempdir;
|
||||
|
||||
fn user_msg(text: &str) -> ResponseItem {
|
||||
ResponseItem::Message {
|
||||
@@ -783,4 +867,40 @@ mod tests {
|
||||
serde_json::to_value(&expected).unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn shutdown_all_threads_bounded_submits_shutdown_to_every_thread() {
|
||||
let temp_dir = tempdir().expect("tempdir");
|
||||
let mut config = test_config();
|
||||
config.codex_home = temp_dir.path().join("codex-home");
|
||||
config.cwd = config.codex_home.clone();
|
||||
std::fs::create_dir_all(&config.codex_home).expect("create codex home");
|
||||
|
||||
let manager = ThreadManager::with_models_provider_and_home_for_tests(
|
||||
CodexAuth::from_api_key("dummy"),
|
||||
config.model_provider.clone(),
|
||||
config.codex_home.clone(),
|
||||
);
|
||||
let thread_1 = manager
|
||||
.start_thread(config.clone())
|
||||
.await
|
||||
.expect("start first thread")
|
||||
.thread_id;
|
||||
let thread_2 = manager
|
||||
.start_thread(config)
|
||||
.await
|
||||
.expect("start second thread")
|
||||
.thread_id;
|
||||
|
||||
let report = manager
|
||||
.shutdown_all_threads_bounded(Duration::from_secs(10))
|
||||
.await;
|
||||
|
||||
let mut expected_completed = vec![thread_1, thread_2];
|
||||
expected_completed.sort_by_key(std::string::ToString::to_string);
|
||||
assert_eq!(report.completed, expected_completed);
|
||||
assert!(report.submit_failed.is_empty());
|
||||
assert!(report.timed_out.is_empty());
|
||||
assert!(manager.list_thread_ids().await.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user