mirror of
https://github.com/openai/codex.git
synced 2026-04-28 02:11:08 +03:00
Fix js_repl in-flight tool-call waiter race (#11800)
## Summary This PR fixes a race in `js_repl` tool-call draining that could leave an exec waiting indefinitely for in-flight tool calls to finish. The fix is in: - `/Users/fjord/code/codex-jsrepl-seq/codex-rs/core/src/tools/js_repl/mod.rs` ## Problem `js_repl` tracks in-flight tool calls per exec and waits for them to drain on completion/timeout/cancel paths. The previous wait logic used a check-then-wait pattern with `Notify` that could miss a wakeup: 1. Observe `in_flight > 0` 2. Drop lock 3. Register wait (`notified().await`) If `notify_waiters()` happened between (2) and (3), the waiter could sleep until another notification that never comes. ## What changed - Updated all exec-tool-call wait loops to create an owned notification future while holding the lock: - use `Arc<Notify>::notified_owned()` instead of cloning notify and awaiting later. - Applied this consistently to: - `wait_for_exec_tool_calls` - `wait_for_all_exec_tool_calls` - `wait_for_exec_tool_calls_map` This preserves existing behavior while eliminating the lost-wakeup window. ## Test coverage Added a regression test: - `wait_for_exec_tool_calls_map_drains_inflight_calls_without_hanging` The test repeatedly races waiter/finisher tasks and asserts bounded completion to catch hangs. ## Impact - No API changes. - No user-facing behavior changes intended. - Improves reliability of exec lifecycle boundaries when tool calls are still in flight. #### [git stack](https://github.com/magus/git-stack-cli) - ✅ `1` https://github.com/openai/codex/pull/11796 - 👉 `2` https://github.com/openai/codex/pull/11800 - ⏳ `3` https://github.com/openai/codex/pull/10673 - ⏳ `4` https://github.com/openai/codex/pull/10670
This commit is contained in:
committed by
GitHub
parent
6cbb489e6e
commit
0d76d029b7
@@ -306,15 +306,15 @@ impl JsReplManager {
|
||||
|
||||
async fn wait_for_exec_tool_calls(&self, exec_id: &str) {
|
||||
loop {
|
||||
let notify = {
|
||||
let notified = {
|
||||
let calls = self.exec_tool_calls.lock().await;
|
||||
calls
|
||||
.get(exec_id)
|
||||
.filter(|state| state.in_flight > 0)
|
||||
.map(|state| Arc::clone(&state.notify))
|
||||
.map(|state| Arc::clone(&state.notify).notified_owned())
|
||||
};
|
||||
match notify {
|
||||
Some(notify) => notify.notified().await,
|
||||
match notified {
|
||||
Some(notified) => notified.await,
|
||||
None => return,
|
||||
}
|
||||
}
|
||||
@@ -322,15 +322,15 @@ impl JsReplManager {
|
||||
|
||||
async fn wait_for_all_exec_tool_calls(&self) {
|
||||
loop {
|
||||
let notify = {
|
||||
let notified = {
|
||||
let calls = self.exec_tool_calls.lock().await;
|
||||
calls
|
||||
.values()
|
||||
.find(|state| state.in_flight > 0)
|
||||
.map(|state| Arc::clone(&state.notify))
|
||||
.map(|state| Arc::clone(&state.notify).notified_owned())
|
||||
};
|
||||
match notify {
|
||||
Some(notify) => notify.notified().await,
|
||||
match notified {
|
||||
Some(notified) => notified.await,
|
||||
None => return,
|
||||
}
|
||||
}
|
||||
@@ -377,15 +377,15 @@ impl JsReplManager {
|
||||
exec_id: &str,
|
||||
) {
|
||||
loop {
|
||||
let notify = {
|
||||
let notified = {
|
||||
let calls = exec_tool_calls.lock().await;
|
||||
calls
|
||||
.get(exec_id)
|
||||
.filter(|state| state.in_flight > 0)
|
||||
.map(|state| Arc::clone(&state.notify))
|
||||
.map(|state| Arc::clone(&state.notify).notified_owned())
|
||||
};
|
||||
match notify {
|
||||
Some(notify) => notify.notified().await,
|
||||
match notified {
|
||||
Some(notified) => notified.await,
|
||||
None => return,
|
||||
}
|
||||
}
|
||||
@@ -1407,6 +1407,40 @@ mod tests {
|
||||
assert!(!is_js_repl_internal_tool("list_mcp_resources"));
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn wait_for_exec_tool_calls_map_drains_inflight_calls_without_hanging() {
|
||||
let exec_tool_calls = Arc::new(Mutex::new(HashMap::new()));
|
||||
|
||||
for _ in 0..128 {
|
||||
let exec_id = Uuid::new_v4().to_string();
|
||||
exec_tool_calls
|
||||
.lock()
|
||||
.await
|
||||
.insert(exec_id.clone(), ExecToolCalls::default());
|
||||
assert!(JsReplManager::begin_exec_tool_call(&exec_tool_calls, &exec_id).await);
|
||||
|
||||
let wait_map = Arc::clone(&exec_tool_calls);
|
||||
let wait_exec_id = exec_id.clone();
|
||||
let waiter = tokio::spawn(async move {
|
||||
JsReplManager::wait_for_exec_tool_calls_map(&wait_map, &wait_exec_id).await;
|
||||
});
|
||||
|
||||
let finish_map = Arc::clone(&exec_tool_calls);
|
||||
let finish_exec_id = exec_id.clone();
|
||||
let finisher = tokio::spawn(async move {
|
||||
tokio::task::yield_now().await;
|
||||
JsReplManager::finish_exec_tool_call(&finish_map, &finish_exec_id).await;
|
||||
});
|
||||
|
||||
tokio::time::timeout(Duration::from_secs(1), waiter)
|
||||
.await
|
||||
.expect("wait_for_exec_tool_calls_map should not hang")
|
||||
.expect("wait task should not panic");
|
||||
finisher.await.expect("finish task should not panic");
|
||||
|
||||
JsReplManager::clear_exec_tool_calls_map(&exec_tool_calls, &exec_id).await;
|
||||
}
|
||||
}
|
||||
async fn can_run_js_repl_runtime_tests() -> bool {
|
||||
if std::env::var_os("CODEX_SANDBOX").is_some() {
|
||||
return false;
|
||||
|
||||
Reference in New Issue
Block a user