Fix js_repl in-flight tool-call waiter race (#11800)

## Summary

This PR fixes a race in `js_repl` tool-call draining that could leave an
exec waiting indefinitely for in-flight tool calls to finish.

The fix is in:

-
`/Users/fjord/code/codex-jsrepl-seq/codex-rs/core/src/tools/js_repl/mod.rs`

## Problem

`js_repl` tracks in-flight tool calls per exec and waits for them to
drain on completion/timeout/cancel paths.
The previous wait logic used a check-then-wait pattern with `Notify`
that could miss a wakeup:

1. Observe `in_flight > 0`
2. Drop lock
3. Register wait (`notified().await`)

If `notify_waiters()` happened between (2) and (3), the waiter could
sleep until another notification that never comes.

## What changed

- Updated all exec-tool-call wait loops to create an owned notification
future while holding the lock:
- use `Arc<Notify>::notified_owned()` instead of cloning notify and
awaiting later.
- Applied this consistently to:
  - `wait_for_exec_tool_calls`
  - `wait_for_all_exec_tool_calls`
  - `wait_for_exec_tool_calls_map`

This preserves existing behavior while eliminating the lost-wakeup
window.

## Test coverage

Added a regression test:

- `wait_for_exec_tool_calls_map_drains_inflight_calls_without_hanging`

The test repeatedly races waiter/finisher tasks and asserts bounded
completion to catch hangs.

## Impact

- No API changes.
- No user-facing behavior changes intended.
- Improves reliability of exec lifecycle boundaries when tool calls are
still in flight.


#### [git stack](https://github.com/magus/git-stack-cli)
-  `1` https://github.com/openai/codex/pull/11796
- 👉 `2` https://github.com/openai/codex/pull/11800
-  `3` https://github.com/openai/codex/pull/10673
-  `4` https://github.com/openai/codex/pull/10670
This commit is contained in:
Curtis 'Fjord' Hawthorne
2026-02-13 17:24:52 -08:00
committed by GitHub
parent 6cbb489e6e
commit 0d76d029b7

View File

@@ -306,15 +306,15 @@ impl JsReplManager {
async fn wait_for_exec_tool_calls(&self, exec_id: &str) {
loop {
let notify = {
let notified = {
let calls = self.exec_tool_calls.lock().await;
calls
.get(exec_id)
.filter(|state| state.in_flight > 0)
.map(|state| Arc::clone(&state.notify))
.map(|state| Arc::clone(&state.notify).notified_owned())
};
match notify {
Some(notify) => notify.notified().await,
match notified {
Some(notified) => notified.await,
None => return,
}
}
@@ -322,15 +322,15 @@ impl JsReplManager {
async fn wait_for_all_exec_tool_calls(&self) {
loop {
let notify = {
let notified = {
let calls = self.exec_tool_calls.lock().await;
calls
.values()
.find(|state| state.in_flight > 0)
.map(|state| Arc::clone(&state.notify))
.map(|state| Arc::clone(&state.notify).notified_owned())
};
match notify {
Some(notify) => notify.notified().await,
match notified {
Some(notified) => notified.await,
None => return,
}
}
@@ -377,15 +377,15 @@ impl JsReplManager {
exec_id: &str,
) {
loop {
let notify = {
let notified = {
let calls = exec_tool_calls.lock().await;
calls
.get(exec_id)
.filter(|state| state.in_flight > 0)
.map(|state| Arc::clone(&state.notify))
.map(|state| Arc::clone(&state.notify).notified_owned())
};
match notify {
Some(notify) => notify.notified().await,
match notified {
Some(notified) => notified.await,
None => return,
}
}
@@ -1407,6 +1407,40 @@ mod tests {
assert!(!is_js_repl_internal_tool("list_mcp_resources"));
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn wait_for_exec_tool_calls_map_drains_inflight_calls_without_hanging() {
let exec_tool_calls = Arc::new(Mutex::new(HashMap::new()));
for _ in 0..128 {
let exec_id = Uuid::new_v4().to_string();
exec_tool_calls
.lock()
.await
.insert(exec_id.clone(), ExecToolCalls::default());
assert!(JsReplManager::begin_exec_tool_call(&exec_tool_calls, &exec_id).await);
let wait_map = Arc::clone(&exec_tool_calls);
let wait_exec_id = exec_id.clone();
let waiter = tokio::spawn(async move {
JsReplManager::wait_for_exec_tool_calls_map(&wait_map, &wait_exec_id).await;
});
let finish_map = Arc::clone(&exec_tool_calls);
let finish_exec_id = exec_id.clone();
let finisher = tokio::spawn(async move {
tokio::task::yield_now().await;
JsReplManager::finish_exec_tool_call(&finish_map, &finish_exec_id).await;
});
tokio::time::timeout(Duration::from_secs(1), waiter)
.await
.expect("wait_for_exec_tool_calls_map should not hang")
.expect("wait task should not panic");
finisher.await.expect("finish task should not panic");
JsReplManager::clear_exec_tool_calls_map(&exec_tool_calls, &exec_id).await;
}
}
async fn can_run_js_repl_runtime_tests() -> bool {
if std::env::var_os("CODEX_SANDBOX").is_some() {
return false;