Files
codex/prs/bolinfest/PR-1852.md
2025-09-02 15:17:45 -07:00

4.1 KiB

PR #1852: exec: timeout on grandchildren

Description

We were enforcing the 10 s wall-clock limit only on the child process. If that child (bash) spawns grandchildren and we kill it on timeout, those grandchildren still have the original stdout/err pipe open, so the background tasks that are draining the pipes block forever

Full Diff

diff --git a/codex-rs/core/src/exec.rs b/codex-rs/core/src/exec.rs
index dce02cc5e2..a8bbd0f52c 100644
--- a/codex-rs/core/src/exec.rs
+++ b/codex-rs/core/src/exec.rs
@@ -344,8 +344,49 @@ pub(crate) async fn consume_truncated_output(
         }
     };
 
-    let stdout = stdout_handle.await??;
-    let stderr = stderr_handle.await??;
+    // Wait for the stdout/stderr collection tasks but guard against them
+    // hanging forever. In the normal case both pipes are closed once the child
+    // terminates so the tasks exit quickly.  However, if the child process
+    // spawned grandchildren that inherited its stdout/stderr file descriptors
+    // those pipes may stay open after we `kill` the direct child on timeout.
+    // That would cause the `read_capped` tasks to block on `read()`
+    // indefinitely, effectively hanging the whole agent.
+
+    const IO_DRAIN_TIMEOUT_MS: u64 = 2_000; // 2 s should be plenty for local pipes
+
+    // We need mutable bindings so we can `abort()` them on timeout.
+    use tokio::task::JoinHandle;
+
+    async fn await_with_timeout(
+        handle: &mut JoinHandle<std::io::Result<Vec<u8>>>,
+        timeout: Duration,
+    ) -> std::io::Result<Vec<u8>> {
+        match tokio::time::timeout(timeout, &mut *handle).await {
+            Ok(join_res) => match join_res {
+                Ok(io_res) => io_res,
+                Err(join_err) => Err(std::io::Error::other(join_err)),
+            },
+            Err(_elapsed) => {
+                // Timeout: abort the task to avoid hanging on open pipes.
+                handle.abort();
+                Ok(Vec::new())
+            }
+        }
+    }
+
+    let mut stdout_handle = stdout_handle;
+    let mut stderr_handle = stderr_handle;
+
+    let stdout = await_with_timeout(
+        &mut stdout_handle,
+        Duration::from_millis(IO_DRAIN_TIMEOUT_MS),
+    )
+    .await?;
+    let stderr = await_with_timeout(
+        &mut stderr_handle,
+        Duration::from_millis(IO_DRAIN_TIMEOUT_MS),
+    )
+    .await?;
 
     Ok(RawExecToolCallOutput {
         exit_status,

Review Comments

codex-rs/core/src/exec.rs

@@ -344,8 +344,50 @@ pub(crate) async fn consume_truncated_output(
         }
     };
 
-    let stdout = stdout_handle.await??;
-    let stderr = stderr_handle.await??;
+    // Wait for the stdout/stderr collection tasks but guard against them
+    // hanging forever. In the normal case both pipes are closed once the child
+    // terminates so the tasks exit quickly.  However, if the child process
+    // spawned grandchildren that inherited its stdout/stderr file descriptors
+    // those pipes may stay open after we `kill` the direct child on timeout.
+    // That would cause the `read_capped` tasks to block on `read()`
+    // indefinitely, effectively hanging the whole agent.
+
+    const IO_DRAIN_TIMEOUT_MS: u64 = 2_000; // 2 s should be plenty for local pipes
+
+    // We need mutable bindings so we can `abort()` them on timeout.
+    use tokio::task::JoinHandle;
+
+    async fn await_with_timeout(
+        handle: &mut JoinHandle<std::io::Result<Vec<u8>>>,
+        timeout: Duration,
+    ) -> std::io::Result<Vec<u8>> {
+        tokio::select! {
+            join_res = &mut *handle => {
+                match join_res {
+                    Ok(io_res) => io_res,
+                    Err(join_err) => Err(std::io::Error::other(join_err)),
+                }
+            },
+            _ = tokio::time::sleep(timeout) => {

Can/should we use use tokio::time::timeout instead?