Stabilize Windows cmd-based shell test harnesses (#14958)

## What is flaky
The Windows shell-driven integration tests in `codex-rs/core` were
intermittently unstable, especially:

- `apply_patch_cli_can_use_shell_command_output_as_patch_input`
- `websocket_test_codex_shell_chain`
- `websocket_v2_test_codex_shell_chain`

## Why it was flaky
These tests were exercising real shell-tool flows through whichever
shell Codex selected on Windows, and the `apply_patch` test also nested
a PowerShell read inside `cmd /c`.

There were multiple independent sources of nondeterminism in that setup:

- The test harness depended on the model-selected Windows shell instead
of pinning the shell it actually meant to exercise.
- `cmd.exe /c powershell.exe -Command "..."` is quoting-sensitive; on CI
that could leave the read command wrapped as a literal string instead of
executing it.
- Even after getting the quoting right, PowerShell could emit CLIXML
progress records like module-initialization output onto stdout.
- The `apply_patch` test was building a patch directly from shell
stdout, so any quoting artifact or progress noise corrupted the patch
input.

So the failures were driven by shell startup and output-shape variance,
not by the `apply_patch` or websocket logic themselves.

## How this PR fixes it
- Add a test-only `user_shell_override` path so Windows integration
tests can pin `cmd.exe` explicitly.
- Use that override in the websocket shell-chain tests and in the
`apply_patch` harness.
- Change the nested Windows file read in
`apply_patch_cli_can_use_shell_command_output_as_patch_input` to a UTF-8
PowerShell `-EncodedCommand` script.
- Run that nested PowerShell process with `-NonInteractive`, set
`$ProgressPreference = 'SilentlyContinue'`, and read the file with
`[System.IO.File]::ReadAllText(...)`.

## Why this fix fixes the flakiness
The outer harness now runs under a deterministic shell, and the inner
PowerShell read no longer depends on fragile `cmd` quoting or on
progress output staying quiet by accident. The shell tool returns only
the file contents, so patch construction and websocket assertions depend
on stable test inputs instead of on runner-specific shell behavior.

---------

Co-authored-by: Ahmed Ibrahim <219906144+aibrahim-oai@users.noreply.github.com>
Co-authored-by: Codex <noreply@openai.com>
This commit is contained in:
Ahmed Ibrahim
2026-03-17 13:21:46 -07:00
committed by GitHub
parent 683c37ce75
commit b02388672f
9 changed files with 160 additions and 13 deletions

View File

@@ -369,6 +369,7 @@ pub(crate) struct CodexSpawnArgs {
pub(crate) persist_extended_history: bool,
pub(crate) metrics_service_name: Option<String>,
pub(crate) inherited_shell_snapshot: Option<Arc<ShellSnapshot>>,
pub(crate) user_shell_override: Option<shell::Shell>,
pub(crate) parent_trace: Option<W3cTraceContext>,
}
@@ -420,6 +421,7 @@ impl Codex {
persist_extended_history,
metrics_service_name,
inherited_shell_snapshot,
user_shell_override,
parent_trace: _,
} = args;
let (tx_sub, rx_sub) = async_channel::bounded(SUBMISSION_CHANNEL_CAPACITY);
@@ -574,6 +576,7 @@ impl Codex {
dynamic_tools,
persist_extended_history,
inherited_shell_snapshot,
user_shell_override,
};
// Generate a unique ID for the lifetime of this Codex session.
@@ -1036,6 +1039,7 @@ pub(crate) struct SessionConfiguration {
dynamic_tools: Vec<DynamicToolSpec>,
persist_extended_history: bool,
inherited_shell_snapshot: Option<Arc<ShellSnapshot>>,
user_shell_override: Option<shell::Shell>,
}
impl SessionConfiguration {
@@ -1616,7 +1620,11 @@ impl Session {
);
let use_zsh_fork_shell = config.features.enabled(Feature::ShellZshFork);
let mut default_shell = if use_zsh_fork_shell {
let mut default_shell = if let Some(user_shell_override) =
session_configuration.user_shell_override.clone()
{
user_shell_override
} else if use_zsh_fork_shell {
let zsh_path = config.zsh_path.as_ref().ok_or_else(|| {
anyhow::anyhow!(
"zsh fork feature enabled, but `zsh_path` is not configured; set `zsh_path` in config.toml"

View File

@@ -88,6 +88,7 @@ pub(crate) async fn run_codex_thread_interactive(
persist_extended_history: false,
metrics_service_name: None,
inherited_shell_snapshot: None,
user_shell_override: None,
parent_trace: None,
})
.await?;

View File

@@ -1663,6 +1663,7 @@ async fn set_rate_limits_retains_previous_credits() {
dynamic_tools: Vec::new(),
persist_extended_history: false,
inherited_shell_snapshot: None,
user_shell_override: None,
};
let mut state = SessionState::new(session_configuration);
@@ -1760,6 +1761,7 @@ async fn set_rate_limits_updates_plan_type_when_present() {
dynamic_tools: Vec::new(),
persist_extended_history: false,
inherited_shell_snapshot: None,
user_shell_override: None,
};
let mut state = SessionState::new(session_configuration);
@@ -2115,6 +2117,7 @@ pub(crate) async fn make_session_configuration_for_tests() -> SessionConfigurati
dynamic_tools: Vec::new(),
persist_extended_history: false,
inherited_shell_snapshot: None,
user_shell_override: None,
}
}
@@ -2345,6 +2348,7 @@ async fn session_new_fails_when_zsh_fork_enabled_without_zsh_path() {
dynamic_tools: Vec::new(),
persist_extended_history: false,
inherited_shell_snapshot: None,
user_shell_override: None,
};
let (tx_event, _rx_event) = async_channel::unbounded();
@@ -2439,6 +2443,7 @@ pub(crate) async fn make_session_and_context() -> (Session, TurnContext) {
dynamic_tools: Vec::new(),
persist_extended_history: false,
inherited_shell_snapshot: None,
user_shell_override: None,
};
let per_turn_config = Session::build_per_turn_config(&session_configuration);
let model_info = ModelsManager::construct_model_info_offline_for_tests(
@@ -3230,6 +3235,7 @@ pub(crate) async fn make_session_and_context_with_dynamic_tools_and_rx(
dynamic_tools,
persist_extended_history: false,
inherited_shell_snapshot: None,
user_shell_override: None,
};
let per_turn_config = Session::build_per_turn_config(&session_configuration);
let model_info = ModelsManager::construct_model_info_offline_for_tests(

View File

@@ -452,6 +452,7 @@ async fn guardian_subagent_does_not_inherit_parent_exec_policy_rules() {
persist_extended_history: false,
metrics_service_name: None,
inherited_shell_snapshot: None,
user_shell_override: None,
parent_trace: None,
})
.await

View File

@@ -64,6 +64,33 @@ pub fn thread_manager_with_models_provider_and_home(
ThreadManager::with_models_provider_and_home_for_tests(auth, provider, codex_home)
}
pub async fn start_thread_with_user_shell_override(
thread_manager: &ThreadManager,
config: Config,
user_shell_override: crate::shell::Shell,
) -> crate::error::Result<crate::NewThread> {
thread_manager
.start_thread_with_user_shell_override_for_tests(config, user_shell_override)
.await
}
pub async fn resume_thread_from_rollout_with_user_shell_override(
thread_manager: &ThreadManager,
config: Config,
rollout_path: PathBuf,
auth_manager: Arc<AuthManager>,
user_shell_override: crate::shell::Shell,
) -> crate::error::Result<crate::NewThread> {
thread_manager
.resume_thread_from_rollout_with_user_shell_override_for_tests(
config,
rollout_path,
auth_manager,
user_shell_override,
)
.await
}
pub fn models_manager_with_provider(
codex_home: PathBuf,
auth_manager: Arc<AuthManager>,

View File

@@ -381,6 +381,7 @@ impl ThreadManager {
persist_extended_history,
metrics_service_name,
parent_trace,
/*user_shell_override*/ None,
))
.await
}
@@ -420,6 +421,48 @@ impl ThreadManager {
persist_extended_history,
/*metrics_service_name*/ None,
parent_trace,
/*user_shell_override*/ None,
))
.await
}
pub(crate) async fn start_thread_with_user_shell_override_for_tests(
&self,
config: Config,
user_shell_override: crate::shell::Shell,
) -> CodexResult<NewThread> {
Box::pin(self.state.spawn_thread(
config,
InitialHistory::New,
Arc::clone(&self.state.auth_manager),
self.agent_control(),
Vec::new(),
/*persist_extended_history*/ false,
/*metrics_service_name*/ None,
/*parent_trace*/ None,
/*user_shell_override*/ Some(user_shell_override),
))
.await
}
pub(crate) async fn resume_thread_from_rollout_with_user_shell_override_for_tests(
&self,
config: Config,
rollout_path: PathBuf,
auth_manager: Arc<AuthManager>,
user_shell_override: crate::shell::Shell,
) -> CodexResult<NewThread> {
let initial_history = RolloutRecorder::get_rollout_history(&rollout_path).await?;
Box::pin(self.state.spawn_thread(
config,
initial_history,
auth_manager,
self.agent_control(),
Vec::new(),
/*persist_extended_history*/ false,
/*metrics_service_name*/ None,
/*parent_trace*/ None,
/*user_shell_override*/ Some(user_shell_override),
))
.await
}
@@ -505,6 +548,7 @@ impl ThreadManager {
persist_extended_history,
/*metrics_service_name*/ None,
parent_trace,
/*user_shell_override*/ None,
))
.await
}
@@ -590,6 +634,7 @@ impl ThreadManagerState {
metrics_service_name,
inherited_shell_snapshot,
/*parent_trace*/ None,
/*user_shell_override*/ None,
))
.await
}
@@ -614,6 +659,7 @@ impl ThreadManagerState {
/*metrics_service_name*/ None,
inherited_shell_snapshot,
/*parent_trace*/ None,
/*user_shell_override*/ None,
))
.await
}
@@ -638,6 +684,7 @@ impl ThreadManagerState {
/*metrics_service_name*/ None,
inherited_shell_snapshot,
/*parent_trace*/ None,
/*user_shell_override*/ None,
))
.await
}
@@ -654,6 +701,7 @@ impl ThreadManagerState {
persist_extended_history: bool,
metrics_service_name: Option<String>,
parent_trace: Option<W3cTraceContext>,
user_shell_override: Option<crate::shell::Shell>,
) -> CodexResult<NewThread> {
Box::pin(self.spawn_thread_with_source(
config,
@@ -666,6 +714,7 @@ impl ThreadManagerState {
metrics_service_name,
/*inherited_shell_snapshot*/ None,
parent_trace,
user_shell_override,
))
.await
}
@@ -683,6 +732,7 @@ impl ThreadManagerState {
metrics_service_name: Option<String>,
inherited_shell_snapshot: Option<Arc<ShellSnapshot>>,
parent_trace: Option<W3cTraceContext>,
user_shell_override: Option<crate::shell::Shell>,
) -> CodexResult<NewThread> {
let watch_registration = self
.file_watcher
@@ -704,6 +754,7 @@ impl ThreadManagerState {
persist_extended_history,
metrics_service_name,
inherited_shell_snapshot,
user_shell_override,
parent_trace,
})
.await?;

View File

@@ -13,6 +13,8 @@ use codex_core::built_in_model_providers;
use codex_core::config::Config;
use codex_core::features::Feature;
use codex_core::models_manager::collaboration_mode_presets::CollaborationModesConfig;
use codex_core::shell::Shell;
use codex_core::shell::get_shell_by_model_provided_path;
use codex_protocol::config_types::ServiceTier;
use codex_protocol::openai_models::ModelsResponse;
use codex_protocol::protocol::AskForApproval;
@@ -64,6 +66,7 @@ pub struct TestCodexBuilder {
auth: CodexAuth,
pre_build_hooks: Vec<Box<PreBuildHook>>,
home: Option<Arc<TempDir>>,
user_shell_override: Option<Shell>,
}
impl TestCodexBuilder {
@@ -100,6 +103,19 @@ impl TestCodexBuilder {
self
}
pub fn with_user_shell(mut self, user_shell: Shell) -> Self {
self.user_shell_override = Some(user_shell);
self
}
pub fn with_windows_cmd_shell(self) -> Self {
if cfg!(windows) {
self.with_user_shell(get_shell_by_model_provided_path(&PathBuf::from("cmd.exe")))
} else {
self
}
}
pub async fn build(&mut self, server: &wiremock::MockServer) -> anyhow::Result<TestCodex> {
let home = match self.home.clone() {
Some(home) => home,
@@ -199,9 +215,23 @@ impl TestCodexBuilder {
)
};
let thread_manager = Arc::new(thread_manager);
let user_shell_override = self.user_shell_override.clone();
let new_conversation = match resume_from {
Some(path) => {
let new_conversation = match (resume_from, user_shell_override) {
(Some(path), Some(user_shell_override)) => {
let auth_manager = codex_core::test_support::auth_manager_from_auth(auth);
Box::pin(
codex_core::test_support::resume_thread_from_rollout_with_user_shell_override(
thread_manager.as_ref(),
config.clone(),
path,
auth_manager,
user_shell_override,
),
)
.await?
}
(Some(path), None) => {
let auth_manager = codex_core::test_support::auth_manager_from_auth(auth);
Box::pin(thread_manager.resume_thread_from_rollout(
config.clone(),
@@ -211,7 +241,17 @@ impl TestCodexBuilder {
))
.await?
}
None => Box::pin(thread_manager.start_thread(config.clone())).await?,
(None, Some(user_shell_override)) => {
Box::pin(
codex_core::test_support::start_thread_with_user_shell_override(
thread_manager.as_ref(),
config.clone(),
user_shell_override,
),
)
.await?
}
(None, None) => Box::pin(thread_manager.start_thread(config.clone())).await?,
};
Ok(TestCodex {
@@ -562,6 +602,7 @@ pub fn test_codex() -> TestCodexBuilder {
auth: CodexAuth::from_api_key("dummy"),
pre_build_hooks: vec![],
home: None,
user_shell_override: None,
}
}

View File

@@ -35,7 +35,7 @@ async fn websocket_test_codex_shell_chain() -> Result<()> {
]])
.await;
let mut builder = test_codex();
let mut builder = test_codex().with_windows_cmd_shell();
let test = builder.build_with_websocket_server(&server).await?;
test.submit_turn_with_policy(
@@ -183,7 +183,7 @@ async fn websocket_v2_test_codex_shell_chain() -> Result<()> {
]])
.await;
let mut builder = test_codex().with_config(|config| {
let mut builder = test_codex().with_windows_cmd_shell().with_config(|config| {
config
.features
.enable(Feature::ResponsesWebsocketsV2)

View File

@@ -1,6 +1,8 @@
#![allow(clippy::expect_used)]
use anyhow::Result;
use base64::Engine;
use base64::engine::general_purpose::STANDARD as BASE64_STANDARD;
use codex_test_macros::large_stack_test;
use core_test_support::responses::ev_apply_patch_call;
use core_test_support::responses::ev_apply_patch_custom_tool_call;
@@ -740,7 +742,9 @@ async fn apply_patch_shell_command_heredoc_with_cd_updates_relative_workdir() ->
async fn apply_patch_cli_can_use_shell_command_output_as_patch_input() -> Result<()> {
skip_if_no_network!(Ok(()));
let harness = apply_patch_harness_with(|builder| builder.with_model("gpt-5.1")).await?;
let harness =
apply_patch_harness_with(|builder| builder.with_model("gpt-5.1").with_windows_cmd_shell())
.await?;
let source_contents = "line1\nnaïve café\nline3\n";
let source_path = harness.path("source.txt");
@@ -786,9 +790,21 @@ async fn apply_patch_cli_can_use_shell_command_output_as_patch_input() -> Result
match call_num {
0 => {
let command = if cfg!(windows) {
"Get-Content -Encoding utf8 source.txt"
// Encode the nested PowerShell script so `cmd.exe /c` does not leave the
// read command wrapped in quotes, and suppress progress records so the
// shell tool only returns the file contents back to apply_patch.
let script = "$ProgressPreference = 'SilentlyContinue'; [Console]::OutputEncoding = [System.Text.UTF8Encoding]::new($false); [System.IO.File]::ReadAllText('source.txt', [System.Text.UTF8Encoding]::new($false))";
let encoded = BASE64_STANDARD.encode(
script
.encode_utf16()
.flat_map(u16::to_le_bytes)
.collect::<Vec<u8>>(),
);
format!(
"powershell.exe -NoLogo -NoProfile -NonInteractive -EncodedCommand {encoded}"
)
} else {
"cat source.txt"
"cat source.txt".to_string()
};
let args = json!({
"command": command,
@@ -807,9 +823,7 @@ async fn apply_patch_cli_can_use_shell_command_output_as_patch_input() -> Result
let body_json: serde_json::Value =
request.body_json().expect("request body should be json");
let read_output = function_call_output_text(&body_json, &self.read_call_id);
eprintln!("read_output: \n{read_output}");
let stdout = stdout_from_shell_output(&read_output);
eprintln!("stdout: \n{stdout}");
let patch_lines = stdout
.lines()
.map(|line| format!("+{line}"))
@@ -819,8 +833,6 @@ async fn apply_patch_cli_can_use_shell_command_output_as_patch_input() -> Result
"*** Begin Patch\n*** Add File: target.txt\n{patch_lines}\n*** End Patch"
);
eprintln!("patch: \n{patch}");
let body = sse(vec![
ev_response_created("resp-2"),
ev_apply_patch_custom_tool_call(&self.apply_call_id, &patch),