Files
codex/codex-rs/scripts/bench_codex_exec_non_login.py
2025-12-04 14:07:15 +07:00

278 lines
8.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Run codex exec end-to-end with and without the non_login_shell_heuristic feature.
This script repeatedly runs:
- codex exec with the heuristic disabled
- codex exec with the heuristic enabled
for one or more prompts and reports per-run wall times, per-prompt summaries,
and combined summary statistics.
Runs can be parallelized (concurrency > 1) to speed up sampling, but note that
parallel rollouts may contend for local/remote resources and slightly skew
latency compared to strictly serial runs.
Usage:
python scripts/bench_codex_exec_non_login.py \
--iterations 5 \
--prompt "Read and summarize codex_berry" \
--prompt "Explain the architecture" \
--workdir /path/to/repo \
--model gpt-5.1-codex-max \
--reasoning-effort high \
--concurrency 2 \
--codex-bin /path/to/codex \
--skip-feature-toggle # when benchmarking an older codex binary
Notes:
- Runs will incur network/LLM variance; prefer N >= 510.
- Requires the `codex` binary (local path supported) and valid credentials.
- Results are printed in seconds; failures are logged and included in the output.
"""
import argparse
import asyncio
import statistics
import sys
import time
from dataclasses import dataclass
from typing import List
@dataclass
class RunResult:
duration: float
exit_code: int
stderr: str
async def run_once(
base_cmd: List[str],
prompt: str,
feature_enabled: bool,
toggle_feature: bool,
sem: asyncio.Semaphore,
) -> RunResult:
cmd = list(base_cmd)
if toggle_feature:
cmd.extend(
[
"-c",
f"features.non_login_shell_heuristic={'true' if feature_enabled else 'false'}",
]
)
cmd.append(prompt)
async with sem:
start = time.perf_counter()
proc = await asyncio.create_subprocess_exec(
*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
)
_, stderr = await proc.communicate()
duration = time.perf_counter() - start
stderr_text = stderr.decode(errors="replace").strip()
return RunResult(duration=duration, exit_code=proc.returncode, stderr=stderr_text)
async def run_case(
label: str,
base_cmd: List[str],
prompt: str,
iterations: int,
feature_enabled: bool,
toggle_feature: bool,
concurrency: int,
) -> tuple[list[float], int]:
sem = asyncio.Semaphore(concurrency)
tasks = [
asyncio.create_task(
run_once(base_cmd, prompt, feature_enabled, toggle_feature, sem)
)
for _ in range(iterations)
]
durations: List[float] = []
failures = 0
for idx, task in enumerate(asyncio.as_completed(tasks), start=1):
result = await task
durations.append(result.duration)
status = "ok" if result.exit_code == 0 else f"fail ({result.exit_code})"
print(f"[{label}] run {idx}/{iterations}: {result.duration:.3f}s [{status}]")
if result.exit_code != 0:
failures += 1
if result.stderr:
print(f" stderr: {result.stderr}", file=sys.stderr)
return durations, failures
def summarize(label: str, durations: List[float]) -> None:
if not durations:
print(f"[{label}] no runs recorded")
return
mean = statistics.mean(durations)
median = statistics.median(durations)
if len(durations) < 2:
p95 = durations[0]
else:
p95 = statistics.quantiles(durations, n=100)[94]
print(
f"[{label}] n={len(durations)} "
f"mean={mean:.3f}s median={median:.3f}s p95={p95:.3f}s"
)
def format_prompt_label(prompt: str, idx: int) -> str:
snippet = prompt.strip()
if len(snippet) > 60:
snippet = snippet[:57] + "..."
return f"prompt {idx + 1}: {snippet}"
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Benchmark codex exec with/without non_login_shell_heuristic."
)
parser.add_argument(
"--codex-bin",
default="codex",
help="Path to the codex binary (default: codex on PATH).",
)
parser.add_argument(
"--iterations",
type=int,
default=5,
help="Number of runs per configuration (default: 5)",
)
parser.add_argument(
"--prompt",
dest="prompts",
action="append",
required=True,
help="Prompt to send to codex exec (repeat for multiple prompts).",
)
parser.add_argument(
"--workdir",
default=None,
help="Working directory for codex exec (--cd); defaults to current dir.",
)
parser.add_argument(
"--model",
default=None,
help="Optional model override passed to codex exec (--model).",
)
parser.add_argument(
"--reasoning-effort",
choices=["none", "minimal", "low", "medium", "high", "xhigh"],
default=None,
help="Optional reasoning effort override passed via -c model_reasoning_effort=...",
)
parser.add_argument(
"--extra-config",
action="append",
default=[],
help="Additional -c overrides passed to codex exec (repeatable).",
)
parser.add_argument(
"--concurrency",
type=int,
default=1,
help="Number of concurrent runs to launch per configuration (default: 1).",
)
parser.add_argument(
"--skip-feature-toggle",
action="store_true",
help=(
"Do not inject the non_login_shell_heuristic flag; "
"use when benchmarking an older codex binary."
),
)
parser.add_argument(
"--verbose",
action="store_true",
help="Print codex exec stdout/stderr for debugging.",
)
return parser.parse_args()
def main() -> int:
args = parse_args()
base_cmd: List[str] = [args.codex_bin, "exec"]
if args.workdir:
base_cmd.extend(["--cd", args.workdir])
if args.model:
base_cmd.extend(["--model", args.model])
if args.reasoning_effort:
base_cmd.extend(["-c", f"model_reasoning_effort={args.reasoning_effort}"])
for override in args.extra_config:
base_cmd.extend(["-c", override])
if not args.verbose:
base_cmd.append("--json")
toggle_feature = not args.skip_feature_toggle
prompts = args.prompts
print(
f"Running {args.iterations} iterations per config for {len(prompts)} prompt(s)..."
)
all_off_durations: List[float] = []
all_on_durations: List[float] = []
total_off_failures = 0
total_on_failures = 0
per_prompt_results: list[dict] = []
for idx, prompt in enumerate(prompts):
label = format_prompt_label(prompt, idx)
print(f"\nPrompt {idx + 1}/{len(prompts)}: {label}")
off_durations, off_failures = asyncio.run(
run_case(
f"login-shell | {label}",
base_cmd,
prompt,
args.iterations,
feature_enabled=False,
toggle_feature=toggle_feature,
concurrency=args.concurrency,
)
)
on_durations, on_failures = asyncio.run(
run_case(
f"non-login-shell | {label}",
base_cmd,
prompt,
args.iterations,
feature_enabled=True,
toggle_feature=toggle_feature,
concurrency=args.concurrency,
)
)
per_prompt_results.append(
{
"label": label,
"off_durations": off_durations,
"on_durations": on_durations,
"off_failures": off_failures,
"on_failures": on_failures,
}
)
all_off_durations.extend(off_durations)
all_on_durations.extend(on_durations)
total_off_failures += off_failures
total_on_failures += on_failures
print("\nPer-prompt summary:")
for stats in per_prompt_results:
summarize(f"login-shell | {stats['label']}", stats["off_durations"])
summarize(f"non-login-shell | {stats['label']}", stats["on_durations"])
print(f"login-shell failures ({stats['label']}): {stats['off_failures']}")
print(f"non-login-shell failures ({stats['label']}): {stats['on_failures']}")
print("\nCombined summary across prompts:")
summarize("login-shell (all prompts)", all_off_durations)
summarize("non-login-shell (all prompts)", all_on_durations)
print(f"login-shell failures (all prompts): {total_off_failures}")
print(f"non-login-shell failures (all prompts): {total_on_failures}")
return 0 if (total_off_failures + total_on_failures) == 0 else 1
if __name__ == "__main__":
raise SystemExit(main())