Something that seems to work at least.

This commit is contained in:
Albin Cassirer
2025-12-04 14:07:15 +07:00
parent ccdeb9d9c4
commit 6a88b0f465
12 changed files with 601 additions and 14 deletions

View File

@@ -0,0 +1,277 @@
#!/usr/bin/env python3
"""
Run codex exec end-to-end with and without the non_login_shell_heuristic feature.
This script repeatedly runs:
- codex exec with the heuristic disabled
- codex exec with the heuristic enabled
for one or more prompts and reports per-run wall times, per-prompt summaries,
and combined summary statistics.
Runs can be parallelized (concurrency > 1) to speed up sampling, but note that
parallel rollouts may contend for local/remote resources and slightly skew
latency compared to strictly serial runs.
Usage:
python scripts/bench_codex_exec_non_login.py \
--iterations 5 \
--prompt "Read and summarize codex_berry" \
--prompt "Explain the architecture" \
--workdir /path/to/repo \
--model gpt-5.1-codex-max \
--reasoning-effort high \
--concurrency 2 \
--codex-bin /path/to/codex \
--skip-feature-toggle # when benchmarking an older codex binary
Notes:
- Runs will incur network/LLM variance; prefer N >= 510.
- Requires the `codex` binary (local path supported) and valid credentials.
- Results are printed in seconds; failures are logged and included in the output.
"""
import argparse
import asyncio
import statistics
import sys
import time
from dataclasses import dataclass
from typing import List
@dataclass
class RunResult:
duration: float
exit_code: int
stderr: str
async def run_once(
base_cmd: List[str],
prompt: str,
feature_enabled: bool,
toggle_feature: bool,
sem: asyncio.Semaphore,
) -> RunResult:
cmd = list(base_cmd)
if toggle_feature:
cmd.extend(
[
"-c",
f"features.non_login_shell_heuristic={'true' if feature_enabled else 'false'}",
]
)
cmd.append(prompt)
async with sem:
start = time.perf_counter()
proc = await asyncio.create_subprocess_exec(
*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
)
_, stderr = await proc.communicate()
duration = time.perf_counter() - start
stderr_text = stderr.decode(errors="replace").strip()
return RunResult(duration=duration, exit_code=proc.returncode, stderr=stderr_text)
async def run_case(
label: str,
base_cmd: List[str],
prompt: str,
iterations: int,
feature_enabled: bool,
toggle_feature: bool,
concurrency: int,
) -> tuple[list[float], int]:
sem = asyncio.Semaphore(concurrency)
tasks = [
asyncio.create_task(
run_once(base_cmd, prompt, feature_enabled, toggle_feature, sem)
)
for _ in range(iterations)
]
durations: List[float] = []
failures = 0
for idx, task in enumerate(asyncio.as_completed(tasks), start=1):
result = await task
durations.append(result.duration)
status = "ok" if result.exit_code == 0 else f"fail ({result.exit_code})"
print(f"[{label}] run {idx}/{iterations}: {result.duration:.3f}s [{status}]")
if result.exit_code != 0:
failures += 1
if result.stderr:
print(f" stderr: {result.stderr}", file=sys.stderr)
return durations, failures
def summarize(label: str, durations: List[float]) -> None:
if not durations:
print(f"[{label}] no runs recorded")
return
mean = statistics.mean(durations)
median = statistics.median(durations)
if len(durations) < 2:
p95 = durations[0]
else:
p95 = statistics.quantiles(durations, n=100)[94]
print(
f"[{label}] n={len(durations)} "
f"mean={mean:.3f}s median={median:.3f}s p95={p95:.3f}s"
)
def format_prompt_label(prompt: str, idx: int) -> str:
snippet = prompt.strip()
if len(snippet) > 60:
snippet = snippet[:57] + "..."
return f"prompt {idx + 1}: {snippet}"
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Benchmark codex exec with/without non_login_shell_heuristic."
)
parser.add_argument(
"--codex-bin",
default="codex",
help="Path to the codex binary (default: codex on PATH).",
)
parser.add_argument(
"--iterations",
type=int,
default=5,
help="Number of runs per configuration (default: 5)",
)
parser.add_argument(
"--prompt",
dest="prompts",
action="append",
required=True,
help="Prompt to send to codex exec (repeat for multiple prompts).",
)
parser.add_argument(
"--workdir",
default=None,
help="Working directory for codex exec (--cd); defaults to current dir.",
)
parser.add_argument(
"--model",
default=None,
help="Optional model override passed to codex exec (--model).",
)
parser.add_argument(
"--reasoning-effort",
choices=["none", "minimal", "low", "medium", "high", "xhigh"],
default=None,
help="Optional reasoning effort override passed via -c model_reasoning_effort=...",
)
parser.add_argument(
"--extra-config",
action="append",
default=[],
help="Additional -c overrides passed to codex exec (repeatable).",
)
parser.add_argument(
"--concurrency",
type=int,
default=1,
help="Number of concurrent runs to launch per configuration (default: 1).",
)
parser.add_argument(
"--skip-feature-toggle",
action="store_true",
help=(
"Do not inject the non_login_shell_heuristic flag; "
"use when benchmarking an older codex binary."
),
)
parser.add_argument(
"--verbose",
action="store_true",
help="Print codex exec stdout/stderr for debugging.",
)
return parser.parse_args()
def main() -> int:
args = parse_args()
base_cmd: List[str] = [args.codex_bin, "exec"]
if args.workdir:
base_cmd.extend(["--cd", args.workdir])
if args.model:
base_cmd.extend(["--model", args.model])
if args.reasoning_effort:
base_cmd.extend(["-c", f"model_reasoning_effort={args.reasoning_effort}"])
for override in args.extra_config:
base_cmd.extend(["-c", override])
if not args.verbose:
base_cmd.append("--json")
toggle_feature = not args.skip_feature_toggle
prompts = args.prompts
print(
f"Running {args.iterations} iterations per config for {len(prompts)} prompt(s)..."
)
all_off_durations: List[float] = []
all_on_durations: List[float] = []
total_off_failures = 0
total_on_failures = 0
per_prompt_results: list[dict] = []
for idx, prompt in enumerate(prompts):
label = format_prompt_label(prompt, idx)
print(f"\nPrompt {idx + 1}/{len(prompts)}: {label}")
off_durations, off_failures = asyncio.run(
run_case(
f"login-shell | {label}",
base_cmd,
prompt,
args.iterations,
feature_enabled=False,
toggle_feature=toggle_feature,
concurrency=args.concurrency,
)
)
on_durations, on_failures = asyncio.run(
run_case(
f"non-login-shell | {label}",
base_cmd,
prompt,
args.iterations,
feature_enabled=True,
toggle_feature=toggle_feature,
concurrency=args.concurrency,
)
)
per_prompt_results.append(
{
"label": label,
"off_durations": off_durations,
"on_durations": on_durations,
"off_failures": off_failures,
"on_failures": on_failures,
}
)
all_off_durations.extend(off_durations)
all_on_durations.extend(on_durations)
total_off_failures += off_failures
total_on_failures += on_failures
print("\nPer-prompt summary:")
for stats in per_prompt_results:
summarize(f"login-shell | {stats['label']}", stats["off_durations"])
summarize(f"non-login-shell | {stats['label']}", stats["on_durations"])
print(f"login-shell failures ({stats['label']}): {stats['off_failures']}")
print(f"non-login-shell failures ({stats['label']}): {stats['on_failures']}")
print("\nCombined summary across prompts:")
summarize("login-shell (all prompts)", all_off_durations)
summarize("non-login-shell (all prompts)", all_on_durations)
print(f"login-shell failures (all prompts): {total_off_failures}")
print(f"non-login-shell failures (all prompts): {total_on_failures}")
return 0 if (total_off_failures + total_on_failures) == 0 else 1
if __name__ == "__main__":
raise SystemExit(main())