mirror of
https://github.com/openai/codex.git
synced 2026-05-04 05:11:37 +03:00
Something that seems to work at least.
This commit is contained in:
277
codex-rs/scripts/bench_codex_exec_non_login.py
Normal file
277
codex-rs/scripts/bench_codex_exec_non_login.py
Normal file
@@ -0,0 +1,277 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Run codex exec end-to-end with and without the non_login_shell_heuristic feature.
|
||||
|
||||
This script repeatedly runs:
|
||||
- codex exec with the heuristic disabled
|
||||
- codex exec with the heuristic enabled
|
||||
|
||||
for one or more prompts and reports per-run wall times, per-prompt summaries,
|
||||
and combined summary statistics.
|
||||
|
||||
Runs can be parallelized (concurrency > 1) to speed up sampling, but note that
|
||||
parallel rollouts may contend for local/remote resources and slightly skew
|
||||
latency compared to strictly serial runs.
|
||||
|
||||
Usage:
|
||||
python scripts/bench_codex_exec_non_login.py \
|
||||
--iterations 5 \
|
||||
--prompt "Read and summarize codex_berry" \
|
||||
--prompt "Explain the architecture" \
|
||||
--workdir /path/to/repo \
|
||||
--model gpt-5.1-codex-max \
|
||||
--reasoning-effort high \
|
||||
--concurrency 2 \
|
||||
--codex-bin /path/to/codex \
|
||||
--skip-feature-toggle # when benchmarking an older codex binary
|
||||
|
||||
Notes:
|
||||
- Runs will incur network/LLM variance; prefer N >= 5–10.
|
||||
- Requires the `codex` binary (local path supported) and valid credentials.
|
||||
- Results are printed in seconds; failures are logged and included in the output.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
|
||||
|
||||
@dataclass
|
||||
class RunResult:
|
||||
duration: float
|
||||
exit_code: int
|
||||
stderr: str
|
||||
|
||||
|
||||
async def run_once(
|
||||
base_cmd: List[str],
|
||||
prompt: str,
|
||||
feature_enabled: bool,
|
||||
toggle_feature: bool,
|
||||
sem: asyncio.Semaphore,
|
||||
) -> RunResult:
|
||||
cmd = list(base_cmd)
|
||||
if toggle_feature:
|
||||
cmd.extend(
|
||||
[
|
||||
"-c",
|
||||
f"features.non_login_shell_heuristic={'true' if feature_enabled else 'false'}",
|
||||
]
|
||||
)
|
||||
cmd.append(prompt)
|
||||
async with sem:
|
||||
start = time.perf_counter()
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
_, stderr = await proc.communicate()
|
||||
duration = time.perf_counter() - start
|
||||
stderr_text = stderr.decode(errors="replace").strip()
|
||||
return RunResult(duration=duration, exit_code=proc.returncode, stderr=stderr_text)
|
||||
|
||||
|
||||
async def run_case(
|
||||
label: str,
|
||||
base_cmd: List[str],
|
||||
prompt: str,
|
||||
iterations: int,
|
||||
feature_enabled: bool,
|
||||
toggle_feature: bool,
|
||||
concurrency: int,
|
||||
) -> tuple[list[float], int]:
|
||||
sem = asyncio.Semaphore(concurrency)
|
||||
tasks = [
|
||||
asyncio.create_task(
|
||||
run_once(base_cmd, prompt, feature_enabled, toggle_feature, sem)
|
||||
)
|
||||
for _ in range(iterations)
|
||||
]
|
||||
durations: List[float] = []
|
||||
failures = 0
|
||||
for idx, task in enumerate(asyncio.as_completed(tasks), start=1):
|
||||
result = await task
|
||||
durations.append(result.duration)
|
||||
status = "ok" if result.exit_code == 0 else f"fail ({result.exit_code})"
|
||||
print(f"[{label}] run {idx}/{iterations}: {result.duration:.3f}s [{status}]")
|
||||
if result.exit_code != 0:
|
||||
failures += 1
|
||||
if result.stderr:
|
||||
print(f" stderr: {result.stderr}", file=sys.stderr)
|
||||
return durations, failures
|
||||
|
||||
|
||||
def summarize(label: str, durations: List[float]) -> None:
|
||||
if not durations:
|
||||
print(f"[{label}] no runs recorded")
|
||||
return
|
||||
mean = statistics.mean(durations)
|
||||
median = statistics.median(durations)
|
||||
if len(durations) < 2:
|
||||
p95 = durations[0]
|
||||
else:
|
||||
p95 = statistics.quantiles(durations, n=100)[94]
|
||||
print(
|
||||
f"[{label}] n={len(durations)} "
|
||||
f"mean={mean:.3f}s median={median:.3f}s p95={p95:.3f}s"
|
||||
)
|
||||
|
||||
|
||||
def format_prompt_label(prompt: str, idx: int) -> str:
|
||||
snippet = prompt.strip()
|
||||
if len(snippet) > 60:
|
||||
snippet = snippet[:57] + "..."
|
||||
return f"prompt {idx + 1}: {snippet}"
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Benchmark codex exec with/without non_login_shell_heuristic."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--codex-bin",
|
||||
default="codex",
|
||||
help="Path to the codex binary (default: codex on PATH).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--iterations",
|
||||
type=int,
|
||||
default=5,
|
||||
help="Number of runs per configuration (default: 5)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--prompt",
|
||||
dest="prompts",
|
||||
action="append",
|
||||
required=True,
|
||||
help="Prompt to send to codex exec (repeat for multiple prompts).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--workdir",
|
||||
default=None,
|
||||
help="Working directory for codex exec (--cd); defaults to current dir.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
default=None,
|
||||
help="Optional model override passed to codex exec (--model).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--reasoning-effort",
|
||||
choices=["none", "minimal", "low", "medium", "high", "xhigh"],
|
||||
default=None,
|
||||
help="Optional reasoning effort override passed via -c model_reasoning_effort=...",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--extra-config",
|
||||
action="append",
|
||||
default=[],
|
||||
help="Additional -c overrides passed to codex exec (repeatable).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--concurrency",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of concurrent runs to launch per configuration (default: 1).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-feature-toggle",
|
||||
action="store_true",
|
||||
help=(
|
||||
"Do not inject the non_login_shell_heuristic flag; "
|
||||
"use when benchmarking an older codex binary."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
action="store_true",
|
||||
help="Print codex exec stdout/stderr for debugging.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
base_cmd: List[str] = [args.codex_bin, "exec"]
|
||||
if args.workdir:
|
||||
base_cmd.extend(["--cd", args.workdir])
|
||||
if args.model:
|
||||
base_cmd.extend(["--model", args.model])
|
||||
if args.reasoning_effort:
|
||||
base_cmd.extend(["-c", f"model_reasoning_effort={args.reasoning_effort}"])
|
||||
for override in args.extra_config:
|
||||
base_cmd.extend(["-c", override])
|
||||
if not args.verbose:
|
||||
base_cmd.append("--json")
|
||||
toggle_feature = not args.skip_feature_toggle
|
||||
prompts = args.prompts
|
||||
|
||||
print(
|
||||
f"Running {args.iterations} iterations per config for {len(prompts)} prompt(s)..."
|
||||
)
|
||||
all_off_durations: List[float] = []
|
||||
all_on_durations: List[float] = []
|
||||
total_off_failures = 0
|
||||
total_on_failures = 0
|
||||
per_prompt_results: list[dict] = []
|
||||
|
||||
for idx, prompt in enumerate(prompts):
|
||||
label = format_prompt_label(prompt, idx)
|
||||
print(f"\nPrompt {idx + 1}/{len(prompts)}: {label}")
|
||||
off_durations, off_failures = asyncio.run(
|
||||
run_case(
|
||||
f"login-shell | {label}",
|
||||
base_cmd,
|
||||
prompt,
|
||||
args.iterations,
|
||||
feature_enabled=False,
|
||||
toggle_feature=toggle_feature,
|
||||
concurrency=args.concurrency,
|
||||
)
|
||||
)
|
||||
on_durations, on_failures = asyncio.run(
|
||||
run_case(
|
||||
f"non-login-shell | {label}",
|
||||
base_cmd,
|
||||
prompt,
|
||||
args.iterations,
|
||||
feature_enabled=True,
|
||||
toggle_feature=toggle_feature,
|
||||
concurrency=args.concurrency,
|
||||
)
|
||||
)
|
||||
per_prompt_results.append(
|
||||
{
|
||||
"label": label,
|
||||
"off_durations": off_durations,
|
||||
"on_durations": on_durations,
|
||||
"off_failures": off_failures,
|
||||
"on_failures": on_failures,
|
||||
}
|
||||
)
|
||||
all_off_durations.extend(off_durations)
|
||||
all_on_durations.extend(on_durations)
|
||||
total_off_failures += off_failures
|
||||
total_on_failures += on_failures
|
||||
|
||||
print("\nPer-prompt summary:")
|
||||
for stats in per_prompt_results:
|
||||
summarize(f"login-shell | {stats['label']}", stats["off_durations"])
|
||||
summarize(f"non-login-shell | {stats['label']}", stats["on_durations"])
|
||||
print(f"login-shell failures ({stats['label']}): {stats['off_failures']}")
|
||||
print(f"non-login-shell failures ({stats['label']}): {stats['on_failures']}")
|
||||
|
||||
print("\nCombined summary across prompts:")
|
||||
summarize("login-shell (all prompts)", all_off_durations)
|
||||
summarize("non-login-shell (all prompts)", all_on_durations)
|
||||
print(f"login-shell failures (all prompts): {total_off_failures}")
|
||||
print(f"non-login-shell failures (all prompts): {total_on_failures}")
|
||||
|
||||
return 0 if (total_off_failures + total_on_failures) == 0 else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user