guardian initial feedback / tweaks (#13897)

## Summary
- remove the remaining model-visible guardian-specific `on-request`
prompt additions so enabling the feature does not change the main
approval-policy instructions
- neutralize user-facing guardian wording to talk about automatic
approval review / approval requests rather than a second reviewer or
only sandbox escalations
- tighten guardian retry-context handling so agent-authored
`justification` stays in the structured action JSON and is not also
injected as raw retry context
- simplify guardian review plumbing in core by deleting dead
prompt-append paths and trimming some request/transcript setup code

## Notable Changes
- delete the dead `permissions/approval_policy/guardian.md` append path
and stop threading `guardian_approval_enabled` through model-facing
developer-instruction builders
- rename the experimental feature copy to `Automatic approval review`
and update the `/experimental` snapshot text accordingly
- make approval-review status strings generic across shell, patch,
network, and MCP review types
- forward real sandbox/network retry reasons for shell and unified-exec
guardian review, but do not pass agent-authored justification as raw
retry context
- simplify `guardian.rs` by removing the one-field request wrapper,
deduping reasoning-effort selection, and cleaning up transcript entry
collection

## Testing
- `just fmt`
- full validation left to CI

---------

Co-authored-by: Codex <noreply@openai.com>
This commit is contained in:
Charley Cunningham
2026-03-09 09:25:24 -07:00
committed by GitHub
parent 2bc3e52a91
commit f23fcd6ced
16 changed files with 421 additions and 352 deletions

View File

@@ -408,8 +408,6 @@ const APPROVAL_POLICY_ON_REQUEST_RULE: &str =
include_str!("prompts/permissions/approval_policy/on_request_rule.md");
const APPROVAL_POLICY_ON_REQUEST_RULE_REQUEST_PERMISSION: &str =
include_str!("prompts/permissions/approval_policy/on_request_rule_request_permission.md");
const GUARDIAN_APPROVAL_FEATURE: &str =
include_str!("prompts/permissions/approval_policy/guardian.md");
const SANDBOX_MODE_DANGER_FULL_ACCESS: &str =
include_str!("prompts/permissions/sandbox_mode/danger_full_access.md");
@@ -427,7 +425,6 @@ impl DeveloperInstructions {
pub fn from(
approval_policy: AskForApproval,
guardian_approval_enabled: bool,
exec_policy: &Policy,
request_permission_enabled: bool,
) -> DeveloperInstructions {
@@ -451,14 +448,7 @@ impl DeveloperInstructions {
AskForApproval::Never => APPROVAL_POLICY_NEVER.to_string(),
AskForApproval::UnlessTrusted => APPROVAL_POLICY_UNLESS_TRUSTED.to_string(),
AskForApproval::OnFailure => APPROVAL_POLICY_ON_FAILURE.to_string(),
AskForApproval::OnRequest => {
let mut instructions = on_request_instructions();
if guardian_approval_enabled {
instructions.push_str("\n\n");
instructions.push_str(GUARDIAN_APPROVAL_FEATURE);
}
instructions
}
AskForApproval::OnRequest => on_request_instructions(),
AskForApproval::Reject(reject_config) => {
let on_request_instructions = on_request_instructions();
let sandbox_approval = reject_config.sandbox_approval;
@@ -521,7 +511,6 @@ impl DeveloperInstructions {
pub fn from_policy(
sandbox_policy: &SandboxPolicy,
approval_policy: AskForApproval,
guardian_approval_enabled: bool,
exec_policy: &Policy,
cwd: &Path,
request_permission_enabled: bool,
@@ -546,7 +535,6 @@ impl DeveloperInstructions {
sandbox_mode,
network_access,
approval_policy,
guardian_approval_enabled,
exec_policy,
writable_roots,
request_permission_enabled,
@@ -571,7 +559,6 @@ impl DeveloperInstructions {
sandbox_mode: SandboxMode,
network_access: NetworkAccess,
approval_policy: AskForApproval,
guardian_approval_enabled: bool,
exec_policy: &Policy,
writable_roots: Option<Vec<WritableRoot>>,
request_permission_enabled: bool,
@@ -585,7 +572,6 @@ impl DeveloperInstructions {
))
.concat(DeveloperInstructions::from(
approval_policy,
guardian_approval_enabled,
exec_policy,
request_permission_enabled,
))
@@ -1667,7 +1653,6 @@ mod tests {
SandboxMode::WorkspaceWrite,
NetworkAccess::Enabled,
AskForApproval::OnRequest,
false,
&Policy::empty(),
None,
false,
@@ -1697,7 +1682,6 @@ mod tests {
let instructions = DeveloperInstructions::from_policy(
&policy,
AskForApproval::UnlessTrusted,
false,
&Policy::empty(),
&PathBuf::from("/tmp"),
false,
@@ -1720,7 +1704,6 @@ mod tests {
SandboxMode::WorkspaceWrite,
NetworkAccess::Enabled,
AskForApproval::OnRequest,
false,
&exec_policy,
None,
false,
@@ -1738,7 +1721,6 @@ mod tests {
SandboxMode::WorkspaceWrite,
NetworkAccess::Enabled,
AskForApproval::OnRequest,
false,
&Policy::empty(),
None,
true,
@@ -1749,23 +1731,6 @@ mod tests {
assert!(text.contains("additional_permissions"));
}
#[test]
fn includes_guardian_feature_guidance_for_on_request_when_enabled() {
let instructions = DeveloperInstructions::from_permissions_with_network(
SandboxMode::WorkspaceWrite,
NetworkAccess::Enabled,
AskForApproval::OnRequest,
true,
&Policy::empty(),
None,
false,
);
let text = instructions.into_text();
assert!(text.contains("guardian subagent"));
assert!(text.contains("approval prompts"));
}
#[test]
fn render_command_prefix_list_sorts_by_len_then_total_len_then_alphabetical() {
let prefixes = vec![

View File

@@ -1,3 +0,0 @@
Guardian approvals are enabled. While `approval_policy` is still `on-request`, approval prompts are routed to a guardian subagent instead of the user. Use `sandbox_permissions: "require_escalated"` with a concise `justification` when you need unsandboxed execution, and use `sandbox_permissions: "with_additional_permissions"` plus `additional_permissions` when you need broader sandboxed access. Codex will ask the guardian subagent to assess the risk automatically.
Do not message the user before requesting escalation. If the guardian rejects an action, do not attempt the same outcome via workaround, indirect execution, or policy circumvention. Either choose a materially safer alternative or stop and ask the user for guidance.