Add auth env observability (#14905)

CXC-410 Emit Env Var Status with `/feedback` report

Add more observability on top of #14611 

[Unset](https://openai.sentry.io/issues/7340419168/?project=4510195390611458&query=019cfa8d-c1ba-7002-96fa-e35fc340551d&referrer=issue-stream)

[Set](https://openai.sentry.io/issues/7340426331/?project=4510195390611458&query=019cfa91-aba1-7823-ab7e-762edfbc0ed4&referrer=issue-stream)
<img width="1063" height="610" alt="image"
src="https://github.com/user-attachments/assets/937ab026-1c2d-4757-81d5-5f31b853113e"
/>


###### Summary
- Adds auth-env telemetry that records whether key auth-related env
overrides were present on session start and request paths.
- Threads those auth-env fields through `/responses`, websocket, and
`/models` telemetry and feedback metadata.
- Buckets custom provider `env_key` configuration to a safe
`"configured"` value instead of emitting raw config text.
- Keeps the slice observability-only: no raw token values or raw URLs
are emitted.

###### Rationale (from spec findings)
- 401 and auth-path debugging needs a way to distinguish env-driven auth
paths from sessions with no auth env override.
- Startup and model-refresh failures need the same auth-env diagnostics
as normal request failures.
- Feedback and Sentry tags need the same auth-env signal as OTel events
so reports can be triaged consistently.
- Custom provider config is user-controlled text, so the telemetry
contract must stay presence-only / bucketed.

###### Scope
- Adds a small `AuthEnvTelemetry` bundle for env presence collection and
threads it through the main request/session telemetry paths.
- Does not add endpoint/base-url/provider-header/geo routing attribution
or broader telemetry API redesign.

###### Trade-offs
- `provider_env_key_name` is bucketed to `"configured"` instead of
preserving the literal configured env var name.
- `/models` is included because startup/model-refresh auth failures need
the same diagnostics, but broader parity work remains out of scope.
- This slice keeps the existing telemetry APIs and layers auth-env
fields onto them rather than redesigning the metadata model.

###### Client follow-up
- Add the separate endpoint/base-url attribution slice if routing-source
diagnosis is still needed.
- Add provider-header or residency attribution only if auth-env presence
proves insufficient in real reports.
- Revisit whether any additional auth-related env inputs need safe
bucketing after more 401 triage data.

###### Testing
- `cargo test -p codex-core emit_feedback_request_tags -- --nocapture`
- `cargo test -p codex-core
collect_auth_env_telemetry_buckets_provider_env_key_name -- --nocapture`
- `cargo test -p codex-core
models_request_telemetry_emits_auth_env_feedback_tags_on_failure --
--nocapture`
- `cargo test -p codex-otel
otel_export_routing_policy_routes_api_request_auth_observability --
--nocapture`
- `cargo test -p codex-otel
otel_export_routing_policy_routes_websocket_connect_auth_observability
-- --nocapture`
- `cargo test -p codex-otel
otel_export_routing_policy_routes_websocket_request_transport_observability
-- --nocapture`
- `cargo test -p codex-core --no-run --message-format short`
- `cargo test -p codex-otel --no-run --message-format short`

---------

Co-authored-by: Codex <noreply@openai.com>
This commit is contained in:
Colin Young
2026-03-17 14:26:27 -07:00
committed by GitHub
parent ee756eb80f
commit 0d2ff40a58
12 changed files with 770 additions and 161 deletions

View File

@@ -7,6 +7,7 @@ use rand::Rng;
use tracing::debug;
use tracing::error;
use crate::auth_env_telemetry::AuthEnvTelemetry;
use crate::parse_command::shlex_join;
const INITIAL_DELAY_MS: u64 = 200;
@@ -54,6 +55,23 @@ pub(crate) struct FeedbackRequestTags<'a> {
pub auth_recovery_followup_status: Option<u16>,
}
struct FeedbackRequestSnapshot<'a> {
endpoint: &'a str,
auth_header_attached: bool,
auth_header_name: &'a str,
auth_mode: &'a str,
auth_retry_after_unauthorized: String,
auth_recovery_mode: &'a str,
auth_recovery_phase: &'a str,
auth_connection_reused: String,
auth_request_id: &'a str,
auth_cf_ray: &'a str,
auth_error: &'a str,
auth_error_code: &'a str,
auth_recovery_followup_success: String,
auth_recovery_followup_status: String,
}
struct Auth401FeedbackSnapshot<'a> {
request_id: &'a str,
cf_ray: &'a str,
@@ -77,42 +95,84 @@ impl<'a> Auth401FeedbackSnapshot<'a> {
}
}
impl<'a> FeedbackRequestSnapshot<'a> {
fn from_tags(tags: &'a FeedbackRequestTags<'a>) -> Self {
Self {
endpoint: tags.endpoint,
auth_header_attached: tags.auth_header_attached,
auth_header_name: tags.auth_header_name.unwrap_or(""),
auth_mode: tags.auth_mode.unwrap_or(""),
auth_retry_after_unauthorized: tags
.auth_retry_after_unauthorized
.map_or_else(String::new, |value| value.to_string()),
auth_recovery_mode: tags.auth_recovery_mode.unwrap_or(""),
auth_recovery_phase: tags.auth_recovery_phase.unwrap_or(""),
auth_connection_reused: tags
.auth_connection_reused
.map_or_else(String::new, |value| value.to_string()),
auth_request_id: tags.auth_request_id.unwrap_or(""),
auth_cf_ray: tags.auth_cf_ray.unwrap_or(""),
auth_error: tags.auth_error.unwrap_or(""),
auth_error_code: tags.auth_error_code.unwrap_or(""),
auth_recovery_followup_success: tags
.auth_recovery_followup_success
.map_or_else(String::new, |value| value.to_string()),
auth_recovery_followup_status: tags
.auth_recovery_followup_status
.map_or_else(String::new, |value| value.to_string()),
}
}
}
#[cfg(test)]
pub(crate) fn emit_feedback_request_tags(tags: &FeedbackRequestTags<'_>) {
let auth_header_name = tags.auth_header_name.unwrap_or("");
let auth_mode = tags.auth_mode.unwrap_or("");
let auth_retry_after_unauthorized = tags
.auth_retry_after_unauthorized
.map_or_else(String::new, |value| value.to_string());
let auth_recovery_mode = tags.auth_recovery_mode.unwrap_or("");
let auth_recovery_phase = tags.auth_recovery_phase.unwrap_or("");
let auth_connection_reused = tags
.auth_connection_reused
.map_or_else(String::new, |value| value.to_string());
let auth_request_id = tags.auth_request_id.unwrap_or("");
let auth_cf_ray = tags.auth_cf_ray.unwrap_or("");
let auth_error = tags.auth_error.unwrap_or("");
let auth_error_code = tags.auth_error_code.unwrap_or("");
let auth_recovery_followup_success = tags
.auth_recovery_followup_success
.map_or_else(String::new, |value| value.to_string());
let auth_recovery_followup_status = tags
.auth_recovery_followup_status
.map_or_else(String::new, |value| value.to_string());
let snapshot = FeedbackRequestSnapshot::from_tags(tags);
feedback_tags!(
endpoint = tags.endpoint,
auth_header_attached = tags.auth_header_attached,
auth_header_name = auth_header_name,
auth_mode = auth_mode,
auth_retry_after_unauthorized = auth_retry_after_unauthorized,
auth_recovery_mode = auth_recovery_mode,
auth_recovery_phase = auth_recovery_phase,
auth_connection_reused = auth_connection_reused,
auth_request_id = auth_request_id,
auth_cf_ray = auth_cf_ray,
auth_error = auth_error,
auth_error_code = auth_error_code,
auth_recovery_followup_success = auth_recovery_followup_success,
auth_recovery_followup_status = auth_recovery_followup_status
endpoint = snapshot.endpoint,
auth_header_attached = snapshot.auth_header_attached,
auth_header_name = snapshot.auth_header_name,
auth_mode = snapshot.auth_mode,
auth_retry_after_unauthorized = snapshot.auth_retry_after_unauthorized,
auth_recovery_mode = snapshot.auth_recovery_mode,
auth_recovery_phase = snapshot.auth_recovery_phase,
auth_connection_reused = snapshot.auth_connection_reused,
auth_request_id = snapshot.auth_request_id,
auth_cf_ray = snapshot.auth_cf_ray,
auth_error = snapshot.auth_error,
auth_error_code = snapshot.auth_error_code,
auth_recovery_followup_success = snapshot.auth_recovery_followup_success,
auth_recovery_followup_status = snapshot.auth_recovery_followup_status
);
}
pub(crate) fn emit_feedback_request_tags_with_auth_env(
tags: &FeedbackRequestTags<'_>,
auth_env: &AuthEnvTelemetry,
) {
let snapshot = FeedbackRequestSnapshot::from_tags(tags);
feedback_tags!(
endpoint = snapshot.endpoint,
auth_header_attached = snapshot.auth_header_attached,
auth_header_name = snapshot.auth_header_name,
auth_mode = snapshot.auth_mode,
auth_retry_after_unauthorized = snapshot.auth_retry_after_unauthorized,
auth_recovery_mode = snapshot.auth_recovery_mode,
auth_recovery_phase = snapshot.auth_recovery_phase,
auth_connection_reused = snapshot.auth_connection_reused,
auth_request_id = snapshot.auth_request_id,
auth_cf_ray = snapshot.auth_cf_ray,
auth_error = snapshot.auth_error,
auth_error_code = snapshot.auth_error_code,
auth_recovery_followup_success = snapshot.auth_recovery_followup_success,
auth_recovery_followup_status = snapshot.auth_recovery_followup_status,
auth_env_openai_api_key_present = auth_env.openai_api_key_env_present,
auth_env_codex_api_key_present = auth_env.codex_api_key_env_present,
auth_env_codex_api_key_enabled = auth_env.codex_api_key_env_enabled,
auth_env_provider_key_name = auth_env.provider_env_key_name.as_deref().unwrap_or(""),
auth_env_provider_key_present = auth_env
.provider_env_key_present
.map_or_else(String::new, |value| value.to_string()),
auth_env_refresh_token_url_override_present = auth_env.refresh_token_url_override_present
);
}