Add auth env observability (#14905)

CXC-410 Emit Env Var Status with `/feedback` report

Add more observability on top of #14611 

[Unset](https://openai.sentry.io/issues/7340419168/?project=4510195390611458&query=019cfa8d-c1ba-7002-96fa-e35fc340551d&referrer=issue-stream)

[Set](https://openai.sentry.io/issues/7340426331/?project=4510195390611458&query=019cfa91-aba1-7823-ab7e-762edfbc0ed4&referrer=issue-stream)
<img width="1063" height="610" alt="image"
src="https://github.com/user-attachments/assets/937ab026-1c2d-4757-81d5-5f31b853113e"
/>


###### Summary
- Adds auth-env telemetry that records whether key auth-related env
overrides were present on session start and request paths.
- Threads those auth-env fields through `/responses`, websocket, and
`/models` telemetry and feedback metadata.
- Buckets custom provider `env_key` configuration to a safe
`"configured"` value instead of emitting raw config text.
- Keeps the slice observability-only: no raw token values or raw URLs
are emitted.

###### Rationale (from spec findings)
- 401 and auth-path debugging needs a way to distinguish env-driven auth
paths from sessions with no auth env override.
- Startup and model-refresh failures need the same auth-env diagnostics
as normal request failures.
- Feedback and Sentry tags need the same auth-env signal as OTel events
so reports can be triaged consistently.
- Custom provider config is user-controlled text, so the telemetry
contract must stay presence-only / bucketed.

###### Scope
- Adds a small `AuthEnvTelemetry` bundle for env presence collection and
threads it through the main request/session telemetry paths.
- Does not add endpoint/base-url/provider-header/geo routing attribution
or broader telemetry API redesign.

###### Trade-offs
- `provider_env_key_name` is bucketed to `"configured"` instead of
preserving the literal configured env var name.
- `/models` is included because startup/model-refresh auth failures need
the same diagnostics, but broader parity work remains out of scope.
- This slice keeps the existing telemetry APIs and layers auth-env
fields onto them rather than redesigning the metadata model.

###### Client follow-up
- Add the separate endpoint/base-url attribution slice if routing-source
diagnosis is still needed.
- Add provider-header or residency attribution only if auth-env presence
proves insufficient in real reports.
- Revisit whether any additional auth-related env inputs need safe
bucketing after more 401 triage data.

###### Testing
- `cargo test -p codex-core emit_feedback_request_tags -- --nocapture`
- `cargo test -p codex-core
collect_auth_env_telemetry_buckets_provider_env_key_name -- --nocapture`
- `cargo test -p codex-core
models_request_telemetry_emits_auth_env_feedback_tags_on_failure --
--nocapture`
- `cargo test -p codex-otel
otel_export_routing_policy_routes_api_request_auth_observability --
--nocapture`
- `cargo test -p codex-otel
otel_export_routing_policy_routes_websocket_connect_auth_observability
-- --nocapture`
- `cargo test -p codex-otel
otel_export_routing_policy_routes_websocket_request_transport_observability
-- --nocapture`
- `cargo test -p codex-core --no-run --message-format short`
- `cargo test -p codex-otel --no-run --message-format short`

---------

Co-authored-by: Codex <noreply@openai.com>
This commit is contained in:
Colin Young
2026-03-17 14:26:27 -07:00
committed by GitHub
parent ee756eb80f
commit 0d2ff40a58
12 changed files with 770 additions and 161 deletions

View File

@@ -34,6 +34,8 @@ use crate::api_bridge::CoreAuthProvider;
use crate::api_bridge::auth_provider_from_auth;
use crate::api_bridge::map_api_error;
use crate::auth::UnauthorizedRecovery;
use crate::auth_env_telemetry::AuthEnvTelemetry;
use crate::auth_env_telemetry::collect_auth_env_telemetry;
use codex_api::CompactClient as ApiCompactClient;
use codex_api::CompactionInput as ApiCompactionInput;
use codex_api::MemoriesClient as ApiMemoriesClient;
@@ -106,7 +108,7 @@ use crate::response_debug_context::telemetry_transport_error_message;
use crate::tools::spec::create_tools_json_for_responses_api;
use crate::util::FeedbackRequestTags;
use crate::util::emit_feedback_auth_recovery_tags;
use crate::util::emit_feedback_request_tags;
use crate::util::emit_feedback_request_tags_with_auth_env;
pub const OPENAI_BETA_HEADER: &str = "OpenAI-Beta";
pub const X_CODEX_TURN_STATE_HEADER: &str = "x-codex-turn-state";
@@ -138,6 +140,7 @@ struct ModelClientState {
auth_manager: Option<Arc<AuthManager>>,
conversation_id: ThreadId,
provider: ModelProviderInfo,
auth_env_telemetry: AuthEnvTelemetry,
session_source: SessionSource,
model_verbosity: Option<VerbosityConfig>,
responses_websockets_enabled_by_feature: bool,
@@ -267,11 +270,16 @@ impl ModelClient {
include_timing_metrics: bool,
beta_features_header: Option<String>,
) -> Self {
let codex_api_key_env_enabled = auth_manager
.as_ref()
.is_some_and(|manager| manager.codex_api_key_env_enabled());
let auth_env_telemetry = collect_auth_env_telemetry(&provider, codex_api_key_env_enabled);
Self {
state: Arc::new(ModelClientState {
auth_manager,
conversation_id,
provider,
auth_env_telemetry,
session_source,
model_verbosity,
responses_websockets_enabled_by_feature,
@@ -362,6 +370,7 @@ impl ModelClient {
PendingUnauthorizedRetry::default(),
),
RequestRouteTelemetry::for_endpoint(RESPONSES_COMPACT_ENDPOINT),
self.state.auth_env_telemetry.clone(),
);
let client =
ApiCompactClient::new(transport, client_setup.api_provider, client_setup.api_auth)
@@ -430,6 +439,7 @@ impl ModelClient {
PendingUnauthorizedRetry::default(),
),
RequestRouteTelemetry::for_endpoint(MEMORIES_SUMMARIZE_ENDPOINT),
self.state.auth_env_telemetry.clone(),
);
let client =
ApiMemoriesClient::new(transport, client_setup.api_provider, client_setup.api_auth)
@@ -474,11 +484,13 @@ impl ModelClient {
session_telemetry: &SessionTelemetry,
auth_context: AuthRequestTelemetryContext,
request_route_telemetry: RequestRouteTelemetry,
auth_env_telemetry: AuthEnvTelemetry,
) -> Arc<dyn RequestTelemetry> {
let telemetry = Arc::new(ApiTelemetry::new(
session_telemetry.clone(),
auth_context,
request_route_telemetry,
auth_env_telemetry,
));
let request_telemetry: Arc<dyn RequestTelemetry> = telemetry;
request_telemetry
@@ -561,6 +573,7 @@ impl ModelClient {
session_telemetry,
auth_context,
request_route_telemetry,
self.state.auth_env_telemetry.clone(),
);
let websocket_connect_timeout = self.state.provider.websocket_connect_timeout();
let start = Instant::now();
@@ -601,27 +614,30 @@ impl ModelClient {
response_debug.auth_error.as_deref(),
response_debug.auth_error_code.as_deref(),
);
emit_feedback_request_tags(&FeedbackRequestTags {
endpoint: request_route_telemetry.endpoint,
auth_header_attached: auth_context.auth_header_attached,
auth_header_name: auth_context.auth_header_name,
auth_mode: auth_context.auth_mode,
auth_retry_after_unauthorized: Some(auth_context.retry_after_unauthorized),
auth_recovery_mode: auth_context.recovery_mode,
auth_recovery_phase: auth_context.recovery_phase,
auth_connection_reused: Some(false),
auth_request_id: response_debug.request_id.as_deref(),
auth_cf_ray: response_debug.cf_ray.as_deref(),
auth_error: response_debug.auth_error.as_deref(),
auth_error_code: response_debug.auth_error_code.as_deref(),
auth_recovery_followup_success: auth_context
.retry_after_unauthorized
.then_some(result.is_ok()),
auth_recovery_followup_status: auth_context
.retry_after_unauthorized
.then_some(status)
.flatten(),
});
emit_feedback_request_tags_with_auth_env(
&FeedbackRequestTags {
endpoint: request_route_telemetry.endpoint,
auth_header_attached: auth_context.auth_header_attached,
auth_header_name: auth_context.auth_header_name,
auth_mode: auth_context.auth_mode,
auth_retry_after_unauthorized: Some(auth_context.retry_after_unauthorized),
auth_recovery_mode: auth_context.recovery_mode,
auth_recovery_phase: auth_context.recovery_phase,
auth_connection_reused: Some(false),
auth_request_id: response_debug.request_id.as_deref(),
auth_cf_ray: response_debug.cf_ray.as_deref(),
auth_error: response_debug.auth_error.as_deref(),
auth_error_code: response_debug.auth_error_code.as_deref(),
auth_recovery_followup_success: auth_context
.retry_after_unauthorized
.then_some(result.is_ok()),
auth_recovery_followup_status: auth_context
.retry_after_unauthorized
.then_some(status)
.flatten(),
},
&self.state.auth_env_telemetry,
);
result
}
@@ -1030,6 +1046,7 @@ impl ModelClientSession {
session_telemetry,
request_auth_context,
RequestRouteTelemetry::for_endpoint(RESPONSES_ENDPOINT),
self.client.state.auth_env_telemetry.clone(),
);
let compression = self.responses_request_compression(client_setup.auth.as_ref());
let options = self.build_responses_options(turn_metadata_header, compression);
@@ -1190,11 +1207,13 @@ impl ModelClientSession {
session_telemetry: &SessionTelemetry,
auth_context: AuthRequestTelemetryContext,
request_route_telemetry: RequestRouteTelemetry,
auth_env_telemetry: AuthEnvTelemetry,
) -> (Arc<dyn RequestTelemetry>, Arc<dyn SseTelemetry>) {
let telemetry = Arc::new(ApiTelemetry::new(
session_telemetry.clone(),
auth_context,
request_route_telemetry,
auth_env_telemetry,
));
let request_telemetry: Arc<dyn RequestTelemetry> = telemetry.clone();
let sse_telemetry: Arc<dyn SseTelemetry> = telemetry;
@@ -1206,11 +1225,13 @@ impl ModelClientSession {
session_telemetry: &SessionTelemetry,
auth_context: AuthRequestTelemetryContext,
request_route_telemetry: RequestRouteTelemetry,
auth_env_telemetry: AuthEnvTelemetry,
) -> Arc<dyn WebsocketTelemetry> {
let telemetry = Arc::new(ApiTelemetry::new(
session_telemetry.clone(),
auth_context,
request_route_telemetry,
auth_env_telemetry,
));
let websocket_telemetry: Arc<dyn WebsocketTelemetry> = telemetry;
websocket_telemetry
@@ -1663,6 +1684,7 @@ struct ApiTelemetry {
session_telemetry: SessionTelemetry,
auth_context: AuthRequestTelemetryContext,
request_route_telemetry: RequestRouteTelemetry,
auth_env_telemetry: AuthEnvTelemetry,
}
impl ApiTelemetry {
@@ -1670,11 +1692,13 @@ impl ApiTelemetry {
session_telemetry: SessionTelemetry,
auth_context: AuthRequestTelemetryContext,
request_route_telemetry: RequestRouteTelemetry,
auth_env_telemetry: AuthEnvTelemetry,
) -> Self {
Self {
session_telemetry,
auth_context,
request_route_telemetry,
auth_env_telemetry,
}
}
}
@@ -1708,29 +1732,32 @@ impl RequestTelemetry for ApiTelemetry {
debug.auth_error.as_deref(),
debug.auth_error_code.as_deref(),
);
emit_feedback_request_tags(&FeedbackRequestTags {
endpoint: self.request_route_telemetry.endpoint,
auth_header_attached: self.auth_context.auth_header_attached,
auth_header_name: self.auth_context.auth_header_name,
auth_mode: self.auth_context.auth_mode,
auth_retry_after_unauthorized: Some(self.auth_context.retry_after_unauthorized),
auth_recovery_mode: self.auth_context.recovery_mode,
auth_recovery_phase: self.auth_context.recovery_phase,
auth_connection_reused: None,
auth_request_id: debug.request_id.as_deref(),
auth_cf_ray: debug.cf_ray.as_deref(),
auth_error: debug.auth_error.as_deref(),
auth_error_code: debug.auth_error_code.as_deref(),
auth_recovery_followup_success: self
.auth_context
.retry_after_unauthorized
.then_some(error.is_none()),
auth_recovery_followup_status: self
.auth_context
.retry_after_unauthorized
.then_some(status)
.flatten(),
});
emit_feedback_request_tags_with_auth_env(
&FeedbackRequestTags {
endpoint: self.request_route_telemetry.endpoint,
auth_header_attached: self.auth_context.auth_header_attached,
auth_header_name: self.auth_context.auth_header_name,
auth_mode: self.auth_context.auth_mode,
auth_retry_after_unauthorized: Some(self.auth_context.retry_after_unauthorized),
auth_recovery_mode: self.auth_context.recovery_mode,
auth_recovery_phase: self.auth_context.recovery_phase,
auth_connection_reused: None,
auth_request_id: debug.request_id.as_deref(),
auth_cf_ray: debug.cf_ray.as_deref(),
auth_error: debug.auth_error.as_deref(),
auth_error_code: debug.auth_error_code.as_deref(),
auth_recovery_followup_success: self
.auth_context
.retry_after_unauthorized
.then_some(error.is_none()),
auth_recovery_followup_status: self
.auth_context
.retry_after_unauthorized
.then_some(status)
.flatten(),
},
&self.auth_env_telemetry,
);
}
}
@@ -1759,29 +1786,32 @@ impl WebsocketTelemetry for ApiTelemetry {
error_message.as_deref(),
connection_reused,
);
emit_feedback_request_tags(&FeedbackRequestTags {
endpoint: self.request_route_telemetry.endpoint,
auth_header_attached: self.auth_context.auth_header_attached,
auth_header_name: self.auth_context.auth_header_name,
auth_mode: self.auth_context.auth_mode,
auth_retry_after_unauthorized: Some(self.auth_context.retry_after_unauthorized),
auth_recovery_mode: self.auth_context.recovery_mode,
auth_recovery_phase: self.auth_context.recovery_phase,
auth_connection_reused: Some(connection_reused),
auth_request_id: debug.request_id.as_deref(),
auth_cf_ray: debug.cf_ray.as_deref(),
auth_error: debug.auth_error.as_deref(),
auth_error_code: debug.auth_error_code.as_deref(),
auth_recovery_followup_success: self
.auth_context
.retry_after_unauthorized
.then_some(error.is_none()),
auth_recovery_followup_status: self
.auth_context
.retry_after_unauthorized
.then_some(status)
.flatten(),
});
emit_feedback_request_tags_with_auth_env(
&FeedbackRequestTags {
endpoint: self.request_route_telemetry.endpoint,
auth_header_attached: self.auth_context.auth_header_attached,
auth_header_name: self.auth_context.auth_header_name,
auth_mode: self.auth_context.auth_mode,
auth_retry_after_unauthorized: Some(self.auth_context.retry_after_unauthorized),
auth_recovery_mode: self.auth_context.recovery_mode,
auth_recovery_phase: self.auth_context.recovery_phase,
auth_connection_reused: Some(connection_reused),
auth_request_id: debug.request_id.as_deref(),
auth_cf_ray: debug.cf_ray.as_deref(),
auth_error: debug.auth_error.as_deref(),
auth_error_code: debug.auth_error_code.as_deref(),
auth_recovery_followup_success: self
.auth_context
.retry_after_unauthorized
.then_some(error.is_none()),
auth_recovery_followup_status: self
.auth_context
.retry_after_unauthorized
.then_some(status)
.flatten(),
},
&self.auth_env_telemetry,
);
}
fn on_ws_event(