feat(otel): Centralize OTEL metric names and shared tag builders (#14117)

This cleans up a bunch of metric plumbing that had started to drift.

The main change is making `codex-otel` the canonical home for shared
metric definitions and metric tag helpers. I moved the `turn/thread`
metric names that were still duplicated into the OTEL metric registry,
added a shared `metrics::tags` module for common tag keys and session
tag construction, and updated `SessionTelemetry` to build its metadata
tags through that shared path.

On the codex-core side, TTFT/TTFM now use the shared metric-name
constants instead of local string definitions. I also switched the
obvious remaining turn/thread metric callsites over to the shared
constants, and added a small helper so TTFT/TTFM can attach an optional
sanitized client.name tag from TurnContext.

This should make follow-on telemetry work less ad hoc:
- one canonical place for metric names
- one canonical place for common metric tag keys/builders
- less duplication between `codex-core` and `codex-otel`
This commit is contained in:
Owen Lin
2026-03-09 12:46:42 -07:00
committed by GitHub
parent 6ad448b658
commit da991bdf3a
7 changed files with 156 additions and 67 deletions

View File

@@ -33,6 +33,9 @@ use crate::protocol::TurnCompleteEvent;
use crate::state::ActiveTurn;
use crate::state::RunningTask;
use crate::state::TaskKind;
use codex_otel::metrics::names::TURN_E2E_DURATION_METRIC;
use codex_otel::metrics::names::TURN_TOKEN_USAGE_METRIC;
use codex_otel::metrics::names::TURN_TOOL_CALL_METRIC;
use codex_protocol::items::TurnItem;
use codex_protocol::models::ContentItem;
use codex_protocol::models::ResponseInputItem;
@@ -145,7 +148,7 @@ impl Session {
let timer = turn_context
.session_telemetry
.start_timer("codex.turn.e2e_duration_ms", &[])
.start_timer(TURN_E2E_DURATION_METRIC, &[])
.ok();
let done_clone = Arc::clone(&done);
@@ -278,7 +281,7 @@ impl Session {
},
);
self.services.session_telemetry.histogram(
"codex.turn.tool.call",
TURN_TOOL_CALL_METRIC,
i64::try_from(turn_tool_calls).unwrap_or(i64::MAX),
&[tmp_mem],
);
@@ -301,27 +304,27 @@ impl Session {
.max(0),
};
self.services.session_telemetry.histogram(
"codex.turn.token_usage",
TURN_TOKEN_USAGE_METRIC,
turn_token_usage.total_tokens,
&[("token_type", "total"), tmp_mem],
);
self.services.session_telemetry.histogram(
"codex.turn.token_usage",
TURN_TOKEN_USAGE_METRIC,
turn_token_usage.input_tokens,
&[("token_type", "input"), tmp_mem],
);
self.services.session_telemetry.histogram(
"codex.turn.token_usage",
TURN_TOKEN_USAGE_METRIC,
turn_token_usage.cached_input(),
&[("token_type", "cached_input"), tmp_mem],
);
self.services.session_telemetry.histogram(
"codex.turn.token_usage",
TURN_TOKEN_USAGE_METRIC,
turn_token_usage.output_tokens,
&[("token_type", "output"), tmp_mem],
);
self.services.session_telemetry.histogram(
"codex.turn.token_usage",
TURN_TOKEN_USAGE_METRIC,
turn_token_usage.reasoning_output_tokens,
&[("token_type", "reasoning_output"), tmp_mem],
);