feat: show runtime metrics in console (#10278)

Summary of changes:

- Adds a new feature flag: runtime_metrics
  - Declared in core/src/features.rs
  - Added to core/config.schema.json
  - Wired into OTEL init in core/src/otel_init.rs

- Enables on-demand runtime metric snapshots in OTEL
  - Adds runtime_metrics: bool to otel/src/config.rs
  - Enables experimental custom reader features in otel/Cargo.toml
  - Adds snapshot/reset/summary APIs in:
    - otel/src/lib.rs
    - otel/src/metrics/client.rs
    - otel/src/metrics/config.rs
    - otel/src/metrics/error.rs

- Defines metric names and a runtime summary builder
  - New files:
    - otel/src/metrics/names.rs
    - otel/src/metrics/runtime_metrics.rs
  - Summarizes totals for:
    - Tool calls
    - API requests
    - SSE/streaming events

- Instruments metrics collection in OTEL manager
  - otel/src/traces/otel_manager.rs now records:
    - API call counts + durations
    - SSE event counts + durations (success/failure)
    - Tool call metrics now use shared constants

- Surfaces runtime metrics in the TUI
  - Resets runtime metrics at turn start in tui/src/chatwidget.rs
- Displays metrics in the final separator line in
tui/src/history_cell.rs

- Adds tests
  - New OTEL tests:
    - otel/tests/suite/snapshot.rs
    - otel/tests/suite/runtime_summary.rs
  - New TUI test:
- final_message_separator_includes_runtime_metrics in
tui/src/history_cell.rs

Scope:
- 19 files changed
- ~652 insertions, 38 deletions


<img width="922" height="169" alt="Screenshot 2026-01-30 at 4 11 34 PM"
src="https://github.com/user-attachments/assets/1efd754d-a16d-4564-83a5-f4442fd2f998"
/>
This commit is contained in:
Anton Panasenko
2026-01-30 22:20:02 -08:00
committed by GitHub
parent a8c9e386e7
commit 8660ad6c64
19 changed files with 659 additions and 38 deletions

View File

@@ -22,13 +22,20 @@ use opentelemetry_otlp::WithTonicConfig;
use opentelemetry_otlp::tonic_types::metadata::MetadataMap;
use opentelemetry_otlp::tonic_types::transport::ClientTlsConfig;
use opentelemetry_sdk::Resource;
use opentelemetry_sdk::metrics::InstrumentKind;
use opentelemetry_sdk::metrics::ManualReader;
use opentelemetry_sdk::metrics::PeriodicReader;
use opentelemetry_sdk::metrics::Pipeline;
use opentelemetry_sdk::metrics::SdkMeterProvider;
use opentelemetry_sdk::metrics::Temporality;
use opentelemetry_sdk::metrics::data::ResourceMetrics;
use opentelemetry_sdk::metrics::reader::MetricReader;
use opentelemetry_semantic_conventions as semconv;
use std::collections::BTreeMap;
use std::collections::HashMap;
use std::sync::Arc;
use std::sync::Mutex;
use std::sync::Weak;
use std::time::Duration;
use tracing::debug;
@@ -37,6 +44,39 @@ const METER_NAME: &str = "codex";
const DURATION_UNIT: &str = "ms";
const DURATION_DESCRIPTION: &str = "Duration in milliseconds.";
#[derive(Clone, Debug)]
struct SharedManualReader {
inner: Arc<ManualReader>,
}
impl SharedManualReader {
fn new(inner: Arc<ManualReader>) -> Self {
Self { inner }
}
}
impl MetricReader for SharedManualReader {
fn register_pipeline(&self, pipeline: Weak<Pipeline>) {
self.inner.register_pipeline(pipeline);
}
fn collect(&self, rm: &mut ResourceMetrics) -> opentelemetry_sdk::error::OTelSdkResult {
self.inner.collect(rm)
}
fn force_flush(&self) -> opentelemetry_sdk::error::OTelSdkResult {
self.inner.force_flush()
}
fn shutdown_with_timeout(&self, timeout: Duration) -> opentelemetry_sdk::error::OTelSdkResult {
self.inner.shutdown_with_timeout(timeout)
}
fn temporality(&self, kind: InstrumentKind) -> Temporality {
self.inner.temporality(kind)
}
}
#[derive(Debug)]
struct MetricsClientInner {
meter_provider: SdkMeterProvider,
@@ -44,6 +84,7 @@ struct MetricsClientInner {
counters: Mutex<HashMap<String, Counter<u64>>>,
histograms: Mutex<HashMap<String, Histogram<f64>>>,
duration_histograms: Mutex<HashMap<String, Histogram<f64>>>,
runtime_reader: Option<Arc<ManualReader>>,
default_tags: BTreeMap<String, String>,
}
@@ -144,26 +185,41 @@ pub struct MetricsClient(std::sync::Arc<MetricsClientInner>);
impl MetricsClient {
/// Build a metrics client from configuration and validate defaults.
pub fn new(config: MetricsConfig) -> Result<Self> {
validate_tags(&config.default_tags)?;
let MetricsConfig {
environment,
service_name,
service_version,
exporter,
export_interval,
runtime_reader,
default_tags,
} = config;
validate_tags(&default_tags)?;
let resource = Resource::builder()
.with_service_name(config.service_name.clone())
.with_service_name(service_name)
.with_attributes(vec![
KeyValue::new(
semconv::attribute::SERVICE_VERSION,
config.service_version.clone(),
),
KeyValue::new(ENV_ATTRIBUTE, config.environment.clone()),
KeyValue::new(semconv::attribute::SERVICE_VERSION, service_version),
KeyValue::new(ENV_ATTRIBUTE, environment),
])
.build();
let (meter_provider, meter) = match config.exporter {
let runtime_reader = runtime_reader.then(|| {
Arc::new(
ManualReader::builder()
.with_temporality(Temporality::Delta)
.build(),
)
});
let (meter_provider, meter) = match exporter {
MetricsExporter::InMemory(exporter) => {
build_provider(resource, exporter, config.export_interval)
build_provider(resource, exporter, export_interval, runtime_reader.clone())
}
MetricsExporter::Otlp(exporter) => {
let exporter = build_otlp_metric_exporter(exporter, Temporality::Delta)?;
build_provider(resource, exporter, config.export_interval)
build_provider(resource, exporter, export_interval, runtime_reader.clone())
}
};
@@ -173,7 +229,8 @@ impl MetricsClient {
counters: Mutex::new(HashMap::new()),
histograms: Mutex::new(HashMap::new()),
duration_histograms: Mutex::new(HashMap::new()),
default_tags: config.default_tags,
runtime_reader,
default_tags,
})))
}
@@ -209,6 +266,18 @@ impl MetricsClient {
Ok(Timer::new(name, tags, self))
}
/// Collect a runtime metrics snapshot without shutting down the provider.
pub fn snapshot(&self) -> Result<ResourceMetrics> {
let Some(reader) = &self.0.runtime_reader else {
return Err(MetricsError::RuntimeSnapshotUnavailable);
};
let mut snapshot = ResourceMetrics::default();
reader
.collect(&mut snapshot)
.map_err(|source| MetricsError::RuntimeSnapshotCollect { source })?;
Ok(snapshot)
}
/// Flush metrics and stop the underlying OTEL meter provider.
pub fn shutdown(&self) -> Result<()> {
self.0.shutdown()
@@ -219,6 +288,7 @@ fn build_provider<E>(
resource: Resource,
exporter: E,
interval: Option<Duration>,
runtime_reader: Option<Arc<ManualReader>>,
) -> (SdkMeterProvider, Meter)
where
E: opentelemetry_sdk::metrics::exporter::PushMetricExporter + 'static,
@@ -228,10 +298,11 @@ where
reader_builder = reader_builder.with_interval(interval);
}
let reader = reader_builder.build();
let provider = SdkMeterProvider::builder()
.with_resource(resource)
.with_reader(reader)
.build();
let mut provider_builder = SdkMeterProvider::builder().with_resource(resource);
if let Some(reader) = runtime_reader {
provider_builder = provider_builder.with_reader(SharedManualReader::new(reader));
}
let provider = provider_builder.with_reader(reader).build();
let meter = provider.meter(METER_NAME);
(provider, meter)
}

View File

@@ -19,6 +19,7 @@ pub struct MetricsConfig {
pub(crate) service_version: String,
pub(crate) exporter: MetricsExporter,
pub(crate) export_interval: Option<Duration>,
pub(crate) runtime_reader: bool,
pub(crate) default_tags: BTreeMap<String, String>,
}
@@ -35,6 +36,7 @@ impl MetricsConfig {
service_version: service_version.into(),
exporter: MetricsExporter::Otlp(exporter),
export_interval: None,
runtime_reader: false,
default_tags: BTreeMap::new(),
}
}
@@ -52,6 +54,7 @@ impl MetricsConfig {
service_version: service_version.into(),
exporter: MetricsExporter::InMemory(exporter),
export_interval: None,
runtime_reader: false,
default_tags: BTreeMap::new(),
}
}
@@ -62,6 +65,12 @@ impl MetricsConfig {
self
}
/// Enable a manual reader for on-demand runtime snapshots.
pub fn with_runtime_reader(mut self) -> Self {
self.runtime_reader = true;
self
}
/// Add a default tag that will be sent with every metric.
pub fn with_tag(mut self, key: impl Into<String>, value: impl Into<String>) -> Result<Self> {
let key = key.into();

View File

@@ -34,4 +34,13 @@ pub enum MetricsError {
#[source]
source: opentelemetry_sdk::error::OTelSdkError,
},
#[error("runtime metrics snapshot reader is not enabled")]
RuntimeSnapshotUnavailable,
#[error("failed to collect runtime metrics snapshot from metrics reader")]
RuntimeSnapshotCollect {
#[source]
source: opentelemetry_sdk::error::OTelSdkError,
},
}

View File

@@ -1,6 +1,8 @@
mod client;
mod config;
mod error;
pub(crate) mod names;
pub(crate) mod runtime_metrics;
pub(crate) mod timer;
pub(crate) mod validation;

View File

@@ -0,0 +1,6 @@
pub(crate) const TOOL_CALL_COUNT_METRIC: &str = "codex.tool.call";
pub(crate) const TOOL_CALL_DURATION_METRIC: &str = "codex.tool.call.duration_ms";
pub(crate) const API_CALL_COUNT_METRIC: &str = "codex.api_request";
pub(crate) const API_CALL_DURATION_METRIC: &str = "codex.api_request.duration_ms";
pub(crate) const SSE_EVENT_COUNT_METRIC: &str = "codex.sse_event";
pub(crate) const SSE_EVENT_DURATION_METRIC: &str = "codex.sse_event.duration_ms";

View File

@@ -0,0 +1,101 @@
use crate::metrics::names::API_CALL_COUNT_METRIC;
use crate::metrics::names::API_CALL_DURATION_METRIC;
use crate::metrics::names::SSE_EVENT_COUNT_METRIC;
use crate::metrics::names::SSE_EVENT_DURATION_METRIC;
use crate::metrics::names::TOOL_CALL_COUNT_METRIC;
use crate::metrics::names::TOOL_CALL_DURATION_METRIC;
use opentelemetry_sdk::metrics::data::AggregatedMetrics;
use opentelemetry_sdk::metrics::data::Metric;
use opentelemetry_sdk::metrics::data::MetricData;
use opentelemetry_sdk::metrics::data::ResourceMetrics;
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
pub struct RuntimeMetricTotals {
pub count: u64,
pub duration_ms: u64,
}
impl RuntimeMetricTotals {
pub fn is_empty(self) -> bool {
self.count == 0 && self.duration_ms == 0
}
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
pub struct RuntimeMetricsSummary {
pub tool_calls: RuntimeMetricTotals,
pub api_calls: RuntimeMetricTotals,
pub streaming_events: RuntimeMetricTotals,
}
impl RuntimeMetricsSummary {
pub fn is_empty(self) -> bool {
self.tool_calls.is_empty() && self.api_calls.is_empty() && self.streaming_events.is_empty()
}
pub(crate) fn from_snapshot(snapshot: &ResourceMetrics) -> Self {
let tool_calls = RuntimeMetricTotals {
count: sum_counter(snapshot, TOOL_CALL_COUNT_METRIC),
duration_ms: sum_histogram_ms(snapshot, TOOL_CALL_DURATION_METRIC),
};
let api_calls = RuntimeMetricTotals {
count: sum_counter(snapshot, API_CALL_COUNT_METRIC),
duration_ms: sum_histogram_ms(snapshot, API_CALL_DURATION_METRIC),
};
let streaming_events = RuntimeMetricTotals {
count: sum_counter(snapshot, SSE_EVENT_COUNT_METRIC),
duration_ms: sum_histogram_ms(snapshot, SSE_EVENT_DURATION_METRIC),
};
Self {
tool_calls,
api_calls,
streaming_events,
}
}
}
fn sum_counter(snapshot: &ResourceMetrics, name: &str) -> u64 {
snapshot
.scope_metrics()
.flat_map(opentelemetry_sdk::metrics::data::ScopeMetrics::metrics)
.filter(|metric| metric.name() == name)
.map(sum_counter_metric)
.sum()
}
fn sum_counter_metric(metric: &Metric) -> u64 {
match metric.data() {
AggregatedMetrics::U64(MetricData::Sum(sum)) => sum
.data_points()
.map(opentelemetry_sdk::metrics::data::SumDataPoint::value)
.sum(),
_ => 0,
}
}
fn sum_histogram_ms(snapshot: &ResourceMetrics, name: &str) -> u64 {
snapshot
.scope_metrics()
.flat_map(opentelemetry_sdk::metrics::data::ScopeMetrics::metrics)
.filter(|metric| metric.name() == name)
.map(sum_histogram_metric_ms)
.sum()
}
fn sum_histogram_metric_ms(metric: &Metric) -> u64 {
match metric.data() {
AggregatedMetrics::F64(MetricData::Histogram(histogram)) => histogram
.data_points()
.map(|point| f64_to_u64(point.sum()))
.sum(),
_ => 0,
}
}
fn f64_to_u64(value: f64) -> u64 {
if !value.is_finite() || value <= 0.0 {
return 0;
}
let clamped = value.min(u64::MAX as f64);
clamped.round() as u64
}