Inline tiny voice helpers

- inline the trivial realtime input-device wrapper and test-only config-range helper - keep the 24 kHz PCM16 selection helpers as the policy boundary Co-authored-by: Codex <noreply@openai.com>
Apply 24 kHz capture preference to realtime
2026-05-05 13:51:29 +03:00 · 2026-02-27 09:34:45 -08:00 · 2026-02-27 09:23:54 -08:00 · 2026-02-27 09:14:16 -08:00
1 changed files with 151 additions and 32 deletions
--- a/codex-rs/tui/src/voice.rs
+++ b/codex-rs/tui/src/voice.rs
@@ -10,6 +10,8 @@ use codex_login::CodexAuth;
 use codex_protocol::protocol::ConversationAudioParams;
 use codex_protocol::protocol::Op;
 use codex_protocol::protocol::RealtimeAudioFrame;
+use cpal::SampleRate;
+use cpal::SupportedStreamConfigRange;
 use cpal::traits::DeviceTrait;
 use cpal::traits::HostTrait;
 use cpal::traits::StreamTrait;
@@ -27,6 +29,9 @@ use tracing::error;
 use tracing::info;
 use tracing::trace;

+const OPENAI_TRANSCRIPTION_MODEL: &str = "gpt-4o-mini-transcribe";
+const TRANSCRIPTION_TARGET_SAMPLE_RATE: u32 = 24_000;
+
 struct TranscriptionAuthContext {
    mode: AuthMode,
    bearer_token: String,
@@ -51,7 +56,8 @@ pub struct VoiceCapture {

 impl VoiceCapture {
    pub fn start() -> Result<Self, String> {
-        let (device, config) = select_default_input_device_and_config()?;
+        let (device, default_config) = select_default_input_device_and_config()?;
+        let config = preferred_pcm16_24khz_input_config(&device).unwrap_or(default_config);

        let sample_rate = config.sample_rate().0;
        let channels = config.channels();
@@ -75,7 +81,9 @@ impl VoiceCapture {
    }

    pub fn start_realtime(config: &Config, tx: AppEventSender) -> Result<Self, String> {
-        let (device, config) = select_realtime_input_device_and_config(config)?;
+        let (device, default_config) =
+            crate::audio_device::select_configured_input_device_and_config(config)?;
+        let config = preferred_pcm16_24khz_input_config(&device).unwrap_or(default_config);

        let sample_rate = config.sample_rate().0;
        let channels = config.channels();
@@ -221,8 +229,8 @@ pub fn transcribe_async(
            return;
        }

-        // Encode entire clip as normalized WAV.
-        let wav_bytes = match encode_wav_normalized(&audio) {
+        // Encode the captured PCM16 clip into WAV without rewriting sample values.
+        let wav_bytes = match encode_wav_pcm16(&audio) {
            Ok(b) => b,
            Err(e) => {
                error!("failed to encode wav: {e}");
@@ -274,10 +282,48 @@ fn select_default_input_device_and_config()
    Ok((device, config))
 }

-fn select_realtime_input_device_and_config(
-    config: &Config,
-) -> Result<(cpal::Device, cpal::SupportedStreamConfig), String> {
-    crate::audio_device::select_configured_input_device_and_config(config)
+fn preferred_pcm16_24khz_input_config(
+    device: &cpal::Device,
+) -> Option<cpal::SupportedStreamConfig> {
+    let configs = device.supported_input_configs().ok()?;
+    let mut best_config = None;
+    let mut best_rank = None;
+
+    for config_range in configs {
+        let Some(rank) = pcm16_24khz_input_config_rank(&config_range) else {
+            continue;
+        };
+        if best_rank.is_none_or(|best| rank < best) {
+            best_rank = Some(rank);
+            best_config =
+                Some(config_range.with_sample_rate(SampleRate(TRANSCRIPTION_TARGET_SAMPLE_RATE)));
+        }
+    }
+
+    best_config
+}
+
+fn pcm16_24khz_input_config_rank(config_range: &SupportedStreamConfigRange) -> Option<u8> {
+    if config_range.sample_format() != cpal::SampleFormat::I16 {
+        return None;
+    }
+
+    let min_sample_rate = config_range.min_sample_rate().0;
+    let max_sample_rate = config_range.max_sample_rate().0;
+    if !(min_sample_rate..=max_sample_rate).contains(&TRANSCRIPTION_TARGET_SAMPLE_RATE) {
+        return None;
+    }
+
+    let channel_rank = if config_range.channels() == 1 { 0 } else { 1 };
+    let exact_rate_rank = if min_sample_rate == TRANSCRIPTION_TARGET_SAMPLE_RATE
+        && max_sample_rate == TRANSCRIPTION_TARGET_SAMPLE_RATE
+    {
+        0
+    } else {
+        1
+    };
+
+    Some(channel_rank * 2 + exact_rate_rank)
 }

 fn build_input_stream(
@@ -671,7 +717,7 @@ fn clip_duration_seconds(audio: &RecordedAudio) -> f32 {
    }
 }

-fn encode_wav_normalized(audio: &RecordedAudio) -> Result<Vec<u8>, String> {
+fn encode_wav_pcm16(audio: &RecordedAudio) -> Result<Vec<u8>, String> {
    let mut wav_bytes: Vec<u8> = Vec::new();
    let spec = WavSpec {
        channels: audio.channels,
@@ -683,29 +729,9 @@ fn encode_wav_normalized(audio: &RecordedAudio) -> Result<Vec<u8>, String> {
    let mut writer =
        WavWriter::new(&mut cursor, spec).map_err(|_| "failed to create wav writer".to_string())?;

-    // Simple peak normalization with headroom to improve audibility on quiet inputs.
-    let segment = &audio.data[..];
-    let mut peak: i16 = 0;
-    for &s in segment {
-        let a = s.unsigned_abs();
-        if a > peak.unsigned_abs() {
-            peak = s;
-        }
-    }
-    let peak_abs = (peak as i32).unsigned_abs() as i32;
-    let target = (i16::MAX as f32) * 0.9; // leave some headroom
-    let gain: f32 = if peak_abs > 0 {
-        target / (peak_abs as f32)
-    } else {
-        1.0
-    };
-
-    for &s in segment {
-        let v = ((s as f32) * gain)
-            .round()
-            .clamp(i16::MIN as f32, i16::MAX as f32) as i16;
+    for &sample in &audio.data {
        writer
-            .write_sample(v)
+            .write_sample(sample)
            .map_err(|_| "failed writing wav sample".to_string())?;
    }
    writer
@@ -782,7 +808,7 @@ async fn transcribe_bytes(
                .mime_str("audio/wav")
                .map_err(|e| format!("failed to set mime: {e}"))?;
            let mut form = reqwest::multipart::Form::new()
-                .text("model", "gpt-4o-transcribe")
+                .text("model", OPENAI_TRANSCRIPTION_MODEL)
                .part("file", part);
            if let Some(context) = context {
                form = form.text("prompt", context);
@@ -834,3 +860,96 @@ async fn transcribe_bytes(
        Ok(text)
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use pretty_assertions::assert_eq;
+
+    #[test]
+    fn pcm16_24khz_input_config_rank_prefers_exact_mono_i16_24khz() {
+        let exact_mono = SupportedStreamConfigRange::new(
+            1,
+            SampleRate(24_000),
+            SampleRate(24_000),
+            cpal::SupportedBufferSize::Unknown,
+            cpal::SampleFormat::I16,
+        );
+        let ranged_mono = SupportedStreamConfigRange::new(
+            1,
+            SampleRate(16_000),
+            SampleRate(48_000),
+            cpal::SupportedBufferSize::Unknown,
+            cpal::SampleFormat::I16,
+        );
+        let exact_stereo = SupportedStreamConfigRange::new(
+            2,
+            SampleRate(24_000),
+            SampleRate(24_000),
+            cpal::SupportedBufferSize::Unknown,
+            cpal::SampleFormat::I16,
+        );
+        let exact_f32 = SupportedStreamConfigRange::new(
+            1,
+            SampleRate(24_000),
+            SampleRate(24_000),
+            cpal::SupportedBufferSize::Unknown,
+            cpal::SampleFormat::F32,
+        );
+
+        assert_eq!(pcm16_24khz_input_config_rank(&exact_mono), Some(0));
+        assert_eq!(pcm16_24khz_input_config_rank(&ranged_mono), Some(1));
+        assert_eq!(pcm16_24khz_input_config_rank(&exact_stereo), Some(2));
+        assert_eq!(pcm16_24khz_input_config_rank(&exact_f32), None);
+    }
+
+    #[test]
+    fn pcm16_24khz_input_config_rank_rejects_non_24khz_ranges() {
+        let too_low = SupportedStreamConfigRange::new(
+            1,
+            SampleRate(8_000),
+            SampleRate(16_000),
+            cpal::SupportedBufferSize::Unknown,
+            cpal::SampleFormat::I16,
+        );
+        let too_high = SupportedStreamConfigRange::new(
+            1,
+            SampleRate(32_000),
+            SampleRate(48_000),
+            cpal::SupportedBufferSize::Unknown,
+            cpal::SampleFormat::I16,
+        );
+
+        assert_eq!(pcm16_24khz_input_config_rank(&too_low), None);
+        assert_eq!(pcm16_24khz_input_config_rank(&too_high), None);
+    }
+
+    #[test]
+    fn encode_wav_pcm16_preserves_input_samples() {
+        let audio = RecordedAudio {
+            data: vec![-30_000, 0, 12_345, 30_000],
+            sample_rate: 24_000,
+            channels: 1,
+        };
+
+        let wav = encode_wav_pcm16(&audio).expect("wav encoding should succeed");
+        let mut reader =
+            hound::WavReader::new(Cursor::new(wav)).expect("wav reader should open encoded bytes");
+        let spec = reader.spec();
+        let samples = reader
+            .samples::<i16>()
+            .collect::<Result<Vec<_>, _>>()
+            .expect("samples should decode");
+
+        assert_eq!(spec.sample_rate, 24_000);
+        assert_eq!(spec.channels, 1);
+        assert_eq!(spec.bits_per_sample, 16);
+        assert_eq!(spec.sample_format, SampleFormat::Int);
+        assert_eq!(samples, audio.data);
+    }
+
+    #[test]
+    fn openai_transcription_model_constant_matches_expected_model() {
+        assert_eq!(OPENAI_TRANSCRIPTION_MODEL, "gpt-4o-mini-transcribe");
+    }
+}