Compare commits

...

3 Commits

Author SHA1 Message Date
Ahmed Ibrahim
115d5586c6 Inline tiny voice helpers
- inline the trivial realtime input-device wrapper and test-only config-range helper
- keep the 24 kHz PCM16 selection helpers as the policy boundary

Co-authored-by: Codex <noreply@openai.com>
2026-02-27 09:34:45 -08:00
Ahmed Ibrahim
371e55746e Apply 24 kHz capture preference to realtime
- reuse the 24 kHz PCM16 input-config preference for realtime microphone capture\n- keep realtime transport and model behavior unchanged\n\nCo-authored-by: Codex <noreply@openai.com>
2026-02-27 09:23:54 -08:00
Ahmed Ibrahim
672208792b Improve voice transcription capture
- prefer 24 kHz PCM16 input for push-to-talk when supported and preserve captured samples in WAV uploads\n- switch API transcription requests to gpt-4o-mini-transcribe\n\nCo-authored-by: Codex <noreply@openai.com>
2026-02-27 09:14:16 -08:00

View File

@@ -10,6 +10,8 @@ use codex_login::CodexAuth;
use codex_protocol::protocol::ConversationAudioParams;
use codex_protocol::protocol::Op;
use codex_protocol::protocol::RealtimeAudioFrame;
use cpal::SampleRate;
use cpal::SupportedStreamConfigRange;
use cpal::traits::DeviceTrait;
use cpal::traits::HostTrait;
use cpal::traits::StreamTrait;
@@ -27,6 +29,9 @@ use tracing::error;
use tracing::info;
use tracing::trace;
const OPENAI_TRANSCRIPTION_MODEL: &str = "gpt-4o-mini-transcribe";
const TRANSCRIPTION_TARGET_SAMPLE_RATE: u32 = 24_000;
struct TranscriptionAuthContext {
mode: AuthMode,
bearer_token: String,
@@ -51,7 +56,8 @@ pub struct VoiceCapture {
impl VoiceCapture {
pub fn start() -> Result<Self, String> {
let (device, config) = select_default_input_device_and_config()?;
let (device, default_config) = select_default_input_device_and_config()?;
let config = preferred_pcm16_24khz_input_config(&device).unwrap_or(default_config);
let sample_rate = config.sample_rate().0;
let channels = config.channels();
@@ -75,7 +81,9 @@ impl VoiceCapture {
}
pub fn start_realtime(config: &Config, tx: AppEventSender) -> Result<Self, String> {
let (device, config) = select_realtime_input_device_and_config(config)?;
let (device, default_config) =
crate::audio_device::select_configured_input_device_and_config(config)?;
let config = preferred_pcm16_24khz_input_config(&device).unwrap_or(default_config);
let sample_rate = config.sample_rate().0;
let channels = config.channels();
@@ -221,8 +229,8 @@ pub fn transcribe_async(
return;
}
// Encode entire clip as normalized WAV.
let wav_bytes = match encode_wav_normalized(&audio) {
// Encode the captured PCM16 clip into WAV without rewriting sample values.
let wav_bytes = match encode_wav_pcm16(&audio) {
Ok(b) => b,
Err(e) => {
error!("failed to encode wav: {e}");
@@ -274,10 +282,48 @@ fn select_default_input_device_and_config()
Ok((device, config))
}
fn select_realtime_input_device_and_config(
config: &Config,
) -> Result<(cpal::Device, cpal::SupportedStreamConfig), String> {
crate::audio_device::select_configured_input_device_and_config(config)
fn preferred_pcm16_24khz_input_config(
device: &cpal::Device,
) -> Option<cpal::SupportedStreamConfig> {
let configs = device.supported_input_configs().ok()?;
let mut best_config = None;
let mut best_rank = None;
for config_range in configs {
let Some(rank) = pcm16_24khz_input_config_rank(&config_range) else {
continue;
};
if best_rank.is_none_or(|best| rank < best) {
best_rank = Some(rank);
best_config =
Some(config_range.with_sample_rate(SampleRate(TRANSCRIPTION_TARGET_SAMPLE_RATE)));
}
}
best_config
}
fn pcm16_24khz_input_config_rank(config_range: &SupportedStreamConfigRange) -> Option<u8> {
if config_range.sample_format() != cpal::SampleFormat::I16 {
return None;
}
let min_sample_rate = config_range.min_sample_rate().0;
let max_sample_rate = config_range.max_sample_rate().0;
if !(min_sample_rate..=max_sample_rate).contains(&TRANSCRIPTION_TARGET_SAMPLE_RATE) {
return None;
}
let channel_rank = if config_range.channels() == 1 { 0 } else { 1 };
let exact_rate_rank = if min_sample_rate == TRANSCRIPTION_TARGET_SAMPLE_RATE
&& max_sample_rate == TRANSCRIPTION_TARGET_SAMPLE_RATE
{
0
} else {
1
};
Some(channel_rank * 2 + exact_rate_rank)
}
fn build_input_stream(
@@ -671,7 +717,7 @@ fn clip_duration_seconds(audio: &RecordedAudio) -> f32 {
}
}
fn encode_wav_normalized(audio: &RecordedAudio) -> Result<Vec<u8>, String> {
fn encode_wav_pcm16(audio: &RecordedAudio) -> Result<Vec<u8>, String> {
let mut wav_bytes: Vec<u8> = Vec::new();
let spec = WavSpec {
channels: audio.channels,
@@ -683,29 +729,9 @@ fn encode_wav_normalized(audio: &RecordedAudio) -> Result<Vec<u8>, String> {
let mut writer =
WavWriter::new(&mut cursor, spec).map_err(|_| "failed to create wav writer".to_string())?;
// Simple peak normalization with headroom to improve audibility on quiet inputs.
let segment = &audio.data[..];
let mut peak: i16 = 0;
for &s in segment {
let a = s.unsigned_abs();
if a > peak.unsigned_abs() {
peak = s;
}
}
let peak_abs = (peak as i32).unsigned_abs() as i32;
let target = (i16::MAX as f32) * 0.9; // leave some headroom
let gain: f32 = if peak_abs > 0 {
target / (peak_abs as f32)
} else {
1.0
};
for &s in segment {
let v = ((s as f32) * gain)
.round()
.clamp(i16::MIN as f32, i16::MAX as f32) as i16;
for &sample in &audio.data {
writer
.write_sample(v)
.write_sample(sample)
.map_err(|_| "failed writing wav sample".to_string())?;
}
writer
@@ -782,7 +808,7 @@ async fn transcribe_bytes(
.mime_str("audio/wav")
.map_err(|e| format!("failed to set mime: {e}"))?;
let mut form = reqwest::multipart::Form::new()
.text("model", "gpt-4o-transcribe")
.text("model", OPENAI_TRANSCRIPTION_MODEL)
.part("file", part);
if let Some(context) = context {
form = form.text("prompt", context);
@@ -834,3 +860,96 @@ async fn transcribe_bytes(
Ok(text)
}
}
#[cfg(test)]
mod tests {
use super::*;
use pretty_assertions::assert_eq;
#[test]
fn pcm16_24khz_input_config_rank_prefers_exact_mono_i16_24khz() {
let exact_mono = SupportedStreamConfigRange::new(
1,
SampleRate(24_000),
SampleRate(24_000),
cpal::SupportedBufferSize::Unknown,
cpal::SampleFormat::I16,
);
let ranged_mono = SupportedStreamConfigRange::new(
1,
SampleRate(16_000),
SampleRate(48_000),
cpal::SupportedBufferSize::Unknown,
cpal::SampleFormat::I16,
);
let exact_stereo = SupportedStreamConfigRange::new(
2,
SampleRate(24_000),
SampleRate(24_000),
cpal::SupportedBufferSize::Unknown,
cpal::SampleFormat::I16,
);
let exact_f32 = SupportedStreamConfigRange::new(
1,
SampleRate(24_000),
SampleRate(24_000),
cpal::SupportedBufferSize::Unknown,
cpal::SampleFormat::F32,
);
assert_eq!(pcm16_24khz_input_config_rank(&exact_mono), Some(0));
assert_eq!(pcm16_24khz_input_config_rank(&ranged_mono), Some(1));
assert_eq!(pcm16_24khz_input_config_rank(&exact_stereo), Some(2));
assert_eq!(pcm16_24khz_input_config_rank(&exact_f32), None);
}
#[test]
fn pcm16_24khz_input_config_rank_rejects_non_24khz_ranges() {
let too_low = SupportedStreamConfigRange::new(
1,
SampleRate(8_000),
SampleRate(16_000),
cpal::SupportedBufferSize::Unknown,
cpal::SampleFormat::I16,
);
let too_high = SupportedStreamConfigRange::new(
1,
SampleRate(32_000),
SampleRate(48_000),
cpal::SupportedBufferSize::Unknown,
cpal::SampleFormat::I16,
);
assert_eq!(pcm16_24khz_input_config_rank(&too_low), None);
assert_eq!(pcm16_24khz_input_config_rank(&too_high), None);
}
#[test]
fn encode_wav_pcm16_preserves_input_samples() {
let audio = RecordedAudio {
data: vec![-30_000, 0, 12_345, 30_000],
sample_rate: 24_000,
channels: 1,
};
let wav = encode_wav_pcm16(&audio).expect("wav encoding should succeed");
let mut reader =
hound::WavReader::new(Cursor::new(wav)).expect("wav reader should open encoded bytes");
let spec = reader.spec();
let samples = reader
.samples::<i16>()
.collect::<Result<Vec<_>, _>>()
.expect("samples should decode");
assert_eq!(spec.sample_rate, 24_000);
assert_eq!(spec.channels, 1);
assert_eq!(spec.bits_per_sample, 16);
assert_eq!(spec.sample_format, SampleFormat::Int);
assert_eq!(samples, audio.data);
}
#[test]
fn openai_transcription_model_constant_matches_expected_model() {
assert_eq!(OPENAI_TRANSCRIPTION_MODEL, "gpt-4o-mini-transcribe");
}
}