diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md index 2b5e2ee..0d62f4b 100644 --- a/docs/CONFIGURATION.md +++ b/docs/CONFIGURATION.md @@ -2170,11 +2170,22 @@ Echo cancellation mode for removing speaker bleed-through from the microphone si - `"auto"` - Use GTCRN neural speech enhancement on mic audio before transcription, followed by a phrase-level transcript dedup pass. The GTCRN model (~523 KB) is automatically downloaded on first `voxtype meeting start`. - `"disabled"` - No enhancement. Use this if you have system-level echo cancellation configured (e.g., PipeWire's `echo-cancel` module) or if you don't use loopback capture. +### vad_threshold + +**Type:** Float +**Default:** `0.01` +**Required:** No + +RMS threshold for meeting chunk voice activity detection. Lower values are more permissive and can help quiet microphones; higher values skip more low-level noise before transcription. Set to `0.0` to disable this pre-transcription gate. + +For quiet USB/XLR mics, try `0.001`. + **Example:** ```toml [meeting.audio] loopback_device = "auto" echo_cancel = "auto" # GTCRN enhancement + transcript dedup +vad_threshold = 0.001 # Optional: quiet mic tuning ``` --- diff --git a/docs/MEETING_MODE.md b/docs/MEETING_MODE.md index 9f1add9..854a3e0 100644 --- a/docs/MEETING_MODE.md +++ b/docs/MEETING_MODE.md @@ -254,6 +254,10 @@ mic_device = "default" # Loopback device for capturing remote participants' audio # "auto" = auto-detect, "disabled" = mic only, or a specific device name loopback_device = "auto" + +# RMS threshold for meeting voice activity detection (default: 0.01) +# Lower to 0.001 for quiet mics; set 0.0 to disable this pre-transcription gate +vad_threshold = 0.01 ``` Setting `loopback_device = "auto"` lets voxtype capture system audio (the other side of a call). When loopback is active, speaker attribution can distinguish between "You" (from the mic) and "Remote" (from system audio). diff --git a/docs/USER_MANUAL.md b/docs/USER_MANUAL.md index b939da8..db510f2 100644 --- a/docs/USER_MANUAL.md +++ b/docs/USER_MANUAL.md @@ -1983,6 +1983,7 @@ max_duration_mins = 180 # Maximum meeting length (0 = unlimited) mic_device = "default" # Microphone (uses audio.device if not set) loopback_device = "auto" # Capture remote participants: "auto", "disabled", or device name echo_cancel = "auto" # GTCRN neural enhancement + transcript dedup +vad_threshold = 0.01 # Lower to 0.001 for quiet mics; 0.0 disables meeting VAD [meeting.diarization] enabled = true diff --git a/src/config.rs b/src/config.rs index 25d4bd0..fa59542 100644 --- a/src/config.rs +++ b/src/config.rs @@ -1330,6 +1330,11 @@ pub struct MeetingAudioConfig { /// and set this to "disabled". #[serde(default = "default_echo_cancel")] pub echo_cancel: String, + + /// RMS threshold for meeting chunk voice activity detection. + /// Lower values are more permissive; 0.0 disables the pre-transcription gate. + #[serde(default = "default_meeting_vad_threshold")] + pub vad_threshold: f32, } fn default_mic_device() -> String { @@ -1344,12 +1349,17 @@ fn default_echo_cancel() -> String { "auto".to_string() } +fn default_meeting_vad_threshold() -> f32 { + 0.01 +} + impl Default for MeetingAudioConfig { fn default() -> Self { Self { mic_device: default_mic_device(), loopback_device: default_loopback(), echo_cancel: default_echo_cancel(), + vad_threshold: default_meeting_vad_threshold(), } } } @@ -3505,6 +3515,7 @@ mod tests { let config = MeetingAudioConfig::default(); assert_eq!(config.mic_device, "default"); assert_eq!(config.loopback_device, "auto"); + assert_eq!(config.vad_threshold, 0.01); } #[test] @@ -3592,6 +3603,7 @@ mod tests { [meeting.audio] mic_device = "hw:1" loopback_device = "disabled" + vad_threshold = 0.001 [meeting.diarization] enabled = false @@ -3607,6 +3619,7 @@ mod tests { let config: Config = toml::from_str(toml_str).unwrap(); assert_eq!(config.meeting.audio.mic_device, "hw:1"); assert_eq!(config.meeting.audio.loopback_device, "disabled"); + assert_eq!(config.meeting.audio.vad_threshold, 0.001); assert!(!config.meeting.diarization.enabled); assert_eq!(config.meeting.diarization.backend, "ml"); assert_eq!(config.meeting.diarization.max_speakers, 5); diff --git a/src/daemon.rs b/src/daemon.rs index 6e50c7a..fe43d16 100644 --- a/src/daemon.rs +++ b/src/daemon.rs @@ -768,6 +768,7 @@ impl Daemon { }, retain_audio: self.config.meeting.retain_audio, max_duration_mins: self.config.meeting.max_duration_mins, + vad_threshold: self.config.meeting.audio.vad_threshold, diarization: diarization_config, }; @@ -790,7 +791,18 @@ impl Daemon { "disabled" | "" => None, other => Some(other), }; - match audio::DualCapture::new(&self.config.audio, loopback_device) { + let mut meeting_audio_config = self.config.audio.clone(); + let meeting_mic_device = self.config.meeting.audio.mic_device.as_str(); + if !matches!(meeting_mic_device, "default" | "") { + tracing::info!( + "Meeting mic override: {} (dictation uses {})", + meeting_mic_device, + self.config.audio.device + ); + meeting_audio_config.device = + self.config.meeting.audio.mic_device.clone(); + } + match audio::DualCapture::new(&meeting_audio_config, loopback_device) { Ok(mut capture) => { if let Err(e) = capture.start().await { tracing::error!("Failed to start meeting audio: {}", e); @@ -873,12 +885,24 @@ impl Daemon { /// Stop the current meeting async fn stop_meeting(&mut self) -> Result<()> { - if let Some(mut daemon) = self.meeting_daemon.take() { - // Stop audio capture + if self.meeting_daemon.is_some() { + // Stop audio capture and keep any samples that arrived since the last poll. if let Some(mut capture) = self.meeting_audio_capture.take() { - let _ = capture.stop().await; + match capture.stop().await { + Ok(dual_samples) => { + self.meeting_mic_buffer.extend(dual_samples.mic); + self.meeting_loopback_buffer.extend(dual_samples.loopback); + } + Err(e) => { + tracing::warn!("Failed to stop meeting audio cleanly: {}", e); + } + } } + // Flush the final partial chunk so speech near stop is not dropped. + self.process_buffered_meeting_audio(true).await; + + let mut daemon = self.meeting_daemon.take().expect("checked above"); match daemon.stop().await { Ok(meeting_id) => { self.update_meeting_state("idle", None); @@ -964,6 +988,109 @@ impl Daemon { 16000 * self.config.meeting.chunk_duration_secs as usize } + async fn process_meeting_audio_pair(&mut self, mic_chunk: Vec, loopback_chunk: Vec) { + #[cfg_attr(not(feature = "onnx-common"), allow(unused_mut))] + let mut mic_chunk = mic_chunk; + + // Enhance mic audio with GTCRN if available (removes echo/noise) + #[cfg(feature = "onnx-common")] + { + if !mic_chunk.is_empty() { + if let Some(ref enhancer) = self.speech_enhancer { + match enhancer.enhance(&mic_chunk) { + Ok(enhanced) => { + tracing::debug!( + "GTCRN enhanced mic chunk ({} samples)", + enhanced.len() + ); + mic_chunk = enhanced; + } + Err(e) => { + tracing::warn!("GTCRN enhancement failed, using raw mic: {}", e); + } + } + } + } + } + + if let Some(ref mut daemon) = self.meeting_daemon { + let mut had_loopback = false; + + if !mic_chunk.is_empty() { + match daemon + .process_chunk_with_source(mic_chunk, meeting::data::AudioSource::Microphone) + .await + { + Ok(Some(segments)) => { + tracing::debug!("Processed mic chunk with {} segments", segments.len()); + } + Ok(None) => {} + Err(e) => { + tracing::error!("Error processing mic chunk: {}", e); + } + } + } + + if !loopback_chunk.is_empty() { + match daemon + .process_chunk_with_source(loopback_chunk, meeting::data::AudioSource::Loopback) + .await + { + Ok(Some(segments)) => { + tracing::debug!( + "Processed loopback chunk with {} segments", + segments.len() + ); + if !segments.is_empty() { + had_loopback = true; + } + } + Ok(None) => {} + Err(e) => { + tracing::error!("Error processing loopback chunk: {}", e); + } + } + } + + // Dedup bleed-through: strip echoed phrases from mic segments + if had_loopback { + if let Some(ref mut meeting) = daemon.current_meeting_mut() { + let removed = meeting.transcript.dedup_bleed_through(); + if removed > 0 { + tracing::info!("Removed {} bleed-through word(s) via dedup", removed); + } + } + } + } + } + + async fn process_buffered_meeting_audio(&mut self, include_tail: bool) { + let chunk_samples = self.meeting_chunk_samples(); + + while self.meeting_mic_buffer.len() >= chunk_samples { + let mic_chunk: Vec = self.meeting_mic_buffer.drain(..chunk_samples).collect(); + let loopback_len = self.meeting_loopback_buffer.len().min(chunk_samples); + let loopback_chunk: Vec = + self.meeting_loopback_buffer.drain(..loopback_len).collect(); + self.process_meeting_audio_pair(mic_chunk, loopback_chunk) + .await; + } + + if include_tail { + let mic_tail = std::mem::take(&mut self.meeting_mic_buffer); + let loopback_tail = std::mem::take(&mut self.meeting_loopback_buffer); + if !mic_tail.is_empty() || !loopback_tail.is_empty() { + tracing::debug!( + mic_samples = mic_tail.len(), + loopback_samples = loopback_tail.len(), + "Processing final meeting audio tail" + ); + self.process_meeting_audio_pair(mic_tail, loopback_tail) + .await; + } + } + } + /// Reset state to idle and run post_output_command to reset compositor submap /// Call this when exiting from recording/transcribing without normal output flow async fn reset_to_idle(&mut self, state: &mut State) { @@ -2683,72 +2810,7 @@ impl Daemon { self.meeting_mic_buffer.extend(dual_samples.mic); self.meeting_loopback_buffer.extend(dual_samples.loopback); - // Check if mic buffer has enough samples for a chunk - let chunk_samples = self.meeting_chunk_samples(); - if self.meeting_mic_buffer.len() >= chunk_samples { - let mic_chunk: Vec = self.meeting_mic_buffer.drain(..chunk_samples).collect(); - - // Also drain loopback buffer up to the same amount - let loopback_len = self.meeting_loopback_buffer.len().min(chunk_samples); - let loopback_chunk: Vec = self.meeting_loopback_buffer.drain(..loopback_len).collect(); - - // Enhance mic audio with GTCRN if available (removes echo/noise) - #[cfg(feature = "onnx-common")] - let mic_chunk = if let Some(ref enhancer) = self.speech_enhancer { - match enhancer.enhance(&mic_chunk) { - Ok(enhanced) => { - tracing::debug!("GTCRN enhanced mic chunk ({} samples)", enhanced.len()); - enhanced - } - Err(e) => { - tracing::warn!("GTCRN enhancement failed, using raw mic: {}", e); - mic_chunk - } - } - } else { - mic_chunk - }; - - if let Some(ref mut daemon) = self.meeting_daemon { - // Process mic chunk - let mut had_loopback = false; - match daemon.process_chunk_with_source(mic_chunk, meeting::data::AudioSource::Microphone).await { - Ok(Some(segments)) => { - tracing::debug!("Processed mic chunk with {} segments", segments.len()); - } - Ok(None) => {} - Err(e) => { - tracing::error!("Error processing mic chunk: {}", e); - } - } - - // Process loopback chunk if non-empty - if !loopback_chunk.is_empty() { - match daemon.process_chunk_with_source(loopback_chunk, meeting::data::AudioSource::Loopback).await { - Ok(Some(segments)) => { - tracing::debug!("Processed loopback chunk with {} segments", segments.len()); - if !segments.is_empty() { - had_loopback = true; - } - } - Ok(None) => {} - Err(e) => { - tracing::error!("Error processing loopback chunk: {}", e); - } - } - } - - // Dedup bleed-through: strip echoed phrases from mic segments - if had_loopback { - if let Some(ref mut meeting) = daemon.current_meeting_mut() { - let removed = meeting.transcript.dedup_bleed_through(); - if removed > 0 { - tracing::info!("Removed {} bleed-through word(s) via dedup", removed); - } - } - } - } - } + self.process_buffered_meeting_audio(false).await; } // Check meeting timeout diff --git a/src/main.rs b/src/main.rs index 6279e7e..0372dca 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1359,6 +1359,7 @@ async fn run_meeting_command(config: &config::Config, action: MeetingAction) -> }, retain_audio: config.meeting.retain_audio, max_duration_mins: config.meeting.max_duration_mins, + vad_threshold: config.meeting.audio.vad_threshold, diarization: None, }; diff --git a/src/meeting/chunk.rs b/src/meeting/chunk.rs index c624b6e..06831b4 100644 --- a/src/meeting/chunk.rs +++ b/src/meeting/chunk.rs @@ -265,7 +265,14 @@ impl ChunkProcessor { // Check for speech if !self.vad.contains_speech(&samples) { - tracing::debug!("Chunk {} has no speech, skipping", chunk_id); + tracing::debug!( + chunk_id, + source = %source, + duration_secs = samples.len() as f32 / self.config.sample_rate as f32, + rms = VoiceActivityDetector::calculate_rms(&samples), + threshold = self.config.vad_threshold, + "Meeting chunk skipped: no speech detected" + ); return Ok(ProcessedChunk { chunk_id, segments: vec![], @@ -276,7 +283,8 @@ impl ChunkProcessor { // Transcribe the chunk tracing::info!( - "Transcribing chunk {} ({:.1}s of audio)", + "Transcribing {:?} chunk {} ({:.1}s of audio)", + source, chunk_id, samples.len() as f32 / self.config.sample_rate as f32 ); diff --git a/src/meeting/diarization/simple.rs b/src/meeting/diarization/simple.rs index dd4e317..3494230 100644 --- a/src/meeting/diarization/simple.rs +++ b/src/meeting/diarization/simple.rs @@ -11,20 +11,12 @@ use crate::meeting::data::AudioSource; use crate::meeting::TranscriptSegment; /// Simple diarizer using audio source for attribution -pub struct SimpleDiarizer { - /// Minimum gap between segments to merge (ms) - merge_gap_ms: u64, -} +pub struct SimpleDiarizer; impl SimpleDiarizer { /// Create a new simple diarizer pub fn new() -> Self { - Self { merge_gap_ms: 500 } - } - - /// Create with custom merge gap - pub fn with_merge_gap(merge_gap_ms: u64) -> Self { - Self { merge_gap_ms } + Self } /// Convert audio source to speaker ID @@ -52,8 +44,10 @@ impl Diarizer for SimpleDiarizer { ) -> Vec { let speaker = Self::source_to_speaker(source); - // Convert transcript segments to diarized segments - let mut diarized: Vec = transcript_segments + // Preserve transcript segment boundaries. The caller applies diarized + // output back to transcript segments positionally, so returning fewer + // segments would leave later transcript segments unlabeled. + transcript_segments .iter() .map(|seg| DiarizedSegment { speaker: speaker.clone(), @@ -62,12 +56,7 @@ impl Diarizer for SimpleDiarizer { text: seg.text.clone(), confidence: 1.0, // High confidence for source-based attribution }) - .collect(); - - // Merge consecutive segments from the same speaker - self.merge_consecutive(&mut diarized); - - diarized + .collect() } fn name(&self) -> &'static str { @@ -75,39 +64,6 @@ impl Diarizer for SimpleDiarizer { } } -impl SimpleDiarizer { - /// Merge consecutive segments from the same speaker if they're close together - fn merge_consecutive(&self, segments: &mut Vec) { - if segments.len() < 2 { - return; - } - - let mut i = 0; - while i < segments.len() - 1 { - let current_end = segments[i].end_ms; - let next_start = segments[i + 1].start_ms; - let same_speaker = segments[i].speaker == segments[i + 1].speaker; - let close_enough = next_start.saturating_sub(current_end) <= self.merge_gap_ms; - - if same_speaker && close_enough { - // Clone the text from next segment before modifying - let next_text = segments[i + 1].text.clone(); - let next_end = segments[i + 1].end_ms; - let next_confidence = segments[i + 1].confidence; - - // Merge next into current - segments[i].end_ms = next_end; - segments[i].text.push(' '); - segments[i].text.push_str(&next_text); - segments[i].confidence = (segments[i].confidence + next_confidence) / 2.0; - segments.remove(i + 1); - } else { - i += 1; - } - } - } -} - #[cfg(test)] mod tests { use super::*; @@ -139,25 +95,30 @@ mod tests { let result = diarizer.diarize(&[], AudioSource::Microphone, &segments); - // Should merge into one segment since same speaker and close together - assert_eq!(result.len(), 1); + // Should preserve transcript boundaries and label every segment. + assert_eq!(result.len(), 2); assert_eq!(result[0].speaker, SpeakerId::You); - assert_eq!(result[0].text, "Hello World"); + assert_eq!(result[1].speaker, SpeakerId::You); + assert_eq!(result[0].text, "Hello"); + assert_eq!(result[1].text, "World"); } #[test] - fn test_diarize_preserves_separate_segments() { + fn test_diarize_labels_all_loopback_segments() { let diarizer = SimpleDiarizer::new(); let mut seg1 = TranscriptSegment::new(1, 0, 1000, "First".to_string(), 0); - seg1.source = AudioSource::Microphone; - let mut seg2 = TranscriptSegment::new(2, 5000, 6000, "Second".to_string(), 0); - seg2.source = AudioSource::Microphone; + seg1.source = AudioSource::Loopback; + let mut seg2 = TranscriptSegment::new(2, 1000, 2000, "Second".to_string(), 0); + seg2.source = AudioSource::Loopback; let segments = vec![seg1, seg2]; - let result = diarizer.diarize(&[], AudioSource::Microphone, &segments); + let result = diarizer.diarize(&[], AudioSource::Loopback, &segments); - // Should keep separate due to large gap assert_eq!(result.len(), 2); + assert_eq!(result[0].speaker, SpeakerId::Remote); + assert_eq!(result[1].speaker, SpeakerId::Remote); + assert_eq!(result[0].text, "First"); + assert_eq!(result[1].text, "Second"); } #[test] diff --git a/src/meeting/mod.rs b/src/meeting/mod.rs index 368532b..4982d34 100644 --- a/src/meeting/mod.rs +++ b/src/meeting/mod.rs @@ -59,6 +59,8 @@ pub struct MeetingConfig { pub retain_audio: bool, /// Maximum meeting duration in minutes (0 = unlimited) pub max_duration_mins: u32, + /// RMS threshold for meeting chunk voice activity detection + pub vad_threshold: f32, /// Diarization configuration (None = disabled) pub diarization: Option, } @@ -71,6 +73,7 @@ impl Default for MeetingConfig { storage: StorageConfig::default(), retain_audio: false, max_duration_mins: 180, + vad_threshold: 0.01, diarization: None, } } @@ -301,6 +304,7 @@ impl MeetingDaemon { let chunk_id = self.state.chunks_processed(); let chunk_config = ChunkConfig { chunk_duration_secs: self.config.chunk_duration_secs, + vad_threshold: self.config.vad_threshold, ..Default::default() }; @@ -418,6 +422,7 @@ mod tests { assert!(!config.enabled); assert_eq!(config.chunk_duration_secs, 30); assert_eq!(config.max_duration_mins, 180); + assert_eq!(config.vad_threshold, 0.01); } #[test]