Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions docs/CONFIGURATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -2170,11 +2170,22 @@ Echo cancellation mode for removing speaker bleed-through from the microphone si
- `"auto"` - Use GTCRN neural speech enhancement on mic audio before transcription, followed by a phrase-level transcript dedup pass. The GTCRN model (~523 KB) is automatically downloaded on first `voxtype meeting start`.
- `"disabled"` - No enhancement. Use this if you have system-level echo cancellation configured (e.g., PipeWire's `echo-cancel` module) or if you don't use loopback capture.

### vad_threshold

**Type:** Float
**Default:** `0.01`
**Required:** No

RMS threshold for meeting chunk voice activity detection. Lower values are more permissive and can help quiet microphones; higher values skip more low-level noise before transcription. Set to `0.0` to disable this pre-transcription gate.

For quiet USB/XLR mics, try `0.001`.

**Example:**
```toml
[meeting.audio]
loopback_device = "auto"
echo_cancel = "auto" # GTCRN enhancement + transcript dedup
vad_threshold = 0.001 # Optional: quiet mic tuning
```

---
Expand Down
4 changes: 4 additions & 0 deletions docs/MEETING_MODE.md
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,10 @@ mic_device = "default"
# Loopback device for capturing remote participants' audio
# "auto" = auto-detect, "disabled" = mic only, or a specific device name
loopback_device = "auto"

# RMS threshold for meeting voice activity detection (default: 0.01)
# Lower to 0.001 for quiet mics; set 0.0 to disable this pre-transcription gate
vad_threshold = 0.01
```

Setting `loopback_device = "auto"` lets voxtype capture system audio (the other side of a call). When loopback is active, speaker attribution can distinguish between "You" (from the mic) and "Remote" (from system audio).
Expand Down
1 change: 1 addition & 0 deletions docs/USER_MANUAL.md
Original file line number Diff line number Diff line change
Expand Up @@ -1983,6 +1983,7 @@ max_duration_mins = 180 # Maximum meeting length (0 = unlimited)
mic_device = "default" # Microphone (uses audio.device if not set)
loopback_device = "auto" # Capture remote participants: "auto", "disabled", or device name
echo_cancel = "auto" # GTCRN neural enhancement + transcript dedup
vad_threshold = 0.01 # Lower to 0.001 for quiet mics; 0.0 disables meeting VAD

[meeting.diarization]
enabled = true
Expand Down
13 changes: 13 additions & 0 deletions src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1330,6 +1330,11 @@ pub struct MeetingAudioConfig {
/// and set this to "disabled".
#[serde(default = "default_echo_cancel")]
pub echo_cancel: String,

/// RMS threshold for meeting chunk voice activity detection.
/// Lower values are more permissive; 0.0 disables the pre-transcription gate.
#[serde(default = "default_meeting_vad_threshold")]
pub vad_threshold: f32,
}

fn default_mic_device() -> String {
Expand All @@ -1344,12 +1349,17 @@ fn default_echo_cancel() -> String {
"auto".to_string()
}

fn default_meeting_vad_threshold() -> f32 {
0.01
}

impl Default for MeetingAudioConfig {
fn default() -> Self {
Self {
mic_device: default_mic_device(),
loopback_device: default_loopback(),
echo_cancel: default_echo_cancel(),
vad_threshold: default_meeting_vad_threshold(),
}
}
}
Expand Down Expand Up @@ -3505,6 +3515,7 @@ mod tests {
let config = MeetingAudioConfig::default();
assert_eq!(config.mic_device, "default");
assert_eq!(config.loopback_device, "auto");
assert_eq!(config.vad_threshold, 0.01);
}

#[test]
Expand Down Expand Up @@ -3592,6 +3603,7 @@ mod tests {
[meeting.audio]
mic_device = "hw:1"
loopback_device = "disabled"
vad_threshold = 0.001

[meeting.diarization]
enabled = false
Expand All @@ -3607,6 +3619,7 @@ mod tests {
let config: Config = toml::from_str(toml_str).unwrap();
assert_eq!(config.meeting.audio.mic_device, "hw:1");
assert_eq!(config.meeting.audio.loopback_device, "disabled");
assert_eq!(config.meeting.audio.vad_threshold, 0.001);
assert!(!config.meeting.diarization.enabled);
assert_eq!(config.meeting.diarization.backend, "ml");
assert_eq!(config.meeting.diarization.max_speakers, 5);
Expand Down
202 changes: 132 additions & 70 deletions src/daemon.rs
Original file line number Diff line number Diff line change
Expand Up @@ -768,6 +768,7 @@ impl Daemon {
},
retain_audio: self.config.meeting.retain_audio,
max_duration_mins: self.config.meeting.max_duration_mins,
vad_threshold: self.config.meeting.audio.vad_threshold,
diarization: diarization_config,
};

Expand All @@ -790,7 +791,18 @@ impl Daemon {
"disabled" | "" => None,
other => Some(other),
};
match audio::DualCapture::new(&self.config.audio, loopback_device) {
let mut meeting_audio_config = self.config.audio.clone();
let meeting_mic_device = self.config.meeting.audio.mic_device.as_str();
if !matches!(meeting_mic_device, "default" | "") {
tracing::info!(
"Meeting mic override: {} (dictation uses {})",
meeting_mic_device,
self.config.audio.device
);
meeting_audio_config.device =
self.config.meeting.audio.mic_device.clone();
}
match audio::DualCapture::new(&meeting_audio_config, loopback_device) {
Ok(mut capture) => {
if let Err(e) = capture.start().await {
tracing::error!("Failed to start meeting audio: {}", e);
Expand Down Expand Up @@ -873,12 +885,24 @@ impl Daemon {

/// Stop the current meeting
async fn stop_meeting(&mut self) -> Result<()> {
if let Some(mut daemon) = self.meeting_daemon.take() {
// Stop audio capture
if self.meeting_daemon.is_some() {
// Stop audio capture and keep any samples that arrived since the last poll.
if let Some(mut capture) = self.meeting_audio_capture.take() {
let _ = capture.stop().await;
match capture.stop().await {
Ok(dual_samples) => {
self.meeting_mic_buffer.extend(dual_samples.mic);
self.meeting_loopback_buffer.extend(dual_samples.loopback);
}
Err(e) => {
tracing::warn!("Failed to stop meeting audio cleanly: {}", e);
}
}
}

// Flush the final partial chunk so speech near stop is not dropped.
self.process_buffered_meeting_audio(true).await;

let mut daemon = self.meeting_daemon.take().expect("checked above");
match daemon.stop().await {
Ok(meeting_id) => {
self.update_meeting_state("idle", None);
Expand Down Expand Up @@ -964,6 +988,109 @@ impl Daemon {
16000 * self.config.meeting.chunk_duration_secs as usize
}

async fn process_meeting_audio_pair(&mut self, mic_chunk: Vec<f32>, loopback_chunk: Vec<f32>) {
#[cfg_attr(not(feature = "onnx-common"), allow(unused_mut))]
let mut mic_chunk = mic_chunk;

// Enhance mic audio with GTCRN if available (removes echo/noise)
#[cfg(feature = "onnx-common")]
{
if !mic_chunk.is_empty() {
if let Some(ref enhancer) = self.speech_enhancer {
match enhancer.enhance(&mic_chunk) {
Ok(enhanced) => {
tracing::debug!(
"GTCRN enhanced mic chunk ({} samples)",
enhanced.len()
);
mic_chunk = enhanced;
}
Err(e) => {
tracing::warn!("GTCRN enhancement failed, using raw mic: {}", e);
}
}
}
}
}

if let Some(ref mut daemon) = self.meeting_daemon {
let mut had_loopback = false;

if !mic_chunk.is_empty() {
match daemon
.process_chunk_with_source(mic_chunk, meeting::data::AudioSource::Microphone)
.await
{
Ok(Some(segments)) => {
tracing::debug!("Processed mic chunk with {} segments", segments.len());
}
Ok(None) => {}
Err(e) => {
tracing::error!("Error processing mic chunk: {}", e);
}
}
}

if !loopback_chunk.is_empty() {
match daemon
.process_chunk_with_source(loopback_chunk, meeting::data::AudioSource::Loopback)
.await
{
Ok(Some(segments)) => {
tracing::debug!(
"Processed loopback chunk with {} segments",
segments.len()
);
if !segments.is_empty() {
had_loopback = true;
}
}
Ok(None) => {}
Err(e) => {
tracing::error!("Error processing loopback chunk: {}", e);
}
}
}

// Dedup bleed-through: strip echoed phrases from mic segments
if had_loopback {
if let Some(ref mut meeting) = daemon.current_meeting_mut() {
let removed = meeting.transcript.dedup_bleed_through();
if removed > 0 {
tracing::info!("Removed {} bleed-through word(s) via dedup", removed);
}
}
}
}
}

async fn process_buffered_meeting_audio(&mut self, include_tail: bool) {
let chunk_samples = self.meeting_chunk_samples();

while self.meeting_mic_buffer.len() >= chunk_samples {
let mic_chunk: Vec<f32> = self.meeting_mic_buffer.drain(..chunk_samples).collect();
let loopback_len = self.meeting_loopback_buffer.len().min(chunk_samples);
let loopback_chunk: Vec<f32> =
self.meeting_loopback_buffer.drain(..loopback_len).collect();
self.process_meeting_audio_pair(mic_chunk, loopback_chunk)
.await;
}

if include_tail {
let mic_tail = std::mem::take(&mut self.meeting_mic_buffer);
let loopback_tail = std::mem::take(&mut self.meeting_loopback_buffer);
if !mic_tail.is_empty() || !loopback_tail.is_empty() {
tracing::debug!(
mic_samples = mic_tail.len(),
loopback_samples = loopback_tail.len(),
"Processing final meeting audio tail"
);
self.process_meeting_audio_pair(mic_tail, loopback_tail)
.await;
}
}
}

/// Reset state to idle and run post_output_command to reset compositor submap
/// Call this when exiting from recording/transcribing without normal output flow
async fn reset_to_idle(&mut self, state: &mut State) {
Expand Down Expand Up @@ -2683,72 +2810,7 @@ impl Daemon {
self.meeting_mic_buffer.extend(dual_samples.mic);
self.meeting_loopback_buffer.extend(dual_samples.loopback);

// Check if mic buffer has enough samples for a chunk
let chunk_samples = self.meeting_chunk_samples();
if self.meeting_mic_buffer.len() >= chunk_samples {
let mic_chunk: Vec<f32> = self.meeting_mic_buffer.drain(..chunk_samples).collect();

// Also drain loopback buffer up to the same amount
let loopback_len = self.meeting_loopback_buffer.len().min(chunk_samples);
let loopback_chunk: Vec<f32> = self.meeting_loopback_buffer.drain(..loopback_len).collect();

// Enhance mic audio with GTCRN if available (removes echo/noise)
#[cfg(feature = "onnx-common")]
let mic_chunk = if let Some(ref enhancer) = self.speech_enhancer {
match enhancer.enhance(&mic_chunk) {
Ok(enhanced) => {
tracing::debug!("GTCRN enhanced mic chunk ({} samples)", enhanced.len());
enhanced
}
Err(e) => {
tracing::warn!("GTCRN enhancement failed, using raw mic: {}", e);
mic_chunk
}
}
} else {
mic_chunk
};

if let Some(ref mut daemon) = self.meeting_daemon {
// Process mic chunk
let mut had_loopback = false;
match daemon.process_chunk_with_source(mic_chunk, meeting::data::AudioSource::Microphone).await {
Ok(Some(segments)) => {
tracing::debug!("Processed mic chunk with {} segments", segments.len());
}
Ok(None) => {}
Err(e) => {
tracing::error!("Error processing mic chunk: {}", e);
}
}

// Process loopback chunk if non-empty
if !loopback_chunk.is_empty() {
match daemon.process_chunk_with_source(loopback_chunk, meeting::data::AudioSource::Loopback).await {
Ok(Some(segments)) => {
tracing::debug!("Processed loopback chunk with {} segments", segments.len());
if !segments.is_empty() {
had_loopback = true;
}
}
Ok(None) => {}
Err(e) => {
tracing::error!("Error processing loopback chunk: {}", e);
}
}
}

// Dedup bleed-through: strip echoed phrases from mic segments
if had_loopback {
if let Some(ref mut meeting) = daemon.current_meeting_mut() {
let removed = meeting.transcript.dedup_bleed_through();
if removed > 0 {
tracing::info!("Removed {} bleed-through word(s) via dedup", removed);
}
}
}
}
}
self.process_buffered_meeting_audio(false).await;
}

// Check meeting timeout
Expand Down
1 change: 1 addition & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1359,6 +1359,7 @@ async fn run_meeting_command(config: &config::Config, action: MeetingAction) ->
},
retain_audio: config.meeting.retain_audio,
max_duration_mins: config.meeting.max_duration_mins,
vad_threshold: config.meeting.audio.vad_threshold,
diarization: None,
};

Expand Down
12 changes: 10 additions & 2 deletions src/meeting/chunk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,14 @@ impl ChunkProcessor {

// Check for speech
if !self.vad.contains_speech(&samples) {
tracing::debug!("Chunk {} has no speech, skipping", chunk_id);
tracing::debug!(
chunk_id,
source = %source,
duration_secs = samples.len() as f32 / self.config.sample_rate as f32,
rms = VoiceActivityDetector::calculate_rms(&samples),
threshold = self.config.vad_threshold,
"Meeting chunk skipped: no speech detected"
);
return Ok(ProcessedChunk {
chunk_id,
segments: vec![],
Expand All @@ -276,7 +283,8 @@ impl ChunkProcessor {

// Transcribe the chunk
tracing::info!(
"Transcribing chunk {} ({:.1}s of audio)",
"Transcribing {:?} chunk {} ({:.1}s of audio)",
source,
chunk_id,
samples.len() as f32 / self.config.sample_rate as f32
);
Expand Down
Loading