Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 22 additions & 14 deletions src/daemon.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2688,9 +2688,16 @@ impl Daemon {
if self.meeting_mic_buffer.len() >= chunk_samples {
let mic_chunk: Vec<f32> = self.meeting_mic_buffer.drain(..chunk_samples).collect();

// Also drain loopback buffer up to the same amount
// Also drain loopback buffer up to the same amount.
// Pad with silence so it matches the mic chunk length —
// keeps the per-source timestamp offsets advancing in
// lockstep with wall-clock even if the loopback monitor
// has a short startup lag or a transient gap.
let loopback_len = self.meeting_loopback_buffer.len().min(chunk_samples);
let loopback_chunk: Vec<f32> = self.meeting_loopback_buffer.drain(..loopback_len).collect();
let mut loopback_chunk: Vec<f32> = self.meeting_loopback_buffer.drain(..loopback_len).collect();
if loopback_chunk.len() < chunk_samples {
loopback_chunk.resize(chunk_samples, 0.0);
}
Copy link

Copilot AI Apr 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Padding loopback_chunk up to chunk_samples can cause the current VAD implementation to miss real loopback speech when the loopback buffer is lagging. VoiceActivityDetector::contains_speech requires >10% of frames to be above threshold; if you pad a short (e.g., 1–2s) loopback slice to a 30s chunk, the speech-frame ratio is diluted and the entire chunk is treated as “no speech”, dropping that audio from transcription. A safer approach is to run VAD/transcription on the actual loopback samples and advance the loopback offset separately (e.g., by adding an explicit “advance offset by N ms” path when loopback is short/empty), rather than padding before VAD.

Copilot uses AI. Check for mistakes.

// Enhance mic audio with GTCRN if available (removes echo/noise)
#[cfg(feature = "onnx-common")]
Expand Down Expand Up @@ -2722,20 +2729,21 @@ impl Daemon {
}
}

// Process loopback chunk if non-empty
if !loopback_chunk.is_empty() {
match daemon.process_chunk_with_source(loopback_chunk, meeting::data::AudioSource::Loopback).await {
Ok(Some(segments)) => {
tracing::debug!("Processed loopback chunk with {} segments", segments.len());
if !segments.is_empty() {
had_loopback = true;
}
}
Ok(None) => {}
Err(e) => {
tracing::error!("Error processing loopback chunk: {}", e);
// Process loopback chunk. Always process (even when
// padded silence) so the loopback source offset
// advances in lockstep with mic; VAD filters out
// the silent frames without running transcription.
match daemon.process_chunk_with_source(loopback_chunk, meeting::data::AudioSource::Loopback).await {
Ok(Some(segments)) => {
tracing::debug!("Processed loopback chunk with {} segments", segments.len());
if !segments.is_empty() {
had_loopback = true;
}
}
Ok(None) => {}
Err(e) => {
tracing::error!("Error processing loopback chunk: {}", e);
}
}

// Dedup bleed-through: strip echoed phrases from mic segments
Expand Down
24 changes: 18 additions & 6 deletions src/meeting/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,11 @@ pub struct MeetingDaemon {
/// Previous chunk's post-processed text, tracked per audio source
/// so mic and loopback contexts don't bleed into each other
last_chunk_text: HashMap<AudioSource, String>,
/// Cumulative audio duration consumed per source, in milliseconds.
/// Used to compute per-source start offsets so mic and loopback
/// timelines stay anchored to real wall-clock elapsed time instead
/// of being pushed forward by the other source's segments.
source_offsets: HashMap<AudioSource, u64>,
}

impl MeetingDaemon {
Expand Down Expand Up @@ -157,6 +162,7 @@ impl MeetingDaemon {
event_tx,
post_processor,
last_chunk_text: HashMap::new(),
source_offsets: HashMap::new(),
})
}

Expand Down Expand Up @@ -224,6 +230,7 @@ impl MeetingDaemon {

self.state = std::mem::take(&mut self.state).stop();
self.last_chunk_text.clear();
self.source_offsets.clear();

// Finalize meeting
if let Some(ref mut meeting) = self.current_meeting {
Expand Down Expand Up @@ -304,12 +311,11 @@ impl MeetingDaemon {
..Default::default()
};

// Calculate start offset
let start_offset_ms = if let Some(ref meeting) = self.current_meeting {
meeting.transcript.duration_ms()
} else {
0
};
// Start offset is tracked per source: each source has its own wall-clock
// timeline. Deriving this from transcript.duration_ms() would conflate
// mic and loopback, pushing every new chunk past the other source's end
// and roughly doubling apparent meeting length on dual-track captures.
let start_offset_ms = *self.source_offsets.entry(source).or_insert(0);

let mut processor = ChunkProcessor::new(chunk_config, transcriber.clone());
let mut buffer = processor.new_buffer(chunk_id, source, start_offset_ms);
Expand All @@ -319,6 +325,12 @@ impl MeetingDaemon {
.process_chunk(buffer)
.map_err(crate::error::VoxtypeError::Transcribe)?;

// Advance the per-source offset by the actual audio duration consumed,
// regardless of whether VAD found speech in this chunk.
if let Some(offset) = self.source_offsets.get_mut(&source) {
*offset += result.audio_duration_ms;
}

// Post-process segment text if configured
if let Some(ref post_processor) = self.post_processor {
let context = self.last_chunk_text.get(&source).cloned();
Expand Down