diff --git a/src/daemon.rs b/src/daemon.rs index 6e50c7a..a3a46e8 100644 --- a/src/daemon.rs +++ b/src/daemon.rs @@ -2688,7 +2688,13 @@ impl Daemon { if self.meeting_mic_buffer.len() >= chunk_samples { let mic_chunk: Vec = self.meeting_mic_buffer.drain(..chunk_samples).collect(); - // Also drain loopback buffer up to the same amount + // Also drain loopback buffer up to the same amount. + // Do NOT pad to chunk_samples — diluting real speech + // with silence can drop the VAD speech-frame ratio + // below threshold and cause remote audio to be + // discarded. Instead we reconcile timestamp offsets + // after both sources have been dispatched via + // sync_source_offsets. let loopback_len = self.meeting_loopback_buffer.len().min(chunk_samples); let loopback_chunk: Vec = self.meeting_loopback_buffer.drain(..loopback_len).collect(); @@ -2722,7 +2728,12 @@ impl Daemon { } } - // Process loopback chunk if non-empty + // Process loopback chunk if we have any real samples. + // If the monitor is lagging we simply skip — the + // sync_source_offsets call below bumps loopback's + // timestamp offset to match mic so the next + // loopback chunk lands at the correct wall-clock + // position. if !loopback_chunk.is_empty() { match daemon.process_chunk_with_source(loopback_chunk, meeting::data::AudioSource::Loopback).await { Ok(Some(segments)) => { @@ -2738,6 +2749,11 @@ impl Daemon { } } + // Reconcile per-source offsets so any source that + // received a short or skipped chunk this iteration + // catches up to wall-clock before the next one. + daemon.sync_source_offsets(); + // Dedup bleed-through: strip echoed phrases from mic segments if had_loopback { if let Some(ref mut meeting) = daemon.current_meeting_mut() { diff --git a/src/meeting/mod.rs b/src/meeting/mod.rs index 368532b..fef2591 100644 --- a/src/meeting/mod.rs +++ b/src/meeting/mod.rs @@ -110,6 +110,11 @@ pub struct MeetingDaemon { /// Previous chunk's post-processed text, tracked per audio source /// so mic and loopback contexts don't bleed into each other last_chunk_text: HashMap, + /// Cumulative audio duration consumed per source, in milliseconds. + /// Used to compute per-source start offsets so mic and loopback + /// timelines stay anchored to real wall-clock elapsed time instead + /// of being pushed forward by the other source's segments. + source_offsets: HashMap, } impl MeetingDaemon { @@ -157,6 +162,7 @@ impl MeetingDaemon { event_tx, post_processor, last_chunk_text: HashMap::new(), + source_offsets: HashMap::new(), }) } @@ -224,6 +230,7 @@ impl MeetingDaemon { self.state = std::mem::take(&mut self.state).stop(); self.last_chunk_text.clear(); + self.source_offsets.clear(); // Finalize meeting if let Some(ref mut meeting) = self.current_meeting { @@ -270,6 +277,18 @@ impl MeetingDaemon { self.current_meeting.as_ref().map(|m| m.metadata.id) } + /// Align all per-source timestamp offsets to the furthest-advanced source. + /// + /// Call this after dispatching the per-iteration chunks for all sources so + /// any source that received a short or empty chunk (e.g. loopback monitor + /// lagging at startup) catches up to wall-clock before the next iteration. + pub fn sync_source_offsets(&mut self) { + let max_offset = self.source_offsets.values().copied().max().unwrap_or(0); + for offset in self.source_offsets.values_mut() { + *offset = max_offset; + } + } + /// Get mutable access to current meeting data (for dedup, etc.) pub fn current_meeting_mut(&mut self) -> Option<&mut MeetingData> { self.current_meeting.as_mut() @@ -304,12 +323,21 @@ impl MeetingDaemon { ..Default::default() }; - // Calculate start offset - let start_offset_ms = if let Some(ref meeting) = self.current_meeting { - meeting.transcript.duration_ms() - } else { - 0 - }; + // Start offset is tracked per source: each source has its own wall-clock + // timeline. Deriving this from transcript.duration_ms() would conflate + // mic and loopback, pushing every new chunk past the other source's end + // and roughly doubling apparent meeting length on dual-track captures. + let start_offset_ms = *self.source_offsets.entry(source).or_insert(0); + + // Advance the per-source offset up front, based on the input sample + // count, so the offset tracks wall-clock even when transcription errors + // or VAD skips this chunk (the caller has already drained these samples + // from its buffer, so the time has elapsed regardless). + let audio_duration_ms = + (samples.len() as f64 / chunk_config.sample_rate as f64 * 1000.0) as u64; + if let Some(offset) = self.source_offsets.get_mut(&source) { + *offset += audio_duration_ms; + } let mut processor = ChunkProcessor::new(chunk_config, transcriber.clone()); let mut buffer = processor.new_buffer(chunk_id, source, start_offset_ms);