diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md index 2b5e2ee..badab09 100644 --- a/docs/CONFIGURATION.md +++ b/docs/CONFIGURATION.md @@ -2249,6 +2249,41 @@ Timeout for summarization requests. --- +## `[corpus]` - Training Corpus Capture + +Autosaves push-to-talk sessions as paired `(audio, text, metadata)` artifacts so you can build a dataset for training or evaluating LLM post-processing. + +### `enabled` (bool) +**Default:** `false` + +When true, each successful push-to-talk recording writes a set of files to the corpus directory. No data is captured when disabled. + +### `path` (string) +**Default:** `"auto"` (resolves to `~/.local/share/voxtype/corpus/`) + +Directory where corpus artifacts are written. Created automatically if it does not exist. + +### Files per session + +Each session produces files sharing a timestamped stem (e.g. `2026-04-20T14-32-05_a7f3`): + +| File | Always present? | Content | +|------|-----------------|---------| +| `.wav` | yes | 16 kHz mono int16 audio passed to the transcriber | +| `.raw.txt` | yes | Raw ASR output | +| `.processed.txt` | only if differs from raw | Text after spoken punctuation / replacements | +| `.post.txt` | only if post-processor ran | Final text delivered as output | +| `.json` | yes | Metadata sidecar (model, engine, language, profile, duration, ...) | + +### Overrides + +| Layer | Setting | +|-------|---------| +| CLI | `--corpus` / `--no-corpus` / `--corpus-path ` | +| Env | `VOXTYPE_CORPUS_ENABLED=true`, `VOXTYPE_CORPUS_PATH=/path` | + +--- + ## [status] Controls status display icons for Waybar and other tray integrations. diff --git a/docs/TROUBLESHOOTING.md b/docs/TROUBLESHOOTING.md index 0f9ed3e..335dda7 100644 --- a/docs/TROUBLESHOOTING.md +++ b/docs/TROUBLESHOOTING.md @@ -1050,6 +1050,21 @@ journalctl --user -u pulseaudio -n 20 --- +## Corpus files aren't appearing + +If you enabled `[corpus] enabled = true` (or `--corpus` / `VOXTYPE_CORPUS_ENABLED=1`) but no files appear in the corpus directory after recording: + +1. Check the daemon logs (`-vv` or `journalctl --user -u voxtype`). Look for one of: + - `Failed to open corpus dir "...": ...` - the directory cannot be created (permission, read-only mount). Check the path and filesystem permissions. + - `Corpus save failed: ...` - a single session failed to write. Common causes: disk full, directory turned read-only mid-session, or the file system doesn't support long filenames. +2. Eager-mode recordings worked, but you're using the main (non-eager) path and still no files: confirm the recording actually produced transcribed text. Empty transcriptions are intentionally skipped before reaching the corpus save site. +3. Verify the resolved path. If `path = "auto"`, corpus writes to `~/.local/share/voxtype/corpus/`. Check with: + ```bash + ls -la ~/.local/share/voxtype/corpus/ + ``` + +--- + ## Getting Help If you're still having issues: diff --git a/docs/USER_MANUAL.md b/docs/USER_MANUAL.md index b939da8..524d883 100644 --- a/docs/USER_MANUAL.md +++ b/docs/USER_MANUAL.md @@ -2249,6 +2249,34 @@ voxtype setup dms --qml # Output raw QML (for scripting) --- +## Building a Training Corpus + +Voxtype can autosave every push-to-talk session as an `(audio, raw_text, processed_text, post_text, metadata)` tuple so you can iteratively build a dataset for training or evaluating LLM post-processing. The `processed_text` stage (spoken-punctuation and word replacements) is written only when it differs from the raw transcription. + +Enable it in your config: + +```toml +[corpus] +enabled = true +path = "auto" # ~/.local/share/voxtype/corpus/ +``` + +Or via CLI for a single run: + +```bash +voxtype --corpus --corpus-path ~/my-corpus +``` + +Every successful recording produces files sharing a timestamped stem (e.g. `2026-04-20T14-32-05_a7f3.wav`, `.raw.txt`, optional `.processed.txt` when it differs from raw, `.post.txt` when a post-processor ran, `.json`). The sidecar `text_stages` object indicates which text files are present. See `docs/CONFIGURATION.md` for the full schema. + +The corpus is designed for downstream tooling: HuggingFace `datasets` (`load_dataset("audiofolder", ...)`), pandas, or a custom script that reads the JSON sidecars and text files. + +### Secrets caveat + +The sidecar JSON records the `post_process_command` string that was used for each recording. If you embed credentials directly in the command (e.g. `API_KEY=sk-... curl ...`), those credentials end up in every sidecar. Prefer script files that read secrets from env vars or config files, and keep the command in your voxtype config limited to the script name or a non-sensitive invocation. + +--- + ## Feedback We want to hear from you! Voxtype is a young project and your feedback helps make it better. diff --git a/src/cli.rs b/src/cli.rs index 88763b0..561d314 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -316,6 +316,25 @@ pub struct Cli { #[arg(long, value_name = "MS", help_heading = "VAD", hide_short_help = true)] pub vad_min_speech_ms: Option, + // -- Corpus capture -- + + /// Enable corpus capture for this daemon run (overrides config) + #[arg(long, help_heading = "Corpus", + long_help = "Autosave every push-to-talk session as an\n\ + (audio, raw, processed, post, metadata) tuple into the corpus directory.\n\ + The processed and post files are only written when they differ from raw\n\ + or when a post-processor runs, respectively.\n\ + Useful for building a training corpus for LLM post-processing.")] + pub corpus: bool, + + /// Disable corpus capture for this daemon run (overrides config) + #[arg(long, help_heading = "Corpus", conflicts_with = "corpus")] + pub no_corpus: bool, + + /// Override corpus storage directory (implies --corpus unless --no-corpus is set) + #[arg(long, value_name = "DIR", help_heading = "Corpus")] + pub corpus_path: Option, + #[command(subcommand)] pub command: Option, } @@ -1889,4 +1908,25 @@ mod tests { _ => panic!("Expected Record command"), } } + + #[test] + fn corpus_flags_parse() { + use clap::Parser; + let cli = Cli::try_parse_from(["voxtype", "--corpus", "--corpus-path", "/tmp/c"]).unwrap(); + assert!(cli.corpus); + assert!(!cli.no_corpus); + assert_eq!(cli.corpus_path, Some(std::path::PathBuf::from("/tmp/c"))); + } + + #[test] + fn corpus_and_no_corpus_conflict() { + use clap::Parser; + let result = Cli::try_parse_from(["voxtype", "--corpus", "--no-corpus"]); + assert!(result.is_err(), "expected conflict error"); + let msg = match result { + Err(e) => e.to_string(), + Ok(_) => panic!("expected Err"), + }; + assert!(msg.contains("--no-corpus") || msg.contains("--corpus")); + } } diff --git a/src/config.rs b/src/config.rs index 25d4bd0..5ddb9ca 100644 --- a/src/config.rs +++ b/src/config.rs @@ -295,6 +295,20 @@ on_transcription = true # [profiles.code] # post_process_command = "ollama run llama3.2:1b 'Format as code comment...'" # output_mode = "clipboard" + +[corpus] +# Autosave push-to-talk sessions as paired (audio, text, metadata) tuples +# for building a training corpus to improve LLM post-processing. +# When enabled, each recording produces a set of files in the corpus path: +# _.wav — 16 kHz mono int16 audio +# _.raw.txt — raw ASR output +# _.processed.txt — text after replacements/spoken punctuation (if different) +# _.post.txt — text after LLM post-processing (if post-processor ran) +# _.json — metadata sidecar +enabled = false + +# Storage path ("auto" = ~/.local/share/voxtype/corpus/) +path = "auto" "#; /// Hotkey activation mode @@ -363,6 +377,10 @@ pub struct Config { #[serde(default)] pub meeting: MeetingConfig, + /// Corpus capture configuration (post-processing training) + #[serde(default)] + pub corpus: CorpusConfig, + /// Optional path to state file for external integrations (e.g., Waybar) /// When set, the daemon writes current state ("idle", "recording", "transcribing") /// to this file whenever state changes. @@ -1183,6 +1201,21 @@ pub enum TranscriptionEngine { Omnilingual, } +impl TranscriptionEngine { + /// Stable lowercase identifier used in logs, corpus metadata, etc. + pub fn name(&self) -> &'static str { + match self { + TranscriptionEngine::Whisper => "whisper", + TranscriptionEngine::Parakeet => "parakeet", + TranscriptionEngine::Moonshine => "moonshine", + TranscriptionEngine::SenseVoice => "sensevoice", + TranscriptionEngine::Paraformer => "paraformer", + TranscriptionEngine::Dolphin => "dolphin", + TranscriptionEngine::Omnilingual => "omnilingual", + } + } +} + /// VAD backend selection /// /// Determines which voice activity detection algorithm to use. @@ -1486,6 +1519,33 @@ impl Default for MeetingConfig { } } +/// Corpus capture configuration — autosaves push-to-talk sessions +/// as paired (audio, text, metadata) artifacts for training/evaluation. +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct CorpusConfig { + /// Enable corpus capture (default: false) + #[serde(default)] + pub enabled: bool, + + /// Storage path for corpus artifacts ("auto" for default location) + /// Default: ~/.local/share/voxtype/corpus/ + #[serde(default = "default_corpus_path")] + pub path: String, +} + +impl Default for CorpusConfig { + fn default() -> Self { + Self { + enabled: false, + path: default_corpus_path(), + } + } +} + +fn default_corpus_path() -> String { + "auto".to_string() +} + /// Notification configuration #[derive(Debug, Clone, Deserialize, Serialize)] pub struct NotificationConfig { @@ -1877,6 +1937,7 @@ impl Default for Config { vad: VadConfig::default(), status: StatusConfig::default(), meeting: MeetingConfig::default(), + corpus: CorpusConfig::default(), state_file: Some("auto".to_string()), profiles: HashMap::new(), } @@ -3776,4 +3837,60 @@ mod tests { let config: Config = toml::from_str(toml_str).unwrap(); assert!(config.hotkey.profile_modifiers.is_empty()); } + + #[test] + fn test_corpus_config_defaults() { + let config = Config::default(); + assert!(!config.corpus.enabled); + assert_eq!(config.corpus.path, "auto"); + } + + #[test] + fn test_corpus_config_parses_from_toml() { + let toml = r#" +[hotkey] +key = "SCROLLLOCK" + +[audio] +device = "default" +sample_rate = 16000 +max_duration_secs = 60 + +[output] +mode = "type" + +[corpus] +enabled = true +path = "/tmp/corpus" +"#; + let config: Config = toml::from_str(toml).unwrap(); + assert!(config.corpus.enabled); + assert_eq!(config.corpus.path, "/tmp/corpus"); + } + + #[test] + fn test_corpus_config_omitted_uses_defaults() { + let toml = r#" +[hotkey] +key = "SCROLLLOCK" + +[audio] +device = "default" +sample_rate = 16000 +max_duration_secs = 60 + +[output] +mode = "type" +"#; + let config: Config = toml::from_str(toml).unwrap(); + assert!(!config.corpus.enabled); + assert_eq!(config.corpus.path, "auto"); + } + + #[test] + fn default_config_template_parses_and_has_corpus_defaults() { + let config: Config = toml::from_str(DEFAULT_CONFIG).unwrap(); + assert!(!config.corpus.enabled); + assert_eq!(config.corpus.path, "auto"); + } } diff --git a/src/corpus/mod.rs b/src/corpus/mod.rs new file mode 100644 index 0000000..e2dabe6 --- /dev/null +++ b/src/corpus/mod.rs @@ -0,0 +1,394 @@ +//! Corpus capture for post-processing training. +//! +//! Writes push-to-talk sessions to a flat directory as +//! `(audio.wav, raw.txt, [processed.txt,] [post.txt,] meta.json)` tuples. + +use chrono::{DateTime, Local}; +use serde::{Deserialize, Serialize}; +use std::path::PathBuf; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum CorpusError { + #[error("Corpus IO error: {0}")] + Io(#[from] std::io::Error), + + #[error("Corpus WAV encode error: {0}")] + Wav(#[from] hound::Error), + + #[error("Corpus metadata serialization error: {0}")] + Json(#[from] serde_json::Error), +} + +/// Corpus capture configuration. `path` is the directory that artifacts will +/// be written to; callers are expected to resolve `"auto"` and any tilde +/// expansion before constructing this, but both absolute and relative paths +/// are accepted as-is. +#[derive(Debug, Clone)] +pub struct CorpusConfig { + pub path: PathBuf, +} + +/// Writes corpus artifacts to a base directory. +pub struct CorpusWriter { + base_dir: PathBuf, +} + +impl CorpusWriter { + /// Open (creating the directory if missing) a corpus writer at the given path. + pub fn open(config: CorpusConfig) -> Result { + std::fs::create_dir_all(&config.path)?; + Ok(Self { base_dir: config.path }) + } + + pub fn base_dir(&self) -> &std::path::Path { + &self.base_dir + } +} + +/// Format a timestamp + 4-hex suffix into a filesystem-safe stem. +/// Example: `2026-04-20T14-32-05_a7f3` +fn session_stem(dt: DateTime, hex: &str) -> String { + // RFC 3339 second-precision with `:` → `-` for filesystem friendliness. + let ts = dt.format("%Y-%m-%dT%H-%M-%S").to_string(); + format!("{ts}_{hex}") +} + +/// Encode f32 samples as int16 mono PCM WAV into an already-opened file. +/// The file is consumed so that callers can atomically claim the path via +/// `OpenOptions::create_new` before writing. +fn write_wav(file: std::fs::File, samples: &[f32], sample_rate: u32) -> Result<(), CorpusError> { + let spec = hound::WavSpec { + channels: 1, + sample_rate, + bits_per_sample: 16, + sample_format: hound::SampleFormat::Int, + }; + let mut writer = hound::WavWriter::new(std::io::BufWriter::new(file), spec)?; + for &s in samples { + let clamped = s.clamp(-1.0, 1.0); + let sample_i16 = (clamped * i16::MAX as f32) as i16; + writer.write_sample(sample_i16)?; + } + writer.finalize()?; + Ok(()) +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TextStages { + pub raw: bool, + pub processed: bool, + pub post: bool, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SessionSidecar { + pub id: String, + pub recorded_at: DateTime, + pub duration_secs: f32, + pub sample_rate: u32, + pub engine: String, + pub model: String, + pub language: Option, + pub profile: Option, + pub post_process_command: Option, + pub voxtype_version: String, + pub text_stages: TextStages, +} + +/// A complete recording session ready to be persisted. +pub struct CorpusSession { + pub samples: Vec, + pub sample_rate: u32, + pub raw_text: String, + pub processed_text: String, + pub post_text: Option, + pub engine: String, + pub model: String, + pub language: Option, + pub profile: Option, + pub post_process_command: Option, + pub duration_secs: f32, + pub recorded_at: DateTime, +} + +impl CorpusWriter { + /// Persist a session. Returns the session id (filename stem) on success. + /// + /// Synchronous — callers should run this on `tokio::task::spawn_blocking`. + pub fn save(&self, session: CorpusSession) -> Result { + // Atomically claim a unique stem by creating its `.wav` file with + // `create_new` (fails on AlreadyExists). Retry up to 3 times with a + // fresh hex suffix when two saves in the same clock second collide. + let (stem, wav_file) = { + let mut last_err: Option = None; + let mut claimed: Option<(String, std::fs::File)> = None; + for _ in 0..3 { + let candidate = session_stem(session.recorded_at, &random_hex4()); + let wav_path = self.base_dir.join(format!("{candidate}.wav")); + match std::fs::OpenOptions::new() + .write(true) + .create_new(true) + .open(&wav_path) + { + Ok(f) => { + claimed = Some((candidate, f)); + break; + } + Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => { + last_err = Some(e); + continue; + } + Err(e) => return Err(e.into()), + } + } + match claimed { + Some(pair) => pair, + None => { + tracing::warn!("Corpus: 3 filename collisions; skipping session"); + return Err(CorpusError::Io(last_err.unwrap_or_else(|| { + std::io::Error::new(std::io::ErrorKind::AlreadyExists, "stem collision") + }))); + } + } + }; + + write_wav(wav_file, &session.samples, session.sample_rate)?; + + let raw_path = self.base_dir.join(format!("{stem}.raw.txt")); + std::fs::write(&raw_path, &session.raw_text)?; + + let processed_written = session.processed_text != session.raw_text; + if processed_written { + let p = self.base_dir.join(format!("{stem}.processed.txt")); + std::fs::write(&p, &session.processed_text)?; + } + + let post_written = session.post_text.is_some(); + if let Some(ref post) = session.post_text { + let p = self.base_dir.join(format!("{stem}.post.txt")); + std::fs::write(&p, post)?; + } + + let sidecar = SessionSidecar { + id: stem.clone(), + recorded_at: session.recorded_at, + duration_secs: session.duration_secs, + sample_rate: session.sample_rate, + engine: session.engine, + model: session.model, + language: session.language, + profile: session.profile, + post_process_command: session.post_process_command, + voxtype_version: env!("CARGO_PKG_VERSION").to_string(), + text_stages: TextStages { + raw: true, + processed: processed_written, + post: post_written, + }, + }; + let json_path = self.base_dir.join(format!("{stem}.json")); + let json = serde_json::to_string_pretty(&sidecar)?; + std::fs::write(&json_path, json)?; + + Ok(stem) + } +} + +/// Generate a 4-character random hex suffix (uses uuid v4 for entropy). +fn random_hex4() -> String { + let u = uuid::Uuid::new_v4(); + // Take the first 4 hex chars of the simple representation. + u.simple().to_string()[..4].to_string() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn open_creates_base_dir() { + let tmp = tempfile::tempdir().unwrap(); + let target = tmp.path().join("nested").join("corpus"); + let cfg = CorpusConfig { path: target.clone() }; + let writer = CorpusWriter::open(cfg).expect("open should succeed"); + assert!(target.exists()); + assert_eq!(writer.base_dir(), target); + } + + #[test] + fn stem_has_expected_shape() { + use chrono::{Local, TimeZone}; + let dt = Local.with_ymd_and_hms(2026, 4, 20, 14, 32, 5).unwrap(); + let stem = session_stem(dt, "a7f3"); + assert_eq!(stem, "2026-04-20T14-32-05_a7f3"); + } + + #[test] + fn random_hex_is_four_chars() { + let hex = random_hex4(); + assert_eq!(hex.len(), 4); + assert!(hex.chars().all(|c| c.is_ascii_hexdigit())); + } + + #[test] + fn distinct_sessions_have_distinct_stems() { + use chrono::Local; + let now = Local::now(); + let a = session_stem(now, &random_hex4()); + let b = session_stem(now, &random_hex4()); + // Extremely unlikely to collide on a 16-bit random suffix within one test run. + assert_ne!(a, b); + } + + #[test] + fn sidecar_serializes_with_all_fields() { + use chrono::{Local, TimeZone}; + let dt = Local.with_ymd_and_hms(2026, 4, 20, 14, 32, 5).unwrap(); + let sidecar = SessionSidecar { + id: "2026-04-20T14-32-05_a7f3".to_string(), + recorded_at: dt, + duration_secs: 4.73, + sample_rate: 16_000, + engine: "whisper".to_string(), + model: "ggml-large-v3-q8_0".to_string(), + language: Some("hu".to_string()), + profile: Some("translate".to_string()), + post_process_command: Some("openrouter-translate".to_string()), + voxtype_version: env!("CARGO_PKG_VERSION").to_string(), + text_stages: TextStages { raw: true, processed: false, post: true }, + }; + let s = serde_json::to_string(&sidecar).unwrap(); + assert!(s.contains("\"id\":\"2026-04-20T14-32-05_a7f3\"")); + assert!(s.contains("\"language\":\"hu\"")); + assert!(s.contains("\"text_stages\":{")); + assert!(s.contains("\"processed\":false")); + // Round-trip parse + let parsed: SessionSidecar = serde_json::from_str(&s).unwrap(); + assert_eq!(parsed.id, "2026-04-20T14-32-05_a7f3"); + assert_eq!(parsed.language.as_deref(), Some("hu")); + assert!(!parsed.text_stages.processed); + } + + #[test] + fn sidecar_serializes_with_null_optionals() { + use chrono::Local; + let sidecar = SessionSidecar { + id: "x".to_string(), + recorded_at: Local::now(), + duration_secs: 1.0, + sample_rate: 16_000, + engine: "whisper".to_string(), + model: "tiny".to_string(), + language: None, + profile: None, + post_process_command: None, + voxtype_version: "0.0.0".to_string(), + text_stages: TextStages { raw: true, processed: false, post: false }, + }; + let s = serde_json::to_string(&sidecar).unwrap(); + assert!(s.contains("\"language\":null")); + assert!(s.contains("\"post_process_command\":null")); + } + + #[test] + fn wav_roundtrip_int16_16khz() { + let tmp = tempfile::tempdir().unwrap(); + let path = tmp.path().join("out.wav"); + + // 1 second of a silent ramp, 16 kHz mono f32 in [-1.0, 1.0). + let samples: Vec = (0..16_000).map(|i| (i as f32) / 16_000.0 - 0.5).collect(); + let file = std::fs::OpenOptions::new() + .write(true) + .create_new(true) + .open(&path) + .expect("wav file create"); + write_wav(file, &samples, 16_000).expect("wav write"); + + let mut reader = hound::WavReader::open(&path).expect("wav open"); + let spec = reader.spec(); + assert_eq!(spec.channels, 1); + assert_eq!(spec.sample_rate, 16_000); + assert_eq!(spec.bits_per_sample, 16); + let out: Vec = reader.samples::().map(|s| s.unwrap()).collect(); + assert_eq!(out.len(), 16_000); + } + + fn sample_session(raw: &str, processed: &str, post: Option<&str>) -> CorpusSession { + use chrono::{Local, TimeZone}; + CorpusSession { + samples: vec![0.0; 16_000], // 1 second of silence + sample_rate: 16_000, + raw_text: raw.to_string(), + processed_text: processed.to_string(), + post_text: post.map(String::from), + engine: "whisper".to_string(), + model: "tiny".to_string(), + language: Some("en".to_string()), + profile: None, + post_process_command: post.map(|_| "my-llm".to_string()), + duration_secs: 1.0, + recorded_at: Local.with_ymd_and_hms(2026, 4, 20, 14, 32, 5).unwrap(), + } + } + + #[test] + fn save_writes_full_quadruplet() { + let tmp = tempfile::tempdir().unwrap(); + let writer = CorpusWriter::open(CorpusConfig { + path: tmp.path().to_path_buf(), + }).unwrap(); + + let session = sample_session("hello world", "Hello, world.", Some("Hello, world!")); + let id = writer.save(session).expect("save"); + + assert!(tmp.path().join(format!("{id}.wav")).exists()); + assert!(tmp.path().join(format!("{id}.raw.txt")).exists()); + assert!(tmp.path().join(format!("{id}.processed.txt")).exists()); + assert!(tmp.path().join(format!("{id}.post.txt")).exists()); + assert!(tmp.path().join(format!("{id}.json")).exists()); + + let raw = std::fs::read_to_string(tmp.path().join(format!("{id}.raw.txt"))).unwrap(); + assert_eq!(raw, "hello world"); + let post = std::fs::read_to_string(tmp.path().join(format!("{id}.post.txt"))).unwrap(); + assert_eq!(post, "Hello, world!"); + } + + #[test] + fn save_elides_processed_when_equal_to_raw() { + let tmp = tempfile::tempdir().unwrap(); + let writer = CorpusWriter::open(CorpusConfig { + path: tmp.path().to_path_buf(), + }).unwrap(); + + let session = sample_session("same", "same", Some("different")); + let id = writer.save(session).unwrap(); + + assert!(!tmp.path().join(format!("{id}.processed.txt")).exists()); + assert!(tmp.path().join(format!("{id}.post.txt")).exists()); + + let json = std::fs::read_to_string(tmp.path().join(format!("{id}.json"))).unwrap(); + let parsed: SessionSidecar = serde_json::from_str(&json).unwrap(); + assert!(!parsed.text_stages.processed); + assert!(parsed.text_stages.post); + } + + #[test] + fn save_elides_post_when_none() { + let tmp = tempfile::tempdir().unwrap(); + let writer = CorpusWriter::open(CorpusConfig { + path: tmp.path().to_path_buf(), + }).unwrap(); + + let session = sample_session("raw", "processed", None); + let id = writer.save(session).unwrap(); + + assert!(!tmp.path().join(format!("{id}.post.txt")).exists()); + assert!(tmp.path().join(format!("{id}.processed.txt")).exists()); + + let json = std::fs::read_to_string(tmp.path().join(format!("{id}.json"))).unwrap(); + let parsed: SessionSidecar = serde_json::from_str(&json).unwrap(); + assert!(!parsed.text_stages.post); + assert!(parsed.post_process_command.is_none()); + } +} diff --git a/src/daemon.rs b/src/daemon.rs index 6e50c7a..08282cc 100644 --- a/src/daemon.rs +++ b/src/daemon.rs @@ -498,6 +498,10 @@ pub struct Daemon { >, // Background task for transcription (allows cancel during transcription) transcription_task: Option>, + // Model override that was active for the in-flight transcription_task, so + // corpus metadata can record the actual model used even after the + // Recording → Transcribing state transition drops the override. + pending_model_override: Option, // Background tasks for eager chunk transcriptions (chunk_index, task) eager_chunk_tasks: Vec<( usize, @@ -519,6 +523,9 @@ pub struct Daemon { // GTCRN speech enhancer for mic echo cancellation #[cfg(feature = "onnx-common")] speech_enhancer: Option>, + /// Corpus writer for post-processing training data capture. + /// None when `config.corpus.enabled == false` or startup failed. + corpus_writer: Option>, // Media players that were paused when recording started (for resume on stop) paused_media_players: Vec, } @@ -607,6 +614,7 @@ impl Daemon { model_manager: None, model_load_task: None, transcription_task: None, + pending_model_override: None, eager_chunk_tasks: Vec::new(), vad, meeting_daemon: None, @@ -617,6 +625,7 @@ impl Daemon { meeting_event_rx: None, #[cfg(feature = "onnx-common")] speech_enhancer: None, + corpus_writer: None, paused_media_players: Vec::new(), } } @@ -1196,6 +1205,13 @@ impl Daemon { let duration = state.recording_duration().unwrap_or_default(); tracing::info!("Recording stopped ({:.1}s)", duration.as_secs_f32()); + // Preserve the active model_override so corpus metadata can record it + // after the Recording → Transcribing state transition drops it. + self.pending_model_override = match state { + State::Recording { model_override, .. } => model_override.clone(), + _ => None, + }; + // Play audio feedback self.play_feedback(SoundEvent::RecordingStop); @@ -1285,6 +1301,8 @@ impl Daemon { &mut self, state: &mut State, result: std::result::Result, + captured_audio: Vec, + model_override: Option, ) { match result { Ok(Ok(text)) => { @@ -1313,6 +1331,9 @@ impl Daemon { ); } + // Save before it's consumed by the final_text block (needed for corpus capture). + let processed_text_for_corpus = processed_text.clone(); + // Check for profile override from CLI flags let profile_override = read_profile_override(); let active_profile = profile_override @@ -1337,6 +1358,7 @@ impl Daemon { } }); // Apply post-processing command (profile overrides default) + let mut post_processor_ran = false; let final_text = if let Some(profile) = active_profile { if let Some(ref cmd) = profile.post_process_command { let timeout_ms = profile.post_process_timeout_ms.unwrap_or(30000); @@ -1358,6 +1380,7 @@ impl Daemon { .await; tracing::info!("Post-processed: changed: {}", result != processed_text); tracing::debug!("Post-processed result: {:?}", result); + post_processor_ran = true; result } else { // Profile exists but has no post_process_command, use default @@ -1369,6 +1392,7 @@ impl Daemon { .await; tracing::info!("Post-processed: changed: {}", result != processed_text); tracing::debug!("Post-processed result: {:?}", result); + post_processor_ran = true; result } else { processed_text @@ -1382,6 +1406,7 @@ impl Daemon { .await; tracing::info!("Post-processed: changed: {}", result != processed_text); tracing::debug!("Post-processed result: {:?}", result); + post_processor_ran = true; result } else { processed_text @@ -1391,6 +1416,58 @@ impl Daemon { self.last_dictation = Some((final_text.clone(), Instant::now())); + // Corpus capture: fire-and-forget save of this session. + // Skip when captured_audio is empty — this is a defensive guard for + // callers that didn't have a buffer to pass in (e.g. error paths). The + // eager-mode path also threads its accumulated buffer through here. + if let Some(writer) = self.corpus_writer.clone().filter(|_| !captured_audio.is_empty()) { + let post_ran = post_processor_ran; + let active_profile_for_corpus = profile_override + .as_ref() + .and_then(|name| self.config.get_profile(name)); + let post_cmd: Option = if post_ran { + active_profile_for_corpus + .and_then(|p| p.post_process_command.clone()) + .or_else(|| { + self.config + .output + .post_process + .as_ref() + .map(|pp| pp.command.clone()) + }) + } else { + None + }; + + let duration_secs = (captured_audio.len() as f32) + / (self.config.audio.sample_rate as f32); + let session = crate::corpus::CorpusSession { + samples: captured_audio, + sample_rate: self.config.audio.sample_rate, + raw_text: text.clone(), + processed_text: processed_text_for_corpus.clone(), + post_text: if post_ran { Some(final_text.clone()) } else { None }, + engine: self.config.engine.name().to_string(), + model: model_override.clone().unwrap_or_else(|| self.config.model_name().to_string()), + language: if self.config.whisper.language.is_auto() { + None + } else { + Some(self.config.whisper.language.primary().to_string()) + }, + profile: profile_override.clone(), + post_process_command: post_cmd, + duration_secs, + recorded_at: chrono::Local::now(), + }; + + tokio::task::spawn_blocking(move || { + match writer.save(session) { + Ok(id) => tracing::debug!("Corpus: saved session {}", id), + Err(e) => tracing::warn!("Corpus save failed: {}", e), + } + }); + } + if smart_submit { tracing::debug!( "Smart auto-submit: final text after post-processing: {:?}", @@ -1560,6 +1637,30 @@ impl Daemon { // Mark any orphaned active meetings as completed cleanup_stale_meetings(&self.config); + // Initialize corpus writer if enabled. + self.corpus_writer = if self.config.corpus.enabled { + let resolved = if self.config.corpus.path == "auto" { + Config::data_dir().join("corpus") + } else { + std::path::PathBuf::from(&self.config.corpus.path) + }; + let cfg = crate::corpus::CorpusConfig { + path: resolved.clone(), + }; + match crate::corpus::CorpusWriter::open(cfg) { + Ok(w) => { + tracing::info!("Corpus capture enabled at {:?}", resolved); + Some(std::sync::Arc::new(w)) + } + Err(e) => { + tracing::warn!("Failed to open corpus dir {:?}: {}", resolved, e); + None + } + } + } else { + None + }; + // Write PID file for external control via signals self.pid_file_path = write_pid_file(); @@ -1914,9 +2015,17 @@ impl Daemon { self.update_state("transcribing"); if let Some(text) = self.finish_eager_recording(&mut state, transcriber).await { - // Move to outputting state and handle via transcription result flow + // Extract accumulated audio and model_override before transitioning state. + // finish_eager_recording clones internally, so the state's buffer is still populated here. + let (captured_audio, captured_model_override) = match &mut state { + State::EagerRecording { accumulated_audio, model_override, .. } => ( + std::mem::take(accumulated_audio), + model_override.clone(), + ), + _ => (Vec::new(), None), + }; state = State::Transcribing { audio: Vec::new() }; - self.handle_transcription_result(&mut state, Ok(Ok(text))).await; + self.handle_transcription_result(&mut state, Ok(Ok(text)), captured_audio, captured_model_override).await; } else { tracing::debug!("Eager recording produced empty result"); self.reset_to_idle(&mut state).await; @@ -2097,8 +2206,17 @@ impl Daemon { self.update_state("transcribing"); if let Some(text) = self.finish_eager_recording(&mut state, transcriber).await { + // Extract accumulated audio and model_override before transitioning state. + // finish_eager_recording clones internally, so the state's buffer is still populated here. + let (captured_audio, captured_model_override) = match &mut state { + State::EagerRecording { accumulated_audio, model_override, .. } => ( + std::mem::take(accumulated_audio), + model_override.clone(), + ), + _ => (Vec::new(), None), + }; state = State::Transcribing { audio: Vec::new() }; - self.handle_transcription_result(&mut state, Ok(Ok(text))).await; + self.handle_transcription_result(&mut state, Ok(Ok(text)), captured_audio, captured_model_override).await; } else { tracing::debug!("Eager recording produced empty result"); self.reset_to_idle(&mut state).await; @@ -2159,6 +2277,7 @@ impl Daemon { if let Some(task) = self.transcription_task.take() { task.abort(); } + self.pending_model_override = None; cleanup_output_mode_override(); cleanup_model_override(); @@ -2344,8 +2463,17 @@ impl Daemon { self.update_state("transcribing"); if let Some(text) = self.finish_eager_recording(&mut state, transcriber).await { + // Extract accumulated audio and model_override before transitioning state. + // finish_eager_recording clones internally, so the state's buffer is still populated here. + let (captured_audio, captured_model_override) = match &mut state { + State::EagerRecording { accumulated_audio, model_override, .. } => ( + std::mem::take(accumulated_audio), + model_override.clone(), + ), + _ => (Vec::new(), None), + }; state = State::Transcribing { audio: Vec::new() }; - self.handle_transcription_result(&mut state, Ok(Ok(text))).await; + self.handle_transcription_result(&mut state, Ok(Ok(text)), captured_audio, captured_model_override).await; } else { tracing::debug!("Eager recording timeout produced empty result"); self.reset_to_idle(&mut state).await; @@ -2535,8 +2663,17 @@ impl Daemon { self.update_state("transcribing"); if let Some(text) = self.finish_eager_recording(&mut state, transcriber).await { + // Extract accumulated audio and model_override before transitioning state. + // finish_eager_recording clones internally, so the state's buffer is still populated here. + let (captured_audio, captured_model_override) = match &mut state { + State::EagerRecording { accumulated_audio, model_override, .. } => ( + std::mem::take(accumulated_audio), + model_override.clone(), + ), + _ => (Vec::new(), None), + }; state = State::Transcribing { audio: Vec::new() }; - self.handle_transcription_result(&mut state, Ok(Ok(text))).await; + self.handle_transcription_result(&mut state, Ok(Ok(text)), captured_audio, captured_model_override).await; } else { tracing::debug!("Eager recording produced empty result"); self.reset_to_idle(&mut state).await; @@ -2553,7 +2690,13 @@ impl Daemon { } }, if self.transcription_task.is_some() => { self.transcription_task = None; - self.handle_transcription_result(&mut state, result).await; + let captured_audio = if let State::Transcribing { audio } = &mut state { + std::mem::take(audio) + } else { + Vec::new() + }; + let model_override = self.pending_model_override.take(); + self.handle_transcription_result(&mut state, result, captured_audio, model_override).await; } // Check for cancel during transcription @@ -2565,6 +2708,7 @@ impl Daemon { if let Some(task) = self.transcription_task.take() { task.abort(); } + self.pending_model_override = None; cleanup_output_mode_override(); cleanup_model_override(); diff --git a/src/lib.rs b/src/lib.rs index 301291a..a7ca427 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -71,6 +71,7 @@ pub mod audio; pub mod cli; pub mod config; +pub mod corpus; pub mod cpu; pub mod daemon; pub mod eager; diff --git a/src/main.rs b/src/main.rs index 6279e7e..57b6e68 100644 --- a/src/main.rs +++ b/src/main.rs @@ -162,6 +162,34 @@ async fn main() -> anyhow::Result<()> { config.hotkey.model_modifier = Some(model_modifier); } + // Corpus overrides + if let Ok(val) = std::env::var("VOXTYPE_CORPUS_ENABLED") { + match val.to_lowercase().as_str() { + "1" | "true" | "yes" | "on" => config.corpus.enabled = true, + "0" | "false" | "no" | "off" => config.corpus.enabled = false, + other => tracing::warn!( + "Ignoring VOXTYPE_CORPUS_ENABLED={:?} (expected one of: 1/0, true/false, yes/no, on/off)", + other + ), + } + } + if let Ok(path) = std::env::var("VOXTYPE_CORPUS_PATH") { + config.corpus.path = path; + } + if let Some(path) = &cli.corpus_path { + config.corpus.path = path.display().to_string(); + // --corpus-path implies enabled unless explicitly disabled + if !cli.no_corpus { + config.corpus.enabled = true; + } + } + if cli.corpus { + config.corpus.enabled = true; + } + if cli.no_corpus { + config.corpus.enabled = false; + } + // Whisper overrides if let Some(delay) = cli.pre_type_delay { config.output.pre_type_delay_ms = delay;