peteonrails · materemias · Apr 20, 2026
@@ -2249,6 +2249,41 @@ Timeout for summarization requests.
 
 ---
 
+## `[corpus]` - Training Corpus Capture
+
+Autosaves push-to-talk sessions as paired `(audio, text, metadata)` artifacts so you can build a dataset for training or evaluating LLM post-processing.
+
+### `enabled` (bool)
+**Default:** `false`
+
+When true, each successful push-to-talk recording writes a set of files to the corpus directory. No data is captured when disabled.
+
+### `path` (string)
+**Default:** `"auto"` (resolves to `~/.local/share/voxtype/corpus/`)
+
+Directory where corpus artifacts are written. Created automatically if it does not exist.
+
+### Files per session
+
+Each session produces files sharing a timestamped stem (e.g. `2026-04-20T14-32-05_a7f3`):
+
+| File | Always present? | Content |
+|------|-----------------|---------|
+| `<stem>.wav` | yes | 16 kHz mono int16 audio passed to the transcriber |
+| `<stem>.raw.txt` | yes | Raw ASR output |
+| `<stem>.processed.txt` | only if differs from raw | Text after spoken punctuation / replacements |
+| `<stem>.post.txt` | only if post-processor ran | Final text delivered as output |
+| `<stem>.json` | yes | Metadata sidecar (model, engine, language, profile, duration, ...) |
+
+### Overrides
+
+| Layer | Setting |
+|-------|---------|
+| CLI | `--corpus` / `--no-corpus` / `--corpus-path <DIR>` |
+| Env | `VOXTYPE_CORPUS_ENABLED=true`, `VOXTYPE_CORPUS_PATH=/path` |
+
+---
+
 ## [status]
 
 Controls status display icons for Waybar and other tray integrations.

@@ -1050,6 +1050,21 @@ journalctl --user -u pulseaudio -n 20
 
 ---
 
+## Corpus files aren't appearing
+
+If you enabled `[corpus] enabled = true` (or `--corpus` / `VOXTYPE_CORPUS_ENABLED=1`) but no files appear in the corpus directory after recording:
+
+1. Check the daemon logs (`-vv` or `journalctl --user -u voxtype`). Look for one of:
+   - `Failed to open corpus dir "...": ...` - the directory cannot be created (permission, read-only mount). Check the path and filesystem permissions.
+   - `Corpus save failed: ...` - a single session failed to write. Common causes: disk full, directory turned read-only mid-session, or the file system doesn't support long filenames.
+2. Eager-mode recordings worked, but you're using the main (non-eager) path and still no files: confirm the recording actually produced transcribed text. Empty transcriptions are intentionally skipped before reaching the corpus save site.
+3. Verify the resolved path. If `path = "auto"`, corpus writes to `~/.local/share/voxtype/corpus/`. Check with:
+   ```bash
+   ls -la ~/.local/share/voxtype/corpus/
+   ```
+
+---
+
 ## Getting Help
 
 If you're still having issues:

@@ -2249,6 +2249,34 @@ voxtype setup dms --qml        # Output raw QML (for scripting)
 
 ---
 
+## Building a Training Corpus
+
+Voxtype can autosave every push-to-talk session as an `(audio, raw_text, processed_text, post_text, metadata)` tuple so you can iteratively build a dataset for training or evaluating LLM post-processing. The `processed_text` stage (spoken-punctuation and word replacements) is written only when it differs from the raw transcription.
+
+Enable it in your config:
+
+```toml
+[corpus]
+enabled = true
+path = "auto"  # ~/.local/share/voxtype/corpus/
+```
+
+Or via CLI for a single run:
+
+```bash
+voxtype --corpus --corpus-path ~/my-corpus
+```
+
+Every successful recording produces files sharing a timestamped stem (e.g. `2026-04-20T14-32-05_a7f3.wav`, `.raw.txt`, optional `.processed.txt` when it differs from raw, `.post.txt` when a post-processor ran, `.json`). The sidecar `text_stages` object indicates which text files are present. See `docs/CONFIGURATION.md` for the full schema.
+
+The corpus is designed for downstream tooling: HuggingFace `datasets` (`load_dataset("audiofolder", ...)`), pandas, or a custom script that reads the JSON sidecars and text files.
+
+### Secrets caveat
+
+The sidecar JSON records the `post_process_command` string that was used for each recording. If you embed credentials directly in the command (e.g. `API_KEY=sk-... curl ...`), those credentials end up in every sidecar. Prefer script files that read secrets from env vars or config files, and keep the command in your voxtype config limited to the script name or a non-sensitive invocation.
+
+---
+
 ## Feedback
 
 We want to hear from you! Voxtype is a young project and your feedback helps make it better.

@@ -316,6 +316,25 @@ pub struct Cli {
     #[arg(long, value_name = "MS", help_heading = "VAD", hide_short_help = true)]
     pub vad_min_speech_ms: Option<u32>,
 
+    // -- Corpus capture --
+
+    /// Enable corpus capture for this daemon run (overrides config)
+    #[arg(long, help_heading = "Corpus",
+        long_help = "Autosave every push-to-talk session as an\n\
+        (audio, raw, processed, post, metadata) tuple into the corpus directory.\n\
+        The processed and post files are only written when they differ from raw\n\
+        or when a post-processor runs, respectively.\n\
+        Useful for building a training corpus for LLM post-processing.")]
+    pub corpus: bool,
+
+    /// Disable corpus capture for this daemon run (overrides config)
+    #[arg(long, help_heading = "Corpus", conflicts_with = "corpus")]
+    pub no_corpus: bool,
+
+    /// Override corpus storage directory (implies --corpus unless --no-corpus is set)
+    #[arg(long, value_name = "DIR", help_heading = "Corpus")]
+    pub corpus_path: Option<std::path::PathBuf>,
+
     #[command(subcommand)]
     pub command: Option<Commands>,
 }
@@ -1889,4 +1908,25 @@ mod tests {
             _ => panic!("Expected Record command"),
         }
     }
+
+    #[test]
+    fn corpus_flags_parse() {
+        use clap::Parser;
+        let cli = Cli::try_parse_from(["voxtype", "--corpus", "--corpus-path", "/tmp/c"]).unwrap();
+        assert!(cli.corpus);
+        assert!(!cli.no_corpus);
+        assert_eq!(cli.corpus_path, Some(std::path::PathBuf::from("/tmp/c")));
+    }
+
+    #[test]
+    fn corpus_and_no_corpus_conflict() {
+        use clap::Parser;
+        let result = Cli::try_parse_from(["voxtype", "--corpus", "--no-corpus"]);
+        assert!(result.is_err(), "expected conflict error");
+        let msg = match result {
+            Err(e) => e.to_string(),
+            Ok(_) => panic!("expected Err"),
+        };
+        assert!(msg.contains("--no-corpus") || msg.contains("--corpus"));
+    }
 }
@@ -295,6 +295,20 @@ on_transcription = true
 # [profiles.code]
 # post_process_command = "ollama run llama3.2:1b 'Format as code comment...'"
 # output_mode = "clipboard"
+
+[corpus]
+# Autosave push-to-talk sessions as paired (audio, text, metadata) tuples
+# for building a training corpus to improve LLM post-processing.
+# When enabled, each recording produces a set of files in the corpus path:
+#   <timestamp>_<id>.wav          — 16 kHz mono int16 audio
+#   <timestamp>_<id>.raw.txt      — raw ASR output
+#   <timestamp>_<id>.processed.txt — text after replacements/spoken punctuation (if different)
+#   <timestamp>_<id>.post.txt     — text after LLM post-processing (if post-processor ran)
+#   <timestamp>_<id>.json         — metadata sidecar
+enabled = false
+
+# Storage path ("auto" = ~/.local/share/voxtype/corpus/)
+path = "auto"
 "#;
 
 /// Hotkey activation mode
@@ -363,6 +377,10 @@ pub struct Config {
     #[serde(default)]
     pub meeting: MeetingConfig,
 
+    /// Corpus capture configuration (post-processing training)
+    #[serde(default)]
+    pub corpus: CorpusConfig,
+
     /// Optional path to state file for external integrations (e.g., Waybar)
     /// When set, the daemon writes current state ("idle", "recording", "transcribing")
     /// to this file whenever state changes.
@@ -1183,6 +1201,21 @@ pub enum TranscriptionEngine {
     Omnilingual,
 }
 
+impl TranscriptionEngine {
+    /// Stable lowercase identifier used in logs, corpus metadata, etc.
+    pub fn name(&self) -> &'static str {
+        match self {
+            TranscriptionEngine::Whisper => "whisper",
+            TranscriptionEngine::Parakeet => "parakeet",
+            TranscriptionEngine::Moonshine => "moonshine",
+            TranscriptionEngine::SenseVoice => "sensevoice",
+            TranscriptionEngine::Paraformer => "paraformer",
+            TranscriptionEngine::Dolphin => "dolphin",
+            TranscriptionEngine::Omnilingual => "omnilingual",
+        }
+    }
+}
+
 /// VAD backend selection
 ///
 /// Determines which voice activity detection algorithm to use.
@@ -1486,6 +1519,33 @@ impl Default for MeetingConfig {
     }
 }
 
+/// Corpus capture configuration — autosaves push-to-talk sessions
+/// as paired (audio, text, metadata) artifacts for training/evaluation.
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct CorpusConfig {
+    /// Enable corpus capture (default: false)
+    #[serde(default)]
+    pub enabled: bool,
+
+    /// Storage path for corpus artifacts ("auto" for default location)
+    /// Default: ~/.local/share/voxtype/corpus/
+    #[serde(default = "default_corpus_path")]
+    pub path: String,
+}
+
+impl Default for CorpusConfig {
+    fn default() -> Self {
+        Self {
+            enabled: false,
+            path: default_corpus_path(),
+        }
+    }
+}
+
+fn default_corpus_path() -> String {
+    "auto".to_string()
+}
+
 /// Notification configuration
 #[derive(Debug, Clone, Deserialize, Serialize)]
 pub struct NotificationConfig {
@@ -1877,6 +1937,7 @@ impl Default for Config {
             vad: VadConfig::default(),
             status: StatusConfig::default(),
             meeting: MeetingConfig::default(),
+            corpus: CorpusConfig::default(),
             state_file: Some("auto".to_string()),
             profiles: HashMap::new(),
         }
@@ -3776,4 +3837,60 @@ mod tests {
         let config: Config = toml::from_str(toml_str).unwrap();
         assert!(config.hotkey.profile_modifiers.is_empty());
     }
+
+    #[test]
+    fn test_corpus_config_defaults() {
+        let config = Config::default();
+        assert!(!config.corpus.enabled);
+        assert_eq!(config.corpus.path, "auto");
+    }
+
+    #[test]
+    fn test_corpus_config_parses_from_toml() {
+        let toml = r#"
+[hotkey]
+key = "SCROLLLOCK"
+
+[audio]
+device = "default"
+sample_rate = 16000
+max_duration_secs = 60
+
+[output]
+mode = "type"
+
+[corpus]
+enabled = true
+path = "/tmp/corpus"
+"#;
+        let config: Config = toml::from_str(toml).unwrap();
+        assert!(config.corpus.enabled);
+        assert_eq!(config.corpus.path, "/tmp/corpus");
+    }
+
+    #[test]
+    fn test_corpus_config_omitted_uses_defaults() {
+        let toml = r#"
+[hotkey]
+key = "SCROLLLOCK"
+
+[audio]
+device = "default"
+sample_rate = 16000
+max_duration_secs = 60
+
+[output]
+mode = "type"
+"#;
+        let config: Config = toml::from_str(toml).unwrap();
+        assert!(!config.corpus.enabled);
+        assert_eq!(config.corpus.path, "auto");
+    }
+
+    #[test]
+    fn default_config_template_parses_and_has_corpus_defaults() {
+        let config: Config = toml::from_str(DEFAULT_CONFIG).unwrap();
+        assert!(!config.corpus.enabled);
+        assert_eq!(config.corpus.path, "auto");
+    }
 }