peteonrails · sjawhar · Mar 23, 2026
@@ -51,6 +51,10 @@ hound = "3"  # WAV file reading/writing
 # HTTP client for remote transcription
 ureq = { version = "2", features = ["json"] }
 
+deepgram = { version = "0.9", default-features = false, features = ["listen"] }
+# TLS crypto provider for rustls (required by deepgram's WebSocket client)
+rustls = { version = "0.23", features = ["ring"] }
+
 # JSON parsing (for CLI backend)
 serde_json = "1"
 

@@ -2652,6 +2652,79 @@ voxtype setup --download --model medium.en
 
 ---
 
+## Streaming Mode (Deepgram)
+
+Real-time transcription via Deepgram WebSocket API. Transcription begins immediately as you speak, with results streamed back in real-time.
+
+### When to Use
+
+- **Real-time feedback:** See transcription results while still recording
+- **Long recordings:** No waiting for transcription to complete after recording stops
+- **Latency-sensitive applications:** Immediate text output for live captioning or accessibility
+- **Internet available:** Requires stable internet connection to Deepgram servers
+
+### Prerequisites
+
+1. Deepgram account (free tier available at https://console.deepgram.com)
+2. API key from your Deepgram account
+3. Internet connection to Deepgram servers
+
+### Configuration
+
+Enable streaming mode and set your API key:
+
+```toml
+[whisper]
+mode = "streaming"
+streaming_api_key = "your-deepgram-api-key"
+```
+
+Or use environment variable:
+
+```bash
+export VOXTYPE_DEEPGRAM_API_KEY="your-deepgram-api-key"
+voxtype daemon
+```
+
+Or CLI flag:
+
+```bash
+voxtype --whisper-mode streaming --streaming-api-key "your-key" daemon
+```
+
+### Streaming Options
+
+```toml
+[whisper]
+mode = "streaming"
+streaming_api_key = "your-api-key"
+streaming_model = "nova-3"
+streaming_endpoint = "wss://api.deepgram.com/v1/listen"
+```
+
+`streaming_model` chooses the Deepgram model.
+`streaming_endpoint` lets you target self-hosted or custom endpoints.
+
+### CLI Flags
+
+```bash
+voxtype --whisper-mode streaming daemon
+voxtype --streaming-model nova-3 daemon
+voxtype --streaming-endpoint "wss://custom.server/v1/listen" daemon
+voxtype --streaming-api-key "your-key" daemon
+```
+
+### Limitations
+
+- Requires internet access
+- Requires valid Deepgram API credentials
+- `voxtype transcribe file.wav` is not supported in streaming mode
+- Local model preparation is skipped in streaming mode
+
+### Privacy
+
+Streaming mode sends audio to Deepgram servers for transcription. See https://deepgram.com/privacy for details.
+
 ## Deprecated Options
 
 The following configuration options are deprecated but still supported for backwards compatibility. They will log a warning when used.

@@ -17,6 +17,7 @@ Solutions to common issues when using Voxtype.
 - [Performance Issues](#performance-issues)
 - [Systemd Service Issues](#systemd-service-issues)
 - [Debug Mode](#debug-mode)
+- [Streaming Mode Issues](#streaming-mode-issues)
 
 ---
 
@@ -1024,6 +1025,54 @@ Include:
 
 ---
 
+## Streaming Mode Issues
+
+### Deepgram API key is required
+
+**Error:** `Deepgram API key is required for streaming mode. Set VOXTYPE_DEEPGRAM_API_KEY environment variable.`
+
+Set one of:
+
+```bash
+export VOXTYPE_DEEPGRAM_API_KEY="your-api-key"
+```
+
+```toml
+[whisper]
+mode = "streaming"
+streaming_api_key = "your-api-key"
+```
+
+### Failed to open Deepgram stream
+
+**Error:** `Failed to open Deepgram stream: ...`
+
+- Check internet connection
+- Verify API key and endpoint
+- Try another model (`streaming_model = "nova-2"`)
+
+### Deepgram stream finish timed out
+
+**Error:** `Deepgram stream finish timed out ...`
+
+- Retry on a stable network
+- Switch to local mode if needed:
+
+```toml
+[whisper]
+mode = "local"
+```
+
+### Streaming mode not available for file transcription
+
+**Error:** `Streaming mode is only available for live recording via the daemon...`
+
+Use local mode for file input:
+
+```bash
+voxtype --whisper-mode local transcribe file.wav
+```
+
 ## Feedback
 
 We want to hear from you! Voxtype is a young project and your feedback helps make it better.

@@ -16,6 +16,7 @@ Voxtype is a push-to-talk voice-to-text tool for Linux. Optimized for Wayland, w
 - [Improving Transcription Accuracy](#improving-transcription-accuracy)
 - [Whisper Models](#whisper-models)
 - [Remote Whisper Servers](#remote-whisper-servers)
+- [Streaming Mode (Deepgram)](#streaming-mode-deepgram)
 - [CLI Backend (whisper-cli)](#cli-backend-whisper-cli)
 - [Eager Processing](#eager-processing)
 - [Output Modes](#output-modes)
@@ -2177,6 +2178,64 @@ voxtype setup dms --qml        # Output raw QML (for scripting)
 
 ---
 
+## Streaming Mode (Deepgram)
+
+Real-time transcription using Deepgram's WebSocket API. Audio is transcribed as you speak, with results appearing instantly.
+
+### Quick Start
+
+1. Create a Deepgram account at https://console.deepgram.com (free tier available)
+2. Get your API key from the Deepgram console
+3. Set your API key in config or environment:
+
+```toml
+[whisper]
+mode = "streaming"
+streaming_api_key = "your-api-key-here"
+```
+
+```bash
+export VOXTYPE_DEEPGRAM_API_KEY="your-api-key-here"
+voxtype daemon
+```
+
+### How It Works
+
+1. Opens a WebSocket connection when recording starts
+2. Sends audio chunks while you speak
+3. Receives transcription in real-time
+4. Returns final text when recording stops
+
+### Configuration
+
+```toml
+[whisper]
+mode = "streaming"
+streaming_api_key = "your-api-key"
+streaming_model = "nova-3"
+streaming_endpoint = "wss://api.deepgram.com/v1/listen"
+```
+
+### Differences from Local
+
+| Feature | Streaming | Local |
+|---------|-----------|-------|
+| Requires internet | Yes | No |
+| Real-time results | Yes | No |
+| Model download | No | Yes |
+| Privacy | Audio sent to Deepgram | Stays local |
+
+### Troubleshooting
+
+- Deepgram API key missing: set `VOXTYPE_DEEPGRAM_API_KEY` or `streaming_api_key`
+- Failed to open stream: check network and API key
+- Finish timeout: retry or switch to local mode
+- Empty transcripts: verify sample rate/language/model
+
+### Privacy Considerations
+
+Streaming mode sends audio to Deepgram servers for transcription. Review https://deepgram.com/privacy.
+
 ## Feedback
 
 We want to hear from you! Voxtype is a young project and your feedback helps make it better.

@@ -93,7 +93,6 @@ pub struct Cli {
     pub model_modifier: Option<String>,
 
     // -- Whisper --
-
     /// Disable context window optimization for short recordings
     #[arg(long, help_heading = "Whisper")]
     pub no_whisper_context_optimization: bool,
@@ -123,7 +122,7 @@ pub struct Cli {
     #[arg(long, help_heading = "Whisper")]
     pub on_demand_loading: bool,
 
-    /// Whisper execution mode: local, remote, or cli
+    /// Whisper execution mode: local, remote, cli, or streaming
     #[arg(long, value_name = "MODE", help_heading = "Whisper")]
     pub whisper_mode: Option<String>,
 
@@ -147,8 +146,26 @@ pub struct Cli {
     #[arg(long, value_name = "KEY", help_heading = "Whisper")]
     pub remote_api_key: Option<String>,
 
-    // -- Audio --
+    #[arg(long, value_name = "MODEL", help_heading = "Whisper")]
+    pub streaming_model: Option<String>,
+
+    #[arg(long, value_name = "URL", help_heading = "Whisper")]
+    pub streaming_endpoint: Option<String>,
+
+    #[arg(long, value_name = "KEY", help_heading = "Whisper")]
+    pub streaming_api_key: Option<String>,
+
+    /// Endpointing duration in milliseconds for Deepgram streaming.
+    /// Controls silence duration before finalizing a transcript segment (default: Deepgram's default).
+    #[arg(long, value_name = "MS", help_heading = "Whisper")]
+    pub streaming_endpointing_ms: Option<u32>,
+
+    /// Timeout in seconds for finalizing Deepgram streaming transcription after recording stops.
+    /// Longer recordings may need more time. Default: 15 seconds.
+    #[arg(long, value_name = "SECS", help_heading = "Whisper")]
+    pub streaming_finish_timeout_secs: Option<u64>,
 
+    // -- Audio --
     /// Audio input device name (or "default" for system default)
     #[arg(long, value_name = "DEVICE", help_heading = "Audio")]
     pub audio_device: Option<String>,
@@ -166,7 +183,6 @@ pub struct Cli {
     pub no_audio_feedback: bool,
 
     // -- Output --
-
     /// Delay before typing starts (ms), helps prevent first character drop
     #[arg(long, value_name = "MS", help_heading = "Output")]
     pub pre_type_delay: Option<u32>,
@@ -219,7 +235,11 @@ pub struct Cli {
     pub fallback_to_clipboard: bool,
 
     /// Disable clipboard fallback
-    #[arg(long, conflicts_with = "fallback_to_clipboard", help_heading = "Output")]
+    #[arg(
+        long,
+        conflicts_with = "fallback_to_clipboard",
+        help_heading = "Output"
+    )]
     pub no_fallback_to_clipboard: bool,
 
     /// Enable spoken punctuation conversion (e.g., say "period" to get ".")
@@ -259,7 +279,6 @@ pub struct Cli {
     pub pre_recording_command: Option<String>,
 
     // -- VAD --
-
     /// Enable Voice Activity Detection (filter silence before transcription)
     #[arg(long, help_heading = "VAD")]
     pub vad: bool,