peteonrails · jacob-vincent-mink · Mar 24, 2026 · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026
@@ -75,6 +75,9 @@ ndarray = { version = "0.16", optional = true }
 tokenizers = { version = "0.20", optional = true, default-features = false, features = ["onig"] }
 rustfft = { version = "6", optional = true }
 
+# OpenVINO GenAI backend (Intel NPU/CPU/GPU via OpenVINO GenAI WhisperPipeline)
+openvino-genai = { version = "0.10.0", optional = true, features = ["runtime-linking"] }
+
 # CPU count for thread detection
 num_cpus = "1.16"
 
@@ -126,6 +129,8 @@ dolphin-tensorrt = ["dolphin", "ort/tensorrt"]
 omnilingual = ["onnx-common"]
 omnilingual-cuda = ["omnilingual", "ort/cuda"]
 omnilingual-tensorrt = ["omnilingual", "ort/tensorrt"]
+# OpenVINO Whisper backend (Intel NPU/CPU/GPU via OpenVINO GenAI WhisperPipeline)
+openvino-whisper = ["dep:openvino-genai"]
 
 [build-dependencies]
 clap = { version = "4", features = ["derive"] }

@@ -12,7 +12,7 @@ Hold a hotkey (default: ScrollLock) while speaking, release to transcribe and ou
 
 - **Works on any Linux desktop** - Uses compositor keybindings (Hyprland, Sway, River) with evdev fallback for X11 and other environments
 - **Fully offline by default** - Uses whisper.cpp for local transcription, with optional remote server support
-- **7 transcription engines** - Whisper, Parakeet, Moonshine, SenseVoice, Paraformer, Dolphin, and Omnilingual (see [Supported Engines](#supported-engines) below)
+- **8 transcription engines** - Whisper, Parakeet, Moonshine, SenseVoice, Paraformer, Dolphin, Omnilingual, and OpenVINO Whisper (see [Supported Engines](#supported-engines) below)
 - **Chinese, Japanese, Korean, and 1600+ languages** - SenseVoice, Dolphin, and Omnilingual add native support for CJK and other non-Latin scripts
 - **Meeting mode** - Continuous meeting transcription with chunked processing, speaker attribution, and export to Markdown, JSON, SRT, or VTT
 - **Fallback chain** - Types via wtype (best CJK support), falls back to dotool (keyboard layout support), ydotool, then clipboard
@@ -331,11 +331,12 @@ Voxtype ships separate binaries for Whisper and ONNX engines. Use `voxtype setup
 | **Paraformer** | zh+en, zh+yue+en | Non-autoregressive (ONNX) | Chinese-English bilingual |
 | **Dolphin** | 40 languages + 22 Chinese dialects | CTC E-Branchformer (ONNX) | Eastern languages (no English) |
 | **Omnilingual** | 1600+ languages | wav2vec2 CTC (ONNX) | Low-resource and rare languages |
+| **OpenVINO Whisper** | 99 languages | Encoder-decoder (OpenVINO) | Intel NPU (Lunar Lake), CPU/GPU fallback |
 
 To set the engine in your config:
 
 ```toml
-engine = "sensevoice"  # or: whisper, parakeet, moonshine, paraformer, dolphin, omnilingual
+engine = "sensevoice"  # or: whisper, parakeet, moonshine, paraformer, dolphin, omnilingual, openvino
 ```
 
 Or override on the command line:
@@ -392,6 +393,32 @@ cargo build --release --features gpu-metal
 cargo build --release --features gpu-hipblas
 ```
 
+### Intel NPU (Lunar Lake, Arrow Lake, Meteor Lake)
+
+Intel NPU acceleration uses OpenVINO GenAI with Whisper models exported in OpenVINO IR format:
+
+```bash
+# Build with OpenVINO support
+cargo build --release --features openvino-whisper
+
+# Install OpenVINO GenAI runtime libraries (required at runtime)
+pip install openvino-genai
+
+# Download a model
+voxtype setup model  # Select an OpenVINO model
+
+# Configure
+cat >> ~/.config/voxtype/config.toml << 'EOF'
+engine = "openvino"
+
+[openvino]
+model = "base.en-int8"
+device = "NPU"
+EOF
+```
+
+The NPU requires the Intel NPU driver (`intel-npu-driver`). Set `device = "CPU"` to fall back to CPU inference on systems without an NPU.
+
 ### Performance Comparison
 
 Results vary by hardware. Example on AMD RX 6800:

@@ -27,6 +27,7 @@ Selects which speech-to-text engine to use for transcription.
 - `whisper` - OpenAI Whisper via whisper.cpp (default, recommended)
 - `parakeet` - NVIDIA Parakeet via ONNX Runtime (experimental, requires special binary)
 - `moonshine` - Moonshine encoder-decoder transformer via ONNX Runtime (experimental, requires special binary)
+- `openvino` - OpenVINO Whisper for Intel NPU/CPU/GPU (requires `--features openvino-whisper`)
 
 **Example:**
 ```toml
@@ -1132,6 +1133,91 @@ on_demand_loading = false  # Keep model loaded for fast response
 
 ---
 
+## [openvino]
+
+Configuration for the OpenVINO Whisper speech-to-text engine. This section is only used when `engine = "openvino"`. Requires building with `--features openvino-whisper` and OpenVINO GenAI runtime libraries installed.
+
+### openvino.model
+
+**Type:** String
+**Default:** `"base.en"`
+
+Model name or absolute path to directory containing OpenVINO IR model files. The directory must contain `openvino_encoder_model.xml/.bin`, `openvino_decoder_model.xml/.bin`, and `tokenizer.json`.
+
+Short names available: `"base.en-int8"`, `"base.en-fp16"`, `"small.en-int8"`, `"base-int8"`, `"large-v3-int8"`.
+
+### openvino.device
+
+**Type:** String
+**Default:** `"NPU"`
+
+OpenVINO device to run inference on. Options: `"NPU"` (Intel Neural Processing Unit), `"CPU"`, `"GPU"`, `"AUTO"` (automatic device selection).
+
+### openvino.quantized
+
+**Type:** Boolean
+**Default:** `true`
+
+Prefer int8 quantized model variants. Int8 models are smaller and run faster on NPU. Set to `false` for fp16 models which may have slightly higher accuracy.
+
+### openvino.language
+
+**Type:** String
+**Default:** `"en"`
+
+Language code for transcription. Uses Whisper language codes: `"en"`, `"zh"`, `"fr"`, `"de"`, `"ja"`, `"ko"`, etc.
+
+### openvino.translate
+
+**Type:** Boolean
+**Default:** `false`
+
+When true, translates non-English speech to English instead of transcribing in the source language.
+
+### openvino.threads
+
+**Type:** Integer (optional)
+**Default:** System-detected
+
+Number of CPU threads for inference. Only applies when `device = "CPU"`.
+
+### openvino.on_demand_loading
+
+**Type:** Boolean
+**Default:** `false`
+
+When `true`, loads the model only when recording starts. When `false`, keeps the model loaded for faster response.
+
+### openvino.openvino_dir
+
+**Type:** String (optional)
+**Default:** None (automatic discovery)
+**Environment variable:** `VOXTYPE_OPENVINO_DIR`
+
+Path to the OpenVINO GenAI installation directory containing shared libraries. When set, voxtype loads `libopenvino_genai_c.so` from this directory instead of relying on automatic discovery via `LD_LIBRARY_PATH`, `OPENVINO_INSTALL_DIR`, or system package paths.
+
+The library is searched in these subdirectories:
+- `<openvino_dir>/`
+- `<openvino_dir>/runtime/lib/intel64/`
+- `<openvino_dir>/runtime/lib/intel64/Release/`
+
+This is useful when you have a custom OpenVINO build or an installation in a non-standard location (for example, a `pip install openvino-genai` environment or a manual extract).
+
+**Example:**
+```toml
+engine = "openvino"
+
+[openvino]
+model = "base.en-int8"
+device = "NPU"
+quantized = true
+language = "en"
+on_demand_loading = false
+openvino_dir = "/opt/intel/openvino"
+```
+
+---
+
 ## [output]
 
 Controls how transcribed text is delivered.

@@ -59,7 +59,8 @@ impl ParecLoopback {
     fn start(&mut self) -> Result<(), AudioError> {
         let mut child = std::process::Command::new("parec")
             .args([
-                "--device", &self.source,
+                "--device",
+                &self.source,
                 "--format=float32le",
                 "--channels=1",
                 "--rate=16000",
@@ -70,7 +71,9 @@ impl ParecLoopback {
             .spawn()
             .map_err(|e| AudioError::Connection(format!("Failed to start parec: {}", e)))?;
 
-        let mut stdout = child.stdout.take()
+        let mut stdout = child
+            .stdout
+            .take()
             .ok_or_else(|| AudioError::Connection("Failed to capture parec stdout".to_string()))?;
 
         self.child = Some(child);
@@ -160,18 +163,16 @@ impl DualCapture {
 
         let loopback = match loopback_device {
             Some("disabled") | Some("") | None => None,
-            Some("auto") => {
-                match Self::find_monitor_source() {
-                    Some(source) => {
-                        tracing::info!("Auto-detected loopback source: {}", source);
-                        Some(ParecLoopback::new(source))
-                    }
-                    None => {
-                        tracing::warn!("No monitor source found, using mic only");
-                        None
-                    }
+            Some("auto") => match Self::find_monitor_source() {
+                Some(source) => {
+                    tracing::info!("Auto-detected loopback source: {}", source);
+                    Some(ParecLoopback::new(source))
                 }
-            }
+                None => {
+                    tracing::warn!("No monitor source found, using mic only");
+                    None
+                }
+            },
             Some(device) => {
                 tracing::info!("Using configured loopback source: {}", device);
                 Some(ParecLoopback::new(device.to_string()))

@@ -80,31 +80,23 @@ impl GtcrnEnhancer {
                 mix_data[i * 2 + 1] = bin.im;
             }
 
-            let mix_tensor =
-                Tensor::<f32>::from_array(([1usize, FREQ_BINS, 1, 2], mix_data)).map_err(|e| {
-                    format!("Failed to create mix tensor: {}", e)
-                })?;
+            let mix_tensor = Tensor::<f32>::from_array(([1usize, FREQ_BINS, 1, 2], mix_data))
+                .map_err(|e| format!("Failed to create mix tensor: {}", e))?;
 
-            let conv_tensor = Tensor::<f32>::from_array((
-                [2usize, 1, 16, 16, 33],
-                conv_cache.clone(),
-            ))
-            .map_err(|e| format!("Failed to create conv_cache tensor: {}", e))?;
+            let conv_tensor =
+                Tensor::<f32>::from_array(([2usize, 1, 16, 16, 33], conv_cache.clone()))
+                    .map_err(|e| format!("Failed to create conv_cache tensor: {}", e))?;
 
-            let tra_tensor =
-                Tensor::<f32>::from_array(([2usize, 3, 1, 1, 16], tra_cache.clone()))
-                    .map_err(|e| format!("Failed to create tra_cache tensor: {}", e))?;
+            let tra_tensor = Tensor::<f32>::from_array(([2usize, 3, 1, 1, 16], tra_cache.clone()))
+                .map_err(|e| format!("Failed to create tra_cache tensor: {}", e))?;
 
             let inter_tensor =
                 Tensor::<f32>::from_array(([2usize, 1, 33, 16], inter_cache.clone()))
                     .map_err(|e| format!("Failed to create inter_cache tensor: {}", e))?;
 
             let inputs: Vec<(std::borrow::Cow<str>, ort::session::SessionInputValue)> = vec![
                 (std::borrow::Cow::Borrowed("mix"), mix_tensor.into()),
-                (
-                    std::borrow::Cow::Borrowed("conv_cache"),
-                    conv_tensor.into(),
-                ),
+                (std::borrow::Cow::Borrowed("conv_cache"), conv_tensor.into()),
                 (std::borrow::Cow::Borrowed("tra_cache"), tra_tensor.into()),
                 (
                     std::borrow::Cow::Borrowed("inter_cache"),