diff --git a/Cargo.lock b/Cargo.lock index 878c75af..d3d67357 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -618,12 +618,35 @@ version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +[[package]] +name = "env_filter" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a1c3cc8e57274ec99de65301228b537f1e4eedc1b8e0f9411c6caac8ae7308f" +dependencies = [ + "log", + "regex", +] + [[package]] name = "env_home" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c7f84e12ccf0a7ddc17a6c41c93326024c42920d7ee630d04950e6926645c0fe" +[[package]] +name = "env_logger" +version = "0.11.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2daee4ea451f429a58296525ddf28b45a3b64f1acf6587e2067437bb11e218d" +dependencies = [ + "anstream", + "anstyle", + "env_filter", + "jiff", + "log", +] + [[package]] name = "equivalent" version = "1.0.2" @@ -1130,6 +1153,30 @@ version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" +[[package]] +name = "jiff" +version = "0.2.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a3546dc96b6d42c5f24902af9e2538e82e39ad350b0c766eb3fbf2d8f3d8359" +dependencies = [ + "jiff-static", + "log", + "portable-atomic", + "portable-atomic-util", + "serde_core", +] + +[[package]] +name = "jiff-static" +version = "0.2.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a8c8b344124222efd714b73bb41f8b5120b27a7cc1c75593a6ff768d9d05aa4" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "jni" version = "0.21.1" @@ -1715,6 +1762,49 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "openvino-finder" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0bc7fe2de763ebbf43d49bafd42a604d13e70f5030f1060d07dbd7b2d4d1b58" +dependencies = [ + "cfg-if", + "log", +] + +[[package]] +name = "openvino-genai" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61e580e5e0e03d658d34653f6601488a3a726b55f699af474b92af8b26d823c5" +dependencies = [ + "openvino-finder", + "openvino-genai-sys", +] + +[[package]] +name = "openvino-genai-sys" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50f5705b891e0beca7ca6d81f2dcf1027cad1f42a814f9ec1fb1b083f567cbbc" +dependencies = [ + "env_logger", + "libloading 0.8.9", + "openvino-finder", + "openvino-sys", +] + +[[package]] +name = "openvino-sys" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89fdf449e2a4706e2a34bd602b7a2d86c92d61216cfef2b639ba1acbb68929a7" +dependencies = [ + "env_logger", + "libloading 0.8.9", + "openvino-finder", +] + [[package]] name = "option-ext" version = "0.2.0" @@ -2906,6 +2996,7 @@ dependencies = [ "nix 0.29.0", "notify", "num_cpus", + "openvino-genai", "ort", "parakeet-rs", "pidlock", diff --git a/Cargo.toml b/Cargo.toml index 7e794de1..96f43521 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -75,6 +75,9 @@ ndarray = { version = "0.16", optional = true } tokenizers = { version = "0.20", optional = true, default-features = false, features = ["onig"] } rustfft = { version = "6", optional = true } +# OpenVINO GenAI backend (Intel NPU/CPU/GPU via OpenVINO GenAI WhisperPipeline) +openvino-genai = { version = "0.10.0", optional = true, features = ["runtime-linking"] } + # CPU count for thread detection num_cpus = "1.16" @@ -126,6 +129,8 @@ dolphin-tensorrt = ["dolphin", "ort/tensorrt"] omnilingual = ["onnx-common"] omnilingual-cuda = ["omnilingual", "ort/cuda"] omnilingual-tensorrt = ["omnilingual", "ort/tensorrt"] +# OpenVINO Whisper backend (Intel NPU/CPU/GPU via OpenVINO GenAI WhisperPipeline) +openvino-whisper = ["dep:openvino-genai"] [build-dependencies] clap = { version = "4", features = ["derive"] } diff --git a/README.md b/README.md index 223371d1..f5fb7995 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ Hold a hotkey (default: ScrollLock) while speaking, release to transcribe and ou - **Works on any Linux desktop** - Uses compositor keybindings (Hyprland, Sway, River) with evdev fallback for X11 and other environments - **Fully offline by default** - Uses whisper.cpp for local transcription, with optional remote server support -- **7 transcription engines** - Whisper, Parakeet, Moonshine, SenseVoice, Paraformer, Dolphin, and Omnilingual (see [Supported Engines](#supported-engines) below) +- **8 transcription engines** - Whisper, Parakeet, Moonshine, SenseVoice, Paraformer, Dolphin, Omnilingual, and OpenVINO Whisper (see [Supported Engines](#supported-engines) below) - **Chinese, Japanese, Korean, and 1600+ languages** - SenseVoice, Dolphin, and Omnilingual add native support for CJK and other non-Latin scripts - **Meeting mode** - Continuous meeting transcription with chunked processing, speaker attribution, and export to Markdown, JSON, SRT, or VTT - **Fallback chain** - Types via wtype (best CJK support), falls back to dotool (keyboard layout support), ydotool, then clipboard @@ -331,11 +331,12 @@ Voxtype ships separate binaries for Whisper and ONNX engines. Use `voxtype setup | **Paraformer** | zh+en, zh+yue+en | Non-autoregressive (ONNX) | Chinese-English bilingual | | **Dolphin** | 40 languages + 22 Chinese dialects | CTC E-Branchformer (ONNX) | Eastern languages (no English) | | **Omnilingual** | 1600+ languages | wav2vec2 CTC (ONNX) | Low-resource and rare languages | +| **OpenVINO Whisper** | 99 languages | Encoder-decoder (OpenVINO) | Intel NPU (Lunar Lake), CPU/GPU fallback | To set the engine in your config: ```toml -engine = "sensevoice" # or: whisper, parakeet, moonshine, paraformer, dolphin, omnilingual +engine = "sensevoice" # or: whisper, parakeet, moonshine, paraformer, dolphin, omnilingual, openvino ``` Or override on the command line: @@ -392,6 +393,32 @@ cargo build --release --features gpu-metal cargo build --release --features gpu-hipblas ``` +### Intel NPU (Lunar Lake, Arrow Lake, Meteor Lake) + +Intel NPU acceleration uses OpenVINO GenAI with Whisper models exported in OpenVINO IR format: + +```bash +# Build with OpenVINO support +cargo build --release --features openvino-whisper + +# Install OpenVINO GenAI runtime libraries (required at runtime) +pip install openvino-genai + +# Download a model +voxtype setup model # Select an OpenVINO model + +# Configure +cat >> ~/.config/voxtype/config.toml << 'EOF' +engine = "openvino" + +[openvino] +model = "base.en-int8" +device = "NPU" +EOF +``` + +The NPU requires the Intel NPU driver (`intel-npu-driver`). Set `device = "CPU"` to fall back to CPU inference on systems without an NPU. + ### Performance Comparison Results vary by hardware. Example on AMD RX 6800: diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md index a74104aa..5930e127 100644 --- a/docs/CONFIGURATION.md +++ b/docs/CONFIGURATION.md @@ -27,6 +27,7 @@ Selects which speech-to-text engine to use for transcription. - `whisper` - OpenAI Whisper via whisper.cpp (default, recommended) - `parakeet` - NVIDIA Parakeet via ONNX Runtime (experimental, requires special binary) - `moonshine` - Moonshine encoder-decoder transformer via ONNX Runtime (experimental, requires special binary) +- `openvino` - OpenVINO Whisper for Intel NPU/CPU/GPU (requires `--features openvino-whisper`) **Example:** ```toml @@ -1132,6 +1133,91 @@ on_demand_loading = false # Keep model loaded for fast response --- +## [openvino] + +Configuration for the OpenVINO Whisper speech-to-text engine. This section is only used when `engine = "openvino"`. Requires building with `--features openvino-whisper` and OpenVINO GenAI runtime libraries installed. + +### openvino.model + +**Type:** String +**Default:** `"base.en"` + +Model name or absolute path to directory containing OpenVINO IR model files. The directory must contain `openvino_encoder_model.xml/.bin`, `openvino_decoder_model.xml/.bin`, and `tokenizer.json`. + +Short names available: `"base.en-int8"`, `"base.en-fp16"`, `"small.en-int8"`, `"base-int8"`, `"large-v3-int8"`. + +### openvino.device + +**Type:** String +**Default:** `"NPU"` + +OpenVINO device to run inference on. Options: `"NPU"` (Intel Neural Processing Unit), `"CPU"`, `"GPU"`, `"AUTO"` (automatic device selection). + +### openvino.quantized + +**Type:** Boolean +**Default:** `true` + +Prefer int8 quantized model variants. Int8 models are smaller and run faster on NPU. Set to `false` for fp16 models which may have slightly higher accuracy. + +### openvino.language + +**Type:** String +**Default:** `"en"` + +Language code for transcription. Uses Whisper language codes: `"en"`, `"zh"`, `"fr"`, `"de"`, `"ja"`, `"ko"`, etc. + +### openvino.translate + +**Type:** Boolean +**Default:** `false` + +When true, translates non-English speech to English instead of transcribing in the source language. + +### openvino.threads + +**Type:** Integer (optional) +**Default:** System-detected + +Number of CPU threads for inference. Only applies when `device = "CPU"`. + +### openvino.on_demand_loading + +**Type:** Boolean +**Default:** `false` + +When `true`, loads the model only when recording starts. When `false`, keeps the model loaded for faster response. + +### openvino.openvino_dir + +**Type:** String (optional) +**Default:** None (automatic discovery) +**Environment variable:** `VOXTYPE_OPENVINO_DIR` + +Path to the OpenVINO GenAI installation directory containing shared libraries. When set, voxtype loads `libopenvino_genai_c.so` from this directory instead of relying on automatic discovery via `LD_LIBRARY_PATH`, `OPENVINO_INSTALL_DIR`, or system package paths. + +The library is searched in these subdirectories: +- `/` +- `/runtime/lib/intel64/` +- `/runtime/lib/intel64/Release/` + +This is useful when you have a custom OpenVINO build or an installation in a non-standard location (for example, a `pip install openvino-genai` environment or a manual extract). + +**Example:** +```toml +engine = "openvino" + +[openvino] +model = "base.en-int8" +device = "NPU" +quantized = true +language = "en" +on_demand_loading = false +openvino_dir = "/opt/intel/openvino" +``` + +--- + ## [output] Controls how transcribed text is delivered. diff --git a/src/audio/dual_capture.rs b/src/audio/dual_capture.rs index 09b4383b..58ef94b1 100644 --- a/src/audio/dual_capture.rs +++ b/src/audio/dual_capture.rs @@ -59,7 +59,8 @@ impl ParecLoopback { fn start(&mut self) -> Result<(), AudioError> { let mut child = std::process::Command::new("parec") .args([ - "--device", &self.source, + "--device", + &self.source, "--format=float32le", "--channels=1", "--rate=16000", @@ -70,7 +71,9 @@ impl ParecLoopback { .spawn() .map_err(|e| AudioError::Connection(format!("Failed to start parec: {}", e)))?; - let mut stdout = child.stdout.take() + let mut stdout = child + .stdout + .take() .ok_or_else(|| AudioError::Connection("Failed to capture parec stdout".to_string()))?; self.child = Some(child); @@ -160,18 +163,16 @@ impl DualCapture { let loopback = match loopback_device { Some("disabled") | Some("") | None => None, - Some("auto") => { - match Self::find_monitor_source() { - Some(source) => { - tracing::info!("Auto-detected loopback source: {}", source); - Some(ParecLoopback::new(source)) - } - None => { - tracing::warn!("No monitor source found, using mic only"); - None - } + Some("auto") => match Self::find_monitor_source() { + Some(source) => { + tracing::info!("Auto-detected loopback source: {}", source); + Some(ParecLoopback::new(source)) } - } + None => { + tracing::warn!("No monitor source found, using mic only"); + None + } + }, Some(device) => { tracing::info!("Using configured loopback source: {}", device); Some(ParecLoopback::new(device.to_string())) diff --git a/src/audio/enhance.rs b/src/audio/enhance.rs index 2ca8242d..c7212432 100644 --- a/src/audio/enhance.rs +++ b/src/audio/enhance.rs @@ -80,20 +80,15 @@ impl GtcrnEnhancer { mix_data[i * 2 + 1] = bin.im; } - let mix_tensor = - Tensor::::from_array(([1usize, FREQ_BINS, 1, 2], mix_data)).map_err(|e| { - format!("Failed to create mix tensor: {}", e) - })?; + let mix_tensor = Tensor::::from_array(([1usize, FREQ_BINS, 1, 2], mix_data)) + .map_err(|e| format!("Failed to create mix tensor: {}", e))?; - let conv_tensor = Tensor::::from_array(( - [2usize, 1, 16, 16, 33], - conv_cache.clone(), - )) - .map_err(|e| format!("Failed to create conv_cache tensor: {}", e))?; + let conv_tensor = + Tensor::::from_array(([2usize, 1, 16, 16, 33], conv_cache.clone())) + .map_err(|e| format!("Failed to create conv_cache tensor: {}", e))?; - let tra_tensor = - Tensor::::from_array(([2usize, 3, 1, 1, 16], tra_cache.clone())) - .map_err(|e| format!("Failed to create tra_cache tensor: {}", e))?; + let tra_tensor = Tensor::::from_array(([2usize, 3, 1, 1, 16], tra_cache.clone())) + .map_err(|e| format!("Failed to create tra_cache tensor: {}", e))?; let inter_tensor = Tensor::::from_array(([2usize, 1, 33, 16], inter_cache.clone())) @@ -101,10 +96,7 @@ impl GtcrnEnhancer { let inputs: Vec<(std::borrow::Cow, ort::session::SessionInputValue)> = vec![ (std::borrow::Cow::Borrowed("mix"), mix_tensor.into()), - ( - std::borrow::Cow::Borrowed("conv_cache"), - conv_tensor.into(), - ), + (std::borrow::Cow::Borrowed("conv_cache"), conv_tensor.into()), (std::borrow::Cow::Borrowed("tra_cache"), tra_tensor.into()), ( std::borrow::Cow::Borrowed("inter_cache"), diff --git a/src/cli.rs b/src/cli.rs index 2ce4d64d..00fab804 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -68,7 +68,7 @@ pub struct Cli { #[arg(long, value_name = "MODEL")] pub model: Option, - /// Override transcription engine: whisper, parakeet, moonshine, sensevoice, paraformer, dolphin, omnilingual + /// Override transcription engine: whisper, parakeet, moonshine, sensevoice, paraformer, dolphin, omnilingual, openvino #[arg(long, value_name = "ENGINE")] pub engine: Option, @@ -93,7 +93,6 @@ pub struct Cli { pub model_modifier: Option, // -- Whisper -- - /// Disable context window optimization for short recordings #[arg(long, help_heading = "Whisper")] pub no_whisper_context_optimization: bool, @@ -148,7 +147,6 @@ pub struct Cli { pub remote_api_key: Option, // -- Audio -- - /// Audio input device name (or "default" for system default) #[arg(long, value_name = "DEVICE", help_heading = "Audio")] pub audio_device: Option, @@ -166,7 +164,6 @@ pub struct Cli { pub no_audio_feedback: bool, // -- Output -- - /// Delay before typing starts (ms), helps prevent first character drop #[arg(long, value_name = "MS", help_heading = "Output")] pub pre_type_delay: Option, @@ -219,7 +216,11 @@ pub struct Cli { pub fallback_to_clipboard: bool, /// Disable clipboard fallback - #[arg(long, conflicts_with = "fallback_to_clipboard", help_heading = "Output")] + #[arg( + long, + conflicts_with = "fallback_to_clipboard", + help_heading = "Output" + )] pub no_fallback_to_clipboard: bool, /// Enable spoken punctuation conversion (e.g., say "period" to get ".") @@ -259,7 +260,6 @@ pub struct Cli { pub pre_recording_command: Option, // -- VAD -- - /// Enable Voice Activity Detection (filter silence before transcription) #[arg(long, help_heading = "VAD")] pub vad: bool, @@ -291,7 +291,7 @@ pub enum Commands { /// Path to audio file file: std::path::PathBuf, - /// Override transcription engine: whisper, parakeet, moonshine, sensevoice, paraformer, dolphin, omnilingual + /// Override transcription engine: whisper, parakeet, moonshine, sensevoice, paraformer, dolphin, omnilingual, openvino #[arg(long, value_name = "ENGINE")] engine: Option, }, @@ -836,6 +836,21 @@ pub enum SetupAction { status: bool, }, + /// Manage NPU acceleration (Intel NPU via OpenVINO) + Npu { + /// Enable NPU acceleration (set engine to OpenVINO, download model if needed) + #[arg(long)] + enable: bool, + + /// Disable NPU acceleration (revert engine to Whisper) + #[arg(long)] + disable: bool, + + /// Show NPU hardware and configuration status + #[arg(long)] + status: bool, + }, + /// Switch between Whisper and ONNX transcription engines Onnx { /// Enable ONNX engine (switch to ONNX binary) diff --git a/src/config.rs b/src/config.rs index 6a1e4691..fada9487 100644 --- a/src/config.rs +++ b/src/config.rs @@ -325,6 +325,10 @@ pub struct Config { #[serde(default)] pub omnilingual: Option, + /// OpenVINO Whisper configuration (optional, only used when engine = "openvino") + #[serde(default)] + pub openvino: Option, + /// Text processing configuration (replacements, spoken punctuation) #[serde(default)] pub text: TextConfig, @@ -1108,6 +1112,72 @@ impl Default for OmnilingualConfig { } } +/// OpenVINO Whisper speech-to-text configuration (Intel NPU/CPU/GPU via OpenVINO GenAI) +/// Requires: cargo build --features openvino-whisper +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct OpenVinoConfig { + /// Model name or path to directory containing OpenVINO IR model files. + /// Names with quantization: "base.en-int8", "small.en-fp16", "tiny-int4", etc. + /// Short names also work: "base.en" (uses `quantized` to pick int8/fp16) + pub model: String, + + /// OpenVINO device to run inference on (default: "NPU") + /// Options: "NPU", "CPU", "GPU", "AUTO" + #[serde(default = "default_openvino_device")] + pub device: String, + + /// Use int8 quantized model variants for better NPU performance (default: true) + #[serde(default = "default_true")] + pub quantized: bool, + + /// Number of CPU threads for CPU inference (ignored for NPU/GPU) + #[serde(default)] + pub threads: Option, + + /// Language for transcription (default: "en") + /// Uses Whisper language codes: "en", "zh", "fr", "de", "ja", etc. + #[serde(default = "default_openvino_language")] + pub language: String, + + /// Enable translation to English (default: false) + #[serde(default)] + pub translate: bool, + + /// Load model on-demand when recording starts (true) or keep loaded (false) + #[serde(default = "default_on_demand_loading")] + pub on_demand_loading: bool, + + /// Path to the OpenVINO GenAI installation directory containing shared libraries. + /// When set, loads libopenvino_genai_c.so from this path instead of relying + /// on automatic discovery (LD_LIBRARY_PATH, OPENVINO_INSTALL_DIR, etc.). + /// Also settable via VOXTYPE_OPENVINO_DIR environment variable. + #[serde(default)] + pub openvino_dir: Option, +} + +fn default_openvino_device() -> String { + "NPU".to_string() +} + +fn default_openvino_language() -> String { + "en".to_string() +} + +impl Default for OpenVinoConfig { + fn default() -> Self { + Self { + model: "base.en".to_string(), + device: default_openvino_device(), + quantized: true, + threads: None, + language: default_openvino_language(), + translate: false, + on_demand_loading: false, + openvino_dir: None, + } + } +} + /// Transcription engine selection (which ASR technology to use) #[derive(Debug, Clone, Copy, Deserialize, Serialize, PartialEq, Eq, Default)] #[serde(rename_all = "lowercase")] @@ -1133,6 +1203,10 @@ pub enum TranscriptionEngine { /// Use Omnilingual (FunASR 50+ language CTC encoder via ONNX Runtime) /// Requires: cargo build --features omnilingual Omnilingual, + /// Use OpenVINO Whisper (Intel NPU/CPU/GPU via OpenVINO Runtime) + /// Requires: cargo build --features openvino-whisper + #[serde(rename = "openvino")] + OpenVino, } /// VAD backend selection @@ -1789,6 +1863,7 @@ impl Default for Config { paraformer: None, dolphin: None, omnilingual: None, + openvino: None, text: TextConfig::default(), vad: VadConfig::default(), status: StatusConfig::default(), @@ -1897,6 +1972,11 @@ impl Config { .as_ref() .map(|o| o.on_demand_loading) .unwrap_or(false), + TranscriptionEngine::OpenVino => self + .openvino + .as_ref() + .map(|o| o.on_demand_loading) + .unwrap_or(false), } } @@ -1934,6 +2014,11 @@ impl Config { .as_ref() .map(|o| o.model.as_str()) .unwrap_or("omnilingual (not configured)"), + TranscriptionEngine::OpenVino => self + .openvino + .as_ref() + .map(|o| o.model.as_str()) + .unwrap_or("openvino (not configured)"), } } @@ -2002,6 +2087,7 @@ pub fn load_config(path: Option<&Path>) -> Result { "paraformer" => config.engine = TranscriptionEngine::Paraformer, "dolphin" => config.engine = TranscriptionEngine::Dolphin, "omnilingual" => config.engine = TranscriptionEngine::Omnilingual, + "openvino" => config.engine = TranscriptionEngine::OpenVino, _ => tracing::warn!("Unknown VOXTYPE_ENGINE value: {}", engine), } } @@ -2096,6 +2182,12 @@ pub fn load_config(path: Option<&Path>) -> Result { config.text.smart_auto_submit = parse_bool_env(&val); } + // OpenVINO + if let Ok(dir) = std::env::var("VOXTYPE_OPENVINO_DIR") { + let openvino = config.openvino.get_or_insert_with(OpenVinoConfig::default); + openvino.openvino_dir = Some(dir); + } + Ok(config) } diff --git a/src/cpu.rs b/src/cpu.rs index a67ecd3f..5fb94941 100644 --- a/src/cpu.rs +++ b/src/cpu.rs @@ -36,7 +36,10 @@ pub fn install_sigill_handler() { } unsafe { - libc::signal(libc::SIGILL, sigill_handler as *const () as libc::sighandler_t); + libc::signal( + libc::SIGILL, + sigill_handler as *const () as libc::sighandler_t, + ); } } diff --git a/src/daemon.rs b/src/daemon.rs index c661e25a..fba1ce25 100644 --- a/src/daemon.rs +++ b/src/daemon.rs @@ -661,7 +661,8 @@ impl Daemon { | crate::config::TranscriptionEngine::SenseVoice | crate::config::TranscriptionEngine::Paraformer | crate::config::TranscriptionEngine::Dolphin - | crate::config::TranscriptionEngine::Omnilingual => { + | crate::config::TranscriptionEngine::Omnilingual + | crate::config::TranscriptionEngine::OpenVino => { if let Some(ref t) = transcriber_preloaded { Ok(t.clone()) } else { @@ -1567,7 +1568,8 @@ impl Daemon { | crate::config::TranscriptionEngine::SenseVoice | crate::config::TranscriptionEngine::Paraformer | crate::config::TranscriptionEngine::Dolphin - | crate::config::TranscriptionEngine::Omnilingual => { + | crate::config::TranscriptionEngine::Omnilingual + | crate::config::TranscriptionEngine::OpenVino => { // Parakeet/Moonshine uses its own model loading transcriber_preloaded = Some(Arc::from(crate::transcribe::create_transcriber( &self.config, @@ -1665,7 +1667,8 @@ impl Daemon { | crate::config::TranscriptionEngine::SenseVoice | crate::config::TranscriptionEngine::Paraformer | crate::config::TranscriptionEngine::Dolphin - | crate::config::TranscriptionEngine::Omnilingual => { + | crate::config::TranscriptionEngine::Omnilingual + | crate::config::TranscriptionEngine::OpenVino => { let config = self.config.clone(); self.model_load_task = Some(tokio::task::spawn_blocking(move || { crate::transcribe::create_transcriber(&config).map(Arc::from) @@ -1688,7 +1691,8 @@ impl Daemon { | crate::config::TranscriptionEngine::SenseVoice | crate::config::TranscriptionEngine::Paraformer | crate::config::TranscriptionEngine::Dolphin - | crate::config::TranscriptionEngine::Omnilingual => { + | crate::config::TranscriptionEngine::Omnilingual + | crate::config::TranscriptionEngine::OpenVino => { if let Some(ref t) = transcriber_preloaded { let transcriber = t.clone(); tokio::task::spawn_blocking(move || { @@ -1848,7 +1852,8 @@ impl Daemon { | crate::config::TranscriptionEngine::SenseVoice | crate::config::TranscriptionEngine::Paraformer | crate::config::TranscriptionEngine::Dolphin - | crate::config::TranscriptionEngine::Omnilingual => { + | crate::config::TranscriptionEngine::Omnilingual + | crate::config::TranscriptionEngine::OpenVino => { let config = self.config.clone(); self.model_load_task = Some(tokio::task::spawn_blocking(move || { crate::transcribe::create_transcriber(&config).map(Arc::from) @@ -1871,7 +1876,8 @@ impl Daemon { | crate::config::TranscriptionEngine::SenseVoice | crate::config::TranscriptionEngine::Paraformer | crate::config::TranscriptionEngine::Dolphin - | crate::config::TranscriptionEngine::Omnilingual => { + | crate::config::TranscriptionEngine::Omnilingual + | crate::config::TranscriptionEngine::OpenVino => { if let Some(ref t) = transcriber_preloaded { let transcriber = t.clone(); tokio::task::spawn_blocking(move || { @@ -2280,7 +2286,8 @@ impl Daemon { | crate::config::TranscriptionEngine::SenseVoice | crate::config::TranscriptionEngine::Paraformer | crate::config::TranscriptionEngine::Dolphin - | crate::config::TranscriptionEngine::Omnilingual => { + | crate::config::TranscriptionEngine::Omnilingual + | crate::config::TranscriptionEngine::OpenVino => { let config = self.config.clone(); self.model_load_task = Some(tokio::task::spawn_blocking(move || { crate::transcribe::create_transcriber(&config).map(Arc::from) @@ -2302,7 +2309,8 @@ impl Daemon { | crate::config::TranscriptionEngine::SenseVoice | crate::config::TranscriptionEngine::Paraformer | crate::config::TranscriptionEngine::Dolphin - | crate::config::TranscriptionEngine::Omnilingual => { + | crate::config::TranscriptionEngine::Omnilingual + | crate::config::TranscriptionEngine::OpenVino => { if let Some(ref t) = transcriber_preloaded { let transcriber = t.clone(); tokio::task::spawn_blocking(move || { diff --git a/src/main.rs b/src/main.rs index e13ba0a1..09017e8e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -135,9 +135,10 @@ async fn main() -> anyhow::Result<()> { "paraformer" => config.engine = config::TranscriptionEngine::Paraformer, "dolphin" => config.engine = config::TranscriptionEngine::Dolphin, "omnilingual" => config.engine = config::TranscriptionEngine::Omnilingual, + "openvino" => config.engine = config::TranscriptionEngine::OpenVino, _ => { eprintln!( - "Error: Invalid engine '{}'. Valid options: whisper, parakeet, moonshine, sensevoice, paraformer, dolphin, omnilingual", + "Error: Invalid engine '{}'. Valid options: whisper, parakeet, moonshine, sensevoice, paraformer, dolphin, omnilingual, openvino", engine ); std::process::exit(1); @@ -357,8 +358,9 @@ async fn main() -> anyhow::Result<()> { "paraformer" => config.engine = config::TranscriptionEngine::Paraformer, "dolphin" => config.engine = config::TranscriptionEngine::Dolphin, "omnilingual" => config.engine = config::TranscriptionEngine::Omnilingual, + "openvino" => config.engine = config::TranscriptionEngine::OpenVino, _ => { - eprintln!("Error: Invalid engine '{}'. Valid options: whisper, parakeet, moonshine, sensevoice, paraformer, dolphin, omnilingual", engine_name); + eprintln!("Error: Invalid engine '{}'. Valid options: whisper, parakeet, moonshine, sensevoice, paraformer, dolphin, omnilingual, openvino", engine_name); std::process::exit(1); } } @@ -475,6 +477,22 @@ async fn main() -> anyhow::Result<()> { setup::gpu::show_status(); } } + Some(SetupAction::Npu { + enable, + disable, + status, + }) => { + warn_if_root("npu"); + if status { + setup::npu::show_status(); + } else if enable { + setup::npu::enable()?; + } else if disable { + setup::npu::disable()?; + } else { + setup::npu::show_status(); + } + } Some(SetupAction::Onnx { enable, disable, @@ -1222,8 +1240,8 @@ async fn show_config(config: &config::Config) -> anyhow::Result<()> { if path.is_dir() { let name = entry.file_name().to_string_lossy().to_string(); if name.contains("sensevoice") { - let has_model = path.join("model.int8.onnx").exists() - || path.join("model.onnx").exists(); + let has_model = + path.join("model.int8.onnx").exists() || path.join("model.onnx").exists(); let has_tokens = path.join("tokens.txt").exists(); if has_model && has_tokens { sensevoice_models.push(name); diff --git a/src/meeting/data.rs b/src/meeting/data.rs index efddf33a..a2aa0f55 100644 --- a/src/meeting/data.rs +++ b/src/meeting/data.rs @@ -57,7 +57,6 @@ pub enum AudioSource { Unknown, } - impl std::fmt::Display for AudioSource { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { @@ -361,7 +360,6 @@ pub enum MeetingStatus { Cancelled, } - /// Metadata for a meeting #[derive(Debug, Clone, Serialize, Deserialize)] pub struct MeetingMetadata { diff --git a/src/meeting/diarization/ml.rs b/src/meeting/diarization/ml.rs index 743bccb7..077df1dd 100644 --- a/src/meeting/diarization/ml.rs +++ b/src/meeting/diarization/ml.rs @@ -10,7 +10,6 @@ use crate::meeting::data::AudioSource; use crate::meeting::TranscriptSegment; use std::collections::HashMap; use std::path::PathBuf; -use std::sync::Mutex; #[cfg(feature = "ml-diarization")] use ort::session::Session; @@ -140,12 +139,13 @@ impl MlDiarizer { #[cfg(feature = "ml-diarization")] pub fn extract_embedding(&self, samples: &[f32]) -> Result, String> { let mutex = self.session.as_ref().ok_or("Model not loaded")?; - let mut session = mutex.lock().map_err(|e| format!("Session lock poisoned: {}", e))?; + let mut session = mutex + .lock() + .map_err(|e| format!("Session lock poisoned: {}", e))?; // Prepare input tensor: [batch=1, samples] - let input_tensor = - Tensor::::from_array(([1usize, samples.len()], samples.to_vec())) - .map_err(|e| format!("Failed to create input tensor: {}", e))?; + let input_tensor = Tensor::::from_array(([1usize, samples.len()], samples.to_vec())) + .map_err(|e| format!("Failed to create input tensor: {}", e))?; // Run inference let outputs = session @@ -235,7 +235,7 @@ impl Default for MlDiarizer { impl Diarizer for MlDiarizer { fn diarize( &self, - samples: &[f32], + _samples: &[f32], _source: AudioSource, transcript_segments: &[TranscriptSegment], ) -> Vec { diff --git a/src/meeting/mod.rs b/src/meeting/mod.rs index 3f98ca2c..dafbcdae 100644 --- a/src/meeting/mod.rs +++ b/src/meeting/mod.rs @@ -246,7 +246,8 @@ impl MeetingDaemon { &mut self, samples: Vec, ) -> Result>> { - self.process_chunk_with_source(samples, AudioSource::Microphone).await + self.process_chunk_with_source(samples, AudioSource::Microphone) + .await } /// Process a chunk of audio with a specific source label diff --git a/src/meeting/state.rs b/src/meeting/state.rs index ee38b623..49572ed1 100644 --- a/src/meeting/state.rs +++ b/src/meeting/state.rs @@ -36,8 +36,7 @@ impl ChunkState { } /// Meeting transcription state -#[derive(Debug, Clone)] -#[derive(Default)] +#[derive(Debug, Clone, Default)] pub enum MeetingState { /// No meeting in progress #[default] @@ -74,7 +73,6 @@ pub enum MeetingState { }, } - impl MeetingState { /// Create a new idle state pub fn new() -> Self { diff --git a/src/output/mod.rs b/src/output/mod.rs index 22344a9f..24000f14 100644 --- a/src/output/mod.rs +++ b/src/output/mod.rs @@ -93,13 +93,14 @@ pub fn is_parakeet_binary_active() -> bool { /// Get the engine icon for notifications based on configured engine pub fn engine_icon(engine: crate::config::TranscriptionEngine) -> &'static str { match engine { - crate::config::TranscriptionEngine::Parakeet => "\u{1F99C}", // 🦜 + crate::config::TranscriptionEngine::Parakeet => "\u{1F99C}", // 🦜 crate::config::TranscriptionEngine::Whisper => "\u{1F5E3}\u{FE0F}", // 🗣️ - crate::config::TranscriptionEngine::Moonshine => "\u{1F319}", // 🌙 - crate::config::TranscriptionEngine::SenseVoice => "\u{1F442}", // 👂 - crate::config::TranscriptionEngine::Paraformer => "\u{1F4AC}", // 💬 - crate::config::TranscriptionEngine::Dolphin => "\u{1F42C}", // 🐬 - crate::config::TranscriptionEngine::Omnilingual => "\u{1F30D}", // 🌍 + crate::config::TranscriptionEngine::Moonshine => "\u{1F319}", // 🌙 + crate::config::TranscriptionEngine::SenseVoice => "\u{1F442}", // 👂 + crate::config::TranscriptionEngine::Paraformer => "\u{1F4AC}", // 💬 + crate::config::TranscriptionEngine::Dolphin => "\u{1F42C}", // 🐬 + crate::config::TranscriptionEngine::Omnilingual => "\u{1F30D}", // 🌍 + crate::config::TranscriptionEngine::OpenVino => "\u{1F9E0}", // 🧠 } } diff --git a/src/setup/gpu.rs b/src/setup/gpu.rs index 3870c151..deac16d2 100644 --- a/src/setup/gpu.rs +++ b/src/setup/gpu.rs @@ -573,10 +573,26 @@ pub fn show_status() { if is_parakeet { // Show ONNX backends (check both new and legacy names) let onnx_backends = [ - ("voxtype-onnx-avx2", "voxtype-parakeet-avx2", "ONNX CPU (AVX2)"), - ("voxtype-onnx-avx512", "voxtype-parakeet-avx512", "ONNX CPU (AVX-512)"), - ("voxtype-onnx-cuda", "voxtype-parakeet-cuda", "ONNX GPU (CUDA)"), - ("voxtype-onnx-rocm", "voxtype-parakeet-rocm", "ONNX GPU (ROCm)"), + ( + "voxtype-onnx-avx2", + "voxtype-parakeet-avx2", + "ONNX CPU (AVX2)", + ), + ( + "voxtype-onnx-avx512", + "voxtype-parakeet-avx512", + "ONNX CPU (AVX-512)", + ), + ( + "voxtype-onnx-cuda", + "voxtype-parakeet-cuda", + "ONNX GPU (CUDA)", + ), + ( + "voxtype-onnx-rocm", + "voxtype-parakeet-rocm", + "ONNX GPU (ROCm)", + ), ]; // Get current symlink target @@ -709,16 +725,15 @@ fn detect_best_parakeet_gpu_backend() -> Option<(&'static str, &'static str)> { let gpus = detect_gpus(); // Helper to find installed binary, preferring new name over legacy - let find_binary = - |new_name: &'static str, legacy_name: &'static str| -> Option<&'static str> { - if Path::new(VOXTYPE_LIB_DIR).join(new_name).exists() { - Some(new_name) - } else if Path::new(VOXTYPE_LIB_DIR).join(legacy_name).exists() { - Some(legacy_name) - } else { - None - } - }; + let find_binary = |new_name: &'static str, legacy_name: &'static str| -> Option<&'static str> { + if Path::new(VOXTYPE_LIB_DIR).join(new_name).exists() { + Some(new_name) + } else if Path::new(VOXTYPE_LIB_DIR).join(legacy_name).exists() { + Some(legacy_name) + } else { + None + } + }; // Check for AMD GPU and ROCm binary let has_amd = gpus.iter().any(|g| g.vendor == GpuVendor::Amd); @@ -905,23 +920,20 @@ fn detect_best_cpu_backend() -> Backend { /// Detect the best ONNX CPU backend for this system fn detect_best_parakeet_cpu_backend() -> Option<&'static str> { // Helper to find installed binary, preferring new name over legacy - let find_binary = - |new_name: &'static str, legacy_name: &'static str| -> Option<&'static str> { - if Path::new(VOXTYPE_LIB_DIR).join(new_name).exists() { - Some(new_name) - } else if Path::new(VOXTYPE_LIB_DIR).join(legacy_name).exists() { - Some(legacy_name) - } else { - None - } - }; + let find_binary = |new_name: &'static str, legacy_name: &'static str| -> Option<&'static str> { + if Path::new(VOXTYPE_LIB_DIR).join(new_name).exists() { + Some(new_name) + } else if Path::new(VOXTYPE_LIB_DIR).join(legacy_name).exists() { + Some(legacy_name) + } else { + None + } + }; // Check for AVX-512 support if let Ok(cpuinfo) = fs::read_to_string("/proc/cpuinfo") { if cpuinfo.contains("avx512f") { - if let Some(binary) = - find_binary("voxtype-onnx-avx512", "voxtype-parakeet-avx512") - { + if let Some(binary) = find_binary("voxtype-onnx-avx512", "voxtype-parakeet-avx512") { return Some(binary); } } diff --git a/src/setup/mod.rs b/src/setup/mod.rs index a80165a9..4dbb3733 100644 --- a/src/setup/mod.rs +++ b/src/setup/mod.rs @@ -6,6 +6,7 @@ //! - Interactive model selection //! - Output chain detection //! - GPU backend management +//! - NPU backend management (Intel NPU via OpenVINO) //! - Parakeet backend management //! - Compositor integration (modifier key fix) @@ -13,6 +14,7 @@ pub mod compositor; pub mod dms; pub mod gpu; pub mod model; +pub mod npu; pub mod parakeet; pub mod systemd; pub mod vad; @@ -460,21 +462,25 @@ pub async fn run_setup( let models_dir = Config::models_dir(); - // Check if model_override is a Parakeet or SenseVoice model + // Check if model_override is a Parakeet, SenseVoice, or OpenVINO model let is_parakeet = model_override .map(model::is_parakeet_model) .unwrap_or(false); let is_sensevoice = model_override .map(model::is_sensevoice_model) .unwrap_or(false); + let is_openvino = model_override + .map(model::is_openvino_model) + .unwrap_or(false); // Use model_override if provided, otherwise use config default (for Whisper) let _model_name: &str = match model_override { Some(name) => { - // Validate the model name (check Whisper, Parakeet, and SenseVoice) + // Validate the model name (check Whisper, Parakeet, SenseVoice, and OpenVINO) if !model::is_valid_model(name) && !model::is_parakeet_model(name) && !model::is_sensevoice_model(name) + && !model::is_openvino_model(name) { let valid = model::valid_model_names().join(", "); anyhow::bail!("Unknown model '{}'. Valid models are: {}", name, valid); @@ -484,7 +490,72 @@ pub async fn run_setup( None => &config.whisper.model, }; - if is_sensevoice { + if is_openvino { + // Handle OpenVINO model + #[allow(unused_variables)] + let model_name = model_override.unwrap(); // Safe: is_openvino implies Some + + if !quiet { + println!("\nOpenVINO Whisper model..."); + } + + #[cfg(not(feature = "openvino-whisper"))] + { + print_failure(&format!( + "OpenVINO model '{}' requires the 'openvino-whisper' feature", + model_name + )); + println!(" Rebuild with: cargo build --features openvino-whisper"); + anyhow::bail!("openvino-whisper feature not enabled"); + } + + #[cfg(feature = "openvino-whisper")] + { + let dir_name = model::openvino_dir_name(model_name).unwrap(); + let model_path = models_dir.join(dir_name); + let model_valid = + model_path.exists() && model::validate_openvino_model(&model_path).is_ok(); + + if model_valid { + if !quiet { + let size = std::fs::read_dir(&model_path) + .map(|entries| { + entries + .flatten() + .filter_map(|e| e.metadata().ok()) + .map(|m| m.len() as f64 / 1024.0 / 1024.0) + .sum::() + }) + .unwrap_or(0.0); + print_success(&format!("Model ready: {} ({:.0} MB)", model_name, size)); + } + // Update config to use OpenVINO + model::set_openvino_config(model_name)?; + if !quiet { + print_success(&format!( + "Config updated: engine = \"openvino\", model = \"{}\"", + model_name + )); + } + } else if download { + model::download_openvino_model(model_name)?; + // Update config to use OpenVINO + model::set_openvino_config(model_name)?; + if !quiet { + print_success(&format!( + "Config updated: engine = \"openvino\", model = \"{}\"", + model_name + )); + } + } else if !quiet { + print_info(&format!("Model '{}' not downloaded yet", model_name)); + println!( + " Run: voxtype setup --download --model {}", + model_name + ); + } + } + } else if is_sensevoice { // Handle SenseVoice model #[allow(unused_variables)] let model_name = model_override.unwrap(); // Safe: is_sensevoice implies Some @@ -831,6 +902,70 @@ pub async fn run_checks(config: &Config) -> anyhow::Result<()> { } } + // Check OpenVINO models + println!("\nOpenVINO Whisper Models:"); + + let mut openvino_models: Vec<(String, u64)> = Vec::new(); + if let Ok(entries) = std::fs::read_dir(&models_dir) { + for entry in entries.flatten() { + let path = entry.path(); + if path.is_dir() { + let name = entry.file_name().to_string_lossy().to_string(); + if name.starts_with("openvino-whisper") + && model::validate_openvino_model(&path).is_ok() + { + let size = std::fs::read_dir(&path) + .map(|entries| { + entries + .flatten() + .filter_map(|e| e.metadata().ok()) + .map(|m| m.len()) + .sum() + }) + .unwrap_or(0); + openvino_models.push((name, size)); + } + } + } + } + + if openvino_models.is_empty() { + print_info("No OpenVINO models found"); + println!(" Download with: voxtype setup --download --model base.en-int8"); + } else { + for (name, size) in &openvino_models { + let size_mb = *size as f64 / 1024.0 / 1024.0; + print_success(&format!("Model '{}' installed ({:.0} MB)", name, size_mb)); + } + } + + // Check if OpenVINO is configured but model is missing + if config.engine == crate::config::TranscriptionEngine::OpenVino { + if let Some(ref openvino_config) = config.openvino { + let configured_model = &openvino_config.model; + // Resolve the dir name for the configured model + let dir_name = model::openvino_dir_name(configured_model); + let model_found = match dir_name { + Some(dir) => openvino_models.iter().any(|(name, _)| name == dir), + None => false, + }; + if !model_found { + print_failure(&format!( + "Configured OpenVINO model '{}' not found", + configured_model + )); + println!( + " Download with: voxtype setup --download --model {}", + configured_model + ); + all_ok = false; + } + } else { + print_failure("Engine set to 'openvino' but [openvino] config section is missing"); + all_ok = false; + } + } + // Check if Parakeet is configured but model is missing if config.engine == crate::config::TranscriptionEngine::Parakeet { if let Some(ref parakeet_config) = config.parakeet { diff --git a/src/setup/model.rs b/src/setup/model.rs index e5059240..4b5566fb 100644 --- a/src/setup/model.rs +++ b/src/setup/model.rs @@ -320,10 +320,7 @@ const SENSEVOICE_MODELS: &[SenseVoiceModelInfo] = &[ size_mb: 938, description: "Full precision (larger, slightly better accuracy)", languages: "zh/en/ja/ko/yue", - files: &[ - ("model.onnx", "model.onnx"), - ("tokens.txt", "tokens.txt"), - ], + files: &[("model.onnx", "model.onnx"), ("tokens.txt", "tokens.txt")], huggingface_repo: "csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17", }, ]; @@ -384,20 +381,18 @@ struct DolphinModelInfo { huggingface_repo: &'static str, } -const DOLPHIN_MODELS: &[DolphinModelInfo] = &[ - DolphinModelInfo { - name: "base", - dir_name: "dolphin-base", - size_mb: 198, - description: "Dictation-optimized (recommended)", - languages: "en/zh", - files: &[ - ("model.int8.onnx", "model.int8.onnx"), - ("tokens.txt", "tokens.txt"), - ], - huggingface_repo: "csukuangfj/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02", - }, -]; +const DOLPHIN_MODELS: &[DolphinModelInfo] = &[DolphinModelInfo { + name: "base", + dir_name: "dolphin-base", + size_mb: 198, + description: "Dictation-optimized (recommended)", + languages: "en/zh", + files: &[ + ("model.int8.onnx", "model.int8.onnx"), + ("tokens.txt", "tokens.txt"), + ], + huggingface_repo: "csukuangfj/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02", +}]; // ============================================================================= // Omnilingual Model Definitions @@ -413,20 +408,15 @@ struct OmnilingualModelInfo { huggingface_repo: &'static str, } -const OMNILINGUAL_MODELS: &[OmnilingualModelInfo] = &[ - OmnilingualModelInfo { - name: "300m", - dir_name: "omnilingual-300m", - size_mb: 3900, - description: "1600+ languages, 300M params", - languages: "1600+ langs", - files: &[ - ("model.onnx", "model.onnx"), - ("tokens.txt", "tokens.txt"), - ], - huggingface_repo: "csukuangfj/sherpa-onnx-omnilingual-asr-1600-languages-300M-ctc-2025-11-12", - }, -]; +const OMNILINGUAL_MODELS: &[OmnilingualModelInfo] = &[OmnilingualModelInfo { + name: "300m", + dir_name: "omnilingual-300m", + size_mb: 3900, + description: "1600+ languages, 300M params", + languages: "1600+ langs", + files: &[("model.onnx", "model.onnx"), ("tokens.txt", "tokens.txt")], + huggingface_repo: "csukuangfj/sherpa-onnx-omnilingual-asr-1600-languages-300M-ctc-2025-11-12", +}]; // ============================================================================= // Whisper Model Functions @@ -459,6 +449,7 @@ pub async fn interactive_select() -> anyhow::Result<()> { let is_paraformer_engine = matches!(config.engine, TranscriptionEngine::Paraformer); let is_dolphin_engine = matches!(config.engine, TranscriptionEngine::Dolphin); let is_omnilingual_engine = matches!(config.engine, TranscriptionEngine::Omnilingual); + let is_openvino_engine = matches!(config.engine, TranscriptionEngine::OpenVino); let current_whisper_model = &config.whisper.model; let current_parakeet_model = config.parakeet.as_ref().map(|p| p.model.as_str()); let current_moonshine_model = config.moonshine.as_ref().map(|m| m.model.as_str()); @@ -466,12 +457,14 @@ pub async fn interactive_select() -> anyhow::Result<()> { let current_paraformer_model = config.paraformer.as_ref().map(|p| p.model.as_str()); let current_dolphin_model = config.dolphin.as_ref().map(|d| d.model.as_str()); let current_omnilingual_model = config.omnilingual.as_ref().map(|o| o.model.as_str()); + let current_openvino_model = config.openvino.as_ref().map(|o| o.model.as_str()); let parakeet_available = cfg!(feature = "parakeet"); let moonshine_available = cfg!(feature = "moonshine"); let sensevoice_available = cfg!(feature = "sensevoice"); let paraformer_available = cfg!(feature = "paraformer"); let dolphin_available = cfg!(feature = "dolphin"); let omnilingual_available = cfg!(feature = "omnilingual"); + let openvino_available = cfg!(feature = "openvino-whisper"); let whisper_count = MODELS.len(); let parakeet_count = PARAKEET_MODELS.len(); let moonshine_count = MOONSHINE_MODELS.len(); @@ -479,6 +472,7 @@ pub async fn interactive_select() -> anyhow::Result<()> { let paraformer_count = PARAFORMER_MODELS.len(); let dolphin_count = DOLPHIN_MODELS.len(); let omnilingual_count = OMNILINGUAL_MODELS.len(); + let openvino_count = OPENVINO_MODELS.len(); let available_count = |available: bool, count: usize| if available { count } else { 0 }; let total_count = whisper_count @@ -487,7 +481,8 @@ pub async fn interactive_select() -> anyhow::Result<()> { + available_count(sensevoice_available, sensevoice_count) + available_count(paraformer_available, paraformer_count) + available_count(dolphin_available, dolphin_count) - + available_count(omnilingual_available, omnilingual_count); + + available_count(omnilingual_available, omnilingual_count) + + available_count(openvino_available, openvino_count); // --- Whisper Section --- println!("--- Whisper (OpenAI, 99+ languages) ---\n"); @@ -635,8 +630,8 @@ pub async fn interactive_select() -> anyhow::Result<()> { } // --- Paraformer Section --- - let paraformer_offset = sensevoice_offset - + available_count(sensevoice_available, sensevoice_count); + let paraformer_offset = + sensevoice_offset + available_count(sensevoice_available, sensevoice_count); println!("\n--- Paraformer (FunASR, Chinese + English) ---\n"); if paraformer_available { @@ -669,8 +664,8 @@ pub async fn interactive_select() -> anyhow::Result<()> { } // --- Dolphin Section --- - let dolphin_offset = paraformer_offset - + available_count(paraformer_available, paraformer_count); + let dolphin_offset = + paraformer_offset + available_count(paraformer_available, paraformer_count); println!("\n--- Dolphin (dictation-optimized CTC) ---\n"); if dolphin_available { @@ -703,8 +698,7 @@ pub async fn interactive_select() -> anyhow::Result<()> { } // --- Omnilingual Section --- - let omnilingual_offset = dolphin_offset - + available_count(dolphin_available, dolphin_count); + let omnilingual_offset = dolphin_offset + available_count(dolphin_available, dolphin_count); println!("\n--- Omnilingual (FunASR, 50+ languages) ---\n"); if omnilingual_available { @@ -736,6 +730,46 @@ pub async fn interactive_select() -> anyhow::Result<()> { println!(" \x1b[90m(not available - rebuild with --features omnilingual)\x1b[0m"); } + // --- OpenVINO Section --- + let openvino_offset = + omnilingual_offset + available_count(omnilingual_available, omnilingual_count); + println!("\n--- OpenVINO Whisper (Intel NPU/CPU/GPU via OpenVINO) ---\n"); + + if openvino_available { + for (i, model) in OPENVINO_MODELS.iter().enumerate() { + let model_path = models_dir.join(model.dir_name); + let installed = model_path.exists() && validate_openvino_model(&model_path).is_ok(); + + let is_current = is_openvino_engine && current_openvino_model == Some(model.name); + let star = if is_current { "*" } else { " " }; + + let status = if installed { + "\x1b[32m[installed]\x1b[0m" + } else { + "" + }; + + let lang = if model.name.contains(".en") { + "en" + } else { + "multi" + }; + + println!( + " {}[{:>2}] {:<28} (~{:>4} MB) {} - {} {}", + star, + openvino_offset + i + 1, + model.name, + model.size_mb, + lang, + model.description, + status + ); + } + } else { + println!(" \x1b[90m(not available - rebuild with --features openvino-whisper)\x1b[0m"); + } + println!("\n [ 0] Cancel\n"); // Get user selection @@ -766,13 +800,43 @@ pub async fn interactive_select() -> anyhow::Result<()> { handle_sensevoice_selection(sensevoice_index).await } else if paraformer_available && selection <= paraformer_offset + paraformer_count { let idx = selection - paraformer_offset; - handle_onnx_engine_selection("paraformer", PARAFORMER_MODELS.iter().map(|m| (m.name, m.dir_name, m.size_mb, m.files, m.huggingface_repo)).collect(), idx, validate_onnx_ctc_model).await + handle_onnx_engine_selection( + "paraformer", + PARAFORMER_MODELS + .iter() + .map(|m| (m.name, m.dir_name, m.size_mb, m.files, m.huggingface_repo)) + .collect(), + idx, + validate_onnx_ctc_model, + ) + .await } else if dolphin_available && selection <= dolphin_offset + dolphin_count { let idx = selection - dolphin_offset; - handle_onnx_engine_selection("dolphin", DOLPHIN_MODELS.iter().map(|m| (m.name, m.dir_name, m.size_mb, m.files, m.huggingface_repo)).collect(), idx, validate_onnx_ctc_model).await + handle_onnx_engine_selection( + "dolphin", + DOLPHIN_MODELS + .iter() + .map(|m| (m.name, m.dir_name, m.size_mb, m.files, m.huggingface_repo)) + .collect(), + idx, + validate_onnx_ctc_model, + ) + .await } else if omnilingual_available && selection <= omnilingual_offset + omnilingual_count { let idx = selection - omnilingual_offset; - handle_onnx_engine_selection("omnilingual", OMNILINGUAL_MODELS.iter().map(|m| (m.name, m.dir_name, m.size_mb, m.files, m.huggingface_repo)).collect(), idx, validate_onnx_ctc_model).await + handle_onnx_engine_selection( + "omnilingual", + OMNILINGUAL_MODELS + .iter() + .map(|m| (m.name, m.dir_name, m.size_mb, m.files, m.huggingface_repo)) + .collect(), + idx, + validate_onnx_ctc_model, + ) + .await + } else if openvino_available && selection <= openvino_offset + openvino_count { + let idx = selection - openvino_offset; + handle_openvino_selection(idx).await } else { println!("\nInvalid selection."); Ok(()) @@ -1161,7 +1225,39 @@ pub fn list_installed() { } if !found { - println!(" No models installed."); + println!(" No Whisper models installed."); + } + + // List installed OpenVINO models + println!("\nInstalled OpenVINO Whisper Models\n"); + println!("=================================\n"); + + let mut openvino_found = false; + + for model in OPENVINO_MODELS { + let model_path = models_dir.join(model.dir_name); + + if model_path.exists() && validate_openvino_model(&model_path).is_ok() { + let size = std::fs::read_dir(&model_path) + .map(|entries| { + entries + .flatten() + .filter_map(|e| e.metadata().ok()) + .map(|m| m.len() as f64 / 1024.0 / 1024.0) + .sum::() + }) + .unwrap_or(0.0); + + println!(" {} ({:.0} MB) - {}", model.name, size, model.description); + openvino_found = true; + } + } + + if !openvino_found { + println!(" No OpenVINO models installed."); + } + + if !found && !openvino_found { println!("\n Run 'voxtype setup model' to download a model."); } } @@ -1862,8 +1958,7 @@ pub fn validate_sensevoice_model(path: &Path) -> anyhow::Result<()> { anyhow::bail!("Model directory does not exist: {:?}", path); } - let has_model = - path.join("model.int8.onnx").exists() || path.join("model.onnx").exists(); + let has_model = path.join("model.int8.onnx").exists() || path.join("model.onnx").exists(); let has_tokens = path.join("tokens.txt").exists(); if has_model && has_tokens { @@ -2128,6 +2223,79 @@ fn validate_onnx_ctc_model(path: &Path) -> anyhow::Result<()> { } /// Generic handler for ONNX engine model selection (download/config/restart) +/// Handle OpenVINO model selection (download/config) +async fn handle_openvino_selection(selection: usize) -> anyhow::Result<()> { + let models_dir = Config::models_dir(); + + if selection == 0 || selection > OPENVINO_MODELS.len() { + println!("\nCancelled."); + return Ok(()); + } + + let model = &OPENVINO_MODELS[selection - 1]; + let model_path = models_dir.join(model.dir_name); + + // Check if already installed + if model_path.exists() && validate_openvino_model(&model_path).is_ok() { + println!("\nModel '{}' is already installed.\n", model.name); + println!(" [1] Set as default model (update config)"); + println!(" [2] Re-download"); + println!(" [0] Cancel\n"); + + print!("Select option [1]: "); + io::stdout().flush()?; + + let mut choice = String::new(); + io::stdin().read_line(&mut choice)?; + let choice = choice.trim(); + + match choice { + "" | "1" => { + update_config_openvino(model.name)?; + restart_daemon_if_running().await; + return Ok(()); + } + "2" => { + // Continue to download below + } + _ => { + println!("Cancelled."); + return Ok(()); + } + } + } + + // Download the model + download_openvino_model(model.name)?; + + // Update config and restart daemon + update_config_openvino(model.name)?; + restart_daemon_if_running().await; + + Ok(()) +} + +/// Update config to use OpenVINO engine with a specific model (with output) +fn update_config_openvino(model_name: &str) -> anyhow::Result<()> { + if let Some(config_path) = Config::default_path() { + if config_path.exists() { + let content = std::fs::read_to_string(&config_path)?; + let updated = update_openvino_in_config(&content, model_name); + std::fs::write(&config_path, updated)?; + print_success(&format!( + "Config updated: engine = \"openvino\", model = \"{}\"", + model_name + )); + Ok(()) + } else { + print_info("No config file found. Run 'voxtype setup' first."); + Ok(()) + } + } else { + anyhow::bail!("Could not determine config path") + } +} + async fn handle_onnx_engine_selection( engine_name: &str, models: Vec<(&str, &str, u32, &[(&str, &str)], &str)>, @@ -2179,7 +2347,10 @@ async fn handle_onnx_engine_selection( // Validate validate_fn(&model_path)?; - print_success(&format!("Model '{}' downloaded to {:?}", dir_name, model_path)); + print_success(&format!( + "Model '{}' downloaded to {:?}", + dir_name, model_path + )); // Update config and restart daemon update_config_engine(engine_name, name)?; @@ -2210,10 +2381,7 @@ fn download_onnx_model( continue; } - let url = format!( - "https://huggingface.co/{}/resolve/main/{}", - repo, repo_path - ); + let url = format!("https://huggingface.co/{}/resolve/main/{}", repo, repo_path); println!("Downloading {}...", local_filename); @@ -2328,9 +2496,520 @@ fn update_engine_in_config(config: &str, engine_name: &str, model_name: &str) -> } if !has_section { - result.push_str(&format!("\n[{}]\nmodel = \"{}\"\n", engine_name, model_name)); + result.push_str(&format!( + "\n[{}]\nmodel = \"{}\"\n", + engine_name, model_name + )); + } + + if !config.ends_with('\n') && result.ends_with('\n') { + result.pop(); + } + + result +} + +// --- OpenVINO Whisper Models --- + +struct OpenVinoModelInfo { + /// Short config name (e.g., "base.en-int8", "small-fp16") + name: &'static str, + /// Directory name under models/ + dir_name: &'static str, + size_mb: u32, + description: &'static str, + /// Quantization type + quantization: &'static str, + huggingface_repo: &'static str, +} + +/// Files common to all OpenVINO Whisper model repos +const OPENVINO_MODEL_FILES: &[&str] = &[ + "openvino_encoder_model.xml", + "openvino_encoder_model.bin", + "openvino_decoder_model.xml", + "openvino_decoder_model.bin", + "openvino_tokenizer.xml", + "openvino_tokenizer.bin", + "openvino_detokenizer.xml", + "openvino_detokenizer.bin", + "tokenizer.json", + "config.json", + "generation_config.json", +]; + +const OPENVINO_MODELS: &[OpenVinoModelInfo] = &[ + // --- Tiny models --- + OpenVinoModelInfo { + name: "tiny-int4", + dir_name: "openvino-whisper-tiny-int4-ov", + size_mb: 25, + description: "Multilingual, int4 quantized (smallest)", + quantization: "int4", + huggingface_repo: "OpenVINO/whisper-tiny-int4-ov", + }, + OpenVinoModelInfo { + name: "tiny-int8", + dir_name: "openvino-whisper-tiny-int8-ov", + size_mb: 50, + description: "Multilingual, int8 quantized", + quantization: "int8", + huggingface_repo: "OpenVINO/whisper-tiny-int8-ov", + }, + OpenVinoModelInfo { + name: "tiny-fp16", + dir_name: "openvino-whisper-tiny-fp16-ov", + size_mb: 80, + description: "Multilingual, fp16", + quantization: "fp16", + huggingface_repo: "OpenVINO/whisper-tiny-fp16-ov", + }, + OpenVinoModelInfo { + name: "tiny.en-int4", + dir_name: "openvino-whisper-tiny.en-int4-ov", + size_mb: 25, + description: "English, int4 quantized (smallest)", + quantization: "int4", + huggingface_repo: "OpenVINO/whisper-tiny.en-int4-ov", + }, + OpenVinoModelInfo { + name: "tiny.en-int8", + dir_name: "openvino-whisper-tiny.en-int8-ov", + size_mb: 50, + description: "English, int8 quantized", + quantization: "int8", + huggingface_repo: "OpenVINO/whisper-tiny.en-int8-ov", + }, + OpenVinoModelInfo { + name: "tiny.en-fp16", + dir_name: "openvino-whisper-tiny.en-fp16-ov", + size_mb: 80, + description: "English, fp16", + quantization: "fp16", + huggingface_repo: "OpenVINO/whisper-tiny.en-fp16-ov", + }, + // --- Base models --- + OpenVinoModelInfo { + name: "base-int4", + dir_name: "openvino-whisper-base-int4-ov", + size_mb: 55, + description: "Multilingual, int4 quantized", + quantization: "int4", + huggingface_repo: "OpenVINO/whisper-base-int4-ov", + }, + OpenVinoModelInfo { + name: "base-int8", + dir_name: "openvino-whisper-base-int8-ov", + size_mb: 100, + description: "Multilingual, int8 quantized", + quantization: "int8", + huggingface_repo: "OpenVINO/whisper-base-int8-ov", + }, + OpenVinoModelInfo { + name: "base-fp16", + dir_name: "openvino-whisper-base-fp16-ov", + size_mb: 145, + description: "Multilingual, fp16", + quantization: "fp16", + huggingface_repo: "OpenVINO/whisper-base-fp16-ov", + }, + OpenVinoModelInfo { + name: "base.en-int4", + dir_name: "openvino-whisper-base.en-int4-ov", + size_mb: 55, + description: "English, int4 quantized", + quantization: "int4", + huggingface_repo: "OpenVINO/whisper-base.en-int4-ov", + }, + OpenVinoModelInfo { + name: "base.en-int8", + dir_name: "openvino-whisper-base.en-int8-ov", + size_mb: 100, + description: "English, int8 quantized (best for NPU)", + quantization: "int8", + huggingface_repo: "OpenVINO/whisper-base.en-int8-ov", + }, + OpenVinoModelInfo { + name: "base.en-fp16", + dir_name: "openvino-whisper-base.en-fp16-ov", + size_mb: 145, + description: "English, fp16 (higher accuracy)", + quantization: "fp16", + huggingface_repo: "OpenVINO/whisper-base.en-fp16-ov", + }, + // --- Small models --- + OpenVinoModelInfo { + name: "small-int4", + dir_name: "openvino-whisper-small-int4-ov", + size_mb: 160, + description: "Multilingual, int4 quantized", + quantization: "int4", + huggingface_repo: "OpenVINO/whisper-small-int4-ov", + }, + OpenVinoModelInfo { + name: "small-int8", + dir_name: "openvino-whisper-small-int8-ov", + size_mb: 300, + description: "Multilingual, int8 quantized", + quantization: "int8", + huggingface_repo: "OpenVINO/whisper-small-int8-ov", + }, + OpenVinoModelInfo { + name: "small-fp16", + dir_name: "openvino-whisper-small-fp16-ov", + size_mb: 470, + description: "Multilingual, fp16", + quantization: "fp16", + huggingface_repo: "OpenVINO/whisper-small-fp16-ov", + }, + OpenVinoModelInfo { + name: "small.en-int4", + dir_name: "openvino-whisper-small.en-int4-ov", + size_mb: 160, + description: "English, int4 quantized", + quantization: "int4", + huggingface_repo: "OpenVINO/whisper-small.en-int4-ov", + }, + OpenVinoModelInfo { + name: "small.en-int8", + dir_name: "openvino-whisper-small.en-int8-ov", + size_mb: 300, + description: "English, int8 quantized", + quantization: "int8", + huggingface_repo: "OpenVINO/whisper-small.en-int8-ov", + }, + OpenVinoModelInfo { + name: "small.en-fp16", + dir_name: "openvino-whisper-small.en-fp16-ov", + size_mb: 470, + description: "English, fp16", + quantization: "fp16", + huggingface_repo: "OpenVINO/whisper-small.en-fp16-ov", + }, + // --- Medium models --- + OpenVinoModelInfo { + name: "medium-int4", + dir_name: "openvino-whisper-medium-int4-ov", + size_mb: 400, + description: "Multilingual, int4 quantized", + quantization: "int4", + huggingface_repo: "OpenVINO/whisper-medium-int4-ov", + }, + OpenVinoModelInfo { + name: "medium-int8", + dir_name: "openvino-whisper-medium-int8-ov", + size_mb: 780, + description: "Multilingual, int8 quantized", + quantization: "int8", + huggingface_repo: "OpenVINO/whisper-medium-int8-ov", + }, + OpenVinoModelInfo { + name: "medium-fp16", + dir_name: "openvino-whisper-medium-fp16-ov", + size_mb: 1500, + description: "Multilingual, fp16", + quantization: "fp16", + huggingface_repo: "OpenVINO/whisper-medium-fp16-ov", + }, + OpenVinoModelInfo { + name: "medium.en-int4", + dir_name: "openvino-whisper-medium.en-int4-ov", + size_mb: 400, + description: "English, int4 quantized", + quantization: "int4", + huggingface_repo: "OpenVINO/whisper-medium.en-int4-ov", + }, + OpenVinoModelInfo { + name: "medium.en-int8", + dir_name: "openvino-whisper-medium.en-int8-ov", + size_mb: 780, + description: "English, int8 quantized", + quantization: "int8", + huggingface_repo: "OpenVINO/whisper-medium.en-int8-ov", + }, + OpenVinoModelInfo { + name: "medium.en-fp16", + dir_name: "openvino-whisper-medium.en-fp16-ov", + size_mb: 1500, + description: "English, fp16", + quantization: "fp16", + huggingface_repo: "OpenVINO/whisper-medium.en-fp16-ov", + }, + // --- Large-v3 models --- + OpenVinoModelInfo { + name: "large-v3-int4", + dir_name: "openvino-whisper-large-v3-int4-ov", + size_mb: 850, + description: "Multilingual, best accuracy, int4 quantized", + quantization: "int4", + huggingface_repo: "OpenVINO/whisper-large-v3-int4-ov", + }, + OpenVinoModelInfo { + name: "large-v3-int8", + dir_name: "openvino-whisper-large-v3-int8-ov", + size_mb: 1600, + description: "Multilingual, best accuracy, int8 quantized", + quantization: "int8", + huggingface_repo: "OpenVINO/whisper-large-v3-int8-ov", + }, + OpenVinoModelInfo { + name: "large-v3-fp16", + dir_name: "openvino-whisper-large-v3-fp16-ov", + size_mb: 3100, + description: "Multilingual, best accuracy, fp16", + quantization: "fp16", + huggingface_repo: "OpenVINO/whisper-large-v3-fp16-ov", + }, + // --- Distil-whisper models (distilled, faster) --- + OpenVinoModelInfo { + name: "distil-large-v2-int4", + dir_name: "openvino-distil-whisper-large-v2-int4-ov", + size_mb: 500, + description: "Distilled large-v2, int4 quantized (fast)", + quantization: "int4", + huggingface_repo: "OpenVINO/distil-whisper-large-v2-int4-ov", + }, + OpenVinoModelInfo { + name: "distil-large-v2-int8", + dir_name: "openvino-distil-whisper-large-v2-int8-ov", + size_mb: 950, + description: "Distilled large-v2, int8 quantized (fast)", + quantization: "int8", + huggingface_repo: "OpenVINO/distil-whisper-large-v2-int8-ov", + }, + OpenVinoModelInfo { + name: "distil-large-v2-fp16", + dir_name: "openvino-distil-whisper-large-v2-fp16-ov", + size_mb: 1800, + description: "Distilled large-v2, fp16 (fast)", + quantization: "fp16", + huggingface_repo: "OpenVINO/distil-whisper-large-v2-fp16-ov", + }, + OpenVinoModelInfo { + name: "distil-large-v3-int4", + dir_name: "openvino-distil-whisper-large-v3-int4-ov", + size_mb: 400, + description: "Distilled large-v3, int4 quantized (fast)", + quantization: "int4", + huggingface_repo: "OpenVINO/distil-whisper-large-v3-int4-ov", + }, + OpenVinoModelInfo { + name: "distil-large-v3-int8", + dir_name: "openvino-distil-whisper-large-v3-int8-ov", + size_mb: 750, + description: "Distilled large-v3, int8 quantized (fast)", + quantization: "int8", + huggingface_repo: "OpenVINO/distil-whisper-large-v3-int8-ov", + }, + OpenVinoModelInfo { + name: "distil-large-v3-fp16", + dir_name: "openvino-distil-whisper-large-v3-fp16-ov", + size_mb: 1400, + description: "Distilled large-v3, fp16 (fast)", + quantization: "fp16", + huggingface_repo: "OpenVINO/distil-whisper-large-v3-fp16-ov", + }, +]; + +/// Download an OpenVINO Whisper model by name +pub fn download_openvino_model(model_name: &str) -> anyhow::Result<()> { + let model = OPENVINO_MODELS + .iter() + .find(|m| m.name == model_name) + .ok_or_else(|| { + let valid: Vec<&str> = OPENVINO_MODELS.iter().map(|m| m.name).collect(); + anyhow::anyhow!( + "Unknown OpenVINO model: {}. Valid options: {}", + model_name, + valid.join(", ") + ) + })?; + + let models_dir = Config::models_dir(); + let model_path = models_dir.join(model.dir_name); + + std::fs::create_dir_all(&model_path)?; + + println!( + "\nDownloading OpenVINO Whisper {} (~{} MB, {})...\n", + model.name, model.size_mb, model.quantization + ); + + for filename in OPENVINO_MODEL_FILES { + let file_path = model_path.join(filename); + + if file_path.exists() { + println!(" {} already exists, skipping", filename); + continue; + } + + let url = format!( + "https://huggingface.co/{}/resolve/main/{}", + model.huggingface_repo, filename + ); + + println!(" Downloading {}...", filename); + + let status = Command::new("curl") + .args([ + "-L", + "--progress-bar", + "-o", + file_path.to_str().unwrap_or("file"), + &url, + ]) + .status(); + + match status { + Ok(exit_status) if exit_status.success() => {} + Ok(exit_status) => { + print_failure(&format!( + "Download failed: curl exited with code {}", + exit_status.code().unwrap_or(-1) + )); + let _ = std::fs::remove_file(&file_path); + anyhow::bail!("Download failed for {}", filename) + } + Err(e) => { + print_failure(&format!("Failed to run curl: {}", e)); + print_info("Please ensure curl is installed"); + anyhow::bail!("curl not available: {}", e) + } + } + } + + // Validate critical files + validate_openvino_model(&model_path).inspect_err(|_| { + print_failure("Model download incomplete. Missing required files."); + })?; + + print_success(&format!( + "OpenVINO model '{}' downloaded to {:?}", + model.name, model_path + )); + + Ok(()) +} + +/// Get list of valid OpenVINO model names +pub fn valid_openvino_model_names() -> Vec<&'static str> { + OPENVINO_MODELS.iter().map(|m| m.name).collect() +} + +/// Check if a model name is an OpenVINO model +pub fn is_openvino_model(name: &str) -> bool { + OPENVINO_MODELS.iter().any(|m| m.name == name) +} + +/// Get the directory name for an OpenVINO model +pub fn openvino_dir_name(name: &str) -> Option<&'static str> { + OPENVINO_MODELS + .iter() + .find(|m| m.name == name) + .map(|m| m.dir_name) +} + +/// Validate that an OpenVINO model directory has required files +pub fn validate_openvino_model(path: &std::path::Path) -> anyhow::Result<()> { + let required = [ + "openvino_encoder_model.xml", + "openvino_encoder_model.bin", + "openvino_decoder_model.xml", + "openvino_decoder_model.bin", + "tokenizer.json", + ]; + for file in &required { + if !path.join(file).exists() { + anyhow::bail!("Missing required file: {}", file); + } } + Ok(()) +} +/// Update config to use OpenVINO engine with a specific model +pub fn set_openvino_config(model_name: &str) -> anyhow::Result<()> { + if let Some(config_path) = Config::default_path() { + if config_path.exists() { + let content = std::fs::read_to_string(&config_path)?; + let updated = update_openvino_in_config(&content, model_name); + std::fs::write(&config_path, updated)?; + } + Ok(()) + } else { + anyhow::bail!("Could not determine config path") + } +} + +/// Update the config to use OpenVINO engine with a specific model +fn update_openvino_in_config(config: &str, model_name: &str) -> String { + let mut result = String::new(); + let mut has_engine_line = false; + let mut has_openvino_section = false; + let mut in_openvino_section = false; + let mut openvino_model_updated = false; + + for line in config.lines() { + let trimmed = line.trim(); + + // Track sections + if trimmed.starts_with('[') { + // If we were in openvino section and didn't update model, add it + if in_openvino_section && !openvino_model_updated { + result.push_str(&format!("model = \"{}\"\n", model_name)); + openvino_model_updated = true; + } + in_openvino_section = trimmed == "[openvino]"; + if in_openvino_section { + has_openvino_section = true; + } + } + + // Update or add engine line at the top level + if trimmed.starts_with("engine") && !trimmed.starts_with('[') { + result.push_str("engine = \"openvino\"\n"); + has_engine_line = true; + } + // Update model line in openvino section + else if in_openvino_section && trimmed.starts_with("model") { + result.push_str(&format!("model = \"{}\"\n", model_name)); + openvino_model_updated = true; + } else { + result.push_str(line); + result.push('\n'); + } + } + + // If we were in openvino section at EOF and didn't update model, add it + if in_openvino_section && !openvino_model_updated { + result.push_str(&format!("model = \"{}\"\n", model_name)); + } + + // Add engine line if not present + if !has_engine_line { + let mut new_result = String::new(); + let mut engine_added = false; + for line in result.lines() { + let trimmed = line.trim(); + if !engine_added + && !trimmed.is_empty() + && !trimmed.starts_with('#') + && !trimmed.starts_with("engine") + { + new_result.push_str("engine = \"openvino\"\n\n"); + engine_added = true; + } + new_result.push_str(line); + new_result.push('\n'); + } + result = new_result; + } + + // Add [openvino] section if not present + if !has_openvino_section { + result.push_str(&format!("\n[openvino]\nmodel = \"{}\"\n", model_name)); + } + + // Remove trailing newline if original didn't have one if !config.ends_with('\n') && result.ends_with('\n') { result.pop(); } diff --git a/src/setup/npu.rs b/src/setup/npu.rs new file mode 100644 index 00000000..ae966ea2 --- /dev/null +++ b/src/setup/npu.rs @@ -0,0 +1,291 @@ +//! NPU backend management for voxtype +//! +//! Manages Intel NPU acceleration via OpenVINO. Unlike GPU (which switches binaries +//! via symlinks), NPU is config-driven: it sets `engine = "openvino"` in config.toml +//! and uses the same binary compiled with the `openvino-whisper` feature. + +use crate::config::Config; +use std::path::Path; + +const DEFAULT_OPENVINO_MODEL: &str = "base.en-int8"; + +/// Check if NPU hardware is present (/dev/accel/accel* devices) +fn detect_npu_hardware() -> bool { + Path::new("/dev/accel").is_dir() + && std::fs::read_dir("/dev/accel") + .map(|entries| { + entries.filter_map(|e| e.ok()).any(|e| { + e.file_name() + .to_str() + .map(|n| n.starts_with("accel")) + .unwrap_or(false) + }) + }) + .unwrap_or(false) +} + +/// Check if the intel-npu-driver kernel module is loaded +fn check_npu_driver() -> bool { + Path::new("/sys/module/intel_vpu").exists() +} + +/// Check if an OpenVINO model is already downloaded +fn has_openvino_model() -> bool { + let models_dir = Config::models_dir(); + if let Some(dir_name) = super::model::openvino_dir_name(DEFAULT_OPENVINO_MODEL) { + let model_path = models_dir.join(dir_name); + super::model::validate_openvino_model(&model_path).is_ok() + } else { + false + } +} + +/// Update the engine field in config.toml to a specific value, preserving everything else +fn update_engine_in_config(content: &str, engine: &str) -> String { + let mut result = String::new(); + let mut engine_updated = false; + + for line in content.lines() { + let trimmed = line.trim(); + if trimmed.starts_with("engine") && trimmed.contains('=') && !trimmed.starts_with('[') { + result.push_str(&format!("engine = \"{}\"\n", engine)); + engine_updated = true; + } else { + result.push_str(line); + result.push('\n'); + } + } + + // If no engine line existed, don't add one -- set_openvino_config handles that for enable + if !engine_updated && engine == "whisper" { + // Nothing to do: there was no engine line, and the default is already whisper + } + + // Preserve trailing newline behavior + if !content.ends_with('\n') && result.ends_with('\n') { + result.pop(); + } + + result +} + +/// Show NPU hardware and configuration status +pub fn show_status() { + println!("NPU Status\n"); + + // Hardware detection + println!("Hardware:"); + if detect_npu_hardware() { + super::print_success("NPU device detected (/dev/accel/)"); + } else { + super::print_failure("No NPU device found (/dev/accel/)"); + } + + if check_npu_driver() { + super::print_success("intel_vpu kernel module loaded"); + } else { + super::print_failure("intel_vpu kernel module not loaded"); + super::print_info("Install intel-npu-driver and reboot"); + } + + // Feature check + println!("\nBuild:"); + if cfg!(feature = "openvino-whisper") { + super::print_success("openvino-whisper feature compiled in"); + } else { + super::print_failure("openvino-whisper feature not compiled"); + super::print_info( + "Install a build with openvino-whisper enabled, or rebuild with --features openvino-whisper", + ); + } + + // Config check + println!("\nConfiguration:"); + if let Some(config_path) = Config::default_path() { + if config_path.exists() { + if let Ok(content) = std::fs::read_to_string(&config_path) { + let engine_is_openvino = content.lines().any(|line| { + let trimmed = line.trim(); + trimmed.starts_with("engine") + && trimmed.contains("openvino") + && !trimmed.starts_with('#') + && !trimmed.starts_with('[') + }); + if engine_is_openvino { + super::print_success("Engine set to OpenVINO"); + } else { + super::print_info("Engine is not set to OpenVINO"); + } + } + } + } + + // Model check + if has_openvino_model() { + super::print_success(&format!( + "Default OpenVINO model ({}) installed", + DEFAULT_OPENVINO_MODEL + )); + } else { + super::print_info(&format!( + "Default OpenVINO model ({}) not installed", + DEFAULT_OPENVINO_MODEL + )); + } +} + +/// Enable NPU acceleration +pub fn enable() -> anyhow::Result<()> { + #[cfg(not(feature = "openvino-whisper"))] + { + anyhow::bail!( + "NPU support requires the openvino-whisper feature.\n\ + Install a build with openvino-whisper enabled, or rebuild with:\n \ + cargo build --features openvino-whisper" + ); + } + + #[cfg(feature = "openvino-whisper")] + { + println!("Enabling NPU acceleration (OpenVINO)...\n"); + + // Check hardware (warn, don't fail) + if detect_npu_hardware() { + super::print_success("NPU device detected"); + } else { + super::print_warning( + "No NPU device found at /dev/accel/. OpenVINO will fall back to CPU.\n \ + If you have an Intel NPU, ensure intel-npu-driver is installed and reboot.", + ); + } + + if !check_npu_driver() { + super::print_warning( + "intel_vpu kernel module not loaded.\n \ + Install intel-npu-driver and reboot to use NPU hardware.", + ); + } + + // Download default model if needed + if !has_openvino_model() { + println!(); + super::model::download_openvino_model(DEFAULT_OPENVINO_MODEL)?; + } else { + super::print_success(&format!( + "OpenVINO model '{}' already installed", + DEFAULT_OPENVINO_MODEL + )); + } + + // Update config to use OpenVINO engine + super::model::set_openvino_config(DEFAULT_OPENVINO_MODEL)?; + super::print_success("Config updated: engine = \"openvino\""); + + println!(); + println!("NPU acceleration enabled (OpenVINO engine)."); + println!(); + println!("Restart voxtype to apply:"); + println!(" systemctl --user restart voxtype"); + + Ok(()) + } +} + +/// Disable NPU acceleration (revert to Whisper engine) +pub fn disable() -> anyhow::Result<()> { + if let Some(config_path) = Config::default_path() { + if config_path.exists() { + let content = std::fs::read_to_string(&config_path)?; + let updated = update_engine_in_config(&content, "whisper"); + std::fs::write(&config_path, updated)?; + super::print_success("Config updated: engine = \"whisper\""); + } else { + super::print_info("No config file found, nothing to disable"); + } + } else { + anyhow::bail!("Could not determine config path"); + } + + println!(); + println!("NPU acceleration disabled (reverted to Whisper engine)."); + println!("The [openvino] config section has been preserved for easy re-enable."); + println!(); + println!("Restart voxtype to apply:"); + println!(" systemctl --user restart voxtype"); + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_update_engine_to_whisper() { + let config = r#"engine = "openvino" + +[whisper] +model = "base.en" + +[openvino] +model = "base.en-int8" +"#; + let result = update_engine_in_config(config, "whisper"); + assert!(result.contains("engine = \"whisper\"")); + assert!(!result.contains("engine = \"openvino\"")); + // openvino section preserved + assert!(result.contains("[openvino]")); + assert!(result.contains("model = \"base.en-int8\"")); + } + + #[test] + fn test_update_engine_to_openvino() { + let config = r#"engine = "whisper" + +[whisper] +model = "base.en" +"#; + let result = update_engine_in_config(config, "openvino"); + assert!(result.contains("engine = \"openvino\"")); + assert!(!result.contains("engine = \"whisper\"")); + } + + #[test] + fn test_update_engine_no_engine_line() { + let config = r#"[whisper] +model = "base.en" +"#; + let result = update_engine_in_config(config, "whisper"); + // No engine line existed, and we're setting to whisper (the default) -- no change + assert!(!result.contains("engine =")); + assert!(result.contains("[whisper]")); + } + + #[test] + fn test_update_engine_preserves_comments() { + let config = r#"# Main config +engine = "openvino" + +# Whisper settings +[whisper] +model = "base.en" +"#; + let result = update_engine_in_config(config, "whisper"); + assert!(result.contains("# Main config")); + assert!(result.contains("# Whisper settings")); + assert!(result.contains("engine = \"whisper\"")); + } + + #[test] + fn test_detect_npu_hardware_returns_false_without_device() { + // In test/CI environments, /dev/accel typically doesn't exist + // This just verifies the function doesn't panic + let _result = detect_npu_hardware(); + } + + #[test] + fn test_check_npu_driver_returns_false_without_module() { + // In test/CI environments, intel_vpu module typically isn't loaded + let _result = check_npu_driver(); + } +} diff --git a/src/transcribe/ctc.rs b/src/transcribe/ctc.rs index 16906f00..cb0e09a3 100644 --- a/src/transcribe/ctc.rs +++ b/src/transcribe/ctc.rs @@ -135,9 +135,8 @@ fn tokens_to_string( /// Format: each line is "token_string token_id" (space-separated). /// The token string may contain spaces, so we split from the right. pub fn load_tokens(path: &Path) -> Result, TranscribeError> { - let content = std::fs::read_to_string(path).map_err(|e| { - TranscribeError::InitFailed(format!("Failed to read tokens.txt: {}", e)) - })?; + let content = std::fs::read_to_string(path) + .map_err(|e| TranscribeError::InitFailed(format!("Failed to read tokens.txt: {}", e)))?; let mut tokens = HashMap::new(); for line in content.lines() { @@ -174,11 +173,7 @@ mod tests { fn test_load_tokens() { let temp_dir = TempDir::new().unwrap(); let tokens_path = temp_dir.path().join("tokens.txt"); - fs::write( - &tokens_path, - " 0\n 1\nhello 2\nworld 3\n", - ) - .unwrap(); + fs::write(&tokens_path, " 0\n 1\nhello 2\nworld 3\n").unwrap(); let tokens = load_tokens(&tokens_path).unwrap(); assert_eq!(tokens.get(&0), Some(&"".to_string())); diff --git a/src/transcribe/dolphin.rs b/src/transcribe/dolphin.rs index c6674cf9..4a1b4b42 100644 --- a/src/transcribe/dolphin.rs +++ b/src/transcribe/dolphin.rs @@ -81,9 +81,7 @@ impl DolphinTranscriber { TranscribeError::InitFailed(format!("ONNX session builder failed: {}", e)) })? .with_intra_threads(threads) - .map_err(|e| { - TranscribeError::InitFailed(format!("Failed to set threads: {}", e)) - })? + .map_err(|e| TranscribeError::InitFailed(format!("Failed to set threads: {}", e)))? .commit_from_file(&model_file) .map_err(|e| { TranscribeError::InitFailed(format!( @@ -155,21 +153,15 @@ impl Transcriber for DolphinTranscriber { // x: shape [1, T, 80] let (x_data, _offset) = features.into_raw_vec_and_offset(); - let x_tensor = - Tensor::::from_array(([1usize, num_frames, feat_dim], x_data)).map_err(|e| { - TranscribeError::InferenceFailed(format!( - "Failed to create input tensor: {}", - e - )) + let x_tensor = Tensor::::from_array(([1usize, num_frames, feat_dim], x_data)) + .map_err(|e| { + TranscribeError::InferenceFailed(format!("Failed to create input tensor: {}", e)) })?; // x_len: shape [1] (i64) - let x_len_tensor = Tensor::::from_array(([1usize], vec![num_frames as i64])) - .map_err(|e| { - TranscribeError::InferenceFailed(format!( - "Failed to create length tensor: {}", - e - )) + let x_len_tensor = + Tensor::::from_array(([1usize], vec![num_frames as i64])).map_err(|e| { + TranscribeError::InferenceFailed(format!("Failed to create length tensor: {}", e)) })?; // Run inference @@ -180,10 +172,7 @@ impl Transcriber for DolphinTranscriber { let inputs: Vec<(std::borrow::Cow, ort::session::SessionInputValue)> = vec![ (std::borrow::Cow::Borrowed("x"), x_tensor.into()), - ( - std::borrow::Cow::Borrowed("x_len"), - x_len_tensor.into(), - ), + (std::borrow::Cow::Borrowed("x_len"), x_len_tensor.into()), ]; let outputs = session.run(inputs).map_err(|e| { @@ -349,10 +338,7 @@ fn read_cmvn_from_metadata(session: &Session) -> Result<(Vec, Vec), Tr ))); } - tracing::debug!( - "Loaded CMVN stats: {} dimensions", - neg_mean.len() - ); + tracing::debug!("Loaded CMVN stats: {} dimensions", neg_mean.len()); Ok((neg_mean, inv_stddev)) } @@ -382,10 +368,7 @@ fn resolve_model_path(model: &str) -> Result { } // Check sherpa-onnx naming convention - let sherpa_name = format!( - "sherpa-onnx-{}-ctc-multi-lang", - model_dir_name - ); + let sherpa_name = format!("sherpa-onnx-{}-ctc-multi-lang", model_dir_name); let sherpa_path = models_dir.join(&sherpa_name); if sherpa_path.exists() { return Ok(sherpa_path); diff --git a/src/transcribe/fbank.rs b/src/transcribe/fbank.rs index 625226fc..a9ffe787 100644 --- a/src/transcribe/fbank.rs +++ b/src/transcribe/fbank.rs @@ -1,11 +1,15 @@ //! Shared Fbank (log-mel filterbank) feature extraction //! -//! Used by SenseVoice, Paraformer, and FireRedASR backends. These models share -//! identical preprocessing: 80-dim Fbank features, LFR stacking (m=7, n=6), -//! and CMVN normalization with the same constants (16kHz, 25ms/10ms frames, +//! Used by SenseVoice, Paraformer, FireRedASR, and OpenVINO Whisper backends. +//! SenseVoice/Paraformer share identical preprocessing: 80-dim Fbank features, +//! LFR stacking (m=7, n=6), and CMVN normalization (16kHz, 25ms/10ms frames, //! Hamming window, 0.97 pre-emphasis). //! -//! Pipeline: Audio (f32, 16kHz) -> Fbank (80-dim) -> LFR (560-dim) -> CMVN +//! OpenVINO Whisper uses different settings: Hann window, no pre-emphasis, +//! no int16 scaling, FFT size 400. Use `FbankConfig::whisper()` for this. +//! +//! Pipeline (CTC models): Audio (f32, 16kHz) -> Fbank (80-dim) -> LFR (560-dim) -> CMVN +//! Pipeline (Whisper): Audio (f32, 16kHz) -> Fbank (80-dim) -> pad/transpose use ndarray::Array2; use rustfft::num_complex::Complex; @@ -29,6 +33,17 @@ const DEFAULT_FRAME_SHIFT: usize = 160; /// Default pre-emphasis coefficient const DEFAULT_PREEMPH_COEFF: f32 = 0.97; +/// Window function type for STFT +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum WindowType { + /// Hamming window: 0.54 - 0.46 * cos(2π*n/(N-1)) + /// Used by SenseVoice, Paraformer, and other Kaldi-style models + Hamming, + /// Hann window: 0.5 * (1 - cos(2π*n/(N-1))) + /// Used by Whisper + Hann, +} + /// Default LFR window size (stack 7 consecutive frames) const DEFAULT_LFR_M: usize = 7; @@ -43,6 +58,10 @@ pub struct FbankConfig { pub frame_length: usize, pub frame_shift: usize, pub preemph_coeff: f32, + /// Window function type (Hamming for Kaldi-style, Hann for Whisper) + pub window_type: WindowType, + /// Scale audio to int16 range before processing (Kaldi convention) + pub scale_to_int16: bool, } impl Default for FbankConfig { @@ -54,6 +73,25 @@ impl Default for FbankConfig { frame_length: DEFAULT_FRAME_LENGTH, frame_shift: DEFAULT_FRAME_SHIFT, preemph_coeff: DEFAULT_PREEMPH_COEFF, + window_type: WindowType::Hamming, + scale_to_int16: true, + } + } +} + +impl FbankConfig { + /// Configuration for Whisper-style mel extraction. + /// Hann window, no pre-emphasis, no int16 scaling, FFT size matches frame length. + pub fn whisper() -> Self { + Self { + sample_rate: DEFAULT_SAMPLE_RATE, + fft_size: DEFAULT_FRAME_LENGTH, // Whisper uses FFT size = frame length (400) + num_mels: DEFAULT_NUM_MELS, + frame_length: DEFAULT_FRAME_LENGTH, + frame_shift: DEFAULT_FRAME_SHIFT, + preemph_coeff: 0.0, + window_type: WindowType::Hann, + scale_to_int16: false, } } } @@ -110,15 +148,24 @@ impl FbankExtractor { let frame_shift = self.config.frame_shift; let fft_size = self.config.fft_size; - // Scale to int16 range (kaldi convention) - let scaled: Vec = samples.iter().map(|&s| s * 32768.0).collect(); + // Optionally scale to int16 range (Kaldi convention, not used by Whisper) + let scaled: Vec = if self.config.scale_to_int16 { + samples.iter().map(|&s| s * 32768.0).collect() + } else { + samples.to_vec() + }; - // Pre-emphasis - let mut emphasized = Vec::with_capacity(scaled.len()); - emphasized.push(scaled[0]); - for i in 1..scaled.len() { - emphasized.push(scaled[i] - self.config.preemph_coeff * scaled[i - 1]); - } + // Pre-emphasis (coefficient 0.0 = no pre-emphasis, used by Whisper) + let emphasized = if self.config.preemph_coeff > 0.0 { + let mut emp = Vec::with_capacity(scaled.len()); + emp.push(scaled[0]); + for i in 1..scaled.len() { + emp.push(scaled[i] - self.config.preemph_coeff * scaled[i - 1]); + } + emp + } else { + scaled + }; // Compute number of frames let num_frames = if emphasized.len() >= frame_length { @@ -131,11 +178,14 @@ impl FbankExtractor { return Array2::zeros((0, num_mels)); } - // Pre-compute Hamming window - let hamming: Vec = (0..frame_length) + // Pre-compute window function + let window: Vec = (0..frame_length) .map(|n| { - 0.54 - 0.46 - * (2.0 * std::f32::consts::PI * n as f32 / (frame_length as f32 - 1.0)).cos() + let x = 2.0 * std::f32::consts::PI * n as f32 / (frame_length as f32 - 1.0); + match self.config.window_type { + WindowType::Hamming => 0.54 - 0.46 * x.cos(), + WindowType::Hann => 0.5 * (1.0 - x.cos()), + } }) .collect(); @@ -151,7 +201,7 @@ impl FbankExtractor { // Window the frame let mut fft_input: Vec> = Vec::with_capacity(fft_size); for i in 0..frame_length { - fft_input.push(Complex::new(emphasized[start + i] * hamming[i], 0.0)); + fft_input.push(Complex::new(emphasized[start + i] * window[i], 0.0)); } // Zero-pad to fft_size fft_input.resize(fft_size, Complex::new(0.0, 0.0)); @@ -235,11 +285,7 @@ pub fn apply_cmvn(features: &mut Array2, neg_mean: &[f32], inv_stddev: &[f3 /// /// Returns num_mels triangular filters, each with fft_size/2+1 coefficients. /// Uses the standard mel scale: mel = 1127 * ln(1 + f/700) -pub fn compute_mel_filterbank( - num_mels: usize, - fft_size: usize, - sample_rate: f32, -) -> Vec> { +pub fn compute_mel_filterbank(num_mels: usize, fft_size: usize, sample_rate: f32) -> Vec> { let num_bins = fft_size / 2 + 1; let max_freq = sample_rate / 2.0; diff --git a/src/transcribe/mod.rs b/src/transcribe/mod.rs index 445025fc..2a221c8b 100644 --- a/src/transcribe/mod.rs +++ b/src/transcribe/mod.rs @@ -11,6 +11,7 @@ //! - Optionally Paraformer via ONNX Runtime (when `paraformer` feature is enabled) //! - Optionally Dolphin via ONNX Runtime (when `dolphin` feature is enabled) //! - Optionally Omnilingual via ONNX Runtime (when `omnilingual` feature is enabled) +//! - Optionally OpenVINO Whisper for Intel NPU/CPU/GPU (when `openvino-whisper` feature is enabled) pub mod cli; pub mod remote; @@ -54,6 +55,9 @@ pub mod dolphin; #[cfg(feature = "omnilingual")] pub mod omnilingual; +#[cfg(feature = "openvino-whisper")] +pub mod openvino_whisper; + use crate::config::{Config, TranscriptionEngine, WhisperConfig, WhisperMode}; use crate::error::TranscribeError; use crate::setup::gpu; @@ -175,6 +179,19 @@ pub fn create_transcriber(config: &Config) -> Result, Trans "Omnilingual engine requested but voxtype was not compiled with --features omnilingual" .to_string(), )), + #[cfg(feature = "openvino-whisper")] + TranscriptionEngine::OpenVino => { + let default_config = crate::config::OpenVinoConfig::default(); + let openvino_config = config.openvino.as_ref().unwrap_or(&default_config); + Ok(Box::new(openvino_whisper::OpenVinoTranscriber::new( + openvino_config, + )?)) + } + #[cfg(not(feature = "openvino-whisper"))] + TranscriptionEngine::OpenVino => Err(TranscribeError::InitFailed( + "OpenVINO engine requested but voxtype was not compiled with --features openvino-whisper" + .to_string(), + )), } } diff --git a/src/transcribe/omnilingual.rs b/src/transcribe/omnilingual.rs index 142e4819..dd65c3f7 100644 --- a/src/transcribe/omnilingual.rs +++ b/src/transcribe/omnilingual.rs @@ -76,9 +76,7 @@ impl OmnilingualTranscriber { TranscribeError::InitFailed(format!("ONNX session builder failed: {}", e)) })? .with_intra_threads(threads) - .map_err(|e| { - TranscribeError::InitFailed(format!("Failed to set threads: {}", e)) - })? + .map_err(|e| TranscribeError::InitFailed(format!("Failed to set threads: {}", e)))? .commit_from_file(&model_file) .map_err(|e| { TranscribeError::InitFailed(format!( @@ -124,10 +122,7 @@ impl Transcriber for OmnilingualTranscriber { // x: shape [1, num_samples] let x_tensor = Tensor::::from_array(([1usize, num_samples], normalized)).map_err(|e| { - TranscribeError::InferenceFailed(format!( - "Failed to create input tensor: {}", - e - )) + TranscribeError::InferenceFailed(format!("Failed to create input tensor: {}", e)) })?; // Run inference @@ -212,7 +207,11 @@ impl Transcriber for OmnilingualTranscriber { fn normalize_audio(samples: &[f32]) -> Vec { let n = samples.len() as f32; let mean: f32 = samples.iter().sum::() / n; - let variance: f32 = samples.iter().map(|&s| (s - mean) * (s - mean)).sum::() / n; + let variance: f32 = samples + .iter() + .map(|&s| (s - mean) * (s - mean)) + .sum::() + / n; let inv_stddev = 1.0 / (variance + 1e-5_f32).sqrt(); samples.iter().map(|&s| (s - mean) * inv_stddev).collect() diff --git a/src/transcribe/openvino_whisper.rs b/src/transcribe/openvino_whisper.rs new file mode 100644 index 00000000..89974123 --- /dev/null +++ b/src/transcribe/openvino_whisper.rs @@ -0,0 +1,628 @@ +//! OpenVINO GenAI Whisper speech-to-text transcription +//! +//! Uses the OpenVINO GenAI WhisperPipeline to run Whisper models on Intel NPU, CPU, +//! or GPU. The pipeline handles mel spectrogram extraction, encoder-decoder inference, +//! and tokenization internally. +//! +//! Models are in OpenVINO IR format from HuggingFace (OpenVINO/whisper-* repos), +//! exported via `optimum-cli export openvino`. + +use super::Transcriber; +use crate::config::OpenVinoConfig; +use crate::error::TranscribeError; +use openvino_genai::WhisperPipeline; +use std::path::PathBuf; +use std::sync::Mutex; + +/// OpenVINO GenAI Whisper transcriber for Intel NPU/CPU/GPU. +/// +/// Pipeline creation is deferred to `prepare()` (called when recording starts), +/// hiding the load latency behind recording time. The pipeline is cached and +/// reused across transcriptions. If `prepare()` was not called, creation happens +/// on first `transcribe()` call. +pub struct OpenVinoTranscriber { + pipeline: Mutex>, + model_dir: PathBuf, + config: OpenVinoConfig, +} + +impl OpenVinoTranscriber { + /// Create a new OpenVINO GenAI Whisper transcriber. + /// + /// Resolves the model directory and optionally creates the pipeline immediately + /// (when `on_demand_loading` is false). The expensive pipeline creation can be + /// deferred to `prepare()` or first `transcribe()`. + pub fn new(config: &OpenVinoConfig) -> Result { + let model_dir = resolve_model_path(&config.model, config.quantized)?; + + tracing::info!( + "Initializing OpenVINO GenAI Whisper from {:?} (device={}, quantized={})", + model_dir, + config.device, + config.quantized + ); + + // Sanity check that the model directory has expected files + let encoder_xml = model_dir.join("openvino_encoder_model.xml"); + if !encoder_xml.exists() { + return Err(TranscribeError::ModelNotFound(format!( + "OpenVINO Whisper encoder model not found: {}\n \ + Run 'voxtype setup model' to download, or manually from:\n \ + https://huggingface.co/OpenVINO/whisper-{}", + encoder_xml.display(), + config.model + ))); + } + + if config.threads.is_some() { + tracing::warn!( + "OpenVINO GenAI WhisperPipeline does not support thread count configuration; \ + the 'threads' setting will be ignored" + ); + } + + let pipeline = if config.on_demand_loading { + None + } else { + Some(Self::create_pipeline(&model_dir, config)?) + }; + + tracing::info!("OpenVINO GenAI Whisper initialized"); + + Ok(Self { + pipeline: Mutex::new(pipeline), + model_dir, + config: config.clone(), + }) + } + + /// Load the OpenVINO GenAI shared library, using a custom path if configured. + fn load_library(config: &OpenVinoConfig) -> Result<(), TranscribeError> { + if let Some(ref dir) = config.openvino_dir { + let lib_path = find_genai_library(dir)?; + // Preload OpenVINO dependency libraries with RTLD_GLOBAL so that dlopen + // can resolve the DT_NEEDED entries in libopenvino_genai_c.so. The OpenVINO + // shared libraries don't set RPATH/RUNPATH, and glibc caches LD_LIBRARY_PATH + // at startup so setting it at runtime has no effect. + if let Some(lib_dir) = lib_path.parent() { + preload_openvino_deps(lib_dir); + } + tracing::info!( + "Loading OpenVINO GenAI library from: {}", + lib_path.display() + ); + openvino_genai::load_from(&lib_path).map_err(|e| { + TranscribeError::InitFailed(format!( + "Failed to load OpenVINO GenAI library from {}: {}\n \ + Ensure libopenvino_genai_c.so exists in the specified openvino_dir.", + lib_path.display(), + e + )) + }) + } else { + openvino_genai::load().map_err(|e| { + TranscribeError::InitFailed(format!( + "Failed to load OpenVINO GenAI library: {}\n \ + Install OpenVINO GenAI: pip install openvino-genai\n \ + Or set openvino_dir in [openvino] config to the library directory.", + e + )) + }) + } + } + + /// Create the WhisperPipeline for the configured device. + fn create_pipeline( + model_dir: &std::path::Path, + config: &OpenVinoConfig, + ) -> Result { + let start = std::time::Instant::now(); + + Self::load_library(config)?; + + let model_path_str = model_dir.to_str().ok_or_else(|| { + TranscribeError::InitFailed("Model path contains invalid UTF-8".to_string()) + })?; + + let is_npu = config.device.to_uppercase() == "NPU"; + + let pipeline = WhisperPipeline::new(model_path_str, &config.device).map_err(|e| { + if is_npu { + TranscribeError::InitFailed(format!( + "Failed to create OpenVINO GenAI Whisper pipeline for NPU: {}\n \ + Ensure intel-npu-driver is installed.\n \ + Check: ls /dev/accel/accel*\n \ + Or set device = \"CPU\" in [openvino] config.", + e + )) + } else { + TranscribeError::InitFailed(format!( + "Failed to create OpenVINO GenAI Whisper pipeline for {}: {}", + config.device, e + )) + } + })?; + + tracing::info!( + "OpenVINO GenAI Whisper pipeline created in {:.2}s (device={})", + start.elapsed().as_secs_f32(), + config.device, + ); + + Ok(pipeline) + } + + /// Ensure the pipeline is created, creating on first use if needed. + fn ensure_pipeline( + &self, + ) -> Result>, TranscribeError> { + let mut guard = self.pipeline.lock().map_err(|e| { + TranscribeError::InferenceFailed(format!("Pipeline lock poisoned: {}", e)) + })?; + + if guard.is_none() { + tracing::info!("Pipeline not yet created, creating now (prepare() was not called)"); + *guard = Some(Self::create_pipeline(&self.model_dir, &self.config)?); + } + + Ok(guard) + } +} + +impl Transcriber for OpenVinoTranscriber { + fn prepare(&self) { + let mut guard = match self.pipeline.lock() { + Ok(g) => g, + Err(e) => { + tracing::error!("Pipeline lock error in prepare(): {}", e); + return; + } + }; + + if guard.is_some() { + tracing::debug!("Pipeline already created, skipping prepare()"); + return; + } + + tracing::info!( + "Creating OpenVINO GenAI Whisper pipeline for {} (triggered by prepare())...", + self.config.device + ); + match Self::create_pipeline(&self.model_dir, &self.config) { + Ok(p) => { + *guard = Some(p); + tracing::info!("OpenVINO GenAI pipeline creation complete"); + } + Err(e) => { + tracing::error!("Failed to create pipeline in prepare(): {}", e); + } + } + } + + fn transcribe(&self, samples: &[f32]) -> Result { + if samples.is_empty() { + return Err(TranscribeError::AudioFormat( + "Empty audio buffer".to_string(), + )); + } + + let duration_secs = samples.len() as f32 / 16000.0; + tracing::debug!( + "Transcribing {:.2}s of audio ({} samples) with OpenVINO GenAI (device={})", + duration_secs, + samples.len(), + self.config.device + ); + + let start = std::time::Instant::now(); + + // Get pipeline and run inference + let mut guard = self.ensure_pipeline()?; + let pipeline = guard.as_mut().unwrap(); + + // Get config from the pipeline (inherits model-specific token IDs). + // A standalone WhisperGenerationConfig::new() uses generic defaults that may + // not match the model; WhisperGenerationConfig::from_json() with the model's + // generation_config.json is the alternative for standalone creation. + let mut gen_config = pipeline.get_generation_config().map_err(|e| { + TranscribeError::InferenceFailed(format!("Failed to get generation config: {}", e)) + })?; + + // Only set language/task on multilingual models (*.en models are English-only + // and reject language/task overrides) + let is_multilingual = gen_config.get_is_multilingual().unwrap_or(false); + + if is_multilingual { + // GenAI expects language tokens in "<|xx|>" format (matching lang_to_id keys + // in generation_config.json), while voxtype config uses bare codes like "en" + let lang = &self.config.language; + let lang_token = if lang.starts_with("<|") { + lang.to_string() + } else { + format!("<|{}|>", lang) + }; + gen_config.set_language(&lang_token).map_err(|e| { + TranscribeError::InferenceFailed(format!("Failed to set language: {}", e)) + })?; + + let task = if self.config.translate { + "translate" + } else { + "transcribe" + }; + gen_config.set_task(task).map_err(|e| { + TranscribeError::InferenceFailed(format!("Failed to set task: {}", e)) + })?; + } else if self.config.translate { + tracing::warn!( + "Translation requested but model is not multilingual; ignoring translate setting" + ); + } + + gen_config.set_return_timestamps(false).map_err(|e| { + TranscribeError::InferenceFailed(format!("Failed to set return_timestamps: {}", e)) + })?; + + let results = pipeline.generate(samples, Some(&gen_config)).map_err(|e| { + TranscribeError::InferenceFailed(format!("OpenVINO GenAI inference failed: {}", e)) + })?; + + let text = results.get_string().map_err(|e| { + TranscribeError::InferenceFailed(format!("Failed to get transcription string: {}", e)) + })?; + + let result = text.trim().to_string(); + + // Log performance metrics if available + if let Ok(metrics) = results.get_perf_metrics() { + if let Ok((gen_dur, _)) = metrics.get_generate_duration() { + tracing::debug!("GenAI generate duration: {:.0}ms", gen_dur); + } + } + + tracing::info!( + "OpenVINO GenAI transcription completed in {:.2}s: {:?}", + start.elapsed().as_secs_f32(), + if result.chars().count() > 50 { + format!("{}...", result.chars().take(50).collect::()) + } else { + result.clone() + } + ); + + Ok(result) + } +} + +/// Find the libopenvino_genai_c shared library in a directory. +fn find_genai_library(dir: &str) -> Result { + let dir_path = PathBuf::from(dir); + if !dir_path.is_dir() { + return Err(TranscribeError::InitFailed(format!( + "openvino_dir is not a directory: {}", + dir + ))); + } + + let lib_name = format!( + "{}openvino_genai_c{}", + std::env::consts::DLL_PREFIX, + std::env::consts::DLL_SUFFIX + ); + let direct = dir_path.join(&lib_name); + if direct.is_file() { + return Ok(direct); + } + + // Search known subdirectories + for subdir in &["runtime/lib/intel64", "runtime/lib/intel64/Release", "."] { + let path = dir_path.join(subdir).join(&lib_name); + if path.is_file() { + return Ok(path); + } + } + + Err(TranscribeError::InitFailed(format!( + "{} not found in {}\n \ + Set openvino_dir to the directory containing the library,\n \ + or to the OpenVINO installation root.", + lib_name, dir + ))) +} + +/// Preload OpenVINO dependency libraries from the given directory using +/// `RTLD_LAZY | RTLD_GLOBAL`. This makes their symbols globally available so +/// that the subsequent dlopen of `libopenvino_genai_c.so` can resolve its +/// DT_NEEDED entries without requiring LD_LIBRARY_PATH to be set before +/// process startup. +fn preload_openvino_deps(lib_dir: &std::path::Path) { + use std::ffi::CString; + + // Order matters: libopenvino.so first (base dependency), then the others. + let deps = ["libopenvino.so", "libopenvino_c.so", "libopenvino_genai.so"]; + + for name in &deps { + let path = lib_dir.join(name); + if !path.exists() { + continue; + } + let Some(c_path) = path.to_str().and_then(|s| CString::new(s).ok()) else { + continue; + }; + let handle = unsafe { libc::dlopen(c_path.as_ptr(), libc::RTLD_LAZY | libc::RTLD_GLOBAL) }; + if handle.is_null() { + tracing::warn!("Failed to preload {}", path.display()); + } else { + tracing::debug!("Preloaded {}", path.display()); + // Intentionally not calling dlclose — keep symbols available. + } + } +} + +/// Check if a model name already includes a quantization suffix (-int4, -int8, -fp16) +fn has_quant_suffix(name: &str) -> bool { + name.ends_with("-int4") || name.ends_with("-int8") || name.ends_with("-fp16") +} + +/// Resolve model name to directory path. +/// +/// Handles several naming conventions: +/// - Absolute paths: used directly +/// - Full dir names: "openvino-whisper-base.en-int8-ov" +/// - Short names with quantization: "base.en-int8" (from `voxtype setup model`) +/// - Short names without quantization: "base.en" (uses `quantized` flag) +/// - Distil models: "distil-large-v2-int8" → "openvino-distil-whisper-large-v2-int8-ov" +fn resolve_model_path(model: &str, quantized: bool) -> Result { + // If it's already an absolute path, use it directly + let path = PathBuf::from(model); + if path.is_absolute() && path.exists() { + return Ok(path); + } + + // If the model name already has a quantization suffix, don't add another one. + // Names from `voxtype setup model` include quantization (e.g., "base.en-int8"). + let already_quantized = has_quant_suffix(model); + let quant_suffix = if already_quantized { + "" + } else if quantized { + "-int8" + } else { + "-fp16" + }; + + // Build candidate directory names. + // Models from setup have names like "base.en-int8" → dir "openvino-whisper-base.en-int8-ov" + // Distil models: "distil-large-v2-int8" → dir "openvino-distil-whisper-large-v2-int8-ov" + let mut candidates: Vec = Vec::new(); + + if model.starts_with("openvino-") { + // Already a full directory name (e.g., "openvino-whisper-base.en-int8-ov") + candidates.push(model.to_string()); + } else if model.starts_with("whisper-") { + // e.g., "whisper-base.en-int8" → "openvino-whisper-base.en-int8-ov" + candidates.push(format!("openvino-{}{}-ov", model, quant_suffix)); + candidates.push(format!("openvino-{}-ov", model)); + } else if let Some(rest) = model.strip_prefix("distil-") { + // e.g., "distil-large-v2-int8" → "openvino-distil-whisper-large-v2-int8-ov" + candidates.push(format!( + "openvino-distil-whisper-{}{}-ov", + rest, quant_suffix + )); + candidates.push(format!("openvino-distil-whisper-{}-ov", rest)); + // Also try the non-distil pattern in case naming differs + candidates.push(format!("openvino-whisper-{}{}-ov", model, quant_suffix)); + candidates.push(format!("openvino-whisper-{}-ov", model)); + } else { + // Short name: "base.en-int8" or "base.en" + candidates.push(format!("openvino-whisper-{}{}-ov", model, quant_suffix)); + candidates.push(format!("openvino-whisper-{}-ov", model)); + } + + // Search locations + let models_dir = crate::config::Config::models_dir(); + let mut search_paths: Vec = Vec::new(); + for candidate in &candidates { + search_paths.push(models_dir.join(candidate)); + } + for candidate in &candidates { + search_paths.push(PathBuf::from(candidate)); + search_paths.push(PathBuf::from("models").join(candidate)); + } + + for search_path in &search_paths { + if search_path.exists() && search_path.join("openvino_encoder_model.xml").exists() { + return Ok(search_path.clone()); + } + } + + // Not found - build helpful error message + let searched: Vec = search_paths + .iter() + .map(|p| format!(" - {}", p.display())) + .collect(); + + let model_with_quant = if already_quantized { + model.to_string() + } else { + format!("{}{}", model, quant_suffix) + }; + let hf_repo = format!("whisper-{}-ov", model_with_quant); + + Err(TranscribeError::ModelNotFound(format!( + "OpenVINO Whisper model '{}' not found. Looked in:\n{}\n\n \ + Run 'voxtype setup model' to download, or manually from:\n \ + https://huggingface.co/OpenVINO/{}", + model, + searched.join("\n"), + hf_repo + ))) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_resolve_model_path_absolute() { + let result = resolve_model_path("/nonexistent/path", false); + assert!(result.is_err()); + } + + #[test] + fn test_resolve_model_path_not_found() { + let result = resolve_model_path("nonexistent-model", true); + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!(err.contains("not found")); + assert!(err.contains("huggingface.co")); + } + + #[test] + fn test_has_quant_suffix() { + assert!(has_quant_suffix("base.en-int8")); + assert!(has_quant_suffix("tiny-int4")); + assert!(has_quant_suffix("large-v3-fp16")); + assert!(has_quant_suffix("distil-large-v2-int8")); + assert!(!has_quant_suffix("base.en")); + assert!(!has_quant_suffix("large-v3")); + assert!(!has_quant_suffix("tiny")); + } + + #[test] + fn test_resolve_no_double_quant_suffix() { + // When model name already has quantization (from `voxtype setup model`), + // should NOT produce doubled suffixes like "base.en-int8-int8" + let result = resolve_model_path("base.en-int8", true); + match result { + Ok(path) => { + // Model exists on disk - verify it resolved to the right dir + let dir_name = path.file_name().unwrap().to_str().unwrap(); + assert_eq!(dir_name, "openvino-whisper-base.en-int8-ov"); + } + Err(err) => { + let err = err.to_string(); + assert!( + err.contains("openvino-whisper-base.en-int8-ov"), + "Expected 'openvino-whisper-base.en-int8-ov' in error, got: {}", + err + ); + assert!( + !err.contains("base.en-int8-int8"), + "Found doubled quantization suffix in error: {}", + err + ); + } + } + } + + #[test] + fn test_resolve_short_name_gets_quant_suffix() { + // Short name without quantization should get suffix from `quantized` flag + let result = resolve_model_path("base.en", true); + match result { + Ok(path) => { + let dir_name = path.file_name().unwrap().to_str().unwrap(); + assert_eq!(dir_name, "openvino-whisper-base.en-int8-ov"); + } + Err(err) => { + let err = err.to_string(); + assert!( + err.contains("openvino-whisper-base.en-int8-ov"), + "Expected int8 suffix for quantized=true, got: {}", + err + ); + } + } + + // Use a model name unlikely to exist on disk to test fp16 path + let result = resolve_model_path("nonexistent-model", false); + let err = result.unwrap_err().to_string(); + assert!( + err.contains("openvino-whisper-nonexistent-model-fp16-ov"), + "Expected fp16 suffix for quantized=false, got: {}", + err + ); + } + + #[test] + fn test_resolve_distil_model_path() { + // Use a distil model that won't exist on disk + let result = resolve_model_path("distil-large-v2-int4", true); + match result { + Ok(path) => { + let dir_name = path.file_name().unwrap().to_str().unwrap(); + assert_eq!(dir_name, "openvino-distil-whisper-large-v2-int4-ov"); + } + Err(err) => { + let err = err.to_string(); + assert!( + err.contains("openvino-distil-whisper-large-v2-int4-ov"), + "Expected distil dir pattern in error, got: {}", + err + ); + } + } + } + + /// Real-life integration test: loads a WAV file and transcribes with OpenVINO GenAI. + /// Requires: model files in ~/.local/share/voxtype/models/, OpenVINO GenAI libs, NPU device. + /// Run with: cargo test --features openvino-whisper -- test_openvino_real --nocapture --ignored + #[test] + #[ignore] + fn test_openvino_real_transcription() { + let _ = tracing_subscriber::fmt() + .with_env_filter("debug") + .try_init(); + + // Load WAV file (16-bit PCM, mono, 16kHz) + let wav_path = std::path::Path::new(env!("CARGO_MANIFEST_DIR")) + .join("tests/fixtures/sensevoice/ja.wav"); + assert!(wav_path.exists(), "Test WAV not found: {:?}", wav_path); + + let mut reader = hound::WavReader::open(&wav_path).expect("Failed to open WAV"); + let spec = reader.spec(); + assert_eq!(spec.sample_rate, 16000, "Expected 16kHz audio"); + assert_eq!(spec.channels, 1, "Expected mono audio"); + + let samples: Vec = reader + .samples::() + .map(|s| s.unwrap() as f32 / 32768.0) + .collect(); + println!( + "Loaded {} samples ({:.2}s)", + samples.len(), + samples.len() as f32 / 16000.0 + ); + + // Create transcriber - use env vars for device and model override + let device = std::env::var("VOXTYPE_OPENVINO_DEVICE").unwrap_or_else(|_| "CPU".to_string()); + let model = std::env::var("VOXTYPE_OPENVINO_MODEL").unwrap_or_else(|_| "base".to_string()); + let config = OpenVinoConfig { + model, + device: device.clone(), + quantized: true, + openvino_dir: std::env::var("VOXTYPE_OPENVINO_DIR").ok(), + ..OpenVinoConfig::default() + }; + + let transcriber = + OpenVinoTranscriber::new(&config).expect("Failed to create OpenVINO transcriber"); + + // Prepare (create pipeline) + println!("Creating pipeline for NPU..."); + transcriber.prepare(); + + // Transcribe + println!("Transcribing..."); + let result = transcriber.transcribe(&samples); + match &result { + Ok(text) => println!("Transcription result: {:?}", text), + Err(e) => println!("Transcription error: {}", e), + } + assert!(result.is_ok(), "Transcription failed: {:?}", result.err()); + + let text = result.unwrap(); + assert!(!text.is_empty(), "Transcription produced empty text"); + println!("SUCCESS: {:?}", text); + } +} diff --git a/src/transcribe/paraformer.rs b/src/transcribe/paraformer.rs index 24016192..8389561c 100644 --- a/src/transcribe/paraformer.rs +++ b/src/transcribe/paraformer.rs @@ -14,8 +14,8 @@ //! Languages: zh+en (bilingual), zh+yue+en (trilingual) //! Model files: model.int8.onnx (or model.onnx), tokens.txt, am.mvn -use super::fbank::{self, FbankExtractor, LfrConfig}; use super::ctc; +use super::fbank::{self, FbankExtractor, LfrConfig}; use super::Transcriber; use crate::config::ParaformerConfig; use crate::error::TranscribeError; @@ -82,9 +82,7 @@ impl ParaformerTranscriber { TranscribeError::InitFailed(format!("ONNX session builder failed: {}", e)) })? .with_intra_threads(threads) - .map_err(|e| { - TranscribeError::InitFailed(format!("Failed to set threads: {}", e)) - })? + .map_err(|e| TranscribeError::InitFailed(format!("Failed to set threads: {}", e)))? .commit_from_file(&model_file) .map_err(|e| { TranscribeError::InitFailed(format!( @@ -169,19 +167,13 @@ impl Transcriber for ParaformerTranscriber { let (x_data, _offset) = features.into_raw_vec_and_offset(); let speech_tensor = Tensor::::from_array(([1usize, num_frames, feat_dim], x_data)) .map_err(|e| { - TranscribeError::InferenceFailed(format!( - "Failed to create speech tensor: {}", - e - )) + TranscribeError::InferenceFailed(format!("Failed to create speech tensor: {}", e)) })?; // speech_lengths: shape [1] let lengths_tensor = Tensor::::from_array(([1usize], vec![num_frames as i32])) .map_err(|e| { - TranscribeError::InferenceFailed(format!( - "Failed to create lengths tensor: {}", - e - )) + TranscribeError::InferenceFailed(format!("Failed to create lengths tensor: {}", e)) })?; // 5. Run inference @@ -272,10 +264,7 @@ fn decode_paraformer_output( } else if shape_dims.len() == 2 { // [batch, seq_len] - pre-argmaxed token IDs as f32 let seq_len = shape_dims[1] as usize; - let token_ids: Vec = data[..seq_len] - .iter() - .map(|&v| v as u32) - .collect(); + let token_ids: Vec = data[..seq_len].iter().map(|&v| v as u32).collect(); Ok(tokens_to_text(&token_ids, tokens)) } else { Err(TranscribeError::InferenceFailed(format!( @@ -349,9 +338,8 @@ fn tokens_to_text(token_ids: &[u32], tokens: &HashMap) -> String { fn read_cmvn_from_kaldi_mvn( path: &std::path::Path, ) -> Result<(Vec, Vec), TranscribeError> { - let data = std::fs::read(path).map_err(|e| { - TranscribeError::InitFailed(format!("Failed to read am.mvn: {}", e)) - })?; + let data = std::fs::read(path) + .map_err(|e| TranscribeError::InitFailed(format!("Failed to read am.mvn: {}", e)))?; let mut pos = 0; @@ -429,7 +417,12 @@ fn read_cmvn_from_kaldi_mvn( ))); } - tracing::debug!("am.mvn: {} rows x {} cols, double={}", rows, cols, is_double); + tracing::debug!( + "am.mvn: {} rows x {} cols, double={}", + rows, + cols, + is_double + ); // Read matrix data let feat_dim = cols - 1; // last column is the count @@ -515,9 +508,7 @@ fn read_cmvn_from_metadata(session: &Session) -> Result<(Vec, Vec), Tr })?; let inv_stddev_str = metadata.custom("inv_stddev").ok_or_else(|| { - TranscribeError::InitFailed( - "Model metadata missing 'inv_stddev' key".to_string(), - ) + TranscribeError::InitFailed("Model metadata missing 'inv_stddev' key".to_string()) })?; let neg_mean: Vec = neg_mean_str diff --git a/src/transcribe/sensevoice.rs b/src/transcribe/sensevoice.rs index 60c76e7a..24978b74 100644 --- a/src/transcribe/sensevoice.rs +++ b/src/transcribe/sensevoice.rs @@ -10,8 +10,8 @@ //! Supports languages: auto, zh, en, ja, ko, yue //! Model files: model.int8.onnx (or model.onnx), tokens.txt -use super::fbank::{self, FbankExtractor, LfrConfig}; use super::ctc::{self, CtcConfig}; +use super::fbank::{self, FbankExtractor, LfrConfig}; use super::Transcriber; use crate::config::SenseVoiceConfig; use crate::error::TranscribeError; @@ -81,9 +81,7 @@ impl SenseVoiceTranscriber { TranscribeError::InitFailed(format!("ONNX session builder failed: {}", e)) })? .with_intra_threads(threads) - .map_err(|e| { - TranscribeError::InitFailed(format!("Failed to set threads: {}", e)) - })? + .map_err(|e| TranscribeError::InitFailed(format!("Failed to set threads: {}", e)))? .commit_from_file(&model_file) .map_err(|e| { TranscribeError::InitFailed(format!( @@ -171,28 +169,19 @@ impl Transcriber for SenseVoiceTranscriber { let (x_data, _offset) = features.into_raw_vec_and_offset(); let x_tensor = Tensor::::from_array(([1usize, num_frames, feat_dim], x_data)) .map_err(|e| { - TranscribeError::InferenceFailed(format!( - "Failed to create input tensor: {}", - e - )) + TranscribeError::InferenceFailed(format!("Failed to create input tensor: {}", e)) })?; // x_length: shape [1] let x_length_tensor = Tensor::::from_array(([1usize], vec![num_frames as i32])) .map_err(|e| { - TranscribeError::InferenceFailed(format!( - "Failed to create length tensor: {}", - e - )) + TranscribeError::InferenceFailed(format!("Failed to create length tensor: {}", e)) })?; // language: shape [1] let language_tensor = Tensor::::from_array(([1usize], vec![self.language_id])) .map_err(|e| { - TranscribeError::InferenceFailed(format!( - "Failed to create language tensor: {}", - e - )) + TranscribeError::InferenceFailed(format!("Failed to create language tensor: {}", e)) })?; // text_norm: shape [1] @@ -212,9 +201,18 @@ impl Transcriber for SenseVoiceTranscriber { let inputs: Vec<(std::borrow::Cow, ort::session::SessionInputValue)> = vec![ (std::borrow::Cow::Borrowed("x"), x_tensor.into()), - (std::borrow::Cow::Borrowed("x_length"), x_length_tensor.into()), - (std::borrow::Cow::Borrowed("language"), language_tensor.into()), - (std::borrow::Cow::Borrowed("text_norm"), text_norm_tensor.into()), + ( + std::borrow::Cow::Borrowed("x_length"), + x_length_tensor.into(), + ), + ( + std::borrow::Cow::Borrowed("language"), + language_tensor.into(), + ), + ( + std::borrow::Cow::Borrowed("text_norm"), + text_norm_tensor.into(), + ), ]; let outputs = session.run(inputs).map_err(|e| { @@ -249,11 +247,7 @@ impl Transcriber for SenseVoiceTranscriber { } else if shape_dims.len() == 2 { // Pre-argmaxed output: each value is already a token ID let time_steps = shape_dims[1] as usize; - ctc::decode_pre_argmax( - &logits_data[..time_steps], - &self.tokens, - &self.ctc_config, - ) + ctc::decode_pre_argmax(&logits_data[..time_steps], &self.tokens, &self.ctc_config) } else { return Err(TranscribeError::InferenceFailed(format!( "Unexpected logits shape: {:?}", diff --git a/src/vad/mod.rs b/src/vad/mod.rs index 0e016bcc..0bb25636 100644 --- a/src/vad/mod.rs +++ b/src/vad/mod.rs @@ -62,7 +62,8 @@ pub fn create_vad(config: &Config) -> Result VadBackend::Energy, + | TranscriptionEngine::Omnilingual + | TranscriptionEngine::OpenVino => VadBackend::Energy, } } explicit => explicit,