diff --git a/plugins/_kokoro_tts/api/status.py b/plugins/_kokoro_tts/api/status.py index 3b1321972e..03cee2b33c 100644 --- a/plugins/_kokoro_tts/api/status.py +++ b/plugins/_kokoro_tts/api/status.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import importlib.metadata from helpers.api import ApiHandler, Request, Response @@ -8,6 +10,10 @@ class Status(ApiHandler): async def process(self, input: dict, request: Request) -> dict | Response: migration.ensure_migrated() + cfg = runtime.get_config() + remote_url = cfg.get("remote_url", "") + + # Local model status (always reported) package_version = "" package_error = "" try: @@ -15,17 +21,26 @@ async def process(self, input: dict, request: Request) -> dict | Response: except Exception as e: package_error = str(e) - return { + result = { "plugin": "_kokoro_tts", "enabled": runtime.is_globally_enabled(), - "config": runtime.get_config(), + "config": cfg, "model": { "ready": await runtime.is_downloaded(), - "loading": await runtime.is_downloading(), - }, - "package": { + "loading": runtime.is_updating_model, "version": package_version, - "error": package_error, + "error": package_error or None, }, "fallback": "Browser-native speechSynthesis remains the fallback when Kokoro is disabled.", } + + # Remote health status (only if configured) + if remote_url: + remote_healthy, remote_error = await runtime.is_remote_healthy() + result["remote"] = { + "url": remote_url, + "healthy": remote_healthy, + "error": remote_error or None, + } + + return result diff --git a/plugins/_kokoro_tts/api/synthesize.py b/plugins/_kokoro_tts/api/synthesize.py index 5530f90039..405cff5640 100644 --- a/plugins/_kokoro_tts/api/synthesize.py +++ b/plugins/_kokoro_tts/api/synthesize.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from helpers.api import ApiHandler, Request, Response from plugins._kokoro_tts.helpers import runtime @@ -12,11 +14,11 @@ async def process(self, input: dict, request: Request) -> dict | Response: return Response(status=400, response="Missing text") try: - audio = await runtime.synthesize_sentences([text]) + audio, mime_type = await runtime.synthesize_sentences([text]) return { "success": True, "audio": audio, - "mime_type": "audio/wav", + "mime_type": mime_type, } except Exception as e: return {"success": False, "error": str(e)} diff --git a/plugins/_kokoro_tts/default_config.yaml b/plugins/_kokoro_tts/default_config.yaml index 85be3ad699..6d54780496 100644 --- a/plugins/_kokoro_tts/default_config.yaml +++ b/plugins/_kokoro_tts/default_config.yaml @@ -1,2 +1,4 @@ voice: am_puck,am_onyx speed: 1.1 +remote_url: +response_format: mp3 diff --git a/plugins/_kokoro_tts/helpers/runtime.py b/plugins/_kokoro_tts/helpers/runtime.py index 365886a2e0..767144ab93 100644 --- a/plugins/_kokoro_tts/helpers/runtime.py +++ b/plugins/_kokoro_tts/helpers/runtime.py @@ -6,6 +6,7 @@ import warnings from typing import Any +import aiohttp import soundfile as sf from helpers import plugins @@ -26,6 +27,16 @@ DEFAULT_CONFIG = { "voice": "am_puck,am_onyx", "speed": 1.1, + "remote_url": "", + "response_format": "mp3", +} + +VALID_FORMATS = {"wav", "mp3", "opus", "flac"} +MIME_TYPES = { + "wav": "audio/wav", + "mp3": "audio/mpeg", + "opus": "audio/opus", + "flac": "audio/flac", } _pipeline = None @@ -48,6 +59,14 @@ def normalize_config(config: dict[str, Any] | None) -> dict[str, Any]: except (TypeError, ValueError): pass + remote_url = str(config.get("remote_url", normalized["remote_url"]) or "").strip() + if remote_url: + normalized["remote_url"] = remote_url.rstrip("/") + + response_format = str(config.get("response_format", normalized["response_format"]) or "").strip().lower() + if response_format in VALID_FORMATS: + normalized["response_format"] = response_format + return normalized @@ -106,20 +125,89 @@ async def is_downloaded() -> bool: return _pipeline is not None +async def is_remote_healthy() -> tuple[bool, str]: + """Check if the remote Kokoro-FastAPI server is reachable. + + Returns (healthy, error_message). If no remote_url is configured, + returns (False, "Not configured"). + """ + cfg = get_config() + remote_url = cfg.get("remote_url", "") + if not remote_url: + return False, "Not configured" + + try: + async with aiohttp.ClientSession() as session: + async with session.get( + f"{remote_url}/health", + timeout=aiohttp.ClientTimeout(total=5), + ) as resp: + if resp.status == 200: + return True, "" + return False, f"HTTP {resp.status}" + except Exception as e: + return False, str(e) + + async def synthesize_sentences( sentences: list[str], config: dict[str, Any] | None = None -) -> str: +) -> tuple[str, str]: cfg = normalize_config(config or get_config()) - return await _synthesize_sentences( + remote_url = str(cfg.get("remote_url", "")) + + if remote_url: + return await _synthesize_remote( + sentences, + voice=str(cfg["voice"]), + speed=float(cfg["speed"]), + remote_url=remote_url, + response_format=str(cfg["response_format"]), + ) + + return await _synthesize_local( sentences, voice=str(cfg["voice"]), speed=float(cfg["speed"]), ) -async def _synthesize_sentences( +async def _synthesize_remote( + sentences: list[str], + *, + voice: str, + speed: float, + remote_url: str, + response_format: str, +) -> tuple[str, str]: + text = " ".join(s.strip() for s in sentences if s.strip()) + if not text: + return "", MIME_TYPES.get(response_format, "audio/mpeg") + + try: + async with aiohttp.ClientSession() as session: + async with session.post( + f"{remote_url}/v1/audio/speech", + json={ + "model": "kokoro", + "input": text, + "voice": voice, + "response_format": response_format, + "speed": speed, + }, + timeout=aiohttp.ClientTimeout(total=30), + ) as resp: + resp.raise_for_status() + audio_bytes = await resp.read() + mime_type = MIME_TYPES.get(response_format, "audio/mpeg") + return base64.b64encode(audio_bytes).decode("utf-8"), mime_type + except Exception as e: + PrintStyle.error(f"Error in remote Kokoro TTS synthesis: {e}") + raise + + +async def _synthesize_local( sentences: list[str], *, voice: str, speed: float -) -> str: +) -> tuple[str, str]: await _preload() combined_audio: list[float] = [] @@ -136,11 +224,11 @@ async def _synthesize_sentences( combined_audio.extend(audio_numpy.tolist()) if not combined_audio: - return "" + return "", "audio/wav" buffer = io.BytesIO() sf.write(buffer, combined_audio, 24000, format="WAV") - return base64.b64encode(buffer.getvalue()).decode("utf-8") + return base64.b64encode(buffer.getvalue()).decode("utf-8"), "audio/wav" except Exception as e: - PrintStyle.error(f"Error in Kokoro TTS synthesis: {e}") + PrintStyle.error(f"Error in local Kokoro TTS synthesis: {e}") raise diff --git a/plugins/_kokoro_tts/webui/config.html b/plugins/_kokoro_tts/webui/config.html index 755258cc26..4ca69341bc 100644 --- a/plugins/_kokoro_tts/webui/config.html +++ b/plugins/_kokoro_tts/webui/config.html @@ -9,20 +9,46 @@
Kokoro TTS
- Configure the built-in Kokoro voice provider. When this plugin is disabled, - spoken output falls back to the browser speech API. + Configure the Kokoro voice provider. Supports both local model synthesis + and a remote Kokoro-FastAPI service. When disabled, spoken output falls + back to the browser speech API. +
+ +
+
+
Remote URL
+
URL of a remote Kokoro-FastAPI service (e.g. http://localhost:18890). Leave empty to use local model synthesis.
+
+
+ +
Voice
-
Kokoro voice identifier passed to the backend pipeline.
+
Kokoro voice identifier (e.g. am_puck, am_onyx, or blend voices with + like am_puck+am_onyx).
+
+
+
Audio Format
+
Output format for remote synthesis. Local synthesis always outputs WAV.
+
+
+ +
+
+
Speed
@@ -36,4 +62,4 @@
- + \ No newline at end of file diff --git a/plugins/_kokoro_tts/webui/kokoro-tts-store.js b/plugins/_kokoro_tts/webui/kokoro-tts-store.js index 715836ae48..6c0693269b 100644 --- a/plugins/_kokoro_tts/webui/kokoro-tts-store.js +++ b/plugins/_kokoro_tts/webui/kokoro-tts-store.js @@ -14,10 +14,12 @@ const model = { config: { voice: "", speed: 1.1, + remote_url: "", + response_format: "mp3", }, modelReady: false, modelLoading: false, - packageVersion: "", + remoteHealthy: false, providerCleanup: null, async initRuntime() { @@ -42,10 +44,12 @@ const model = { this.config = { voice: status?.config?.voice || "", speed: Number(status?.config?.speed || 1.1), + remote_url: status?.config?.remote_url || "", + response_format: status?.config?.response_format || "mp3", }; this.modelReady = !!status?.model?.ready; this.modelLoading = !!status?.model?.loading; - this.packageVersion = status?.package?.version || ""; + this.remoteHealthy = !!status?.remote?.healthy; if (this.enabled) { this.registerProvider(); @@ -77,7 +81,7 @@ const model = { return { audioBase64: result.audio || "", - mimeType: result.mime_type || "audio/wav", + mimeType: result.mime_type || "audio/mpeg", }; }, }); diff --git a/plugins/_kokoro_tts/webui/main.html b/plugins/_kokoro_tts/webui/main.html index 58595b3df1..183de9d1cd 100644 --- a/plugins/_kokoro_tts/webui/main.html +++ b/plugins/_kokoro_tts/webui/main.html @@ -16,9 +16,8 @@
Kokoro TTS
- Built-in Kokoro speech synthesis. Dependency installation remains on the - Docker/bootstrap path; disabling this plugin returns spoken output to the - browser fallback. + Built-in Kokoro speech synthesis. When disabled, spoken output falls back + to the browser speech API.
@@ -29,13 +28,15 @@
- Model + Local Model
-
- Package - -
+
@@ -48,6 +49,18 @@ Speed
+
@@ -130,4 +143,4 @@ } - + \ No newline at end of file