diff --git a/plugins/_kokoro_tts/api/status.py b/plugins/_kokoro_tts/api/status.py index 3b1321972e..03cee2b33c 100644 --- a/plugins/_kokoro_tts/api/status.py +++ b/plugins/_kokoro_tts/api/status.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import importlib.metadata from helpers.api import ApiHandler, Request, Response @@ -8,6 +10,10 @@ class Status(ApiHandler): async def process(self, input: dict, request: Request) -> dict | Response: migration.ensure_migrated() + cfg = runtime.get_config() + remote_url = cfg.get("remote_url", "") + + # Local model status (always reported) package_version = "" package_error = "" try: @@ -15,17 +21,26 @@ async def process(self, input: dict, request: Request) -> dict | Response: except Exception as e: package_error = str(e) - return { + result = { "plugin": "_kokoro_tts", "enabled": runtime.is_globally_enabled(), - "config": runtime.get_config(), + "config": cfg, "model": { "ready": await runtime.is_downloaded(), - "loading": await runtime.is_downloading(), - }, - "package": { + "loading": runtime.is_updating_model, "version": package_version, - "error": package_error, + "error": package_error or None, }, "fallback": "Browser-native speechSynthesis remains the fallback when Kokoro is disabled.", } + + # Remote health status (only if configured) + if remote_url: + remote_healthy, remote_error = await runtime.is_remote_healthy() + result["remote"] = { + "url": remote_url, + "healthy": remote_healthy, + "error": remote_error or None, + } + + return result diff --git a/plugins/_kokoro_tts/api/synthesize.py b/plugins/_kokoro_tts/api/synthesize.py index 5530f90039..405cff5640 100644 --- a/plugins/_kokoro_tts/api/synthesize.py +++ b/plugins/_kokoro_tts/api/synthesize.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from helpers.api import ApiHandler, Request, Response from plugins._kokoro_tts.helpers import runtime @@ -12,11 +14,11 @@ async def process(self, input: dict, request: Request) -> dict | Response: return Response(status=400, response="Missing text") try: - audio = await runtime.synthesize_sentences([text]) + audio, mime_type = await runtime.synthesize_sentences([text]) return { "success": True, "audio": audio, - "mime_type": "audio/wav", + "mime_type": mime_type, } except Exception as e: return {"success": False, "error": str(e)} diff --git a/plugins/_kokoro_tts/default_config.yaml b/plugins/_kokoro_tts/default_config.yaml index 85be3ad699..6d54780496 100644 --- a/plugins/_kokoro_tts/default_config.yaml +++ b/plugins/_kokoro_tts/default_config.yaml @@ -1,2 +1,4 @@ voice: am_puck,am_onyx speed: 1.1 +remote_url: +response_format: mp3 diff --git a/plugins/_kokoro_tts/helpers/runtime.py b/plugins/_kokoro_tts/helpers/runtime.py index 365886a2e0..767144ab93 100644 --- a/plugins/_kokoro_tts/helpers/runtime.py +++ b/plugins/_kokoro_tts/helpers/runtime.py @@ -6,6 +6,7 @@ import warnings from typing import Any +import aiohttp import soundfile as sf from helpers import plugins @@ -26,6 +27,16 @@ DEFAULT_CONFIG = { "voice": "am_puck,am_onyx", "speed": 1.1, + "remote_url": "", + "response_format": "mp3", +} + +VALID_FORMATS = {"wav", "mp3", "opus", "flac"} +MIME_TYPES = { + "wav": "audio/wav", + "mp3": "audio/mpeg", + "opus": "audio/opus", + "flac": "audio/flac", } _pipeline = None @@ -48,6 +59,14 @@ def normalize_config(config: dict[str, Any] | None) -> dict[str, Any]: except (TypeError, ValueError): pass + remote_url = str(config.get("remote_url", normalized["remote_url"]) or "").strip() + if remote_url: + normalized["remote_url"] = remote_url.rstrip("/") + + response_format = str(config.get("response_format", normalized["response_format"]) or "").strip().lower() + if response_format in VALID_FORMATS: + normalized["response_format"] = response_format + return normalized @@ -106,20 +125,89 @@ async def is_downloaded() -> bool: return _pipeline is not None +async def is_remote_healthy() -> tuple[bool, str]: + """Check if the remote Kokoro-FastAPI server is reachable. + + Returns (healthy, error_message). If no remote_url is configured, + returns (False, "Not configured"). + """ + cfg = get_config() + remote_url = cfg.get("remote_url", "") + if not remote_url: + return False, "Not configured" + + try: + async with aiohttp.ClientSession() as session: + async with session.get( + f"{remote_url}/health", + timeout=aiohttp.ClientTimeout(total=5), + ) as resp: + if resp.status == 200: + return True, "" + return False, f"HTTP {resp.status}" + except Exception as e: + return False, str(e) + + async def synthesize_sentences( sentences: list[str], config: dict[str, Any] | None = None -) -> str: +) -> tuple[str, str]: cfg = normalize_config(config or get_config()) - return await _synthesize_sentences( + remote_url = str(cfg.get("remote_url", "")) + + if remote_url: + return await _synthesize_remote( + sentences, + voice=str(cfg["voice"]), + speed=float(cfg["speed"]), + remote_url=remote_url, + response_format=str(cfg["response_format"]), + ) + + return await _synthesize_local( sentences, voice=str(cfg["voice"]), speed=float(cfg["speed"]), ) -async def _synthesize_sentences( +async def _synthesize_remote( + sentences: list[str], + *, + voice: str, + speed: float, + remote_url: str, + response_format: str, +) -> tuple[str, str]: + text = " ".join(s.strip() for s in sentences if s.strip()) + if not text: + return "", MIME_TYPES.get(response_format, "audio/mpeg") + + try: + async with aiohttp.ClientSession() as session: + async with session.post( + f"{remote_url}/v1/audio/speech", + json={ + "model": "kokoro", + "input": text, + "voice": voice, + "response_format": response_format, + "speed": speed, + }, + timeout=aiohttp.ClientTimeout(total=30), + ) as resp: + resp.raise_for_status() + audio_bytes = await resp.read() + mime_type = MIME_TYPES.get(response_format, "audio/mpeg") + return base64.b64encode(audio_bytes).decode("utf-8"), mime_type + except Exception as e: + PrintStyle.error(f"Error in remote Kokoro TTS synthesis: {e}") + raise + + +async def _synthesize_local( sentences: list[str], *, voice: str, speed: float -) -> str: +) -> tuple[str, str]: await _preload() combined_audio: list[float] = [] @@ -136,11 +224,11 @@ async def _synthesize_sentences( combined_audio.extend(audio_numpy.tolist()) if not combined_audio: - return "" + return "", "audio/wav" buffer = io.BytesIO() sf.write(buffer, combined_audio, 24000, format="WAV") - return base64.b64encode(buffer.getvalue()).decode("utf-8") + return base64.b64encode(buffer.getvalue()).decode("utf-8"), "audio/wav" except Exception as e: - PrintStyle.error(f"Error in Kokoro TTS synthesis: {e}") + PrintStyle.error(f"Error in local Kokoro TTS synthesis: {e}") raise diff --git a/plugins/_kokoro_tts/webui/config.html b/plugins/_kokoro_tts/webui/config.html index 755258cc26..4ca69341bc 100644 --- a/plugins/_kokoro_tts/webui/config.html +++ b/plugins/_kokoro_tts/webui/config.html @@ -9,20 +9,46 @@