agent0ai · Draco-Lunaris · Jun 3, 2026
diff --git a/plugins/_kokoro_tts/api/status.py b/plugins/_kokoro_tts/api/status.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import importlib.metadata
 
 from helpers.api import ApiHandler, Request, Response
@@ -8,24 +10,37 @@ class Status(ApiHandler):
     async def process(self, input: dict, request: Request) -> dict | Response:
         migration.ensure_migrated()
 
+        cfg = runtime.get_config()
+        remote_url = cfg.get("remote_url", "")
+
+        # Local model status (always reported)
         package_version = ""
         package_error = ""
         try:
             package_version = importlib.metadata.version("kokoro")
         except Exception as e:
             package_error = str(e)
 
-        return {
+        result = {
             "plugin": "_kokoro_tts",
             "enabled": runtime.is_globally_enabled(),
-            "config": runtime.get_config(),
+            "config": cfg,
             "model": {
                 "ready": await runtime.is_downloaded(),
-                "loading": await runtime.is_downloading(),
-            },
-            "package": {
+                "loading": runtime.is_updating_model,
                 "version": package_version,
-                "error": package_error,
+                "error": package_error or None,
             },
             "fallback": "Browser-native speechSynthesis remains the fallback when Kokoro is disabled.",
         }
+
+        # Remote health status (only if configured)
+        if remote_url:
+            remote_healthy, remote_error = await runtime.is_remote_healthy()
+            result["remote"] = {
+                "url": remote_url,
+                "healthy": remote_healthy,
+                "error": remote_error or None,
+            }
+
+        return result
diff --git a/plugins/_kokoro_tts/api/synthesize.py b/plugins/_kokoro_tts/api/synthesize.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 from helpers.api import ApiHandler, Request, Response
 from plugins._kokoro_tts.helpers import runtime
 
@@ -12,11 +14,11 @@ async def process(self, input: dict, request: Request) -> dict | Response:
             return Response(status=400, response="Missing text")
 
         try:
-            audio = await runtime.synthesize_sentences([text])
+            audio, mime_type = await runtime.synthesize_sentences([text])
             return {
                 "success": True,
                 "audio": audio,
-                "mime_type": "audio/wav",
+                "mime_type": mime_type,
             }
         except Exception as e:
             return {"success": False, "error": str(e)}
diff --git a/plugins/_kokoro_tts/default_config.yaml b/plugins/_kokoro_tts/default_config.yaml
@@ -1,2 +1,4 @@
 voice: am_puck,am_onyx
 speed: 1.1
+remote_url:
+response_format: mp3
diff --git a/plugins/_kokoro_tts/helpers/runtime.py b/plugins/_kokoro_tts/helpers/runtime.py
@@ -6,6 +6,7 @@
 import warnings
 from typing import Any
 
+import aiohttp
 import soundfile as sf
 
 from helpers import plugins
@@ -26,6 +27,16 @@
 DEFAULT_CONFIG = {
     "voice": "am_puck,am_onyx",
     "speed": 1.1,
+    "remote_url": "",
+    "response_format": "mp3",
+}
+
+VALID_FORMATS = {"wav", "mp3", "opus", "flac"}
+MIME_TYPES = {
+    "wav": "audio/wav",
+    "mp3": "audio/mpeg",
+    "opus": "audio/opus",
+    "flac": "audio/flac",
 }
 
 _pipeline = None
@@ -48,6 +59,14 @@ def normalize_config(config: dict[str, Any] | None) -> dict[str, Any]:
     except (TypeError, ValueError):
         pass
 
+    remote_url = str(config.get("remote_url", normalized["remote_url"]) or "").strip()
+    if remote_url:
+        normalized["remote_url"] = remote_url.rstrip("/")
+
+    response_format = str(config.get("response_format", normalized["response_format"]) or "").strip().lower()
+    if response_format in VALID_FORMATS:
+        normalized["response_format"] = response_format
+
     return normalized
 
 
@@ -106,20 +125,89 @@ async def is_downloaded() -> bool:
     return _pipeline is not None
 
 
+async def is_remote_healthy() -> tuple[bool, str]:
+    """Check if the remote Kokoro-FastAPI server is reachable.
+
+    Returns (healthy, error_message). If no remote_url is configured,
+    returns (False, "Not configured").
+    """
+    cfg = get_config()
+    remote_url = cfg.get("remote_url", "")
+    if not remote_url:
+        return False, "Not configured"
+
+    try:
+        async with aiohttp.ClientSession() as session:
+            async with session.get(
+                f"{remote_url}/health",
+                timeout=aiohttp.ClientTimeout(total=5),
+            ) as resp:
+                if resp.status == 200:
+                    return True, ""
+                return False, f"HTTP {resp.status}"
+    except Exception as e:
+        return False, str(e)
+
+
 async def synthesize_sentences(
     sentences: list[str], config: dict[str, Any] | None = None
-) -> str:
+) -> tuple[str, str]:
     cfg = normalize_config(config or get_config())
-    return await _synthesize_sentences(
+    remote_url = str(cfg.get("remote_url", ""))
+
+    if remote_url:
+        return await _synthesize_remote(
+            sentences,
+            voice=str(cfg["voice"]),
+            speed=float(cfg["speed"]),
+            remote_url=remote_url,
+            response_format=str(cfg["response_format"]),
+        )
+
+    return await _synthesize_local(
         sentences,
         voice=str(cfg["voice"]),
         speed=float(cfg["speed"]),
     )
 
 
-async def _synthesize_sentences(
+async def _synthesize_remote(
+    sentences: list[str],
+    *,
+    voice: str,
+    speed: float,
+    remote_url: str,
+    response_format: str,
+) -> tuple[str, str]:
+    text = " ".join(s.strip() for s in sentences if s.strip())
+    if not text:
+        return "", MIME_TYPES.get(response_format, "audio/mpeg")
+
+    try:
+        async with aiohttp.ClientSession() as session:
+            async with session.post(
+                f"{remote_url}/v1/audio/speech",
+                json={
+                    "model": "kokoro",
+                    "input": text,
+                    "voice": voice,
+                    "response_format": response_format,
+                    "speed": speed,
+                },
+                timeout=aiohttp.ClientTimeout(total=30),
+            ) as resp:
+                resp.raise_for_status()
+                audio_bytes = await resp.read()
+                mime_type = MIME_TYPES.get(response_format, "audio/mpeg")
+                return base64.b64encode(audio_bytes).decode("utf-8"), mime_type
+    except Exception as e:
+        PrintStyle.error(f"Error in remote Kokoro TTS synthesis: {e}")
+        raise
+
+
+async def _synthesize_local(
     sentences: list[str], *, voice: str, speed: float
-) -> str:
+) -> tuple[str, str]:
     await _preload()
 
     combined_audio: list[float] = []
@@ -136,11 +224,11 @@ async def _synthesize_sentences(
                 combined_audio.extend(audio_numpy.tolist())
 
         if not combined_audio:
-            return ""
+            return "", "audio/wav"
 
         buffer = io.BytesIO()
         sf.write(buffer, combined_audio, 24000, format="WAV")
-        return base64.b64encode(buffer.getvalue()).decode("utf-8")
+        return base64.b64encode(buffer.getvalue()).decode("utf-8"), "audio/wav"
     except Exception as e:
-        PrintStyle.error(f"Error in Kokoro TTS synthesis: {e}")
+        PrintStyle.error(f"Error in local Kokoro TTS synthesis: {e}")
         raise
diff --git a/plugins/_kokoro_tts/webui/config.html b/plugins/_kokoro_tts/webui/config.html
@@ -9,20 +9,46 @@
       <div class="plugin-config-page">
         <div class="section-title">Kokoro TTS</div>
         <div class="section-description">
-          Configure the built-in Kokoro voice provider. When this plugin is disabled,
-          spoken output falls back to the browser speech API.
+          Configure the Kokoro voice provider. Supports both local model synthesis
+          and a remote Kokoro-FastAPI service. When disabled, spoken output falls
+          back to the browser speech API.
+        </div>
+
+        <div class="field">
+          <div class="field-label">
+            <div class="field-title">Remote URL</div>
+            <div class="field-description">URL of a remote Kokoro-FastAPI service (e.g. http://localhost:18890). Leave empty to use local model synthesis.</div>
+          </div>
+          <div class="field-control">
+            <input type="text" x-model="config.remote_url" />
+          </div>
         </div>
 
         <div class="field">
           <div class="field-label">
             <div class="field-title">Voice</div>
-            <div class="field-description">Kokoro voice identifier passed to the backend pipeline.</div>
+            <div class="field-description">Kokoro voice identifier (e.g. am_puck, am_onyx, or blend voices with + like am_puck+am_onyx).</div>
           </div>
           <div class="field-control">
             <input type="text" x-model="config.voice" />
           </div>
         </div>
 
+        <div class="field">
+          <div class="field-label">
+            <div class="field-title">Audio Format</div>
+            <div class="field-description">Output format for remote synthesis. Local synthesis always outputs WAV.</div>
+          </div>
+          <div class="field-control">
+            <select x-model="config.response_format">
+              <option value="mp3">MP3 (recommended)</option>
+              <option value="wav">WAV (uncompressed)</option>
+              <option value="opus">Opus (low bitrate)</option>
+              <option value="flac">FLAC (lossless)</option>
+            </select>
+          </div>
+        </div>
+
         <div class="field">
           <div class="field-label">
             <div class="field-title">Speed</div>
@@ -36,4 +62,4 @@
     </template>
   </div>
 </body>
-</html>
+</html>
diff --git a/plugins/_kokoro_tts/webui/kokoro-tts-store.js b/plugins/_kokoro_tts/webui/kokoro-tts-store.js
@@ -14,10 +14,12 @@ const model = {
   config: {
     voice: "",
     speed: 1.1,
+    remote_url: "",
+    response_format: "mp3",
   },
   modelReady: false,
   modelLoading: false,
-  packageVersion: "",
+  remoteHealthy: false,
   providerCleanup: null,
 
   async initRuntime() {
@@ -42,10 +44,12 @@ const model = {
       this.config = {
         voice: status?.config?.voice || "",
         speed: Number(status?.config?.speed || 1.1),
+        remote_url: status?.config?.remote_url || "",
+        response_format: status?.config?.response_format || "mp3",
       };
       this.modelReady = !!status?.model?.ready;
       this.modelLoading = !!status?.model?.loading;
-      this.packageVersion = status?.package?.version || "";
+      this.remoteHealthy = !!status?.remote?.healthy;
 
       if (this.enabled) {
         this.registerProvider();
@@ -77,7 +81,7 @@ const model = {
 
         return {
           audioBase64: result.audio || "",
-          mimeType: result.mime_type || "audio/wav",
+          mimeType: result.mime_type || "audio/mpeg",
         };
       },
     });

diff --git a/plugins/_kokoro_tts/webui/main.html b/plugins/_kokoro_tts/webui/main.html
@@ -16,9 +16,8 @@
       <div>
         <div class="section-title">Kokoro TTS</div>
         <div class="section-description">
-          Built-in Kokoro speech synthesis. Dependency installation remains on the
-          Docker/bootstrap path; disabling this plugin returns spoken output to the
-          browser fallback.
+          Built-in Kokoro speech synthesis. When disabled, spoken output falls back
+          to the browser speech API.
         </div>
 
         <div class="speech-plugin-grid">
@@ -29,13 +28,15 @@
               <span class="status-badge" :class="$store.kokoroTts.enabled ? 'ok' : 'warn'" x-text="$store.kokoroTts.enabled ? 'Yes' : 'No'"></span>
             </div>
             <div class="status-row">
-              <span class="status-key">Model</span>
+              <span class="status-key">Local Model</span>
               <span class="status-badge" :class="$store.kokoroTts.statusClass" x-text="$store.kokoroTts.statusText"></span>
             </div>
-            <div class="status-row" x-show="$store.kokoroTts.packageVersion">
-              <span class="status-key">Package</span>
-              <span class="status-value" x-text="$store.kokoroTts.packageVersion"></span>
-            </div>
+            <template x-if="$store.kokoroTts.config.remote_url">
+              <div class="status-row">
+                <span class="status-key">Remote</span>
+                <span class="status-badge" :class="$store.kokoroTts.remoteHealthy ? 'ok' : 'warn'" x-text="$store.kokoroTts.remoteHealthy ? 'Healthy' : 'Unreachable'"></span>
+              </div>
+            </template>
           </div>
 
           <div class="speech-plugin-card">
@@ -48,6 +49,18 @@
               <span class="status-key">Speed</span>
               <span class="status-value" x-text="$store.kokoroTts.config.speed"></span>
             </div>
+            <template x-if="$store.kokoroTts.config.remote_url">
+              <div>
+                <div class="status-row">
+                  <span class="status-key">Remote URL</span>
+                  <span class="status-value mono" x-text="$store.kokoroTts.config.remote_url"></span>
+                </div>
+                <div class="status-row">
+                  <span class="status-key">Format</span>
+                  <span class="status-value" x-text="$store.kokoroTts.config.response_format?.toUpperCase()"></span>
+                </div>
+              </div>
+            </template>
           </div>
         </div>
 
@@ -130,4 +143,4 @@
     }
   </style>
 </body>
-</html>
+</html>