diff --git a/.changeset/minimax-plugin-initial.md b/.changeset/minimax-plugin-initial.md new file mode 100644 index 000000000..a6322aca7 --- /dev/null +++ b/.changeset/minimax-plugin-initial.md @@ -0,0 +1,5 @@ +--- +"@livekit/agents-plugin-minimax": minor +--- + +feat(minimax): port MiniMax TTS plugin from Python agents (HTTP SSE + WebSocket streaming, PCM output) diff --git a/plugins/minimax/README.md b/plugins/minimax/README.md new file mode 100644 index 000000000..a1092ce97 --- /dev/null +++ b/plugins/minimax/README.md @@ -0,0 +1,30 @@ + +# MiniMax plugin for LiveKit Agents + +The Agents Framework is designed for building realtime, programmable +participants that run on servers. Use it to create conversational, multi-modal +voice agents that can see, hear, and understand. + +This package contains the MiniMax plugin, which provides text-to-speech via +MiniMax's `t2a_v2` APIs (both HTTP streaming and WebSocket streaming). + +Refer to the [documentation](https://docs.livekit.io/agents/overview/) for +information on how to use it. See the +[repository](https://github.com/livekit/agents-js) for more information about +the framework as a whole. + +## Installation + +```bash +pnpm add @livekit/agents-plugin-minimax +``` + +## Pre-requisites + +You'll need an API key from MiniMax. It can be set as an environment variable: +`MINIMAX_API_KEY`. You can also override the API endpoint via `MINIMAX_BASE_URL` +(defaults to `https://api-uw.minimax.io`). diff --git a/plugins/minimax/api-extractor.json b/plugins/minimax/api-extractor.json new file mode 100644 index 000000000..baa041649 --- /dev/null +++ b/plugins/minimax/api-extractor.json @@ -0,0 +1,8 @@ +/** + * Config file for API Extractor. For more info, please visit: https://api-extractor.com + */ +{ + "$schema": "https://developer.microsoft.com/json-schemas/api-extractor/v7/api-extractor.schema.json", + "extends": "../../api-extractor-shared.json", + "mainEntryPointFilePath": "./dist/index.d.ts" +} diff --git a/plugins/minimax/package.json b/plugins/minimax/package.json new file mode 100644 index 000000000..f458eb275 --- /dev/null +++ b/plugins/minimax/package.json @@ -0,0 +1,52 @@ +{ + "name": "@livekit/agents-plugin-minimax", + "version": "0.1.0", + "description": "MiniMax plugin for LiveKit Node Agents", + "main": "dist/index.js", + "require": "dist/index.cjs", + "types": "dist/index.d.ts", + "exports": { + "import": { + "types": "./dist/index.d.ts", + "default": "./dist/index.js" + }, + "require": { + "types": "./dist/index.d.cts", + "default": "./dist/index.cjs" + } + }, + "author": "LiveKit", + "type": "module", + "repository": "git@github.com:livekit/agents-js.git", + "license": "Apache-2.0", + "files": [ + "dist", + "src", + "README.md" + ], + "scripts": { + "build": "tsup --onSuccess \"pnpm build:types\"", + "build:types": "tsc --declaration --emitDeclarationOnly && node ../../scripts/copyDeclarationOutput.js", + "clean": "rm -rf dist", + "clean:build": "pnpm clean && pnpm build", + "lint": "eslint -f unix \"src/**/*.{ts,js}\"", + "api:check": "api-extractor run --typescript-compiler-folder ../../node_modules/typescript", + "api:update": "api-extractor run --local --typescript-compiler-folder ../../node_modules/typescript --verbose" + }, + "devDependencies": { + "@livekit/agents": "workspace:*", + "@livekit/agents-plugins-test": "workspace:*", + "@livekit/rtc-node": "catalog:", + "@microsoft/api-extractor": "^7.35.0", + "@types/ws": "catalog:", + "tsup": "^8.3.5", + "typescript": "^5.0.0" + }, + "dependencies": { + "ws": "catalog:" + }, + "peerDependencies": { + "@livekit/agents": "workspace:*", + "@livekit/rtc-node": "catalog:" + } +} diff --git a/plugins/minimax/src/index.ts b/plugins/minimax/src/index.ts new file mode 100644 index 000000000..711dff2bc --- /dev/null +++ b/plugins/minimax/src/index.ts @@ -0,0 +1,19 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { Plugin } from '@livekit/agents'; + +export * from './models.js'; +export * from './tts.js'; + +class MiniMaxPlugin extends Plugin { + constructor() { + super({ + title: 'minimax', + version: __PACKAGE_VERSION__, + package: __PACKAGE_NAME__, + }); + } +} + +Plugin.registerPlugin(new MiniMaxPlugin()); diff --git a/plugins/minimax/src/models.ts b/plugins/minimax/src/models.ts new file mode 100644 index 000000000..cf6bbf057 --- /dev/null +++ b/plugins/minimax/src/models.ts @@ -0,0 +1,137 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +// Ref: python livekit-plugins/livekit-plugins-minimax/livekit/plugins/minimax/tts.py - 27-38 lines +/** Supported MiniMax TTS models. */ +export type TTSModel = + | 'speech-2.8-hd' + | 'speech-2.8-turbo' + | 'speech-2.6-hd' + | 'speech-2.6-turbo' + | 'speech-2.5-hd-preview' + | 'speech-2.5-turbo-preview' + | 'speech-02-hd' + | 'speech-02-turbo' + | 'speech-01-hd' + | 'speech-01-turbo'; + +// Ref: python livekit-plugins/livekit-plugins-minimax/livekit/plugins/minimax/tts.py - 40-83 lines +/** + * A subset of commonly used MiniMax voice IDs. Any string is accepted by + * {@link TTSOptions.voice} - these literals exist purely for IDE + * auto-completion. See the MiniMax documentation for the full voice list. + */ +export type TTSVoice = + // Social Media Voices + | 'socialmedia_female_2_v1' + | 'socialmedia_female_1_v1' + // Voice Agent Series + | 'voice_agent_Female_Phone_4' + | 'voice_agent_Male_Phone_1' + | 'voice_agent_Male_Phone_2' + // English Voices - Female + | 'English_StressedLady' + | 'English_SentimentalLady' + | 'English_radiant_girl' + // English Voices - Male + | 'English_WiseScholar' + | 'English_Persuasive_Man' + | 'English_Explanatory_Man' + | 'English_Insightful_Speaker' + // Japanese Voices + | 'japanese_male_social_media_1_v2' + | 'japanese_female_social_media_1_v2' + // French Voices + | 'French_CasualMan' + | 'French_Female Journalist' + // Spanish Voices + | 'Spanish_Narrator' + | 'Spanish_WiseScholar' + | 'Spanish_ThoughtfulMan' + // Arabic Voices + | 'Arabic_CalmWoman' + | 'Arabic_FriendlyGuy' + // Portuguese Voices + | 'Portuguese_ThoughtfulLady' + // German Voices + | 'German_PlayfulMan' + | 'German_SweetLady' + // MOSS Audio Series + | 'moss_audio_7c7e7ae2-7356-11f0-9540-7ef9b4b62566' + | 'moss_audio_b118f320-78c0-11f0-bbeb-26e8167c4779' + | 'moss_audio_84f32de9-2363-11f0-b7ab-d255fae1f27b' + | 'moss_audio_82ebf67c-78c8-11f0-8e8e-36b92fbb4f95'; + +// Ref: python livekit-plugins/livekit-plugins-minimax/livekit/plugins/minimax/tts.py - 89-92 lines +/** + * MiniMax-supported emotions. + * + * @remarks `fluent` is only supported by `speech-2.6-*` models. + */ +export type TTSEmotion = + | 'happy' + | 'sad' + | 'angry' + | 'fearful' + | 'disgusted' + | 'surprised' + | 'neutral' + | 'fluent'; + +// Ref: python livekit-plugins/livekit-plugins-minimax/livekit/plugins/minimax/tts.py - 94-136 lines +/** Language hint for multilingual performance. */ +export type TTSLanguageBoost = + | 'auto' + | 'Chinese' + | 'Chinese,Yue' + | 'English' + | 'Arabic' + | 'Russian' + | 'Spanish' + | 'French' + | 'Portuguese' + | 'German' + | 'Turkish' + | 'Dutch' + | 'Ukrainian' + | 'Vietnamese' + | 'Indonesian' + | 'Japanese' + | 'Italian' + | 'Korean' + | 'Thai' + | 'Polish' + | 'Romanian' + | 'Greek' + | 'Czech' + | 'Finnish' + | 'Hindi' + | 'Bulgarian' + | 'Danish' + | 'Hebrew' + | 'Malay' + | 'Persian' + | 'Slovak' + | 'Swedish' + | 'Croatian' + | 'Filipino' + | 'Hungarian' + | 'Norwegian' + | 'Slovenian' + | 'Catalan' + | 'Nynorsk' + | 'Tamil' + | 'Afrikaans'; + +// Ref: python livekit-plugins/livekit-plugins-minimax/livekit/plugins/minimax/tts.py - 139 line +/** + * Valid PCM sample rates accepted by the MiniMax API. + */ +export type TTSSampleRate = 8000 | 16000 | 22050 | 24000 | 32000 | 44100; + +// Ref: python livekit-plugins/livekit-plugins-minimax/livekit/plugins/minimax/tts.py - 85-86 lines +export const DEFAULT_MODEL: TTSModel = 'speech-02-turbo'; +export const DEFAULT_VOICE_ID: TTSVoice = 'socialmedia_female_2_v1'; +// Ref: python livekit-plugins/livekit-plugins-minimax/livekit/plugins/minimax/tts.py - 142-144 lines +export const DEFAULT_BASE_URL = 'https://api-uw.minimax.io'; diff --git a/plugins/minimax/src/tts.test.ts b/plugins/minimax/src/tts.test.ts new file mode 100644 index 000000000..27d456e17 --- /dev/null +++ b/plugins/minimax/src/tts.test.ts @@ -0,0 +1,19 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { describe, it } from 'vitest'; +import { TTS } from './tts.js'; + +const hasMinimaxConfig = Boolean(process.env.MINIMAX_API_KEY); + +if (hasMinimaxConfig) { + describe('MiniMax TTS', () => { + it('constructs without throwing', () => { + new TTS(); + }); + }); +} else { + describe('MiniMax TTS', () => { + it.skip('requires MINIMAX_API_KEY', () => {}); + }); +} diff --git a/plugins/minimax/src/tts.ts b/plugins/minimax/src/tts.ts new file mode 100644 index 000000000..b83c8af9c --- /dev/null +++ b/plugins/minimax/src/tts.ts @@ -0,0 +1,716 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { + type APIConnectOptions, + APIConnectionError, + APIError, + APIStatusError, + APITimeoutError, + AudioByteStream, + Future, + log, + shortuuid, + tokenize, + tts, +} from '@livekit/agents'; +import type { AudioFrame } from '@livekit/rtc-node'; +import { WebSocket } from 'ws'; +import { + DEFAULT_BASE_URL, + DEFAULT_MODEL, + DEFAULT_VOICE_ID, + type TTSEmotion, + type TTSLanguageBoost, + type TTSModel, + type TTSSampleRate, + type TTSVoice, +} from './models.js'; + +const NUM_CHANNELS = 1; +const DEFAULT_SAMPLE_RATE: TTSSampleRate = 24000; +const DEFAULT_BITRATE = 128000; + +/** Configuration options for the MiniMax TTS plugin. */ +export interface TTSOptions { + /** + * MiniMax model name. Defaults to `speech-02-turbo`. + */ + model?: TTSModel | string; + /** + * MiniMax voice id. Defaults to {@link DEFAULT_VOICE_ID}. + */ + voice?: TTSVoice | string; + /** + * Optional emotion override. `fluent` is only supported by `speech-2.6-*` + * models; passing it with a different model throws at construction time. + */ + emotion?: TTSEmotion; + /** Playback speed. Must be in the range `[0.5, 2.0]`. */ + speed?: number; + /** Volume. Must be in the range `(0, 10]`. */ + vol?: number; + /** Pitch adjustment. Must be in the range `[-12, 12]`. */ + pitch?: number; + /** Enable Chinese/English text normalization on the server side. */ + textNormalization?: boolean; + /** + * Pronunciation dictionary, in the format + * `{ "word": ["replacement1", "replacement2"] }`. + */ + pronunciationDict?: Record; + /** Voice strength slider. Range `[-100, 100]`. */ + intensity?: number; + /** Voice timbre (nasal/crisp) slider. Range `[-100, 100]`. */ + timbre?: number; + /** Language hint for multilingual performance. */ + languageBoost?: TTSLanguageBoost; + /** Output PCM sample rate. Defaults to 24000. */ + sampleRate?: TTSSampleRate; + /** Output bitrate (ignored for PCM). Kept for API parity. */ + bitrate?: number; + /** API key. Falls back to `$MINIMAX_API_KEY`. */ + apiKey?: string; + /** + * Base URL of the MiniMax API. Falls back to `$MINIMAX_BASE_URL`, otherwise + * {@link DEFAULT_BASE_URL}. + */ + baseUrl?: string; + /** Tokenizer used when chunking input text for the WebSocket stream. */ + tokenizer?: tokenize.SentenceTokenizer; +} + +interface ResolvedTTSOptions { + model: TTSModel | string; + voice: TTSVoice | string; + emotion?: TTSEmotion; + speed: number; + vol: number; + pitch: number; + textNormalization: boolean; + pronunciationDict?: Record; + intensity?: number; + timbre?: number; + languageBoost?: TTSLanguageBoost; + sampleRate: TTSSampleRate; + bitrate: number; + apiKey: string; + baseUrl: string; + tokenizer: tokenize.SentenceTokenizer; +} + +// Ref: python livekit-plugins/livekit-plugins-minimax/livekit/plugins/minimax/tts.py - 239-252 lines +const validateOptions = (opts: { + model?: TTSModel | string; + speed?: number; + intensity?: number; + timbre?: number; + emotion?: TTSEmotion; +}): void => { + if (opts.speed !== undefined && (opts.speed < 0.5 || opts.speed > 2.0)) { + throw new Error(`speed must be between 0.5 and 2.0, but got ${opts.speed}`); + } + if (opts.intensity !== undefined && (opts.intensity < -100 || opts.intensity > 100)) { + throw new Error(`intensity must be between -100 and 100, but got ${opts.intensity}`); + } + if (opts.timbre !== undefined && (opts.timbre < -100 || opts.timbre > 100)) { + throw new Error(`timbre must be between -100 and 100, but got ${opts.timbre}`); + } + if ( + opts.emotion === 'fluent' && + opts.model !== undefined && + !opts.model.startsWith('speech-2.6') + ) { + throw new Error( + `"fluent" emotion is only supported by speech-2.6-* models, but got model "${opts.model}". ` + + 'Please use speech-2.6-hd or speech-2.6-turbo.', + ); + } +}; + +// Ref: python livekit-plugins/livekit-plugins-minimax/livekit/plugins/minimax/tts.py - 221-252 lines +const resolveOptions = (opts: TTSOptions): ResolvedTTSOptions => { + const apiKey = opts.apiKey ?? process.env.MINIMAX_API_KEY; + if (!apiKey) { + throw new Error( + 'MiniMax API key is required, either as an argument or as $MINIMAX_API_KEY environment variable', + ); + } + + const model = opts.model ?? DEFAULT_MODEL; + validateOptions({ + model, + speed: opts.speed, + intensity: opts.intensity, + timbre: opts.timbre, + emotion: opts.emotion, + }); + + return { + model, + voice: opts.voice ?? DEFAULT_VOICE_ID, + emotion: opts.emotion, + speed: opts.speed ?? 1.0, + vol: opts.vol ?? 1.0, + pitch: opts.pitch ?? 0, + textNormalization: opts.textNormalization ?? false, + pronunciationDict: opts.pronunciationDict, + intensity: opts.intensity, + timbre: opts.timbre, + languageBoost: opts.languageBoost, + sampleRate: opts.sampleRate ?? DEFAULT_SAMPLE_RATE, + bitrate: opts.bitrate ?? DEFAULT_BITRATE, + apiKey, + baseUrl: opts.baseUrl ?? process.env.MINIMAX_BASE_URL ?? DEFAULT_BASE_URL, + tokenizer: opts.tokenizer ?? new tokenize.basic.SentenceTokenizer(), + }; +}; + +// Ref: python livekit-plugins/livekit-plugins-minimax/livekit/plugins/minimax/tts.py - 689-725 lines (_to_minimax_options) +const toMiniMaxPayload = (opts: ResolvedTTSOptions): Record => { + const config: Record = { + model: opts.model, + voice_setting: { + voice_id: opts.voice, + speed: opts.speed, + vol: opts.vol, + pitch: opts.pitch, + ...(opts.emotion !== undefined ? { emotion: opts.emotion } : {}), + }, + audio_setting: { + sample_rate: opts.sampleRate, + bitrate: opts.bitrate, + // The JS port only exposes PCM output because AudioByteStream expects + // raw PCM samples; decoding mp3/flac on the fly would require an + // external decoder and a matching pipeline in @livekit/agents. + format: 'pcm', + channel: 1, + }, + text_normalization: opts.textNormalization, + }; + + if (opts.languageBoost !== undefined) { + config.language_boost = opts.languageBoost; + } + + if (opts.pronunciationDict) { + config.pronunciation_dict = opts.pronunciationDict; + } + + const voiceModify: Record = {}; + if (opts.intensity !== undefined) voiceModify.intensity = opts.intensity; + if (opts.timbre !== undefined) voiceModify.timbre = opts.timbre; + if (Object.keys(voiceModify).length > 0) { + config.voice_modify = voiceModify; + } + + return config; +}; + +// Ref: python livekit-plugins/livekit-plugins-minimax/livekit/plugins/minimax/tts.py - 168-388 lines +export class TTS extends tts.TTS { + #opts: ResolvedTTSOptions; + label = 'minimax.TTS'; + + get model(): string { + return this.#opts.model; + } + + get provider(): string { + return 'MiniMax'; + } + + constructor(opts: TTSOptions = {}) { + const resolved = resolveOptions(opts); + super(resolved.sampleRate, NUM_CHANNELS, { + streaming: true, + alignedTranscript: false, + }); + this.#opts = resolved; + } + + // Ref: python livekit-plugins/livekit-plugins-minimax/livekit/plugins/minimax/tts.py - 294-346 lines + updateOptions( + opts: Partial< + Pick< + TTSOptions, + | 'model' + | 'voice' + | 'emotion' + | 'speed' + | 'vol' + | 'pitch' + | 'textNormalization' + | 'pronunciationDict' + | 'intensity' + | 'timbre' + | 'languageBoost' + > + >, + ): void { + const merged = { ...this.#opts, ...opts }; + // Re-validate post-merge so updateOptions surfaces the same errors as the + // constructor. Python silently assigns invalid values; the JS port + // tightens this to produce a clear local error instead of a server-side one. + validateOptions({ + model: merged.model, + speed: merged.speed, + intensity: merged.intensity, + timbre: merged.timbre, + emotion: merged.emotion, + }); + this.#opts = merged; + } + + // Ref: python livekit-plugins/livekit-plugins-minimax/livekit/plugins/minimax/tts.py - 371-374 lines + synthesize( + text: string, + connOptions?: APIConnectOptions, + abortSignal?: AbortSignal, + ): tts.ChunkedStream { + return new ChunkedStream(this, text, this.#opts, connOptions, abortSignal); + } + + // Ref: python livekit-plugins/livekit-plugins-minimax/livekit/plugins/minimax/tts.py - 376-381 lines + stream(options?: { connOptions?: APIConnectOptions }): SynthesizeStream { + return new SynthesizeStream(this, this.#opts, options?.connOptions); + } +} + +const hexToBuffer = (hex: string): Buffer => Buffer.from(hex, 'hex'); + +// Ref: python livekit-plugins/livekit-plugins-minimax/livekit/plugins/minimax/tts.py - 575-687 lines +export class ChunkedStream extends tts.ChunkedStream { + label = 'minimax.ChunkedStream'; + #logger = log(); + #opts: ResolvedTTSOptions; + #text: string; + + constructor( + tts: TTS, + text: string, + opts: ResolvedTTSOptions, + connOptions?: APIConnectOptions, + abortSignal?: AbortSignal, + ) { + super(text, tts, connOptions, abortSignal); + this.#text = text; + this.#opts = opts; + } + + // Ref: python livekit-plugins/livekit-plugins-minimax/livekit/plugins/minimax/tts.py - 581-687 lines + protected async run(): Promise { + if (!this.#text.trim()) { + this.queue.close(); + return; + } + + const requestId = shortuuid(); + const bstream = new AudioByteStream(this.#opts.sampleRate, NUM_CHANNELS); + + const payload = toMiniMaxPayload(this.#opts); + payload.text = this.#text; + payload.stream = true; + payload.stream_options = { exclude_aggregated_audio: true }; + + const url = `${this.#opts.baseUrl}/v1/t2a_v2`; + + let response: Response; + try { + response = await fetch(url, { + method: 'POST', + headers: { + Authorization: `Bearer ${this.#opts.apiKey}`, + 'Content-Type': 'application/json', + }, + body: JSON.stringify(payload), + signal: this.abortSignal, + }); + } catch (e) { + if (this.abortSignal.aborted) return; + const err = e as Error; + if (err.name === 'AbortError') return; + throw new APIConnectionError({ message: `MiniMax connection failed: ${err.message}` }); + } + + if (!response.ok) { + const body = await response.text().catch(() => ''); + throw new APIStatusError({ + message: `MiniMax HTTP error: ${response.status} ${response.statusText}: ${body}`, + options: { statusCode: response.status, requestId, body: body ? { raw: body } : null }, + }); + } + + const traceId = + response.headers.get('Trace-Id') ?? response.headers.get('X-Trace-Id') ?? requestId; + + const reader = response.body?.getReader(); + if (!reader) { + throw new APIError('MiniMax returned an empty response body'); + } + + let lastFrame: AudioFrame | undefined; + const sendLastFrame = (segmentId: string, final: boolean) => { + if (lastFrame) { + this.queue.put({ requestId: traceId, segmentId, frame: lastFrame, final }); + lastFrame = undefined; + } + }; + + const decoder = new TextDecoder(); + let buffer = ''; + + try { + while (true) { + const { done, value } = await reader.read(); + if (done) break; + if (value === undefined) continue; + buffer += decoder.decode(value, { stream: true }); + + let newlineIdx: number; + // SSE frames are separated by '\n' (MiniMax uses a single newline, not \n\n). + while ((newlineIdx = buffer.indexOf('\n')) !== -1) { + const line = buffer.slice(0, newlineIdx).trim(); + buffer = buffer.slice(newlineIdx + 1); + if (!line) continue; + if (!line.startsWith('data:')) { + this.#logger.warn({ line }, 'unexpected MiniMax SSE line'); + continue; + } + + const data = JSON.parse(line.slice(5).trim()); + + const baseResp = data.base_resp ?? {}; + const statusCode = baseResp.status_code ?? 0; + if (statusCode !== 0) { + throw new APIStatusError({ + message: `MiniMax error [${statusCode}]: ${baseResp.status_msg ?? 'Unknown error'} (trace_id: ${data.trace_id ?? traceId})`, + options: { + statusCode, + requestId: data.trace_id ?? traceId, + body: data, + // MiniMax application-level codes (1xxx) are not HTTP status + // codes, so APIStatusError's default retryability heuristic + // (retryable = !(400 <= code < 500)) does not apply - most + // MiniMax errors (auth, invalid params) are permanent. + retryable: false, + }, + }); + } + + const audioHex = data?.data?.audio as string | undefined; + if (audioHex) { + const audio = hexToBuffer(audioHex); + for (const frame of bstream.write(audio)) { + sendLastFrame(traceId, false); + lastFrame = frame; + } + } + } + } + + for (const frame of bstream.flush()) { + sendLastFrame(traceId, false); + lastFrame = frame; + } + sendLastFrame(traceId, true); + } catch (e) { + if (this.abortSignal.aborted) return; + if (e instanceof APIError) throw e; + const err = e as Error; + if (err.name === 'AbortError') return; + throw new APIConnectionError({ message: `MiniMax streaming failed: ${err.message}` }); + } finally { + try { + reader.releaseLock(); + } catch { + // ignore + } + if (!this.queue.closed) { + this.queue.close(); + } + } + } +} + +// Ref: python livekit-plugins/livekit-plugins-minimax/livekit/plugins/minimax/tts.py - 390-573 lines +export class SynthesizeStream extends tts.SynthesizeStream { + label = 'minimax.SynthesizeStream'; + #logger = log(); + #opts: ResolvedTTSOptions; + #tokenStream: tokenize.SentenceStream; + + constructor(tts: TTS, opts: ResolvedTTSOptions, connOptions?: APIConnectOptions) { + super(tts, connOptions); + this.#opts = opts; + this.#tokenStream = opts.tokenizer.stream(); + } + + // Ref: python livekit-plugins/livekit-plugins-minimax/livekit/plugins/minimax/tts.py - 396-569 lines + protected async run(): Promise { + const requestId = shortuuid(); + let currentTraceId = requestId; + const taskStarted = new Future(); + + const wsUrl = + (this.#opts.baseUrl.startsWith('http') + ? this.#opts.baseUrl.replace(/^http/, 'ws') + : this.#opts.baseUrl) + '/ws/v1/t2a_v2'; + + const ws = new WebSocket(wsUrl, { + headers: { Authorization: `Bearer ${this.#opts.apiKey}` }, + }); + + const inputTask = async () => { + for await (const data of this.input) { + if (this.abortController.signal.aborted) break; + if (data === SynthesizeStream.FLUSH_SENTINEL) { + this.#tokenStream.flush(); + continue; + } + this.#tokenStream.pushText(data); + } + this.#tokenStream.endInput(); + }; + + const sendTask = async () => { + const startMsg = toMiniMaxPayload(this.#opts); + startMsg.event = 'task_start'; + ws.send(JSON.stringify(startMsg)); + + let taskStartTimeout: ReturnType | undefined; + try { + await Promise.race([ + taskStarted.await, + new Promise((_, reject) => { + taskStartTimeout = setTimeout( + () => reject(new APITimeoutError({ message: 'task_start timed out' })), + this.connOptions.timeoutMs, + ); + }), + ]); + } finally { + if (taskStartTimeout) clearTimeout(taskStartTimeout); + } + + for await (const sentence of this.#tokenStream) { + if (this.abortController.signal.aborted) break; + ws.send(JSON.stringify({ event: 'task_continue', text: sentence.token })); + } + + ws.send(JSON.stringify({ event: 'task_finish' })); + }; + + const recvTask = async () => { + const bstream = new AudioByteStream(this.#opts.sampleRate, NUM_CHANNELS); + let lastFrame: AudioFrame | undefined; + let segmentId = requestId; + + const sendLastFrame = (final: boolean) => { + if (lastFrame) { + this.queue.put({ requestId, segmentId, frame: lastFrame, final }); + lastFrame = undefined; + } + }; + + for await (const rawMsg of iterateWebSocket(ws, this.abortController.signal)) { + let data: Record; + try { + data = JSON.parse(rawMsg); + } catch (e) { + this.#logger.warn({ err: e, rawMsg }, 'failed to parse MiniMax WS message'); + continue; + } + + const trace = + (data.trace_id as string | undefined) ?? + ((data.base_resp as Record | undefined)?.trace_id as string | undefined); + if (trace) currentTraceId = trace; + + const baseResp = (data.base_resp as Record | undefined) ?? {}; + const statusCode = (baseResp.status_code as number | undefined) ?? 0; + if (statusCode !== 0) { + throw new APIStatusError({ + message: `MiniMax error [${statusCode}]: ${baseResp.status_msg ?? 'Unknown error'} (trace_id: ${currentTraceId})`, + options: { + statusCode, + requestId: currentTraceId, + body: data, + // See ChunkedStream for rationale: MiniMax app-level codes are + // not HTTP codes, so we opt out of the default retry heuristic. + retryable: false, + }, + }); + } + + const event = data.event as string | undefined; + if (event === 'connected_success') { + continue; + } + if (event === 'task_started') { + segmentId = (data.session_id as string | undefined) ?? requestId; + if (!taskStarted.done) taskStarted.resolve(); + continue; + } + if (event === 'task_continued') { + const audioHex = (data.data as { audio?: string } | undefined)?.audio; + if (audioHex) { + const audio = hexToBuffer(audioHex); + for (const frame of bstream.write(audio)) { + sendLastFrame(false); + lastFrame = frame; + } + } + if (data.is_final) { + for (const frame of bstream.flush()) { + sendLastFrame(false); + lastFrame = frame; + } + } + continue; + } + if (event === 'task_finished') { + for (const frame of bstream.flush()) { + sendLastFrame(false); + lastFrame = frame; + } + sendLastFrame(true); + this.queue.put(SynthesizeStream.END_OF_STREAM); + break; + } + if (event === 'task_failed') { + throw new APIError( + `MiniMax task failed (trace_id: ${currentTraceId}): ${JSON.stringify(data)}`, + ); + } + this.#logger.warn({ data }, 'unexpected MiniMax WS event'); + } + }; + + try { + await waitForWebSocketOpen(ws, this.connOptions.timeoutMs, this.abortController.signal); + await Promise.all([inputTask(), sendTask(), recvTask()]); + } catch (e) { + if (this.abortController.signal.aborted) return; + if (e instanceof APIError) throw e; + const err = e as Error; + throw new APIConnectionError({ + message: `MiniMax WebSocket connection failed: ${err.message} (trace_id: ${currentTraceId})`, + }); + } finally { + try { + if (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING) { + ws.close(); + } + } catch { + // ignore + } + // Do NOT close #tokenStream here - the base class retries run() on + // retryable errors, and #tokenStream is created once in the constructor. + // Closing it here would make every retry push text into a closed stream, + // silently losing user input. It is closed in close() instead. + } + } + + close(): void { + this.#tokenStream.close(); + super.close(); + } +} + +const waitForWebSocketOpen = async ( + ws: WebSocket, + timeoutMs: number, + signal: AbortSignal, +): Promise => { + if (signal.aborted) throw new Error('aborted'); + await new Promise((resolve, reject) => { + const cleanup = () => { + ws.off('open', onOpen); + ws.off('error', onError); + ws.off('close', onClose); + if (timeout) clearTimeout(timeout); + signal.removeEventListener('abort', onAbort); + }; + const onOpen = () => { + cleanup(); + resolve(); + }; + const onError = (err: Error) => { + cleanup(); + reject(err); + }; + const onClose = (code: number, reason: Buffer) => { + cleanup(); + reject(new Error(`WebSocket closed before open (code=${code}, reason=${reason.toString()})`)); + }; + const onAbort = () => { + cleanup(); + reject(new Error('aborted')); + }; + const timeout = + timeoutMs > 0 + ? setTimeout(() => { + cleanup(); + reject(new APITimeoutError({ message: 'MiniMax WebSocket connect timeout' })); + }, timeoutMs) + : undefined; + + ws.on('open', onOpen); + ws.on('error', onError); + ws.on('close', onClose); + signal.addEventListener('abort', onAbort, { once: true }); + }); +}; + +async function* iterateWebSocket( + ws: WebSocket, + signal: AbortSignal, +): AsyncGenerator { + const queue: string[] = []; + let waiter: (() => void) | undefined; + let closed = false; + let error: Error | undefined; + + const wake = () => { + if (waiter) { + const w = waiter; + waiter = undefined; + w(); + } + }; + + ws.on('message', (data) => { + queue.push(data.toString()); + wake(); + }); + ws.on('close', () => { + closed = true; + wake(); + }); + ws.on('error', (err) => { + error = err; + closed = true; + wake(); + }); + signal.addEventListener( + 'abort', + () => { + closed = true; + wake(); + }, + { once: true }, + ); + + while (!closed || queue.length > 0) { + if (queue.length > 0) { + yield queue.shift()!; + continue; + } + if (closed) break; + await new Promise((resolve) => { + waiter = resolve; + }); + } + if (error) throw error; +} diff --git a/plugins/minimax/tsconfig.json b/plugins/minimax/tsconfig.json new file mode 100644 index 000000000..86ff2c3a0 --- /dev/null +++ b/plugins/minimax/tsconfig.json @@ -0,0 +1,15 @@ +{ + "extends": "../../tsconfig.json", + "include": ["./src"], + "compilerOptions": { + "rootDir": "./src", + "declarationDir": "./dist", + "outDir": "./dist" + }, + "typedocOptions": { + "name": "plugins/agents-plugin-minimax", + "entryPointStrategy": "resolve", + "readme": "none", + "entryPoints": ["src/index.ts"] + } +} diff --git a/plugins/minimax/tsup.config.ts b/plugins/minimax/tsup.config.ts new file mode 100644 index 000000000..8ca20961f --- /dev/null +++ b/plugins/minimax/tsup.config.ts @@ -0,0 +1,7 @@ +import { defineConfig } from 'tsup'; + +import defaults from '../../tsup.config'; + +export default defineConfig({ + ...defaults, +}); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index ae1210ae1..5170f6d8c 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -770,6 +770,34 @@ importers: specifier: ^5.0.0 version: 5.4.5 + plugins/minimax: + dependencies: + ws: + specifier: 'catalog:' + version: 8.20.0 + devDependencies: + '@livekit/agents': + specifier: workspace:* + version: link:../../agents + '@livekit/agents-plugins-test': + specifier: workspace:* + version: link:../test + '@livekit/rtc-node': + specifier: 'catalog:' + version: 0.13.25 + '@microsoft/api-extractor': + specifier: ^7.35.0 + version: 7.43.7(@types/node@22.19.1) + '@types/ws': + specifier: 'catalog:' + version: 8.5.10 + tsup: + specifier: ^8.3.5 + version: 8.4.0(@microsoft/api-extractor@7.43.7(@types/node@22.19.1))(postcss@8.5.9)(tsx@4.21.0)(typescript@5.9.3) + typescript: + specifier: ^5.0.0 + version: 5.9.3 + plugins/mistral: dependencies: '@mistralai/mistralai': diff --git a/turbo.json b/turbo.json index 113bda809..5fe813239 100644 --- a/turbo.json +++ b/turbo.json @@ -72,6 +72,8 @@ "TRUGEN_API_KEY", "TRUGEN_API_URL", "TRUGEN_AVATAR_ID", + "MINIMAX_API_KEY", + "MINIMAX_BASE_URL", "MISTRAL_API_KEY", "VITEST" ],