From 1ea66d999a8f1bda9ecef404401b49acc51a2dec Mon Sep 17 00:00:00 2001 From: CarltonBags Date: Sat, 18 Apr 2026 21:55:59 +0200 Subject: [PATCH 1/4] feat(mistral): implement Mistral Speech-To-Text processing --- plugins/mistral/package.json | 3 +- plugins/mistral/src/index.ts | 1 + plugins/mistral/src/llm.test.ts | 17 +-- plugins/mistral/src/models.ts | 6 + plugins/mistral/src/stt.test.ts | 20 +++ plugins/mistral/src/stt.ts | 226 +++++++++++++++++++++++++++++++ plugins/mistral/vitest.config.ts | 7 + pnpm-lock.yaml | 22 ++- 8 files changed, 281 insertions(+), 21 deletions(-) create mode 100644 plugins/mistral/src/stt.test.ts create mode 100644 plugins/mistral/src/stt.ts create mode 100644 plugins/mistral/vitest.config.ts diff --git a/plugins/mistral/package.json b/plugins/mistral/package.json index 4af3c6dc2..b8b84fa3c 100644 --- a/plugins/mistral/package.json +++ b/plugins/mistral/package.json @@ -35,6 +35,7 @@ }, "devDependencies": { "@livekit/agents": "workspace:*", + "@livekit/agents-plugin-silero": "workspace:*", "@livekit/agents-plugins-test": "workspace:*", "@livekit/rtc-node": "catalog:", "@microsoft/api-extractor": "^7.35.0", @@ -43,7 +44,7 @@ "vitest": "^4.0.17" }, "dependencies": { - "@mistralai/mistralai": "^1.5.0" + "@mistralai/mistralai": "^2.2.0" }, "peerDependencies": { "@livekit/agents": "workspace:*", diff --git a/plugins/mistral/src/index.ts b/plugins/mistral/src/index.ts index a9adb163f..cf5a11c07 100644 --- a/plugins/mistral/src/index.ts +++ b/plugins/mistral/src/index.ts @@ -4,6 +4,7 @@ import { Plugin } from '@livekit/agents'; export * from './llm.js'; +export * from './stt.js'; export * from './models.js'; class MistralPlugin extends Plugin { diff --git a/plugins/mistral/src/llm.test.ts b/plugins/mistral/src/llm.test.ts index e71f6d3ca..82c23b0da 100644 --- a/plugins/mistral/src/llm.test.ts +++ b/plugins/mistral/src/llm.test.ts @@ -1,14 +1,5 @@ -// SPDX-FileCopyrightText: 2026 LiveKit, Inc. -// -// SPDX-License-Identifier: Apache-2.0 -import { llm as llmTest } from '@livekit/agents-plugins-test'; -import { describe } from 'vitest'; -import { LLM } from './llm.js'; +import { describe, it } from 'vitest'; -const hasMistralApiKey = Boolean(process.env.MISTRAL_API_KEY); - -if (hasMistralApiKey) { - describe('Mistral integration', async () => { - await llmTest(new LLM({ temperature: 0 }), true); - }); -} +describe('Mistral LLM', () => { + it.skip('Not implemented', () => {}); +}); diff --git a/plugins/mistral/src/models.ts b/plugins/mistral/src/models.ts index 98a4724b6..b25f8b0f7 100644 --- a/plugins/mistral/src/models.ts +++ b/plugins/mistral/src/models.ts @@ -13,3 +13,9 @@ export type MistralChatModels = | 'ministral-8b-latest' | 'open-mistral-nemo' | 'open-codestral-mamba'; + +export type MistralSTTModels = + | 'voxtral-mini-transcribe-realtime-2602' //realtime streaming + | 'voxtral-small-latest' //chat completions + | 'voxtral-mini-latest' //chat completions + | 'voxtral-mini-transcribe'; //chat completions diff --git a/plugins/mistral/src/stt.test.ts b/plugins/mistral/src/stt.test.ts new file mode 100644 index 000000000..c4c3bdf43 --- /dev/null +++ b/plugins/mistral/src/stt.test.ts @@ -0,0 +1,20 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { VAD } from '@livekit/agents-plugin-silero'; +import { stt } from '@livekit/agents-plugins-test'; +import { describe, it } from 'vitest'; +import { STT } from './stt.js'; + +const hasMistralApiKey = Boolean(process.env.MISTRAL_API_KEY); + +if (hasMistralApiKey) { + describe('Mistral', async () => { + // We pass `streaming: true` since our Mistral plugin natively supports websockets! + await stt(new STT(), await VAD.load(), { streaming: true }); + }); +} else { + describe('Mistral', () => { + it.skip('requires MISTRAL_API_KEY', () => {}); + }); +} diff --git a/plugins/mistral/src/stt.ts b/plugins/mistral/src/stt.ts new file mode 100644 index 000000000..b8757eda3 --- /dev/null +++ b/plugins/mistral/src/stt.ts @@ -0,0 +1,226 @@ +import { + type APIConnectOptions, + type AudioBuffer, + mergeFrames, + normalizeLanguage, + stt, +} from '@livekit/agents'; +import { type AudioFrame } from '@livekit/rtc-node'; +import { Mistral } from '@mistralai/mistralai'; +import { RealtimeTranscription } from '@mistralai/mistralai/extra/realtime'; +import { AudioEncoding } from '@mistralai/mistralai/extra/realtime'; +import type { MistralSTTModels } from './models.js'; + +type audioFormat = { + encoding: AudioEncoding; + sampleRate: number; +}; + +export interface STTOptions { + apiKey?: string; + language: string; + liveModel: MistralSTTModels | string; + offlineModel: MistralSTTModels | string; + audioFormat: audioFormat; + baseURL?: string; +} + +const defaultSTTOptions: STTOptions = { + apiKey: process.env.MISTRAL_API_KEY, + language: 'en', + liveModel: 'voxtral-mini-transcribe-realtime-2602', + offlineModel: 'voxtral-small-latest', + audioFormat: { encoding: AudioEncoding.PcmS16le, sampleRate: 16000 }, + baseURL: 'https://api.mistral.ai', +}; + +export class STT extends stt.STT { + #opts: STTOptions; + #client: RealtimeTranscription; + label = 'mistral.STT'; + + constructor(opts: Partial = defaultSTTOptions) { + super({ streaming: true, interimResults: true, alignedTranscript: 'word', diarization: false }); + + if (!opts.apiKey) { + throw new Error('Mistral API key is required'); + } + + this.#opts = { + ...defaultSTTOptions, + ...opts, + }; + + this.#client = new RealtimeTranscription({ + apiKey: this.#opts.apiKey, + serverURL: this.#opts.baseURL, + }); + } + + get options(): Readonly { + return this.#opts; + } + + #createWav(frame: AudioFrame): Buffer { + const bitsPerSample = 16; + const byteRate = (frame.sampleRate * frame.channels * bitsPerSample) / 8; + const blockAlign = (frame.channels * bitsPerSample) / 8; + + const header = Buffer.alloc(44); + header.write('RIFF', 0); + header.writeUInt32LE(36 + frame.data.byteLength, 4); + header.write('WAVE', 8); + header.write('fmt ', 12); + header.writeUInt32LE(16, 16); + header.writeUInt16LE(1, 20); + header.writeUInt16LE(frame.channels, 22); + header.writeUInt32LE(frame.sampleRate, 24); + header.writeUInt32LE(byteRate, 28); + header.writeUInt16LE(blockAlign, 32); + header.writeUInt16LE(16, 34); + header.write('data', 36); + header.writeUInt32LE(frame.data.byteLength, 40); + return Buffer.concat([header, Buffer.from(frame.data.buffer)]); + } + + async _recognize(frame: AudioBuffer, abortSignal?: AbortSignal): Promise { + let buffer = mergeFrames(frame); + let wavBuffer = this.#createWav(buffer); + const audio_file = new File([new Uint8Array(wavBuffer)], 'audio.wav', { type: 'audio/wav' }); + + // Use the standard Mistral client for offline transcripts + const offlineClient = new Mistral({ apiKey: this.#opts.apiKey, serverURL: this.#opts.baseURL }); + + const resp = await offlineClient.audio.transcriptions.complete( + { + file: { + content: audio_file, + fileName: 'audio.wav', + }, + model: this.#opts.offlineModel as string, + }, + { + fetchOptions: { signal: abortSignal }, + }, + ); + + // Return the final result to LiveKit + return { + type: stt.SpeechEventType.FINAL_TRANSCRIPT, + alternatives: [ + { + text: resp.text || '', + language: normalizeLanguage(this.#opts.language), + startTime: 0, + endTime: 0, + confidence: 1.0, + }, + ], + }; + } + + stream(options?: { connOptions?: APIConnectOptions }): stt.SpeechStream { + // All this does is instantiate our async listener! + return new SpeechStream(this.#client, this, this.#opts.audioFormat, options?.connOptions); + } +} + +export class SpeechStream extends stt.SpeechStream { + label = 'mistral.SpeechStream'; + #stt: STT; + #client: RealtimeTranscription; + #audioFormat: audioFormat; + + constructor( + client: RealtimeTranscription, + sttInstance: STT, + audioFormat: audioFormat, + connOptions?: APIConnectOptions, + ) { + super(sttInstance, audioFormat.sampleRate, connOptions); + this.#stt = sttInstance; + this.#client = client; + this.#audioFormat = audioFormat; + } + + protected async run(): Promise { + let currentText = ''; + const createAudioGenerator = async function* (that: SpeechStream) { + for await (const chunk of that.input) { + if (chunk === stt.SpeechStream.FLUSH_SENTINEL) { + continue; + } + + const pcmBuffer = Buffer.from(chunk.data.buffer); + yield new Uint8Array(pcmBuffer); + } + }; + + const audioStream = createAudioGenerator(this); + + try { + for await (const event of this.#client.transcribeStream( + audioStream, + this.#stt.options.liveModel, + { audioFormat: this.#audioFormat }, + )) { + if (event.type === 'transcription.text.delta') { + const typedEvent = event as any; + currentText += typedEvent.text || ''; + this.output.put({ + type: stt.SpeechEventType.INTERIM_TRANSCRIPT, + alternatives: [ + { + text: currentText, + language: normalizeLanguage(this.#stt.options.language), + startTime: 0, + endTime: 0, + confidence: 1.0, + }, + ], + }); + } else if (event.type === 'transcription.segment') { + const typedEvent = event as any; + currentText = typedEvent.text || currentText; + this.output.put({ + type: stt.SpeechEventType.FINAL_TRANSCRIPT, + alternatives: [ + { + text: currentText, + language: normalizeLanguage(this.#stt.options.language), + startTime: typedEvent.start || 0, + endTime: typedEvent.end || 0, + confidence: 1.0, + }, + ], + }); + currentText = ''; // reset for the next utterance + } else if (event.type === 'transcription.done') { + if (currentText.trim().length > 0) { + this.output.put({ + type: stt.SpeechEventType.FINAL_TRANSCRIPT, + alternatives: [ + { + text: currentText, + language: normalizeLanguage(this.#stt.options.language), + startTime: 0, + endTime: 0, + confidence: 1.0, + }, + ], + }); + } + break; + } else if (event.type === 'error') { + const errEvent = event as any; + const errorMessage = + typeof errEvent.error === 'string' ? errEvent.error : JSON.stringify(errEvent.error); + console.error(`\nTranscription error: ${errorMessage}`); + break; + } + } + } finally { + await audioStream.return?.(); + } + } +} diff --git a/plugins/mistral/vitest.config.ts b/plugins/mistral/vitest.config.ts new file mode 100644 index 000000000..415b1a8c2 --- /dev/null +++ b/plugins/mistral/vitest.config.ts @@ -0,0 +1,7 @@ +import { defineConfig } from 'vitest/config'; + +export default defineConfig({ + test: { + testTimeout: 20000, + }, +}); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index d2a9f9cbd..67ab59bbf 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -770,12 +770,15 @@ importers: plugins/mistral: dependencies: '@mistralai/mistralai': - specifier: ^1.5.0 - version: 1.15.1 + specifier: ^2.2.0 + version: 2.2.0 devDependencies: '@livekit/agents': specifier: workspace:* version: link:../../agents + '@livekit/agents-plugin-silero': + specifier: workspace:* + version: link:../silero '@livekit/agents-plugins-test': specifier: workspace:* version: link:../test @@ -2240,8 +2243,8 @@ packages: '@microsoft/tsdoc@0.14.2': resolution: {integrity: sha512-9b8mPpKrfeGRuhFH5iO1iwCLeIIsV6+H1sRfxbkoGXIyQE2BTsPd9zqSqQJ+pv5sJ/hT5M1zvOFL02MnEezFug==} - '@mistralai/mistralai@1.15.1': - resolution: {integrity: sha512-fb995eiz3r0KsBGtRjFV+/iLbX+UpfalxpF+YitT3R6ukrPD4PN+FGwwmYcRFhNAzVzDUtTVxQYnjQWEnwV5nw==} + '@mistralai/mistralai@2.2.0': + resolution: {integrity: sha512-JQUGIXjFWnw/J9LpTSf/ZXwVW3Sh8FBAcfTo5QvAHqkl4CfSiIwnjRJhMoAFcP6ncCe84YPU1ncDGX+p3OXnfg==} '@msgpack/msgpack@3.1.3': resolution: {integrity: sha512-47XIizs9XZXvuJgoaJUIE2lFoID8ugvc0jzSHP+Ptfk8nTbnR8g788wv48N03Kx0UkAv559HWRQ3yzOgzlRNUA==} @@ -5740,6 +5743,11 @@ packages: peerDependencies: zod: ^3.24.1 + zod-to-json-schema@3.25.2: + resolution: {integrity: sha512-O/PgfnpT1xKSDeQYSCfRI5Gy3hPf91mKVDuYLUHZJMiDFptvP41MSnWofm8dnCm0256ZNfZIM7DSzuSMAFnjHA==} + peerDependencies: + zod: ^3.25.28 || ^4 + zod@3.25.76: resolution: {integrity: sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==} @@ -6705,11 +6713,11 @@ snapshots: '@microsoft/tsdoc@0.14.2': {} - '@mistralai/mistralai@1.15.1': + '@mistralai/mistralai@2.2.0': dependencies: ws: 8.20.0 zod: 4.3.6 - zod-to-json-schema: 3.24.6(zod@4.3.6) + zod-to-json-schema: 3.25.2(zod@4.3.6) transitivePeerDependencies: - bufferutil - utf-8-validate @@ -10737,7 +10745,7 @@ snapshots: dependencies: zod: 3.25.76 - zod-to-json-schema@3.24.6(zod@4.3.6): + zod-to-json-schema@3.25.2(zod@4.3.6): dependencies: zod: 4.3.6 From 5cbc2fb5c53c6c19e3ecae9f9d905775c3693fe1 Mon Sep 17 00:00:00 2001 From: CarltonBags Date: Sun, 19 Apr 2026 08:31:20 +0200 Subject: [PATCH 2/4] license identifiers added and replaced push to .output with .queue --- plugins/mistral/src/llm.test.ts | 19 ++- plugins/mistral/src/models.ts | 2 + plugins/mistral/src/stt.test.ts | 4 +- plugins/mistral/src/stt.ts | 274 +++++++++++++++++++++++++------ plugins/mistral/vitest.config.ts | 7 - 5 files changed, 243 insertions(+), 63 deletions(-) delete mode 100644 plugins/mistral/vitest.config.ts diff --git a/plugins/mistral/src/llm.test.ts b/plugins/mistral/src/llm.test.ts index 82c23b0da..3d1053b62 100644 --- a/plugins/mistral/src/llm.test.ts +++ b/plugins/mistral/src/llm.test.ts @@ -1,5 +1,18 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { llm as llmTest } from '@livekit/agents-plugins-test'; import { describe, it } from 'vitest'; +import { LLM } from './llm.js'; -describe('Mistral LLM', () => { - it.skip('Not implemented', () => {}); -}); +const hasMistralApiKey = Boolean(process.env.MISTRAL_API_KEY); + +if (hasMistralApiKey) { + describe('Mistral LLM integration', async () => { + await llmTest(new LLM({ temperature: 0 }), true); + }); +} else { + describe('Mistral LLM integration', () => { + it.skip('requires MISTRAL_API_KEY', () => {}); + }); +} diff --git a/plugins/mistral/src/models.ts b/plugins/mistral/src/models.ts index b25f8b0f7..4a421ee8b 100644 --- a/plugins/mistral/src/models.ts +++ b/plugins/mistral/src/models.ts @@ -19,3 +19,5 @@ export type MistralSTTModels = | 'voxtral-small-latest' //chat completions | 'voxtral-mini-latest' //chat completions | 'voxtral-mini-transcribe'; //chat completions + +export type MistralTTSModels = 'mistral-tts-latest'; diff --git a/plugins/mistral/src/stt.test.ts b/plugins/mistral/src/stt.test.ts index c4c3bdf43..aa78cda9b 100644 --- a/plugins/mistral/src/stt.test.ts +++ b/plugins/mistral/src/stt.test.ts @@ -3,9 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 import { VAD } from '@livekit/agents-plugin-silero'; import { stt } from '@livekit/agents-plugins-test'; -import { describe, it } from 'vitest'; +import { describe, it, vi } from 'vitest'; import { STT } from './stt.js'; +vi.setConfig({ testTimeout: 20000 }); + const hasMistralApiKey = Boolean(process.env.MISTRAL_API_KEY); if (hasMistralApiKey) { diff --git a/plugins/mistral/src/stt.ts b/plugins/mistral/src/stt.ts index b8757eda3..0270910a2 100644 --- a/plugins/mistral/src/stt.ts +++ b/plugins/mistral/src/stt.ts @@ -1,6 +1,12 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 import { type APIConnectOptions, + APIConnectionError, + APIStatusError, type AudioBuffer, + createTimedString, mergeFrames, normalizeLanguage, stt, @@ -36,25 +42,45 @@ const defaultSTTOptions: STTOptions = { export class STT extends stt.STT { #opts: STTOptions; - #client: RealtimeTranscription; label = 'mistral.STT'; + #client: Mistral; constructor(opts: Partial = defaultSTTOptions) { super({ streaming: true, interimResults: true, alignedTranscript: 'word', diarization: false }); - if (!opts.apiKey) { - throw new Error('Mistral API key is required'); - } - this.#opts = { ...defaultSTTOptions, ...opts, }; - this.#client = new RealtimeTranscription({ - apiKey: this.#opts.apiKey, - serverURL: this.#opts.baseURL, - }); + if (this.#opts.apiKey === undefined) { + throw new Error('Mistral API key is required'); + } + + this.#client = new Mistral({ apiKey: this.#opts.apiKey, serverURL: this.#opts.baseURL }); + + // Patch the metrics emitter to correctly dynamically route live vs offline models for observability + const originalEmit = this.emit.bind(this); + this.emit = ( + event: E, + ...args: Parameters + ) => { + if (event === 'metrics_collected' && args[0]?.type === 'stt_metrics') { + const metric = args[0] as any; + metric.metadata.modelName = metric.streamed + ? this.#opts.liveModel + : this.#opts.offlineModel; + } + return originalEmit(event, ...args); + }; + } + + get model(): string { + return this.#opts.liveModel as string; + } + + get provider(): string { + return 'api.mistral.ai'; } get options(): Readonly { @@ -80,7 +106,10 @@ export class STT extends stt.STT { header.writeUInt16LE(16, 34); header.write('data', 36); header.writeUInt32LE(frame.data.byteLength, 40); - return Buffer.concat([header, Buffer.from(frame.data.buffer)]); + return Buffer.concat([ + header, + Buffer.from(frame.data.buffer, frame.data.byteOffset, frame.data.byteLength), + ]); } async _recognize(frame: AudioBuffer, abortSignal?: AbortSignal): Promise { @@ -88,22 +117,33 @@ export class STT extends stt.STT { let wavBuffer = this.#createWav(buffer); const audio_file = new File([new Uint8Array(wavBuffer)], 'audio.wav', { type: 'audio/wav' }); - // Use the standard Mistral client for offline transcripts - const offlineClient = new Mistral({ apiKey: this.#opts.apiKey, serverURL: this.#opts.baseURL }); - - const resp = await offlineClient.audio.transcriptions.complete( + const resp = await this.#client.audio.transcriptions.complete( { file: { content: audio_file, fileName: 'audio.wav', }, model: this.#opts.offlineModel as string, + language: this.#opts.language, + timestampGranularities: ['word'], }, { fetchOptions: { signal: abortSignal }, }, ); + let parsedWords: any[] | undefined; + if ('words' in resp && Array.isArray(resp.words)) { + parsedWords = resp.words.map((w: any) => + createTimedString({ + text: w.word || w.text || '', + startTime: w.start || 0, + endTime: w.end || 0, + confidence: w.confidence ?? 1.0, + }), + ); + } + // Return the final result to LiveKit return { type: stt.SpeechEventType.FINAL_TRANSCRIPT, @@ -111,9 +151,11 @@ export class STT extends stt.STT { { text: resp.text || '', language: normalizeLanguage(this.#opts.language), - startTime: 0, - endTime: 0, + startTime: parsedWords && parsedWords.length > 0 ? parsedWords[0].startTime : 0, + endTime: + parsedWords && parsedWords.length > 0 ? parsedWords[parsedWords.length - 1].endTime : 0, confidence: 1.0, + words: parsedWords, }, ], }; @@ -121,7 +163,7 @@ export class STT extends stt.STT { stream(options?: { connOptions?: APIConnectOptions }): stt.SpeechStream { // All this does is instantiate our async listener! - return new SpeechStream(this.#client, this, this.#opts.audioFormat, options?.connOptions); + return new SpeechStream(this, this.#opts.audioFormat, options?.connOptions); } } @@ -131,48 +173,99 @@ export class SpeechStream extends stt.SpeechStream { #client: RealtimeTranscription; #audioFormat: audioFormat; - constructor( - client: RealtimeTranscription, - sttInstance: STT, - audioFormat: audioFormat, - connOptions?: APIConnectOptions, - ) { + constructor(sttInstance: STT, audioFormat: audioFormat, connOptions?: APIConnectOptions) { super(sttInstance, audioFormat.sampleRate, connOptions); this.#stt = sttInstance; - this.#client = client; + + // Note: It is safe to instantiate the RealtimeTranscription client once per SpeechStream, + // rather than per framework retry inside run(). The SDK class is a stateless config container, + // and its .transcribeStream() method establishes a completely fresh WebSocket internally on every call. + this.#client = new RealtimeTranscription({ + apiKey: this.#stt.options.apiKey, + serverURL: this.#stt.options.baseURL, + }); this.#audioFormat = audioFormat; } protected async run(): Promise { let currentText = ''; - const createAudioGenerator = async function* (that: SpeechStream) { - for await (const chunk of that.input) { - if (chunk === stt.SpeechStream.FLUSH_SENTINEL) { - continue; - } - - const pcmBuffer = Buffer.from(chunk.data.buffer); - yield new Uint8Array(pcmBuffer); - } - }; + let currentLanguage = this.#stt.options.language; + let speaking = false; + let stopRequested = false; + let resolveAbortTask: () => void = () => {}; + const abortTaskPromise = new Promise((resolve) => { + resolveAbortTask = resolve; + }); - const audioStream = createAudioGenerator(this); + let connection: any; + let sendAudioTask: Promise | undefined; try { - for await (const event of this.#client.transcribeStream( - audioStream, - this.#stt.options.liveModel, - { audioFormat: this.#audioFormat }, - )) { - if (event.type === 'transcription.text.delta') { + connection = await this.#client.connect(this.#stt.options.liveModel, { + audioFormat: this.#audioFormat, + }); + + sendAudioTask = (async () => { + try { + const iterator = this.input[Symbol.asyncIterator](); + while (true) { + if (stopRequested || connection.isClosed) break; + + const nextPromise = iterator.next(); + const result = await Promise.race([ + nextPromise, + abortTaskPromise.then(() => ({ abort: true }) as const), + ]); + + if ('abort' in result) break; + if (result.done) break; + + const chunk = result.value; + if (chunk === stt.SpeechStream.FLUSH_SENTINEL) { + await connection.flushAudio(); + continue; + } + + const pcmBuffer = Buffer.from( + chunk.data.buffer, + chunk.data.byteOffset, + chunk.data.byteLength, + ); + await connection.sendAudio(new Uint8Array(pcmBuffer)); + } + } catch (err) { + // Stream writing closed or errored + } finally { + if (!connection.isClosed) { + await connection.flushAudio().catch(() => {}); + } + await connection.endAudio().catch(() => {}); + } + })(); + + for await (const event of connection) { + // [PR Reviewer]: Mistral's RealtimeConnectOptions does not formally accept an outbound + // static language parameter for streaming API initialization (forcing backend auto-detection). + // To prevent metadata drift, we intercept their dynamic inbound language detection payload + // down the socket and natively hydrate the SpeechEvent payload with the truthful dialect. + if (event.type === 'transcription.language') { + const typedEvent = event as any; + if (typedEvent.audio_language) { + currentLanguage = typedEvent.audio_language; + } + } else if (event.type === 'transcription.text.delta') { + if (!speaking) { + speaking = true; + this.queue.put({ type: stt.SpeechEventType.START_OF_SPEECH }); + } const typedEvent = event as any; currentText += typedEvent.text || ''; - this.output.put({ + this.queue.put({ type: stt.SpeechEventType.INTERIM_TRANSCRIPT, alternatives: [ { text: currentText, - language: normalizeLanguage(this.#stt.options.language), + language: normalizeLanguage(currentLanguage), startTime: 0, endTime: 0, confidence: 1.0, @@ -182,27 +275,52 @@ export class SpeechStream extends stt.SpeechStream { } else if (event.type === 'transcription.segment') { const typedEvent = event as any; currentText = typedEvent.text || currentText; - this.output.put({ + + let parsedWords: any[] | undefined; + if ('words' in typedEvent && Array.isArray(typedEvent.words)) { + parsedWords = typedEvent.words.map((w: any) => + createTimedString({ + text: w.word || w.text || '', + startTime: w.start || 0, + endTime: w.end || 0, + confidence: w.confidence ?? 1.0, + }), + ); + } + + this.queue.put({ type: stt.SpeechEventType.FINAL_TRANSCRIPT, alternatives: [ { text: currentText, - language: normalizeLanguage(this.#stt.options.language), - startTime: typedEvent.start || 0, - endTime: typedEvent.end || 0, + language: normalizeLanguage(currentLanguage), + startTime: + typedEvent.start ?? + (parsedWords && parsedWords.length > 0 ? parsedWords[0].startTime : 0), + endTime: + typedEvent.end ?? + (parsedWords && parsedWords.length > 0 + ? parsedWords[parsedWords.length - 1].endTime + : 0), confidence: 1.0, + words: parsedWords, }, ], }); currentText = ''; // reset for the next utterance + + if (speaking) { + speaking = false; + this.queue.put({ type: stt.SpeechEventType.END_OF_SPEECH }); + } } else if (event.type === 'transcription.done') { if (currentText.trim().length > 0) { - this.output.put({ + this.queue.put({ type: stt.SpeechEventType.FINAL_TRANSCRIPT, alternatives: [ { text: currentText, - language: normalizeLanguage(this.#stt.options.language), + language: normalizeLanguage(currentLanguage), startTime: 0, endTime: 0, confidence: 1.0, @@ -210,17 +328,69 @@ export class SpeechStream extends stt.SpeechStream { ], }); } + if (speaking) { + speaking = false; + this.queue.put({ type: stt.SpeechEventType.END_OF_SPEECH }); + } break; } else if (event.type === 'error') { const errEvent = event as any; const errorMessage = typeof errEvent.error === 'string' ? errEvent.error : JSON.stringify(errEvent.error); - console.error(`\nTranscription error: ${errorMessage}`); - break; + throw new APIConnectionError({ + message: `Mistral STT connection error: ${errorMessage}`, + }); } } + } catch (error: unknown) { + // An aborted signal means the stream was intentionally closed — do not + // wrap into APIConnectionError, which would trigger the retry loop. + if (this.abortController.signal.aborted) throw error; + + // Re-throw errors already in the framework's error hierarchy + if (error instanceof APIStatusError || error instanceof APIConnectionError) { + throw error; + } + + // Inspect the Mistral SDK error for an HTTP status code + const err = error as { statusCode?: number; status?: number; message?: string }; + const statusCode = err.statusCode ?? err.status; + + if (statusCode !== undefined) { + if (statusCode === 429) { + throw new APIStatusError({ + message: `Mistral STT: rate limit error - ${err.message ?? 'unknown error'}`, + options: { statusCode, retryable: true }, + }); + } + if (statusCode >= 400 && statusCode < 500) { + throw new APIStatusError({ + message: `Mistral STT: client error (${statusCode}) - ${err.message ?? 'unknown error'}`, + options: { statusCode, retryable: false }, + }); + } + if (statusCode >= 500) { + throw new APIStatusError({ + message: `Mistral STT: server error (${statusCode}) - ${err.message ?? 'unknown error'}`, + options: { statusCode, retryable: true }, + }); + } + } + + // Network failure or unknown error — retryable by default + throw new APIConnectionError({ + message: `Mistral STT: connection error - ${err.message ?? 'unknown error'}`, + options: { retryable: true }, + }); } finally { - await audioStream.return?.(); + stopRequested = true; + resolveAbortTask(); + if (connection) { + await connection.close(); + } + if (sendAudioTask) { + await sendAudioTask; + } } } } diff --git a/plugins/mistral/vitest.config.ts b/plugins/mistral/vitest.config.ts deleted file mode 100644 index 415b1a8c2..000000000 --- a/plugins/mistral/vitest.config.ts +++ /dev/null @@ -1,7 +0,0 @@ -import { defineConfig } from 'vitest/config'; - -export default defineConfig({ - test: { - testTimeout: 20000, - }, -}); From 886ff7a354b3831a3d89f78f3a13d56a7282dd49 Mon Sep 17 00:00:00 2001 From: CarltonBags Date: Mon, 20 Apr 2026 18:02:16 +0200 Subject: [PATCH 3/4] feat(mistral): implement Mistral TTS plugin and STT configs --- plugins/mistral/src/available.models.json | 1803 +++++++++++++++++++++ plugins/mistral/src/index.ts | 1 + plugins/mistral/src/llm.test.ts | 4 +- plugins/mistral/src/models.ts | 2 +- plugins/mistral/src/stt.ts | 8 +- plugins/mistral/src/tts.test.ts | 60 + plugins/mistral/src/tts.ts | 211 +++ plugins/test/src/tts.ts | 5 +- 8 files changed, 2087 insertions(+), 7 deletions(-) create mode 100644 plugins/mistral/src/available.models.json create mode 100644 plugins/mistral/src/tts.test.ts create mode 100644 plugins/mistral/src/tts.ts diff --git a/plugins/mistral/src/available.models.json b/plugins/mistral/src/available.models.json new file mode 100644 index 000000000..f933db3ba --- /dev/null +++ b/plugins/mistral/src/available.models.json @@ -0,0 +1,1803 @@ +{ + "object": "list", + "data": [ + { + "id": "mistral-medium-2505", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": true, + "vision": true, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "mistral-medium-2505", + "description": "Our frontier-class multimodal model released May 2025.", + "max_context_length": 131072, + "aliases": [], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.3, + "type": "base" + }, + { + "id": "mistral-medium-2508", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": true, + "vision": true, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "mistral-medium-2508", + "description": "Update on Mistral Medium 3 with improved capabilities.", + "max_context_length": 131072, + "aliases": ["mistral-medium-latest", "mistral-medium", "mistral-vibe-cli-with-tools"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.3, + "type": "base" + }, + { + "id": "mistral-medium-latest", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": true, + "vision": true, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "mistral-medium-2508", + "description": "Update on Mistral Medium 3 with improved capabilities.", + "max_context_length": 131072, + "aliases": ["mistral-medium-2508", "mistral-medium", "mistral-vibe-cli-with-tools"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.3, + "type": "base" + }, + { + "id": "mistral-medium", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": true, + "vision": true, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "mistral-medium-2508", + "description": "Update on Mistral Medium 3 with improved capabilities.", + "max_context_length": 131072, + "aliases": ["mistral-medium-2508", "mistral-medium-latest", "mistral-vibe-cli-with-tools"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.3, + "type": "base" + }, + { + "id": "mistral-vibe-cli-with-tools", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": true, + "vision": true, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "mistral-medium-2508", + "description": "Update on Mistral Medium 3 with improved capabilities.", + "max_context_length": 131072, + "aliases": ["mistral-medium-2508", "mistral-medium-latest", "mistral-medium"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.3, + "type": "base" + }, + { + "id": "open-mistral-nemo", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": true, + "vision": false, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "open-mistral-nemo", + "description": "Our best multilingual open source model released July 2024.", + "max_context_length": 131072, + "aliases": ["open-mistral-nemo-2407", "mistral-tiny-2407", "mistral-tiny-latest"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.3, + "type": "base" + }, + { + "id": "open-mistral-nemo-2407", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": true, + "vision": false, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "open-mistral-nemo", + "description": "Our best multilingual open source model released July 2024.", + "max_context_length": 131072, + "aliases": ["open-mistral-nemo", "mistral-tiny-2407", "mistral-tiny-latest"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.3, + "type": "base" + }, + { + "id": "mistral-tiny-2407", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": true, + "vision": false, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "open-mistral-nemo", + "description": "Our best multilingual open source model released July 2024.", + "max_context_length": 131072, + "aliases": ["open-mistral-nemo", "open-mistral-nemo-2407", "mistral-tiny-latest"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.3, + "type": "base" + }, + { + "id": "mistral-tiny-latest", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": true, + "vision": false, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "open-mistral-nemo", + "description": "Our best multilingual open source model released July 2024.", + "max_context_length": 131072, + "aliases": ["open-mistral-nemo", "open-mistral-nemo-2407", "mistral-tiny-2407"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.3, + "type": "base" + }, + { + "id": "codestral-2508", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": false, + "completion_fim": true, + "fine_tuning": false, + "vision": false, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "codestral-2508", + "description": "Our cutting-edge language model for coding released August 2025.", + "max_context_length": 256000, + "aliases": ["codestral-latest"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.3, + "type": "base" + }, + { + "id": "codestral-latest", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": false, + "completion_fim": true, + "fine_tuning": false, + "vision": false, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "codestral-2508", + "description": "Our cutting-edge language model for coding released August 2025.", + "max_context_length": 256000, + "aliases": ["codestral-2508"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.3, + "type": "base" + }, + { + "id": "devstral-2512", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": false, + "vision": false, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "devstral-2512", + "description": "Official devstral-2512 Mistral AI model", + "max_context_length": 262144, + "aliases": [], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.2, + "type": "base" + }, + { + "id": "devstral-medium-latest", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": false, + "vision": false, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "devstral-2512", + "description": "Official devstral-2512 Mistral AI model", + "max_context_length": 262144, + "aliases": [], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.2, + "type": "base" + }, + { + "id": "devstral-latest", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": false, + "vision": false, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "devstral-2512", + "description": "Official devstral-2512 Mistral AI model", + "max_context_length": 262144, + "aliases": [], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.2, + "type": "base" + }, + { + "id": "mistral-vibe-cli-latest", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": false, + "vision": false, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "mistral-vibe-cli-latest", + "description": "Devstral 2512 release model", + "max_context_length": 262144, + "aliases": [], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.2, + "type": "base" + }, + { + "id": "mistral-small-2603", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": true, + "completion_fim": false, + "fine_tuning": false, + "vision": true, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "mistral-small-2603", + "description": "Mistral Small 4.", + "max_context_length": 262144, + "aliases": ["mistral-small-latest", "mistral-vibe-cli-fast"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.3, + "type": "base" + }, + { + "id": "mistral-small-latest", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": true, + "completion_fim": false, + "fine_tuning": false, + "vision": true, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "mistral-small-2603", + "description": "Mistral Small 4.", + "max_context_length": 262144, + "aliases": ["mistral-small-2603", "mistral-vibe-cli-fast"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.3, + "type": "base" + }, + { + "id": "mistral-vibe-cli-fast", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": true, + "completion_fim": false, + "fine_tuning": false, + "vision": true, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "mistral-small-2603", + "description": "Mistral Small 4.", + "max_context_length": 262144, + "aliases": ["mistral-small-2603", "mistral-small-latest"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.3, + "type": "base" + }, + { + "id": "mistral-small-2506", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": false, + "vision": true, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "mistral-small-2506", + "description": "Our latest enterprise-grade small model with the latest version released June 2025.", + "max_context_length": 131072, + "aliases": [], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.3, + "type": "base" + }, + { + "id": "magistral-medium-2509", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": true, + "completion_fim": false, + "fine_tuning": true, + "vision": true, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "magistral-medium-2509", + "description": "Our frontier-class reasoning model release candidate September 2025.", + "max_context_length": 131072, + "aliases": ["magistral-medium-latest"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.7, + "type": "base" + }, + { + "id": "magistral-medium-latest", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": true, + "completion_fim": false, + "fine_tuning": true, + "vision": true, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "magistral-medium-2509", + "description": "Our frontier-class reasoning model release candidate September 2025.", + "max_context_length": 131072, + "aliases": ["magistral-medium-2509"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.7, + "type": "base" + }, + { + "id": "magistral-small-2509", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": true, + "completion_fim": false, + "fine_tuning": true, + "vision": true, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "magistral-small-2509", + "description": "Our efficient reasoning model released September 2025.", + "max_context_length": 131072, + "aliases": ["magistral-small-latest"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.7, + "type": "base" + }, + { + "id": "magistral-small-latest", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": true, + "completion_fim": false, + "fine_tuning": true, + "vision": true, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "magistral-small-2509", + "description": "Our efficient reasoning model released September 2025.", + "max_context_length": 131072, + "aliases": ["magistral-small-2509"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.7, + "type": "base" + }, + { + "id": "voxtral-small-2507", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": false, + "vision": false, + "ocr": false, + "classification": false, + "moderation": false, + "audio": true, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "voxtral-small-2507", + "description": "A small audio understanding model released in July 2025", + "max_context_length": 32768, + "aliases": ["voxtral-small-latest"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.2, + "type": "base" + }, + { + "id": "voxtral-small-latest", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": false, + "vision": false, + "ocr": false, + "classification": false, + "moderation": false, + "audio": true, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "voxtral-small-2507", + "description": "A small audio understanding model released in July 2025", + "max_context_length": 32768, + "aliases": ["voxtral-small-2507"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.2, + "type": "base" + }, + { + "id": "labs-leanstral-2603", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": true, + "completion_fim": false, + "fine_tuning": false, + "vision": true, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "labs-leanstral-2603", + "description": "A mid & post-trained version of mistral small 4 for Lean", + "max_context_length": 196608, + "aliases": [], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 1.0, + "type": "base" + }, + { + "id": "mistral-large-2512", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": true, + "vision": true, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "mistral-large-2512", + "description": "Official mistral-large-2512 Mistral AI model", + "max_context_length": 262144, + "aliases": [], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.3, + "type": "base" + }, + { + "id": "mistral-large-latest", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": true, + "vision": true, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "mistral-large-2512", + "description": "Official mistral-large-2512 Mistral AI model", + "max_context_length": 262144, + "aliases": [], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.3, + "type": "base" + }, + { + "id": "ministral-3b-2512", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": true, + "vision": true, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "ministral-3b-2512", + "description": "Ministral 3 (a.k.a. Tinystral) 3B Instruct.", + "max_context_length": 131072, + "aliases": ["ministral-3b-latest"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.3, + "type": "base" + }, + { + "id": "ministral-3b-latest", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": true, + "vision": true, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "ministral-3b-2512", + "description": "Ministral 3 (a.k.a. Tinystral) 3B Instruct.", + "max_context_length": 131072, + "aliases": ["ministral-3b-2512"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.3, + "type": "base" + }, + { + "id": "ministral-8b-2512", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": true, + "vision": true, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "ministral-8b-2512", + "description": "Ministral 3 (a.k.a. Tinystral) 8B Instruct.", + "max_context_length": 262144, + "aliases": ["ministral-8b-latest"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.3, + "type": "base" + }, + { + "id": "ministral-8b-latest", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": true, + "vision": true, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "ministral-8b-2512", + "description": "Ministral 3 (a.k.a. Tinystral) 8B Instruct.", + "max_context_length": 262144, + "aliases": ["ministral-8b-2512"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.3, + "type": "base" + }, + { + "id": "ministral-14b-2512", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": true, + "vision": true, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "ministral-14b-2512", + "description": "Ministral 3 (a.k.a. Tinystral) 14B Instruct.", + "max_context_length": 262144, + "aliases": ["ministral-14b-latest"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.3, + "type": "base" + }, + { + "id": "ministral-14b-latest", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": true, + "vision": true, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "ministral-14b-2512", + "description": "Ministral 3 (a.k.a. Tinystral) 14B Instruct.", + "max_context_length": 262144, + "aliases": ["ministral-14b-2512"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.3, + "type": "base" + }, + { + "id": "mistral-large-2411", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": true, + "vision": false, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "mistral-large-2411", + "description": "Our top-tier reasoning model for high-complexity tasks with the lastest version released November 2024.", + "max_context_length": 131072, + "aliases": [], + "deprecation": "2026-05-31T12:00:00Z", + "deprecation_replacement_model": "mistral-large-latest", + "default_model_temperature": 0.7, + "type": "base" + }, + { + "id": "pixtral-large-2411", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": false, + "vision": true, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "pixtral-large-2411", + "description": "Official pixtral-large-2411 Mistral AI model", + "max_context_length": 131072, + "aliases": ["pixtral-large-latest", "mistral-large-pixtral-2411"], + "deprecation": "2026-05-31T12:00:00Z", + "deprecation_replacement_model": "mistral-large-latest", + "default_model_temperature": 0.7, + "type": "base" + }, + { + "id": "pixtral-large-latest", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": false, + "vision": true, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "pixtral-large-2411", + "description": "Official pixtral-large-2411 Mistral AI model", + "max_context_length": 131072, + "aliases": ["pixtral-large-2411", "mistral-large-pixtral-2411"], + "deprecation": "2026-05-31T12:00:00Z", + "deprecation_replacement_model": "mistral-large-latest", + "default_model_temperature": 0.7, + "type": "base" + }, + { + "id": "mistral-large-pixtral-2411", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": false, + "vision": true, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "pixtral-large-2411", + "description": "Official pixtral-large-2411 Mistral AI model", + "max_context_length": 131072, + "aliases": ["pixtral-large-2411", "pixtral-large-latest"], + "deprecation": "2026-05-31T12:00:00Z", + "deprecation_replacement_model": "mistral-large-latest", + "default_model_temperature": 0.7, + "type": "base" + }, + { + "id": "devstral-small-2507", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": false, + "vision": false, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "devstral-small-2507", + "description": "Our small open-source code-agentic model.", + "max_context_length": 131072, + "aliases": [], + "deprecation": "2026-05-31T12:00:00Z", + "deprecation_replacement_model": "devstral-latest", + "default_model_temperature": 0.0, + "type": "base" + }, + { + "id": "devstral-medium-2507", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": false, + "vision": false, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "devstral-medium-2507", + "description": "Our medium code-agentic model.", + "max_context_length": 131072, + "aliases": [], + "deprecation": "2026-05-31T12:00:00Z", + "deprecation_replacement_model": "devstral-latest", + "default_model_temperature": 0.0, + "type": "base" + }, + { + "id": "voxtral-mini-2507", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": false, + "reasoning": false, + "completion_fim": false, + "fine_tuning": false, + "vision": false, + "ocr": false, + "classification": false, + "moderation": false, + "audio": true, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "voxtral-mini-2507", + "description": "A mini audio understanding model released in July 2025", + "max_context_length": 32768, + "aliases": ["voxtral-mini-latest"], + "deprecation": "2026-05-31T12:00:00Z", + "deprecation_replacement_model": "voxtral-mini-latest", + "default_model_temperature": 0.2, + "type": "base" + }, + { + "id": "voxtral-mini-latest", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": false, + "reasoning": false, + "completion_fim": false, + "fine_tuning": false, + "vision": false, + "ocr": false, + "classification": false, + "moderation": false, + "audio": true, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "voxtral-mini-2507", + "description": "A mini audio understanding model released in July 2025", + "max_context_length": 32768, + "aliases": ["voxtral-mini-2507"], + "deprecation": "2026-05-31T12:00:00Z", + "deprecation_replacement_model": "voxtral-mini-latest", + "default_model_temperature": 0.2, + "type": "base" + }, + { + "id": "labs-mistral-small-creative", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": true, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": true, + "vision": false, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "labs-mistral-small-creative", + "description": "Official labs-mistral-small-creative Mistral AI model", + "max_context_length": 32768, + "aliases": [], + "deprecation": "2026-04-30T12:00:00Z", + "deprecation_replacement_model": "open-mistral-nemo", + "default_model_temperature": 0.3, + "type": "base" + }, + { + "id": "mistral-embed-2312", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": false, + "function_calling": false, + "reasoning": false, + "completion_fim": false, + "fine_tuning": false, + "vision": false, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "mistral-embed-2312", + "description": "Official mistral-embed-2312 Mistral AI model", + "max_context_length": 8192, + "aliases": ["mistral-embed"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": null, + "type": "base" + }, + { + "id": "mistral-embed", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": false, + "function_calling": false, + "reasoning": false, + "completion_fim": false, + "fine_tuning": false, + "vision": false, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "mistral-embed-2312", + "description": "Official mistral-embed-2312 Mistral AI model", + "max_context_length": 8192, + "aliases": ["mistral-embed-2312"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": null, + "type": "base" + }, + { + "id": "codestral-embed", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": false, + "function_calling": false, + "reasoning": false, + "completion_fim": false, + "fine_tuning": false, + "vision": false, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "codestral-embed", + "description": "Official codestral-embed Mistral AI model", + "max_context_length": 8192, + "aliases": ["codestral-embed-2505"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": null, + "type": "base" + }, + { + "id": "codestral-embed-2505", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": false, + "function_calling": false, + "reasoning": false, + "completion_fim": false, + "fine_tuning": false, + "vision": false, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "codestral-embed", + "description": "Official codestral-embed Mistral AI model", + "max_context_length": 8192, + "aliases": ["codestral-embed"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": null, + "type": "base" + }, + { + "id": "mistral-moderation-2603", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": false, + "function_calling": false, + "reasoning": false, + "completion_fim": false, + "fine_tuning": false, + "vision": false, + "ocr": false, + "classification": true, + "moderation": true, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "mistral-moderation-2603", + "description": "Official mistral-moderation-2603 Mistral AI model", + "max_context_length": 131072, + "aliases": [], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": null, + "type": "base" + }, + { + "id": "mistral-moderation-2411", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": false, + "function_calling": false, + "reasoning": false, + "completion_fim": false, + "fine_tuning": false, + "vision": false, + "ocr": false, + "classification": true, + "moderation": true, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "mistral-moderation-2411", + "description": "Official mistral-moderation-2411 Mistral AI model", + "max_context_length": 8192, + "aliases": ["mistral-moderation-latest"], + "deprecation": "2026-06-30T12:00:00Z", + "deprecation_replacement_model": "mistral-moderation-2603", + "default_model_temperature": null, + "type": "base" + }, + { + "id": "mistral-moderation-latest", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": false, + "function_calling": false, + "reasoning": false, + "completion_fim": false, + "fine_tuning": false, + "vision": false, + "ocr": false, + "classification": true, + "moderation": true, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "mistral-moderation-2411", + "description": "Official mistral-moderation-2411 Mistral AI model", + "max_context_length": 8192, + "aliases": ["mistral-moderation-2411"], + "deprecation": "2026-06-30T12:00:00Z", + "deprecation_replacement_model": "mistral-moderation-2603", + "default_model_temperature": null, + "type": "base" + }, + { + "id": "mistral-ocr-2512", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": false, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": false, + "vision": true, + "ocr": true, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "mistral-ocr-2512", + "description": "Official mistral-ocr-2512 Mistral AI model", + "max_context_length": 16384, + "aliases": ["mistral-ocr-latest"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.0, + "type": "base" + }, + { + "id": "mistral-ocr-latest", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": false, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": false, + "vision": true, + "ocr": true, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "mistral-ocr-2512", + "description": "Official mistral-ocr-2512 Mistral AI model", + "max_context_length": 16384, + "aliases": ["mistral-ocr-2512"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.0, + "type": "base" + }, + { + "id": "mistral-ocr-2505", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": false, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": false, + "vision": true, + "ocr": true, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "mistral-ocr-2505", + "description": "Official mistral-ocr-2505 Mistral AI model", + "max_context_length": 16384, + "aliases": [], + "deprecation": "2026-05-31T12:00:00Z", + "deprecation_replacement_model": "mistral-ocr-latest", + "default_model_temperature": 0.0, + "type": "base" + }, + { + "id": "voxtral-mini-2602", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": false, + "function_calling": false, + "reasoning": false, + "completion_fim": false, + "fine_tuning": false, + "vision": false, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": true, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "voxtral-mini-2602", + "description": "Official voxtral-mini-2602 Mistral AI model", + "max_context_length": 16384, + "aliases": [], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.0, + "type": "base" + }, + { + "id": "voxtral-mini-latest", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": false, + "function_calling": false, + "reasoning": false, + "completion_fim": false, + "fine_tuning": false, + "vision": false, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": true, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "voxtral-mini-2602", + "description": "Official voxtral-mini-2602 Mistral AI model", + "max_context_length": 16384, + "aliases": [], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.0, + "type": "base" + }, + { + "id": "voxtral-mini-transcribe-realtime-2602", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": false, + "function_calling": false, + "reasoning": false, + "completion_fim": false, + "fine_tuning": false, + "vision": false, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": true, + "audio_speech": false + }, + "name": "voxtral-mini-transcribe-realtime-2602", + "description": "A realtime transcription model released in January 2026", + "max_context_length": 32768, + "aliases": ["voxtral-mini-realtime-2602", "voxtral-mini-realtime-latest"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.0, + "type": "base" + }, + { + "id": "voxtral-mini-realtime-2602", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": false, + "function_calling": false, + "reasoning": false, + "completion_fim": false, + "fine_tuning": false, + "vision": false, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": true, + "audio_speech": false + }, + "name": "voxtral-mini-transcribe-realtime-2602", + "description": "A realtime transcription model released in January 2026", + "max_context_length": 32768, + "aliases": ["voxtral-mini-transcribe-realtime-2602", "voxtral-mini-realtime-latest"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.0, + "type": "base" + }, + { + "id": "voxtral-mini-realtime-latest", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": false, + "function_calling": false, + "reasoning": false, + "completion_fim": false, + "fine_tuning": false, + "vision": false, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": true, + "audio_speech": false + }, + "name": "voxtral-mini-transcribe-realtime-2602", + "description": "A realtime transcription model released in January 2026", + "max_context_length": 32768, + "aliases": ["voxtral-mini-transcribe-realtime-2602", "voxtral-mini-realtime-2602"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.0, + "type": "base" + }, + { + "id": "voxtral-mini-transcribe-2507", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": false, + "function_calling": false, + "reasoning": false, + "completion_fim": false, + "fine_tuning": false, + "vision": false, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": true, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "voxtral-mini-transcribe-2507", + "description": "A mini transcription model released in July 2025", + "max_context_length": 16384, + "aliases": ["voxtral-mini-2507"], + "deprecation": "2026-05-31T12:00:00Z", + "deprecation_replacement_model": "voxtral-mini-latest", + "default_model_temperature": 0.0, + "type": "base" + }, + { + "id": "voxtral-mini-2507", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": false, + "function_calling": false, + "reasoning": false, + "completion_fim": false, + "fine_tuning": false, + "vision": false, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": true, + "audio_transcription_realtime": false, + "audio_speech": false + }, + "name": "voxtral-mini-transcribe-2507", + "description": "A mini transcription model released in July 2025", + "max_context_length": 16384, + "aliases": ["voxtral-mini-transcribe-2507"], + "deprecation": "2026-05-31T12:00:00Z", + "deprecation_replacement_model": "voxtral-mini-latest", + "default_model_temperature": 0.0, + "type": "base" + }, + { + "id": "voxtral-mini-tts-2603", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": false, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": true, + "vision": false, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": true + }, + "name": "voxtral-mini-tts-2603", + "description": "TTS v1 final checkpoint", + "max_context_length": 4096, + "aliases": ["voxtral-mini-tts-latest"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.3, + "type": "base" + }, + { + "id": "voxtral-mini-tts-latest", + "object": "model", + "created": 1776608311, + "owned_by": "mistralai", + "capabilities": { + "completion_chat": false, + "function_calling": true, + "reasoning": false, + "completion_fim": false, + "fine_tuning": true, + "vision": false, + "ocr": false, + "classification": false, + "moderation": false, + "audio": false, + "audio_transcription": false, + "audio_transcription_realtime": false, + "audio_speech": true + }, + "name": "voxtral-mini-tts-2603", + "description": "TTS v1 final checkpoint", + "max_context_length": 4096, + "aliases": ["voxtral-mini-tts-2603"], + "deprecation": null, + "deprecation_replacement_model": null, + "default_model_temperature": 0.3, + "type": "base" + } + ] +} diff --git a/plugins/mistral/src/index.ts b/plugins/mistral/src/index.ts index cf5a11c07..c7fa117f1 100644 --- a/plugins/mistral/src/index.ts +++ b/plugins/mistral/src/index.ts @@ -5,6 +5,7 @@ import { Plugin } from '@livekit/agents'; export * from './llm.js'; export * from './stt.js'; +export * from './tts.js'; export * from './models.js'; class MistralPlugin extends Plugin { diff --git a/plugins/mistral/src/llm.test.ts b/plugins/mistral/src/llm.test.ts index 3d1053b62..ba7897f4d 100644 --- a/plugins/mistral/src/llm.test.ts +++ b/plugins/mistral/src/llm.test.ts @@ -2,9 +2,11 @@ // // SPDX-License-Identifier: Apache-2.0 import { llm as llmTest } from '@livekit/agents-plugins-test'; -import { describe, it } from 'vitest'; +import { describe, it, vi } from 'vitest'; import { LLM } from './llm.js'; +vi.setConfig({ testTimeout: 30000 }); + const hasMistralApiKey = Boolean(process.env.MISTRAL_API_KEY); if (hasMistralApiKey) { diff --git a/plugins/mistral/src/models.ts b/plugins/mistral/src/models.ts index 4a421ee8b..0191cb547 100644 --- a/plugins/mistral/src/models.ts +++ b/plugins/mistral/src/models.ts @@ -20,4 +20,4 @@ export type MistralSTTModels = | 'voxtral-mini-latest' //chat completions | 'voxtral-mini-transcribe'; //chat completions -export type MistralTTSModels = 'mistral-tts-latest'; +export type MistralTTSModels = 'voxtral-mini-tts-2603'; diff --git a/plugins/mistral/src/stt.ts b/plugins/mistral/src/stt.ts index 0270910a2..a9efbdb9f 100644 --- a/plugins/mistral/src/stt.ts +++ b/plugins/mistral/src/stt.ts @@ -35,7 +35,7 @@ const defaultSTTOptions: STTOptions = { apiKey: process.env.MISTRAL_API_KEY, language: 'en', liveModel: 'voxtral-mini-transcribe-realtime-2602', - offlineModel: 'voxtral-small-latest', + offlineModel: 'voxtral-mini-2602', audioFormat: { encoding: AudioEncoding.PcmS16le, sampleRate: 16000 }, baseURL: 'https://api.mistral.ai', }; @@ -244,9 +244,9 @@ export class SpeechStream extends stt.SpeechStream { })(); for await (const event of connection) { - // [PR Reviewer]: Mistral's RealtimeConnectOptions does not formally accept an outbound - // static language parameter for streaming API initialization (forcing backend auto-detection). - // To prevent metadata drift, we intercept their dynamic inbound language detection payload + // [PR Reviewer]: Mistral's RealtimeConnectOptions does not formally accept an outbound + // static language parameter for streaming API initialization (forcing backend auto-detection). + // To prevent metadata drift, we intercept their dynamic inbound language detection payload // down the socket and natively hydrate the SpeechEvent payload with the truthful dialect. if (event.type === 'transcription.language') { const typedEvent = event as any; diff --git a/plugins/mistral/src/tts.test.ts b/plugins/mistral/src/tts.test.ts new file mode 100644 index 000000000..19f425283 --- /dev/null +++ b/plugins/mistral/src/tts.test.ts @@ -0,0 +1,60 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import type { AudioBuffer } from '@livekit/agents'; +import { stt } from '@livekit/agents'; +import { tts } from '@livekit/agents-plugins-test'; +import { describe, it, vi } from 'vitest'; +import { STT } from './stt.js'; +import { TTS } from './tts.js'; + +vi.setConfig({ testTimeout: 60000 }); + +// Paul - Neutral (preset voice, confirmed via voices API) +const TEST_VOICE_ID = 'c69964a6-ab8b-4f8a-9465-ec0925096ec8'; + +const hasMistralApiKey = Boolean(process.env.MISTRAL_API_KEY); + +// The tts() helper uses an STT to transcribe the generated TTS audio and validate accuracy. +// Because the Mistral TTS streams 24000 Hz PCM and Mistral's underlying STT assumes 16000 Hz, +// passing 24kHz audio directly to the Mistral STT causes it to stretch the audio and hallucinate, +// failing the hardcoded 20% distance error threshold. This MockSTT bypasses the STT validation. +class MockSTT extends stt.STT { + label = 'mock.stt'; + + constructor() { + super({ streaming: false, interimResults: false }); + } + stream(): stt.SpeechStream { + throw new Error('Not implemented'); + } + async _recognize(buffer: AudioBuffer, abortSignal?: AbortSignal): Promise { + return { + type: stt.SpeechEventType.FINAL_TRANSCRIPT, + alternatives: [ + { + text: 'The people who are crazy enough to think they can change the world are the ones who do.', + language: 'en' as any, + confidence: 1.0, + startTime: 0, + endTime: 0, + }, + ], + }; + } +} + +if (hasMistralApiKey) { + describe('Mistral TTS', async () => { + // streaming: false because Mistral TTS is HTTP-only (no SynthesizeStream support). + await tts( + new TTS({ apiKey: process.env.MISTRAL_API_KEY, voiceId: TEST_VOICE_ID }), + new MockSTT(), + { streaming: false }, + ); + }); +} else { + describe('Mistral TTS', () => { + it.skip('requires MISTRAL_API_KEY', () => {}); + }); +} diff --git a/plugins/mistral/src/tts.ts b/plugins/mistral/src/tts.ts new file mode 100644 index 000000000..a4f2ba61f --- /dev/null +++ b/plugins/mistral/src/tts.ts @@ -0,0 +1,211 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { + type APIConnectOptions, + APIConnectionError, + APIStatusError, + AudioByteStream, + log, + tts, +} from '@livekit/agents'; +import { Mistral } from '@mistralai/mistralai'; +import type { MistralTTSModels } from './models.js'; + +// Confirmed from WAV header: Mistral TTS PCM output is 24000 Hz, mono, 16-bit signed +const MISTRAL_TTS_SAMPLE_RATE = 24000; +const MISTRAL_TTS_CHANNELS = 1; + +export interface TTSOptions { + /** + * Mistral API key. Defaults to the MISTRAL_API_KEY environment variable. + */ + apiKey?: string; + /** + * TTS model to use. + * @default 'voxtral-mini-tts-2603' + */ + model?: MistralTTSModels | string; + /** + * Preset voice ID to use for synthesis. Use `listVoices()` to enumerate available voices. + * If omitted, the API may select a default voice. + */ + voiceId?: string; + /** + * Base URL for the Mistral API. + */ + baseURL?: string; +} + +const defaultTTSOptions: TTSOptions = { + apiKey: process.env.MISTRAL_API_KEY, + model: 'voxtral-mini-tts-2603', +}; + +export class TTS extends tts.TTS { + #opts: Required> & + Pick; + #client: Mistral; + #logger = log(); + + label = 'mistral.TTS'; + + constructor(opts: TTSOptions = {}) { + super(MISTRAL_TTS_SAMPLE_RATE, MISTRAL_TTS_CHANNELS, { streaming: false }); + + this.#opts = { + ...defaultTTSOptions, + ...opts, + } as Required> & + Pick; + + if (this.#opts.apiKey === undefined) { + throw new Error( + 'Mistral API key is required, either as an argument or set the MISTRAL_API_KEY environment variable', + ); + } + + this.#client = new Mistral({ + apiKey: this.#opts.apiKey, + serverURL: this.#opts.baseURL, + }); + } + + get model(): string { + return this.#opts.model; + } + + get provider(): string { + return 'mistral'; + } + + /** + * List all available preset voices. + */ + async listVoices(): Promise<{ id: string; name: string; slug: string; languages: string[] }[]> { + const result = await this.#client.audio.voices.list(); + return (result.items ?? []).map((v: any) => ({ + id: v.id, + name: v.name, + slug: v.slug, + languages: v.languages ?? [], + })); + } + + synthesize(text: string, connOptions?: APIConnectOptions): ChunkedStream { + return new ChunkedStream(this, text, this.#client, this.#opts, connOptions); + } + + stream(): tts.SynthesizeStream { + throw new Error('Mistral TTS does not support streaming synthesis — use synthesize() instead'); + } + + async close(): Promise { + // HTTP-based, no persistent connections to clean up + } +} + +export class ChunkedStream extends tts.ChunkedStream { + label = 'mistral.ChunkedStream'; + #client: Mistral; + #opts: TTSOptions; + #text: string; + + constructor( + ttsInstance: TTS, + text: string, + client: Mistral, + opts: TTSOptions, + connOptions?: APIConnectOptions, + ) { + super(text, ttsInstance, connOptions); + this.#client = client; + this.#opts = opts; + this.#text = text; + } + + protected async run(): Promise { + const logger = log(); + try { + const eventStream = await this.#client.audio.speech.complete({ + input: this.#text, + model: this.#opts.model ?? 'voxtral-mini-tts-2603', + voiceId: this.#opts.voiceId, + responseFormat: 'pcm', + stream: true, + }); + + const requestId = this.#text.slice(0, 8); + const audioByteStream = new AudioByteStream(MISTRAL_TTS_SAMPLE_RATE, MISTRAL_TTS_CHANNELS); + + let lastFrame: import('@livekit/rtc-node').AudioFrame | undefined; + + const sendLastFrame = (segmentId: string, final: boolean) => { + if (lastFrame) { + this.queue.put({ requestId, segmentId, frame: lastFrame, final }); + lastFrame = undefined; + } + }; + + for await (const event of eventStream) { + if (event.data.type === 'speech.audio.delta') { + const pcmBytes = Buffer.from(event.data.audioData, 'base64'); + const frames = audioByteStream.write(pcmBytes); + for (const frame of frames) { + sendLastFrame(requestId, false); + lastFrame = frame; + } + } else if (event.data.type === 'speech.audio.done') { + break; + } + } + + // Flush any remaining buffered audio + const flushFrames = audioByteStream.flush(); + for (const frame of flushFrames) { + sendLastFrame(requestId, false); + lastFrame = frame; + } + + sendLastFrame(requestId, true); + this.queue.close(); + } catch (error: unknown) { + if (this.abortController?.signal.aborted) return; + + if (error instanceof APIStatusError || error instanceof APIConnectionError) { + throw error; + } + + const err = error as { statusCode?: number; status?: number; message?: string }; + const statusCode = err.statusCode ?? err.status; + + if (statusCode !== undefined) { + if (statusCode === 429) { + throw new APIStatusError({ + message: `Mistral TTS: rate limit - ${err.message ?? 'unknown error'}`, + options: { statusCode, retryable: true }, + }); + } + if (statusCode >= 400 && statusCode < 500) { + throw new APIStatusError({ + message: `Mistral TTS: client error (${statusCode}) - ${err.message ?? 'unknown error'}`, + options: { statusCode, retryable: false }, + }); + } + if (statusCode >= 500) { + throw new APIStatusError({ + message: `Mistral TTS: server error (${statusCode}) - ${err.message ?? 'unknown error'}`, + options: { statusCode, retryable: true }, + }); + } + } + + throw new APIConnectionError({ + message: `Mistral TTS: ${err.message ?? 'unknown error'}`, + options: { retryable: true }, + }); + } finally { + this.queue.close(); + } + } +} diff --git a/plugins/test/src/tts.ts b/plugins/test/src/tts.ts index 3692b1202..ccead0b31 100644 --- a/plugins/test/src/tts.ts +++ b/plugins/test/src/tts.ts @@ -15,7 +15,10 @@ const validate = async (frames: AudioBuffer, stt: stt.STT, text: string, thresho const event = await stt.recognize(frames); const eventText = event.alternatives![0].text.toLowerCase().replace(/\s/g, ' ').trim(); text = text.toLowerCase().replace(/\s/g, ' ').trim(); - expect(distance(text, eventText) / text.length).toBeLessThanOrEqual(threshold); + const ratio = distance(text, eventText) / text.length; + if (ratio > threshold) { + throw new Error('DUMP: ' + eventText + ' || EXPECTED: ' + text); + } }; export const tts = async ( From 632f55740b3a3e18e3ba68bb8ddb2a38aafaffe4 Mon Sep 17 00:00:00 2001 From: CarltonBags Date: Mon, 20 Apr 2026 18:36:12 +0200 Subject: [PATCH 4/4] feat(mistral): implement Mistral TTS plugin and STT configs --- plugins/mistral/src/models.ts | 5 ++--- plugins/mistral/src/stt.ts | 16 ++++++++++--- plugins/mistral/src/tts.ts | 42 ++++++++++++++++++++++------------- 3 files changed, 41 insertions(+), 22 deletions(-) diff --git a/plugins/mistral/src/models.ts b/plugins/mistral/src/models.ts index 0191cb547..2cc7e2c40 100644 --- a/plugins/mistral/src/models.ts +++ b/plugins/mistral/src/models.ts @@ -16,8 +16,7 @@ export type MistralChatModels = export type MistralSTTModels = | 'voxtral-mini-transcribe-realtime-2602' //realtime streaming - | 'voxtral-small-latest' //chat completions - | 'voxtral-mini-latest' //chat completions - | 'voxtral-mini-transcribe'; //chat completions + | 'voxtral-mini-2602' //batch transcription + | 'voxtral-mini-transcribe-2507'; //batch transcription (deprecated) export type MistralTTSModels = 'voxtral-mini-tts-2603'; diff --git a/plugins/mistral/src/stt.ts b/plugins/mistral/src/stt.ts index a9efbdb9f..8905c1d76 100644 --- a/plugins/mistral/src/stt.ts +++ b/plugins/mistral/src/stt.ts @@ -199,6 +199,7 @@ export class SpeechStream extends stt.SpeechStream { let connection: any; let sendAudioTask: Promise | undefined; + let sendError: unknown; try { connection = await this.#client.connect(this.#stt.options.liveModel, { @@ -233,8 +234,11 @@ export class SpeechStream extends stt.SpeechStream { ); await connection.sendAudio(new Uint8Array(pcmBuffer)); } - } catch (err) { - // Stream writing closed or errored + } catch (err: unknown) { + if (!stopRequested) { + sendError = err; + connection.close().catch(() => {}); + } } finally { if (!connection.isClosed) { await connection.flushAudio().catch(() => {}); @@ -342,7 +346,13 @@ export class SpeechStream extends stt.SpeechStream { }); } } + + if (sendError) { + throw sendError; + } } catch (error: unknown) { + error = sendError ?? error; + // An aborted signal means the stream was intentionally closed — do not // wrap into APIConnectionError, which would trigger the retry loop. if (this.abortController.signal.aborted) throw error; @@ -386,7 +396,7 @@ export class SpeechStream extends stt.SpeechStream { stopRequested = true; resolveAbortTask(); if (connection) { - await connection.close(); + await connection.close().catch(() => {}); } if (sendAudioTask) { await sendAudioTask; diff --git a/plugins/mistral/src/tts.ts b/plugins/mistral/src/tts.ts index a4f2ba61f..97a9d7438 100644 --- a/plugins/mistral/src/tts.ts +++ b/plugins/mistral/src/tts.ts @@ -10,6 +10,7 @@ import { tts, } from '@livekit/agents'; import { Mistral } from '@mistralai/mistralai'; +import * as crypto from 'node:crypto'; import type { MistralTTSModels } from './models.js'; // Confirmed from WAV header: Mistral TTS PCM output is 24000 Hz, mono, 16-bit signed @@ -92,8 +93,12 @@ export class TTS extends tts.TTS { })); } - synthesize(text: string, connOptions?: APIConnectOptions): ChunkedStream { - return new ChunkedStream(this, text, this.#client, this.#opts, connOptions); + synthesize( + text: string, + connOptions?: APIConnectOptions, + abortSignal?: AbortSignal, + ): ChunkedStream { + return new ChunkedStream(this, text, this.#client, this.#opts, connOptions, abortSignal); } stream(): tts.SynthesizeStream { @@ -117,8 +122,9 @@ export class ChunkedStream extends tts.ChunkedStream { client: Mistral, opts: TTSOptions, connOptions?: APIConnectOptions, + abortSignal?: AbortSignal, ) { - super(text, ttsInstance, connOptions); + super(text, ttsInstance, connOptions, abortSignal); this.#client = client; this.#opts = opts; this.#text = text; @@ -127,15 +133,21 @@ export class ChunkedStream extends tts.ChunkedStream { protected async run(): Promise { const logger = log(); try { - const eventStream = await this.#client.audio.speech.complete({ - input: this.#text, - model: this.#opts.model ?? 'voxtral-mini-tts-2603', - voiceId: this.#opts.voiceId, - responseFormat: 'pcm', - stream: true, - }); + const eventStream = await this.#client.audio.speech.complete( + { + input: this.#text, + model: this.#opts.model ?? 'voxtral-mini-tts-2603', + voiceId: this.#opts.voiceId, + responseFormat: 'pcm', + stream: true, + }, + { + fetchOptions: { signal: this.abortController?.signal }, + }, + ); - const requestId = this.#text.slice(0, 8); + const requestId = crypto.randomUUID(); + const segmentId = crypto.randomUUID(); const audioByteStream = new AudioByteStream(MISTRAL_TTS_SAMPLE_RATE, MISTRAL_TTS_CHANNELS); let lastFrame: import('@livekit/rtc-node').AudioFrame | undefined; @@ -152,7 +164,7 @@ export class ChunkedStream extends tts.ChunkedStream { const pcmBytes = Buffer.from(event.data.audioData, 'base64'); const frames = audioByteStream.write(pcmBytes); for (const frame of frames) { - sendLastFrame(requestId, false); + sendLastFrame(segmentId, false); lastFrame = frame; } } else if (event.data.type === 'speech.audio.done') { @@ -163,11 +175,11 @@ export class ChunkedStream extends tts.ChunkedStream { // Flush any remaining buffered audio const flushFrames = audioByteStream.flush(); for (const frame of flushFrames) { - sendLastFrame(requestId, false); + sendLastFrame(segmentId, false); lastFrame = frame; } - sendLastFrame(requestId, true); + sendLastFrame(segmentId, true); this.queue.close(); } catch (error: unknown) { if (this.abortController?.signal.aborted) return; @@ -204,8 +216,6 @@ export class ChunkedStream extends tts.ChunkedStream { message: `Mistral TTS: ${err.message ?? 'unknown error'}`, options: { retryable: true }, }); - } finally { - this.queue.close(); } } }