diff --git a/.changeset/rosetta-issue-99.md b/.changeset/rosetta-issue-99.md new file mode 100644 index 000000000..201e04e81 --- /dev/null +++ b/.changeset/rosetta-issue-99.md @@ -0,0 +1,5 @@ +--- +"@livekit/agents": minor +--- + +feat(voice): add dynamic endpointing to the Node.js SDK diff --git a/agents/src/utils.ts b/agents/src/utils.ts index 82c623a6c..4d04234c2 100644 --- a/agents/src/utils.ts +++ b/agents/src/utils.ts @@ -350,45 +350,119 @@ export class AsyncIterableQueue implements AsyncIterableIterator { } } +type ExpFilterConfig = { + alpha?: number; + initial?: number; + min?: number; + max?: number; +}; + /** @internal */ +// Ref: source livekit-agents/livekit/agents/utils/exp_filter.py - 5-64 export class ExpFilter { - #alpha: number; - #max?: number; - #filtered?: number = undefined; + #alphaValue: number; + #maxValue?: number; + #minValue?: number; + #filteredValue?: number; + + constructor(alpha: number, max?: number); + constructor(alpha: number, config?: ExpFilterConfig); + constructor(alpha: number, maxOrConfig?: number | ExpFilterConfig) { + this.assertAlpha(alpha); + this.#alphaValue = alpha; + + if (typeof maxOrConfig === 'number') { + this.#maxValue = maxOrConfig; + return; + } - constructor(alpha: number, max?: number) { - this.#alpha = alpha; - this.#max = max; + this.#maxValue = maxOrConfig?.max; + this.#minValue = maxOrConfig?.min; + this.#filteredValue = maxOrConfig?.initial; } - reset(alpha?: number) { - if (alpha) { - this.#alpha = alpha; + reset(alpha?: number): void; + reset(config?: ExpFilterConfig): void; + reset(alphaOrConfig?: number | ExpFilterConfig): void { + if (typeof alphaOrConfig === 'number') { + this.assertAlpha(alphaOrConfig); + this.#alphaValue = alphaOrConfig; + return; + } + + if (alphaOrConfig?.alpha !== undefined) { + this.assertAlpha(alphaOrConfig.alpha); + this.#alphaValue = alphaOrConfig.alpha; + } + if (alphaOrConfig?.initial !== undefined) { + this.#filteredValue = alphaOrConfig.initial; + } + if (alphaOrConfig?.min !== undefined) { + this.#minValue = alphaOrConfig.min; + } + if (alphaOrConfig?.max !== undefined) { + this.#maxValue = alphaOrConfig.max; } - this.#filtered = undefined; } - apply(exp: number, sample: number): number { - if (this.#filtered) { - const a = this.#alpha ** exp; - this.#filtered = a * this.#filtered + (1 - a) * sample; + apply(exp: number, sample: number): number; + apply(exp: number, sample?: number): number { + const nextSample = sample ?? this.#filteredValue; + if (nextSample === undefined) { + throw new Error('sample or initial value must be given.'); + } + + if (this.#filteredValue === undefined) { + this.#filteredValue = nextSample; } else { - this.#filtered = sample; + const alpha = this.#alphaValue ** exp; + this.#filteredValue = alpha * this.#filteredValue + (1 - alpha) * nextSample; } - if (this.#max && this.#filtered > this.#max) { - this.#filtered = this.#max; + if (this.#maxValue !== undefined && this.#filteredValue > this.#maxValue) { + this.#filteredValue = this.#maxValue; + } + if (this.#minValue !== undefined && this.#filteredValue < this.#minValue) { + this.#filteredValue = this.#minValue; } - return this.#filtered; + return this.#filteredValue; + } + + updateBase(alpha: number): void { + this.assertAlpha(alpha); + this.#alphaValue = alpha; } get filtered(): number | undefined { - return this.#filtered; + return this.#filteredValue; + } + + get value(): number | undefined { + return this.#filteredValue; + } + + get alpha(): number { + return this.#alphaValue; } set alpha(alpha: number) { - this.#alpha = alpha; + this.assertAlpha(alpha); + this.#alphaValue = alpha; + } + + get min(): number | undefined { + return this.#minValue; + } + + get max(): number | undefined { + return this.#maxValue; + } + + private assertAlpha(alpha: number): void { + if (!(alpha > 0 && alpha <= 1)) { + throw new Error('alpha must be in (0, 1].'); + } } } diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts index b0ddf9d30..b75940577 100644 --- a/agents/src/voice/agent_activity.ts +++ b/agents/src/voice/agent_activity.ts @@ -88,6 +88,7 @@ import { } from './generation.js'; import type { TimedString } from './io.js'; import { SpeechHandle } from './speech_handle.js'; +import { type EndpointingOptions, createEndpointing } from './turn_config/endpointing.js'; import { setParticipantSpanAttributes } from './utils.js'; export const agentActivityStorage = new AsyncLocalStorage(); @@ -469,6 +470,7 @@ export class AgentActivity implements RecognitionHooks { this.vad.on('metrics_collected', this.onMetricsCollected); } + // Ref: source livekit-agents/livekit/agents/voice/agent_activity.py - 768-784 this.audioRecognition = new AudioRecognition({ recognitionHooks: this, // Disable stt node if stt is not provided @@ -483,6 +485,17 @@ export class AgentActivity implements RecognitionHooks { maxEndpointingDelay: this.agent.turnHandling?.endpointing?.maxDelay ?? this.agentSession.sessionOptions.turnHandling.endpointing.maxDelay, + endpointing: createEndpointing({ + mode: + this.agent.turnHandling?.endpointing?.mode ?? + this.agentSession.sessionOptions.turnHandling.endpointing.mode, + minDelay: + this.agent.turnHandling?.endpointing?.minDelay ?? + this.agentSession.sessionOptions.turnHandling.endpointing.minDelay, + maxDelay: + this.agent.turnHandling?.endpointing?.maxDelay ?? + this.agentSession.sessionOptions.turnHandling.endpointing.maxDelay, + }), rootSpanContext: this.agentSession.rootSpanContext, sttModel: this.stt?.label, sttProvider: this.getSttProvider(), @@ -661,20 +674,6 @@ export class AgentActivity implements RecognitionHooks { return this.agent.turnHandling ?? this.agentSession.sessionOptions.turnHandling; } - // get minEndpointingDelay(): number { - // return ( - // this.agent.turnHandling?.endpointing?.minDelay ?? - // this.agentSession.sessionOptions.turnHandling.endpointing.minDelay - // ); - // } - - // get maxEndpointingDelay(): number { - // return ( - // this.agent.turnHandling?.endpointing?.maxDelay ?? - // this.agentSession.sessionOptions.turnHandling.endpointing.maxDelay - // ); - // } - get toolCtx(): ToolContext { return this.agent.toolCtx; } @@ -721,6 +720,15 @@ export class AgentActivity implements RecognitionHooks { }: { toolChoice?: ToolChoice | null; turnDetection?: TurnDetectionMode; + }): void; + updateOptions({ + toolChoice, + turnDetection, + endpointing, + }: { + toolChoice?: ToolChoice | null; + turnDetection?: TurnDetectionMode; + endpointing?: EndpointingOptions; }): void { if (toolChoice !== undefined) { this.toolChoice = toolChoice; @@ -742,8 +750,12 @@ export class AgentActivity implements RecognitionHooks { } } + // Ref: source livekit-agents/livekit/agents/voice/agent_activity.py - 477-482 if (this.audioRecognition) { - this.audioRecognition.updateOptions({ turnDetection: this.turnDetectionMode }); + this.audioRecognition.updateOptions({ + endpointing: endpointing ? createEndpointing(endpointing) : undefined, + turnDetection: this.turnDetectionMode, + }); } } @@ -921,13 +933,10 @@ export class AgentActivity implements RecognitionHooks { this.logger.info('onInputSpeechStarted'); if (!this.vad) { + // Ref: source livekit-agents/livekit/agents/voice/agent_activity.py - 1490-1498 this.agentSession._updateUserState('speaking'); - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { - this.audioRecognition.onStartOfOverlapSpeech( - 0, - Date.now(), - this.agentSession._userSpeakingSpan, - ); + if (this.audioRecognition) { + this.audioRecognition.onStartOfSpeech(Date.now(), 0, this.agentSession._userSpeakingSpan); } } @@ -947,8 +956,9 @@ export class AgentActivity implements RecognitionHooks { this.logger.info(ev, 'onInputSpeechStopped'); if (!this.vad) { - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { - this.audioRecognition.onEndOfOverlapSpeech(Date.now(), this.agentSession._userSpeakingSpan); + // Ref: source livekit-agents/livekit/agents/voice/agent_activity.py - 1508-1516 + if (this.audioRecognition) { + this.audioRecognition.onEndOfSpeech(Date.now(), this.agentSession._userSpeakingSpan); } this.agentSession._updateUserState('listening'); } @@ -1832,13 +1842,17 @@ export class AgentActivity implements RecognitionHooks { let replyTtsGenData: _TTSGenerationData | null = null; const onFirstFrame = (startedSpeakingAt?: number) => { - replyStartedSpeakingAt = startedSpeakingAt ?? Date.now(); + // Ref: source livekit-agents/livekit/agents/voice/agent_activity.py - 2183-2197 + const agentSpeechStartedAt = startedSpeakingAt ?? Date.now(); + replyStartedSpeakingAt = agentSpeechStartedAt; this.agentSession._updateAgentState('speaking', { startTime: startedSpeakingAt, otelContext: speechHandle._agentTurnContext, }); - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { - this.audioRecognition.onStartOfAgentSpeech(); + if (this.audioRecognition) { + this.audioRecognition.onStartOfAgentSpeech(agentSpeechStartedAt); + } + if (this.isInterruptionDetectionEnabled) { this.isInterruptionByAudioActivityEnabled = false; } }; @@ -1924,10 +1938,12 @@ export class AgentActivity implements RecognitionHooks { if (this.agentSession.agentState === 'speaking') { this.agentSession._updateAgentState('listening'); - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + if (this.audioRecognition) { this.audioRecognition.onEndOfAgentSpeech(Date.now()); } - this.restoreInterruptionByAudioActivity(); + if (this.isInterruptionDetectionEnabled) { + this.restoreInterruptionByAudioActivity(); + } } } @@ -2108,13 +2124,17 @@ export class AgentActivity implements RecognitionHooks { let agentStartedSpeakingAt: number | undefined; const onFirstFrame = (startedSpeakingAt?: number) => { - agentStartedSpeakingAt = startedSpeakingAt ?? Date.now(); + // Ref: source livekit-agents/livekit/agents/voice/agent_activity.py - 2183-2197 + const agentSpeechStartedAt = startedSpeakingAt ?? Date.now(); + agentStartedSpeakingAt = agentSpeechStartedAt; this.agentSession._updateAgentState('speaking', { startTime: startedSpeakingAt, otelContext: speechHandle._agentTurnContext, }); - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { - this.audioRecognition.onStartOfAgentSpeech(); + if (this.audioRecognition) { + this.audioRecognition.onStartOfAgentSpeech(agentSpeechStartedAt); + } + if (this.isInterruptionDetectionEnabled) { this.isInterruptionByAudioActivityEnabled = false; } }; @@ -2271,8 +2291,10 @@ export class AgentActivity implements RecognitionHooks { if (this.agentSession.agentState === 'speaking') { this.agentSession._updateAgentState('listening'); - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + if (this.audioRecognition) { this.audioRecognition.onEndOfAgentSpeech(Date.now()); + } + if (this.isInterruptionDetectionEnabled) { this.restoreInterruptionByAudioActivity(); } } @@ -2314,11 +2336,11 @@ export class AgentActivity implements RecognitionHooks { this.agentSession._updateAgentState('thinking'); } else if (this.agentSession.agentState === 'speaking') { this.agentSession._updateAgentState('listening'); - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { - { - this.audioRecognition.onEndOfAgentSpeech(Date.now()); - this.restoreInterruptionByAudioActivity(); - } + if (this.audioRecognition) { + this.audioRecognition.onEndOfAgentSpeech(Date.now()); + } + if (this.isInterruptionDetectionEnabled) { + this.restoreInterruptionByAudioActivity(); } } diff --git a/agents/src/voice/audio_recognition.ts b/agents/src/voice/audio_recognition.ts index e77532a5b..f1666887a 100644 --- a/agents/src/voice/audio_recognition.ts +++ b/agents/src/voice/audio_recognition.ts @@ -36,6 +36,7 @@ import { Task, cancelAndWait, delay, readStream, waitForAbort } from '../utils.j import { type VAD, type VADEvent, VADEventType } from '../vad.js'; import type { TurnDetectionMode } from './agent_session.js'; import type { STTNode } from './io.js'; +import { type BaseEndpointing, createEndpointing } from './turn_config/endpointing.js'; import { setParticipantSpanAttributes } from './utils.js'; export interface EndOfTurnInfo { @@ -139,9 +140,13 @@ export interface AudioRecognitionOptions { turnDetectionMode?: TurnDetectionMode; interruptionDetection?: AdaptiveInterruptionDetector; /** Minimum endpointing delay in milliseconds. */ + /** @deprecated Use `endpointing` instead. */ minEndpointingDelay: number; /** Maximum endpointing delay in milliseconds. */ + /** @deprecated Use `endpointing` instead. */ maxEndpointingDelay: number; + /** Endpointing strategy. */ + endpointing?: BaseEndpointing; /** Root span context for tracing. */ rootSpanContext?: Context; /** STT model name for tracing */ @@ -172,6 +177,7 @@ export class AudioRecognition { private turnDetectionMode?: TurnDetectionMode; private minEndpointingDelay: number; private maxEndpointingDelay: number; + private endpointing: BaseEndpointing; private lastLanguage?: LanguageCode; private rootSpanContext?: Context; private sttModel?: string; @@ -215,6 +221,7 @@ export class AudioRecognition { private transcriptBuffer: SpeechEvent[]; private isInterruptionEnabled: boolean; private isAgentSpeaking: boolean; + private overlapInterruptionDetected?: boolean; private interruptionStreamChannel?: StreamChannel; private closed = false; @@ -226,6 +233,14 @@ export class AudioRecognition { this.turnDetectionMode = opts.turnDetectionMode; this.minEndpointingDelay = opts.minEndpointingDelay; this.maxEndpointingDelay = opts.maxEndpointingDelay; + this.endpointing = + 'endpointing' in opts && opts.endpointing !== undefined + ? opts.endpointing + : createEndpointing({ + mode: 'fixed', + minDelay: opts.minEndpointingDelay, + maxDelay: opts.maxEndpointingDelay, + }); this.lastLanguage = undefined; this.rootSpanContext = opts.rootSpanContext; this.sttModel = opts.sttModel; @@ -237,6 +252,7 @@ export class AudioRecognition { this.transcriptBuffer = []; this.isInterruptionEnabled = !!(opts.interruptionDetection && opts.vad); this.isAgentSpeaking = false; + this.overlapInterruptionDetected = undefined; if (opts.interruptionDetection) { const [vadInputStream, teedInput] = this.deferredInputStream.stream.tee(); @@ -275,7 +291,15 @@ export class AudioRecognition { } /** @internal */ - updateOptions(options: { turnDetection: TurnDetectionMode | undefined }): void { + updateOptions(options: { + endpointing?: BaseEndpointing; + turnDetection: TurnDetectionMode | undefined; + }): void { + if (options.endpointing !== undefined) { + this.minEndpointingDelay = options.endpointing.minDelay; + this.maxEndpointingDelay = options.endpointing.maxDelay; + this.endpointing = options.endpointing; + } this.turnDetectionMode = options.turnDetection; } @@ -311,12 +335,21 @@ export class AudioRecognition { this.interruptionStreamChannel = undefined; } - async onStartOfAgentSpeech() { + // Ref: source livekit-agents/livekit/agents/voice/audio_recognition.py - 238-243 + async onStartOfAgentSpeech(): Promise; + async onStartOfAgentSpeech(startedAt: number): Promise; + async onStartOfAgentSpeech(startedAt = Date.now()) { this.isAgentSpeaking = true; + this.endpointing.onStartOfAgentSpeech(startedAt); return this.trySendInterruptionSentinel(InterruptionStreamSentinel.agentSpeechStarted()); } + // Ref: source livekit-agents/livekit/agents/voice/audio_recognition.py - 245-270 async onEndOfAgentSpeech(ignoreUserTranscriptUntil: number) { + if (this.isAgentSpeaking) { + this.endpointing.onEndOfAgentSpeech(Date.now()); + } + if (!this.isInterruptionEnabled) { this.isAgentSpeaking = false; return; @@ -344,6 +377,31 @@ export class AudioRecognition { this.isAgentSpeaking = false; } + // Ref: source livekit-agents/livekit/agents/voice/audio_recognition.py - 272-289 + onStartOfSpeech(startedAt: number, speechDuration = 0, userSpeakingSpan?: Span): void { + this.endpointing.onStartOfSpeech(startedAt, this.isAgentSpeaking); + this.overlapInterruptionDetected = this.isInterruptionEnabled ? false : undefined; + + if (!this.isInterruptionEnabled || !this.isAgentSpeaking) { + return; + } + + void this.onStartOfOverlapSpeech(speechDuration, startedAt, userSpeakingSpan); + } + + // Ref: source livekit-agents/livekit/agents/voice/audio_recognition.py - 291-305 + onEndOfSpeech(endedAt: number, userSpeakingSpan?: Span, interruption?: boolean): void { + if (this.speaking) { + this.endpointing.onEndOfSpeech( + endedAt, + interruption !== undefined && !interruption && this.isAgentSpeaking, + ); + } + + this.overlapInterruptionDetected = undefined; + void this.onEndOfOverlapSpeech(endedAt, userSpeakingSpan); + } + /** Start interruption inference when agent is speaking and overlap speech starts. */ async onStartOfOverlapSpeech(speechDuration: number, startedAt: number, userSpeakingSpan?: Span) { if (this.isAgentSpeaking) { @@ -359,7 +417,7 @@ export class AudioRecognition { /** End interruption inference when overlap speech ends. */ async onEndOfOverlapSpeech(endedAt: number, userSpeakingSpan?: Span) { - if (!this.isInterruptionEnabled) { + if (!this.isInterruptionEnabled || !this.isAgentSpeaking) { return; } if (userSpeakingSpan && userSpeakingSpan.isRecording()) { @@ -704,13 +762,16 @@ export class AudioRecognition { case SpeechEventType.START_OF_SPEECH: if (this.turnDetectionMode !== 'stt') break; { - const span = this.ensureUserTurnSpan(Date.now()); + const startedAt = Date.now(); + this.onStartOfSpeech(startedAt); + + const span = this.ensureUserTurnSpan(startedAt); const ctx = this.userTurnContext(span); otelContext.with(ctx, () => { this.hooks.onStartOfSpeech({ type: VADEventType.START_OF_SPEECH, samplesIndex: 0, - timestamp: Date.now(), + timestamp: startedAt, speechDuration: 0, silenceDuration: 0, frames: [], @@ -730,13 +791,14 @@ export class AudioRecognition { case SpeechEventType.END_OF_SPEECH: if (this.turnDetectionMode !== 'stt') break; { + const endedAt = Date.now(); const span = this.ensureUserTurnSpan(); const ctx = this.userTurnContext(span); otelContext.with(ctx, () => { this.hooks.onEndOfSpeech({ type: VADEventType.END_OF_SPEECH, samplesIndex: 0, - timestamp: Date.now(), + timestamp: endedAt, speechDuration: 0, silenceDuration: 0, frames: [], @@ -747,6 +809,7 @@ export class AudioRecognition { rawAccumulatedSpeech: 0, }); }); + this.onEndOfSpeech(endedAt, undefined, this.overlapInterruptionDetected); } // STT EOT changes user state from speaking to listening without updating VAD internal states. // VAD EOS will also skip updating user state from listening (STT enforced) to listening (VAD detected) @@ -770,6 +833,7 @@ export class AudioRecognition { } private onOverlapSpeechEvent(ev: OverlappingSpeechEvent) { + this.overlapInterruptionDetected = ev.isInterruption; if (ev.isInterruption) { this.hooks.onInterruption(ev); } @@ -805,7 +869,7 @@ export class AudioRecognition { speechStartTime: number | undefined, ) => async (controller: AbortController) => { - let endpointingDelay = this.minEndpointingDelay; + let endpointingDelay = this.endpointing.minDelay; const userTurnSpan = this.ensureUserTurnSpan(); const userTurnCtx = this.userTurnContext(userTurnSpan); @@ -831,7 +895,7 @@ export class AudioRecognition { ); if (unlikelyThreshold && endOfTurnProbability < unlikelyThreshold) { - endpointingDelay = this.maxEndpointingDelay; + endpointingDelay = this.endpointing.maxDelay; } } catch (error) { this.logger.error(error, 'Error predicting end of turn'); @@ -1018,6 +1082,7 @@ export class AudioRecognition { this.logger.debug('VAD task: START_OF_SPEECH'); { const startTime = Date.now() - ev.speechDuration; + this.onStartOfSpeech(startTime, ev.speechDuration); const span = this.ensureUserTurnSpan(startTime); const ctx = this.userTurnContext(span); otelContext.with(ctx, () => this.hooks.onStartOfSpeech(ev)); @@ -1047,9 +1112,11 @@ export class AudioRecognition { case VADEventType.END_OF_SPEECH: this.logger.debug('VAD task: END_OF_SPEECH'); { + const endedAt = Date.now() - ev.silenceDuration - ev.inferenceDuration; const span = this.ensureUserTurnSpan(); const ctx = this.userTurnContext(span); otelContext.with(ctx, () => this.hooks.onEndOfSpeech(ev)); + this.onEndOfSpeech(endedAt, undefined, this.overlapInterruptionDetected); } // when VAD fires END_OF_SPEECH, it already waited for the silence_duration diff --git a/agents/src/voice/audio_recognition_endpointing.test.ts b/agents/src/voice/audio_recognition_endpointing.test.ts new file mode 100644 index 000000000..0a81f9463 --- /dev/null +++ b/agents/src/voice/audio_recognition_endpointing.test.ts @@ -0,0 +1,263 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { describe, expect, it, vi } from 'vitest'; +import { ChatContext } from '../llm/chat_context.js'; +import { initializeLogger } from '../log.js'; +import { type SpeechEvent, SpeechEventType } from '../stt/stt.js'; +import { VAD, type VADEvent, VADEventType, type VADStream } from '../vad.js'; +import { AudioRecognition, type RecognitionHooks } from './audio_recognition.js'; +import { DynamicEndpointing, createEndpointing } from './turn_config/endpointing.js'; + +function createHooks() { + const hooks: RecognitionHooks = { + onInterruption: vi.fn(), + onStartOfSpeech: vi.fn(), + onVADInferenceDone: vi.fn(), + onEndOfSpeech: vi.fn(), + onInterimTranscript: vi.fn(), + onFinalTranscript: vi.fn(), + onEndOfTurn: vi.fn(async () => true), + onPreemptiveGeneration: vi.fn(), + retrieveChatCtx: () => ChatContext.empty(), + }; + + return hooks; +} + +async function flushTasks() { + await Promise.resolve(); + await Promise.resolve(); +} + +class FakeVADStream extends (Object as unknown as { new (): VADStream }) { + private events: VADEvent[]; + private index = 0; + + constructor(events: VADEvent[]) { + super(); + this.events = events; + } + + updateInputStream() {} + + detachInputStream() {} + + close() {} + + [Symbol.asyncIterator]() { + return this; + } + + async next(): Promise> { + if (this.index >= this.events.length) { + return { done: true, value: undefined }; + } + + const value = this.events[this.index++]!; + return { done: false, value }; + } +} + +class FakeVAD extends VAD { + label = 'fake-vad'; + + constructor(private events: VADEvent[]) { + super({ updateInterval: 1 }); + } + + stream(): VADStream { + return new FakeVADStream(this.events); + } +} + +describe('AudioRecognition dynamic endpointing integration', () => { + initializeLogger({ pretty: false, level: 'silent' }); + + it('uses learned dynamic delay for STT-driven end-of-turn scheduling', async () => { + vi.useFakeTimers(); + + const hooks = createHooks(); + const recognition = new AudioRecognition({ + recognitionHooks: hooks, + minEndpointingDelay: 300, + maxEndpointingDelay: 1000, + endpointing: new DynamicEndpointing(300, 1000, 0.5), + turnDetectionMode: 'stt', + }); + const onSTTEvent = (recognition as any).onSTTEvent.bind(recognition) as ( + ev: SpeechEvent, + ) => Promise; + + try { + vi.setSystemTime(100000); + await onSTTEvent({ type: SpeechEventType.START_OF_SPEECH }); + + vi.setSystemTime(100500); + await onSTTEvent({ type: SpeechEventType.END_OF_SPEECH }); + + vi.setSystemTime(100900); + await onSTTEvent({ type: SpeechEventType.START_OF_SPEECH }); + + vi.setSystemTime(101200); + await onSTTEvent({ type: SpeechEventType.END_OF_SPEECH }); + + expect((recognition as any).endpointing.minDelay).toBeCloseTo(350, 5); + + await vi.advanceTimersByTimeAsync(349); + expect(hooks.onEndOfTurn).not.toHaveBeenCalled(); + + await vi.advanceTimersByTimeAsync(1); + expect(hooks.onEndOfTurn).toHaveBeenCalledTimes(1); + } finally { + await recognition.close(); + vi.useRealTimers(); + } + }); + + it('updates dynamic endpointing from the VAD runtime path', async () => { + vi.useFakeTimers(); + + const hooks = createHooks(); + const recognition = new AudioRecognition({ + recognitionHooks: hooks, + vad: new FakeVAD([ + { + type: VADEventType.START_OF_SPEECH, + samplesIndex: 0, + timestamp: 0, + speechDuration: 2000, + silenceDuration: 0, + frames: [], + probability: 0, + inferenceDuration: 0, + speaking: true, + rawAccumulatedSilence: 0, + rawAccumulatedSpeech: 0, + }, + { + type: VADEventType.END_OF_SPEECH, + samplesIndex: 0, + timestamp: 0, + speechDuration: 0, + silenceDuration: 1500, + frames: [], + probability: 0, + inferenceDuration: 0, + speaking: false, + rawAccumulatedSilence: 0, + rawAccumulatedSpeech: 0, + }, + { + type: VADEventType.START_OF_SPEECH, + samplesIndex: 0, + timestamp: 0, + speechDuration: 1100, + silenceDuration: 0, + frames: [], + probability: 0, + inferenceDuration: 0, + speaking: true, + rawAccumulatedSilence: 0, + rawAccumulatedSpeech: 0, + }, + { + type: VADEventType.END_OF_SPEECH, + samplesIndex: 0, + timestamp: 0, + speechDuration: 0, + silenceDuration: 800, + frames: [], + probability: 0, + inferenceDuration: 0, + speaking: false, + rawAccumulatedSilence: 0, + rawAccumulatedSpeech: 0, + }, + ]), + minEndpointingDelay: 300, + maxEndpointingDelay: 1000, + endpointing: new DynamicEndpointing(300, 1000, 0.5), + turnDetectionMode: 'vad', + }); + + try { + vi.setSystemTime(102000); + await recognition.start(); + await flushTasks(); + + expect((recognition as any).endpointing.minDelay).toBeCloseTo(350, 5); + + await vi.advanceTimersByTimeAsync(349); + expect(hooks.onEndOfTurn).not.toHaveBeenCalled(); + + await vi.advanceTimersByTimeAsync(1); + expect(hooks.onEndOfTurn).toHaveBeenCalledTimes(1); + } finally { + await recognition.close(); + vi.useRealTimers(); + } + }); + + it('passes false interruption results through to endpointing ignore logic', async () => { + vi.useFakeTimers(); + + const recognition = new AudioRecognition({ + recognitionHooks: createHooks(), + minEndpointingDelay: 300, + maxEndpointingDelay: 1000, + endpointing: new DynamicEndpointing(300, 1000, 0.5), + turnDetectionMode: 'stt', + }); + const endpointing = (recognition as any).endpointing as DynamicEndpointing; + const onSTTEvent = (recognition as any).onSTTEvent.bind(recognition) as ( + ev: SpeechEvent, + ) => Promise; + + try { + vi.setSystemTime(100500); + await recognition.onStartOfAgentSpeech(100500); + + vi.setSystemTime(101500); + await onSTTEvent({ type: SpeechEventType.START_OF_SPEECH }); + (recognition as any).onOverlapSpeechEvent({ isInterruption: false }); + + const previousMin = endpointing.minDelay; + const previousMax = endpointing.maxDelay; + + vi.setSystemTime(101800); + await onSTTEvent({ type: SpeechEventType.END_OF_SPEECH }); + + expect(endpointing.minDelay).toBe(previousMin); + expect(endpointing.maxDelay).toBe(previousMax); + expect((endpointing as any).utteranceStartedAt).toBeUndefined(); + expect((endpointing as any).utteranceEndedAt).toBeUndefined(); + } finally { + await recognition.close(); + vi.useRealTimers(); + } + }); + + it('replaces endpointing state on updateOptions', () => { + const endpointing = new DynamicEndpointing(300, 1000, 0.5); + endpointing.onEndOfSpeech(100000); + endpointing.onStartOfSpeech(100400); + endpointing.onEndOfSpeech(100600); + expect(endpointing.minDelay).toBeCloseTo(350, 5); + + const recognition = new AudioRecognition({ + recognitionHooks: createHooks(), + minEndpointingDelay: 300, + maxEndpointingDelay: 1000, + endpointing, + turnDetectionMode: 'stt', + }); + + recognition.updateOptions({ + endpointing: createEndpointing({ mode: 'dynamic', minDelay: 300, maxDelay: 1000 }), + turnDetection: 'stt', + }); + + expect((recognition as any).endpointing.minDelay).toBe(300); + }); +}); diff --git a/agents/src/voice/audio_recognition_handoff.test.ts b/agents/src/voice/audio_recognition_handoff.test.ts index 76311ec12..c8e5df453 100644 --- a/agents/src/voice/audio_recognition_handoff.test.ts +++ b/agents/src/voice/audio_recognition_handoff.test.ts @@ -8,6 +8,7 @@ import { initializeLogger } from '../log.js'; import { type SpeechEvent, SpeechEventType } from '../stt/stt.js'; import { AudioRecognition, type RecognitionHooks, STTPipeline } from './audio_recognition.js'; import type { STTNode } from './io.js'; +import { createEndpointing } from './turn_config/endpointing.js'; function createHooks() { const hooks: RecognitionHooks = { @@ -47,6 +48,7 @@ function createRecognition(sttNode: STTNode, hooks = createHooks()) { stt: sttNode, minEndpointingDelay: 0, maxEndpointingDelay: 0, + endpointing: createEndpointing({ mode: 'fixed', minDelay: 0, maxDelay: 0 }), }), }; } diff --git a/agents/src/voice/audio_recognition_span.test.ts b/agents/src/voice/audio_recognition_span.test.ts index cfe92a821..46283609e 100644 --- a/agents/src/voice/audio_recognition_span.test.ts +++ b/agents/src/voice/audio_recognition_span.test.ts @@ -23,6 +23,7 @@ import { type _TurnDetector, } from './audio_recognition.js'; import type { STTNode } from './io.js'; +import { createEndpointing } from './turn_config/endpointing.js'; function setupInMemoryTracing() { const exporter = new InMemorySpanExporter(); @@ -147,6 +148,7 @@ describe('AudioRecognition user_turn span parity', () => { turnDetectionMode: 'stt', minEndpointingDelay: 0, maxEndpointingDelay: 0, + endpointing: createEndpointing({ mode: 'fixed', minDelay: 0, maxDelay: 0 }), sttModel: 'deepgram-nova2', sttProvider: 'deepgram', getLinkedParticipant: () => ({ sid: 'p1', identity: 'bob', kind: ParticipantKind.AGENT }), @@ -256,6 +258,7 @@ describe('AudioRecognition user_turn span parity', () => { turnDetectionMode: 'vad', minEndpointingDelay: 0, maxEndpointingDelay: 0, + endpointing: createEndpointing({ mode: 'fixed', minDelay: 0, maxDelay: 0 }), sttModel: 'stt-model', sttProvider: 'stt-provider', getLinkedParticipant: () => ({ sid: 'p2', identity: 'alice', kind: ParticipantKind.AGENT }), diff --git a/agents/src/voice/index.ts b/agents/src/voice/index.ts index b9b3a62e7..b60649425 100644 --- a/agents/src/voice/index.ts +++ b/agents/src/voice/index.ts @@ -26,4 +26,5 @@ export { type TimedString } from './io.js'; export * from './report.js'; export * from './room_io/index.js'; export { RunContext } from './run_context.js'; +export * from './turn_config/endpointing.js'; export * as testing from './testing/index.js'; diff --git a/agents/src/voice/turn_config/endpointing.test.ts b/agents/src/voice/turn_config/endpointing.test.ts new file mode 100644 index 000000000..5c1e1335f --- /dev/null +++ b/agents/src/voice/turn_config/endpointing.test.ts @@ -0,0 +1,448 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { describe, expect, it } from 'vitest'; +import { ExpFilter } from '../../utils.js'; +import { DynamicEndpointing } from './endpointing.js'; + +describe('ExpFilter', () => { + it('initialization with valid alpha', () => { + const ema = new ExpFilter(0.5); + expect(ema.value).toBeUndefined(); + + const emaWithInitial = new ExpFilter(0.5, { initial: 10 }); + expect(emaWithInitial.value).toBe(10); + + expect(new ExpFilter(1.0).value).toBeUndefined(); + }); + + it('initialization with invalid alpha', () => { + expect(() => new ExpFilter(0.0)).toThrow(/alpha must be in/); + expect(() => new ExpFilter(-0.5)).toThrow(/alpha must be in/); + expect(() => new ExpFilter(1.5)).toThrow(/alpha must be in/); + }); + + it('update with no initial value', () => { + const ema = new ExpFilter(0.5); + const result = ema.apply(1.0, 10.0); + expect(result).toBe(10); + expect(ema.value).toBe(10); + }); + + it('update with initial value', () => { + const ema = new ExpFilter(0.5, { initial: 10.0 }); + const result = ema.apply(1.0, 20.0); + expect(result).toBe(15); + expect(ema.value).toBe(15); + }); + + it('update multiple times', () => { + const ema = new ExpFilter(0.5, { initial: 10.0 }); + ema.apply(1.0, 20.0); + ema.apply(1.0, 20.0); + expect(ema.value).toBe(17.5); + }); + + it('reset', () => { + const ema = new ExpFilter(0.5, { initial: 10.0 }); + expect(ema.value).toBe(10); + ema.reset(); + expect(ema.value).toBe(10); + + const emaWithInitial = new ExpFilter(0.5, { initial: 10.0 }); + emaWithInitial.reset({ initial: 5.0 }); + expect(emaWithInitial.value).toBe(5); + }); +}); + +describe('DynamicEndpointing', () => { + it('initialization', () => { + const ep = new DynamicEndpointing(300, 1000); + expect(ep.minDelay).toBe(300); + expect(ep.maxDelay).toBe(1000); + }); + + it('initialization with custom alpha', () => { + const ep = new DynamicEndpointing(300, 1000, 0.2); + expect(ep.minDelay).toBe(300); + expect(ep.maxDelay).toBe(1000); + }); + + it('initialization uses updated default alpha', () => { + const ep = new DynamicEndpointing(300, 1000); + expect((ep as any).utterancePause.alpha).toBeCloseTo(0.9, 5); + expect((ep as any).turnPause.alpha).toBeCloseTo(0.9, 5); + }); + + it('empty delays', () => { + const ep = new DynamicEndpointing(300, 1000); + expect(ep.betweenUtteranceDelay).toBe(0); + expect(ep.betweenTurnDelay).toBe(0); + expect(ep.immediateInterruptionDelay).toEqual([0, 0]); + }); + + it('on utterance ended', () => { + const ep = new DynamicEndpointing(300, 1000); + ep.onEndOfSpeech(100000); + expect((ep as any).utteranceEndedAt).toBe(100000); + + const ep2 = new DynamicEndpointing(300, 1000); + ep2.onEndOfSpeech(99900); + expect((ep2 as any).utteranceEndedAt).toBe(99900); + }); + + it('on utterance started', () => { + const ep = new DynamicEndpointing(300, 1000); + ep.onStartOfSpeech(100000); + expect((ep as any).utteranceStartedAt).toBe(100000); + }); + + it('on agent speech started', () => { + const ep = new DynamicEndpointing(300, 1000); + ep.onStartOfAgentSpeech(100000); + expect((ep as any).agentSpeechStartedAt).toBe(100000); + }); + + it('between utterance delay calculation', () => { + const ep = new DynamicEndpointing(300, 1000); + ep.onEndOfSpeech(100000); + ep.onStartOfSpeech(100500); + expect(ep.betweenUtteranceDelay).toBeCloseTo(500, 5); + }); + + it('between turn delay calculation', () => { + const ep = new DynamicEndpointing(300, 1000); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100800); + expect(ep.betweenTurnDelay).toBeCloseTo(800, 5); + }); + + it('pause between utterances updates min delay', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + const initialMin = ep.minDelay; + + ep.onEndOfSpeech(100000); + ep.onStartOfSpeech(100400); + ep.onEndOfSpeech(100500, false); + + const expected = 0.5 * 400 + 0.5 * initialMin; + expect(ep.minDelay).toBeCloseTo(expected, 5); + }); + + it('new turn updates max delay', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100600); + ep.onStartOfSpeech(101500); + ep.onEndOfSpeech(102000, false); + + expect(ep.maxDelay).toBeCloseTo(0.5 * 600 + 0.5 * 1000, 5); + }); + + it('interruption updates min delay', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100200); + expect((ep as any).agentSpeechStartedAt).toBeDefined(); + ep.onStartOfSpeech(100250, true); + expect(ep.overlapping).toBe(true); + + ep.onEndOfSpeech(100500); + + expect(ep.overlapping).toBe(false); + expect((ep as any).agentSpeechStartedAt).toBeUndefined(); + expect(ep.minDelay).toBeCloseTo(300, 5); + }); + + it('update options', () => { + const ep = new DynamicEndpointing(300, 1000); + ep.updateOptions({ minDelay: 500 }); + expect(ep.minDelay).toBe(500); + expect((ep as any).configuredMinDelay).toBe(500); + + const ep2 = new DynamicEndpointing(300, 1000); + ep2.updateOptions({ maxDelay: 2000 }); + expect(ep2.maxDelay).toBe(2000); + expect((ep2 as any).configuredMaxDelay).toBe(2000); + + const ep3 = new DynamicEndpointing(300, 1000); + ep3.updateOptions({ minDelay: 500, maxDelay: 2000 }); + expect(ep3.minDelay).toBe(500); + expect(ep3.maxDelay).toBe(2000); + + const ep4 = new DynamicEndpointing(300, 1000); + ep4.updateOptions(); + expect(ep4.minDelay).toBe(300); + expect(ep4.maxDelay).toBe(1000); + }); + + it('max delay clamped to configured max', () => { + const ep = new DynamicEndpointing(300, 1000, 1.0); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(102000); + ep.onStartOfSpeech(105000); + expect(ep.maxDelay).toBe(1000); + }); + + it('max delay clamped to min delay', () => { + const ep = new DynamicEndpointing(300, 1000, 1.0); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100100); + ep.onStartOfSpeech(100500); + expect(ep.maxDelay).toBeGreaterThanOrEqual((ep as any).configuredMinDelay); + }); + + it('non interruption clears agent speech', () => { + const ep = new DynamicEndpointing(300, 1000); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100500); + expect((ep as any).agentSpeechStartedAt).toBeDefined(); + + ep.onStartOfSpeech(102000); + ep.onEndOfSpeech(103000, false); + expect((ep as any).agentSpeechStartedAt).toBeUndefined(); + }); + + it('consecutive interruptions only track first', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100200); + ep.onStartOfSpeech(100250, true); + + expect(ep.overlapping).toBe(true); + const previous = [ep.minDelay, ep.maxDelay]; + + ep.onStartOfSpeech(100350); + + expect(ep.overlapping).toBe(true); + expect([ep.minDelay, ep.maxDelay]).toEqual(previous); + }); + + it('delayed interruption updates max delay without crashing', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100900); + ep.onStartOfSpeech(101800); + ep.onEndOfSpeech(102000, false); + expect(ep.maxDelay).toBeCloseTo(0.5 * 900 + 0.5 * 1000, 5); + }); + + it('interruption adjusts stale utterance end time', () => { + const ep = new DynamicEndpointing(60, 1000, 1.0); + ep.onEndOfSpeech(99000); + ep.onStartOfSpeech(100000); + + ep.onStartOfAgentSpeech(100200); + ep.onStartOfSpeech(100250, true); + + expect((ep as any).utteranceEndedAt).toBeCloseTo(100199, 5); + expect(ep.minDelay).toBeCloseTo(60, 5); + expect(ep.maxDelay).toBeCloseTo(1000, 5); + }); + + it('update options preserves filter alpha', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.updateOptions({ minDelay: 600, maxDelay: 2000 }); + expect((ep as any).utterancePause.alpha).toBeCloseTo(0.5, 5); + expect((ep as any).turnPause.alpha).toBeCloseTo(0.5, 5); + }); + + it('update options updates filter clamp bounds', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.updateOptions({ minDelay: 500, maxDelay: 2000 }); + expect((ep as any).utterancePause.min).toBe(500); + expect((ep as any).turnPause.max).toBe(2000); + + ep.onEndOfSpeech(100000); + ep.onStartOfSpeech(100200); + expect(ep.minDelay).toBeCloseTo(500, 5); + + ep.onEndOfSpeech(101000); + ep.onStartOfAgentSpeech(102800); + ep.onStartOfSpeech(103500); + expect(ep.maxDelay).toBeGreaterThan(1000); + expect(ep.maxDelay).toBeLessThanOrEqual(2000); + }); + + it('should ignore skips filter update', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100500); + ep.onStartOfSpeech(101500, true); + + const previousMin = ep.minDelay; + const previousMax = ep.maxDelay; + + ep.onEndOfSpeech(101800, true); + + expect(ep.minDelay).toBe(previousMin); + expect(ep.maxDelay).toBe(previousMax); + expect((ep as any).utteranceStartedAt).toBeUndefined(); + expect((ep as any).utteranceEndedAt).toBeUndefined(); + expect(ep.overlapping).toBe(false); + expect((ep as any).speaking).toBe(false); + }); + + it('should ignore without overlapping still updates', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + const initialMin = ep.minDelay; + + ep.onEndOfSpeech(100000); + ep.onStartOfSpeech(100400, false); + ep.onEndOfSpeech(100600, true); + + const expected = 0.5 * 400 + 0.5 * initialMin; + expect(ep.minDelay).toBeCloseTo(expected, 5); + }); + + it('should ignore grace period overrides', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100500); + ep.onStartOfSpeech(100600, true); + ep.onEndOfSpeech(100800, true); + + expect((ep as any).utteranceEndedAt).toBe(100800); + expect((ep as any).speaking).toBe(false); + }); + + it('should ignore outside grace period', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100500); + ep.onStartOfSpeech(101000, true); + + const previousMin = ep.minDelay; + const previousMax = ep.maxDelay; + ep.onEndOfSpeech(101500, true); + + expect(ep.minDelay).toBe(previousMin); + expect(ep.maxDelay).toBe(previousMax); + expect((ep as any).utteranceStartedAt).toBeUndefined(); + expect((ep as any).utteranceEndedAt).toBeUndefined(); + }); + + it('on end of agent speech clears state', () => { + const ep = new DynamicEndpointing(300, 1000); + ep.onStartOfAgentSpeech(100000); + ep.onStartOfSpeech(100100, true); + expect(ep.overlapping).toBe(true); + expect((ep as any).agentSpeechStartedAt).toBe(100000); + + ep.onEndOfAgentSpeech(101000); + + expect((ep as any).agentSpeechEndedAt).toBe(101000); + expect((ep as any).agentSpeechStartedAt).toBe(100000); + expect(ep.overlapping).toBe(false); + }); + + it('overlapping inferred from agent speech', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100900); + ep.onStartOfSpeech(101800, false); + ep.onEndOfSpeech(102000); + + expect(ep.maxDelay).toBeCloseTo(0.5 * 900 + 0.5 * 1000, 5); + }); + + it('speaking flag set and cleared', () => { + const ep = new DynamicEndpointing(300, 1000); + expect((ep as any).speaking).toBe(false); + ep.onStartOfSpeech(100000); + expect((ep as any).speaking).toBe(true); + ep.onEndOfSpeech(100500); + expect((ep as any).speaking).toBe(false); + }); + + it.each([ + ['no_agent/no_overlap/no_ignore', 'none', false, false, false, true, false], + ['no_agent/no_overlap/ignore', 'none', false, true, false, true, false], + ['agent_ended/no_overlap/no_ignore', 'ended', false, false, false, false, true], + ['agent_ended/no_overlap/ignore', 'ended', false, true, false, false, true], + ['agent_active/no_overlap/no_ignore', 'active', false, false, false, false, true], + ['agent_active/no_overlap/ignore', 'active', false, true, false, false, true], + ['agent_active/overlap/no_ignore', 'active', true, false, false, true, false], + ['agent_active/overlap/ignore/outside_grace', 'active', true, true, false, false, false], + ['agent_active/overlap/ignore/inside_grace', 'active', true, true, true, true, false], + ])( + 'all overlapping and should_ignore combos: %s', + ( + label, + agentSpeech, + overlapping, + shouldIgnore, + withinGrace, + expectMinChange, + expectMaxChange, + ) => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + + ep.onStartOfSpeech(99000); + ep.onEndOfSpeech(100000); + + let userStart = 100400; + if (agentSpeech === 'ended') { + ep.onStartOfAgentSpeech(100500); + ep.onEndOfAgentSpeech(101000); + userStart = 101500; + } else if (agentSpeech === 'active') { + if (withinGrace) { + ep.onStartOfAgentSpeech(100150); + userStart = 100350; + } else if (overlapping && shouldIgnore) { + ep.onStartOfAgentSpeech(100200); + userStart = 101500; + } else if (overlapping) { + ep.onStartOfAgentSpeech(100150); + userStart = 100400; + } else { + ep.onStartOfAgentSpeech(100900); + userStart = 101800; + } + } + + ep.onStartOfSpeech(userStart, overlapping); + + const previousMin = ep.minDelay; + const previousMax = ep.maxDelay; + + ep.onEndOfSpeech(userStart + 500, shouldIgnore); + + const minChanged = ep.minDelay !== previousMin; + const maxChanged = ep.maxDelay !== previousMax; + + expect(minChanged, `[${label}] min_delay change`).toBe(expectMinChange); + expect(maxChanged, `[${label}] max_delay change`).toBe(expectMaxChange); + expect((ep as any).speaking, `[${label}] speaking`).toBe(false); + expect(ep.overlapping, `[${label}] overlapping`).toBe(false); + }, + ); + + it('full conversation sequence', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + + ep.onStartOfSpeech(100000); + ep.onEndOfSpeech(101000); + + ep.onStartOfAgentSpeech(101500); + + ep.onStartOfSpeech(102500, true); + const minBeforeBackchannel = ep.minDelay; + const maxBeforeBackchannel = ep.maxDelay; + ep.onEndOfSpeech(102800, true); + + expect(ep.minDelay).toBe(minBeforeBackchannel); + expect(ep.maxDelay).toBe(maxBeforeBackchannel); + + ep.onEndOfAgentSpeech(103000); + + ep.onStartOfSpeech(103500); + ep.onEndOfSpeech(104000); + + expect((ep as any).speaking).toBe(false); + expect((ep as any).agentSpeechStartedAt).toBeUndefined(); + }); +}); diff --git a/agents/src/voice/turn_config/endpointing.ts b/agents/src/voice/turn_config/endpointing.ts index f2603e00f..665bcae76 100644 --- a/agents/src/voice/turn_config/endpointing.ts +++ b/agents/src/voice/turn_config/endpointing.ts @@ -1,6 +1,14 @@ // SPDX-FileCopyrightText: 2026 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 +import { log } from '../../log.js'; +import { ExpFilter } from '../../utils.js'; + +const logger = log(); + +// Ref: source livekit-agents/livekit/agents/voice/endpointing.py - 7-7 +const AGENT_SPEECH_LEADING_SILENCE_GRACE_PERIOD = 250; // 0.25s -> 250ms + /** * Configuration for endpointing, which determines when the user's turn is complete. */ @@ -31,3 +39,274 @@ export const defaultEndpointingOptions = { minDelay: 500, maxDelay: 3000, } as const satisfies EndpointingOptions; + +// Ref: source livekit-agents/livekit/agents/voice/endpointing.py - 10-46 +export class BaseEndpointing { + protected configuredMinDelay: number; + protected configuredMaxDelay: number; + protected isOverlapping = false; + + constructor(minDelay: number, maxDelay: number) { + this.configuredMinDelay = minDelay; + this.configuredMaxDelay = maxDelay; + } + + updateOptions({ minDelay, maxDelay }: { minDelay?: number; maxDelay?: number } = {}): void { + this.configuredMinDelay = minDelay ?? this.configuredMinDelay; + this.configuredMaxDelay = maxDelay ?? this.configuredMaxDelay; + } + + get minDelay(): number { + return this.configuredMinDelay; + } + + get maxDelay(): number { + return this.configuredMaxDelay; + } + + get overlapping(): boolean { + return this.isOverlapping; + } + + onStartOfSpeech(_startedAt: number, overlapping = false): void { + this.isOverlapping = overlapping; + } + + onEndOfSpeech(_endedAt: number, _shouldIgnore = false): void { + this.isOverlapping = false; + } + + onStartOfAgentSpeech(_startedAt: number): void {} + + onEndOfAgentSpeech(_endedAt: number): void {} +} + +// Ref: source livekit-agents/livekit/agents/voice/endpointing.py - 49-302 +export class DynamicEndpointing extends BaseEndpointing { + private utterancePause: ExpFilter; + private turnPause: ExpFilter; + + private utteranceStartedAt?: number; + private utteranceEndedAt?: number; + private agentSpeechStartedAt?: number; + private agentSpeechEndedAt?: number; + private speaking = false; + + constructor(minDelay: number, maxDelay: number, alpha = 0.9) { + super(minDelay, maxDelay); + + this.utterancePause = new ExpFilter(alpha, { + initial: minDelay, + min: minDelay, + max: maxDelay, + }); + this.turnPause = new ExpFilter(alpha, { + initial: maxDelay, + min: minDelay, + max: maxDelay, + }); + } + + get minDelay(): number { + return this.utterancePause.value ?? this.configuredMinDelay; + } + + get maxDelay(): number { + return Math.max(this.turnPause.value ?? this.configuredMaxDelay, this.minDelay); + } + + get betweenUtteranceDelay(): number { + if (this.utteranceEndedAt === undefined || this.utteranceStartedAt === undefined) { + return 0; + } + + return Math.max(0, this.utteranceStartedAt - this.utteranceEndedAt); + } + + get betweenTurnDelay(): number { + if (this.agentSpeechStartedAt === undefined || this.utteranceEndedAt === undefined) { + return 0; + } + + return Math.max(0, this.agentSpeechStartedAt - this.utteranceEndedAt); + } + + get immediateInterruptionDelay(): [number, number] { + if (this.utteranceStartedAt === undefined || this.agentSpeechStartedAt === undefined) { + return [0, 0]; + } + + return [this.betweenTurnDelay, Math.abs(this.betweenUtteranceDelay - this.betweenTurnDelay)]; + } + + onStartOfAgentSpeech(startedAt: number): void { + this.agentSpeechStartedAt = startedAt; + this.agentSpeechEndedAt = undefined; + this.isOverlapping = false; + } + + onEndOfAgentSpeech(endedAt: number): void { + // Ref: source livekit-agents/livekit/agents/voice/endpointing.py - 144-153 + if ( + this.agentSpeechStartedAt !== undefined && + (this.agentSpeechEndedAt === undefined || this.agentSpeechEndedAt < this.agentSpeechStartedAt) + ) { + this.agentSpeechEndedAt = endedAt; + } + this.isOverlapping = false; + } + + // Ref: source livekit-agents/livekit/agents/voice/endpointing.py - 155-177 + onStartOfSpeech(startedAt: number, overlapping = false): void { + if (this.isOverlapping) { + return; + } + + if ( + this.utteranceStartedAt !== undefined && + this.utteranceEndedAt !== undefined && + this.agentSpeechStartedAt !== undefined && + this.utteranceEndedAt < this.utteranceStartedAt && + overlapping + ) { + this.utteranceEndedAt = this.agentSpeechStartedAt - 1; + logger.trace({ utteranceEndedAt: this.utteranceEndedAt }, 'utterance ended at adjusted'); + } + + this.utteranceStartedAt = startedAt; + this.isOverlapping = overlapping; + this.speaking = true; + } + + // Ref: source livekit-agents/livekit/agents/voice/endpointing.py - 179-286 + onEndOfSpeech(endedAt: number, shouldIgnore = false): void { + if (shouldIgnore && this.isOverlapping) { + if ( + this.utteranceStartedAt !== undefined && + this.agentSpeechStartedAt !== undefined && + Math.abs(this.utteranceStartedAt - this.agentSpeechStartedAt) < + AGENT_SPEECH_LEADING_SILENCE_GRACE_PERIOD + ) { + logger.trace( + { + delay: Math.abs(this.utteranceStartedAt - this.agentSpeechStartedAt), + gracePeriod: AGENT_SPEECH_LEADING_SILENCE_GRACE_PERIOD, + }, + 'ignoring shouldIgnore=true within grace period', + ); + } else { + this.isOverlapping = false; + this.speaking = false; + this.utteranceStartedAt = undefined; + this.utteranceEndedAt = undefined; + return; + } + } + + if ( + this.isOverlapping || + (this.agentSpeechStartedAt !== undefined && this.agentSpeechEndedAt === undefined) + ) { + const [turnDelay, interruptionDelay] = this.immediateInterruptionDelay; + const utterancePause = this.betweenUtteranceDelay; + + if ( + interruptionDelay > 0 && + interruptionDelay <= this.minDelay && + turnDelay > 0 && + turnDelay <= this.maxDelay && + utterancePause > 0 + ) { + const previousValue = this.minDelay; + this.utterancePause.apply(1, utterancePause); + logger.debug( + { + reason: 'immediate interruption', + pause: utterancePause, + interruptionDelay, + turnDelay, + maxDelay: this.maxDelay, + minDelay: this.minDelay, + }, + `min endpointing delay updated: ${previousValue} -> ${this.minDelay}`, + ); + } else if (this.betweenTurnDelay > 0) { + const previousValue = this.maxDelay; + this.turnPause.apply(1, this.betweenTurnDelay); + logger.debug( + { + reason: 'new turn (interruption)', + pause: this.betweenTurnDelay, + maxDelay: this.maxDelay, + minDelay: this.minDelay, + betweenUtteranceDelay: this.betweenUtteranceDelay, + betweenTurnDelay: this.betweenTurnDelay, + }, + `max endpointing delay updated: ${previousValue} -> ${this.maxDelay}`, + ); + } + } else if (this.betweenTurnDelay > 0) { + const previousValue = this.maxDelay; + this.turnPause.apply(1, this.betweenTurnDelay); + logger.debug( + { + reason: 'new turn', + pause: this.betweenTurnDelay, + maxDelay: this.maxDelay, + minDelay: this.minDelay, + }, + `max endpointing delay updated due to pause: ${previousValue} -> ${this.maxDelay}`, + ); + } else if ( + this.betweenUtteranceDelay > 0 && + this.agentSpeechEndedAt === undefined && + this.agentSpeechStartedAt === undefined + ) { + const previousValue = this.minDelay; + this.utterancePause.apply(1, this.betweenUtteranceDelay); + logger.debug( + { + reason: 'pause between utterances', + pause: this.betweenUtteranceDelay, + maxDelay: this.maxDelay, + minDelay: this.minDelay, + }, + `min endpointing delay updated: ${previousValue} -> ${this.minDelay}`, + ); + } + + this.utteranceEndedAt = endedAt; + this.agentSpeechStartedAt = undefined; + this.agentSpeechEndedAt = undefined; + this.speaking = false; + this.isOverlapping = false; + } + + // Ref: source livekit-agents/livekit/agents/voice/endpointing.py - 288-302 + override updateOptions({ + minDelay, + maxDelay, + }: { minDelay?: number; maxDelay?: number } = {}): void { + if (minDelay !== undefined) { + this.configuredMinDelay = minDelay; + this.utterancePause.reset({ initial: minDelay, min: minDelay }); + this.turnPause.reset({ min: minDelay }); + } + + if (maxDelay !== undefined) { + this.configuredMaxDelay = maxDelay; + this.turnPause.reset({ initial: maxDelay, max: maxDelay }); + this.utterancePause.reset({ max: maxDelay }); + } + } +} + +// Ref: source livekit-agents/livekit/agents/voice/endpointing.py - 305-316 +export function createEndpointing(options: EndpointingOptions): BaseEndpointing { + switch (options.mode) { + case 'dynamic': + return new DynamicEndpointing(options.minDelay, options.maxDelay); + default: + return new BaseEndpointing(options.minDelay, options.maxDelay); + } +}