


























@@ -57,6 +57,12 @@ function lastConnectParams(): MockGoogleLiveConnectParams {
5757return params;
5858}
595960+function sentAudio(index = 0): { data?: unknown; mimeType?: unknown } {
61+const audio = session.sendRealtimeInput.mock.calls[index]?.[0]?.audio;
62+expect(audio).toBeDefined();
63+return audio as { data?: unknown; mimeType?: unknown };
64+}
65+6066describe("buildGoogleRealtimeVoiceProvider", () => {
6167beforeEach(() => {
6268envSnapshot = Object.fromEntries(ENV_KEYS.map((key) => [key, process.env[key]]));
@@ -204,61 +210,68 @@ describe("buildGoogleRealtimeVoiceProvider", () => {
204210await bridge.connect();
205211206212expect(connectMock).toHaveBeenCalledTimes(1);
207-expect(lastConnectParams()).toMatchObject({
208-model: "gemini-live-2.5-flash-preview",
209-config: {
210-responseModalities: ["AUDIO"],
211-temperature: 0.3,
212-systemInstruction: "Speak briefly.",
213-speechConfig: {
214-voiceConfig: {
215-prebuiltVoiceConfig: {
216-voiceName: "Kore",
217-},
218-},
219-},
220-outputAudioTranscription: {},
221-realtimeInputConfig: {
222-activityHandling: "NO_INTERRUPTION",
223-automaticActivityDetection: {
224-startOfSpeechSensitivity: "START_SENSITIVITY_LOW",
225-endOfSpeechSensitivity: "END_SENSITIVITY_LOW",
226-},
227-turnCoverage: "TURN_INCLUDES_ONLY_ACTIVITY",
228-},
229-sessionResumption: {},
230-contextWindowCompression: { slidingWindow: {} },
231-tools: [
232-{
233-functionDeclarations: [
234-{
235-name: "lookup",
236-description: "Look something up",
237-parametersJsonSchema: {
238-type: "object",
239-properties: {
240-query: { type: "string" },
241-},
242-required: ["query"],
243-},
244-},
245-{
246-name: "openclaw_agent_consult",
247-description: "Ask OpenClaw",
248-parametersJsonSchema: {
249-type: "object",
250-properties: {
251-question: { type: "string" },
252-},
253-required: ["question"],
254-},
255-behavior: "NON_BLOCKING",
256-},
257-],
258-},
259-],
213+const params = lastConnectParams();
214+expect(params.model).toBe("gemini-live-2.5-flash-preview");
215+const config = params.config as {
216+contextWindowCompression?: unknown;
217+outputAudioTranscription?: unknown;
218+realtimeInputConfig?: {
219+activityHandling?: string;
220+automaticActivityDetection?: {
221+endOfSpeechSensitivity?: string;
222+startOfSpeechSensitivity?: string;
223+};
224+turnCoverage?: string;
225+};
226+responseModalities?: string[];
227+sessionResumption?: unknown;
228+speechConfig?: { voiceConfig?: { prebuiltVoiceConfig?: { voiceName?: string } } };
229+systemInstruction?: string;
230+temperature?: number;
231+tools?: Array<{
232+functionDeclarations?: Array<{
233+behavior?: string;
234+description?: string;
235+name?: string;
236+parametersJsonSchema?: unknown;
237+}>;
238+}>;
239+};
240+expect(config.responseModalities).toEqual(["AUDIO"]);
241+expect(config.temperature).toBe(0.3);
242+expect(config.systemInstruction).toBe("Speak briefly.");
243+expect(config.speechConfig?.voiceConfig?.prebuiltVoiceConfig?.voiceName).toBe("Kore");
244+expect(config.outputAudioTranscription).toEqual({});
245+expect(config.realtimeInputConfig?.activityHandling).toBe("NO_INTERRUPTION");
246+expect(config.realtimeInputConfig?.automaticActivityDetection?.startOfSpeechSensitivity).toBe(
247+"START_SENSITIVITY_LOW",
248+);
249+expect(config.realtimeInputConfig?.automaticActivityDetection?.endOfSpeechSensitivity).toBe(
250+"END_SENSITIVITY_LOW",
251+);
252+expect(config.realtimeInputConfig?.turnCoverage).toBe("TURN_INCLUDES_ONLY_ACTIVITY");
253+expect(config.sessionResumption).toEqual({});
254+expect(config.contextWindowCompression).toEqual({ slidingWindow: {} });
255+const declarations = config.tools?.[0]?.functionDeclarations ?? [];
256+expect(declarations[0]?.name).toBe("lookup");
257+expect(declarations[0]?.description).toBe("Look something up");
258+expect(declarations[0]?.parametersJsonSchema).toEqual({
259+type: "object",
260+properties: {
261+query: { type: "string" },
260262},
263+required: ["query"],
264+});
265+expect(declarations[1]?.name).toBe("openclaw_agent_consult");
266+expect(declarations[1]?.description).toBe("Ask OpenClaw");
267+expect(declarations[1]?.parametersJsonSchema).toEqual({
268+type: "object",
269+properties: {
270+question: { type: "string" },
271+},
272+required: ["question"],
261273});
274+expect(declarations[1]?.behavior).toBe("NON_BLOCKING");
262275});
263276264277it("omits zero temperature for native audio responses", async () => {
@@ -305,58 +318,67 @@ describe("buildGoogleRealtimeVoiceProvider", () => {
305318});
306319307320expect(createTokenMock).toHaveBeenCalledTimes(1);
308-expect(createTokenMock.mock.calls[0]?.[0]).toMatchObject({
309-config: {
310-uses: 1,
311-liveConnectConstraints: {
312-model: "gemini-live-2.5-flash-preview",
313-config: {
314-responseModalities: ["AUDIO"],
315-temperature: 0.4,
316-systemInstruction: "Speak briefly.",
317-speechConfig: {
318-voiceConfig: {
319-prebuiltVoiceConfig: {
320-voiceName: "Puck",
321-},
322-},
323-},
324-tools: [
325-{
326-functionDeclarations: [
327-{
328-name: "openclaw_agent_consult",
329-behavior: "NON_BLOCKING",
330-},
331-],
332-},
333-],
334-},
335-},
336-},
337-});
338-expect(session).toMatchObject({
339-provider: "google",
340-transport: "provider-websocket",
341-protocol: "google-live-bidi",
342-clientSecret: "auth_tokens/browser-session",
343-websocketUrl:
344-"wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContentConstrained",
321+const tokenConfig = createTokenMock.mock.calls[0]?.[0] as {
322+config?: {
323+liveConnectConstraints?: {
324+config?: {
325+responseModalities?: string[];
326+speechConfig?: { voiceConfig?: { prebuiltVoiceConfig?: { voiceName?: string } } };
327+systemInstruction?: string;
328+temperature?: number;
329+tools?: Array<{ functionDeclarations?: Array<{ behavior?: string; name?: string }> }>;
330+};
331+model?: string;
332+};
333+uses?: number;
334+};
335+};
336+const liveConstraints = tokenConfig.config?.liveConnectConstraints;
337+expect(tokenConfig.config?.uses).toBe(1);
338+expect(liveConstraints?.model).toBe("gemini-live-2.5-flash-preview");
339+expect(liveConstraints?.config?.responseModalities).toEqual(["AUDIO"]);
340+expect(liveConstraints?.config?.temperature).toBe(0.4);
341+expect(liveConstraints?.config?.systemInstruction).toBe("Speak briefly.");
342+expect(liveConstraints?.config?.speechConfig?.voiceConfig?.prebuiltVoiceConfig?.voiceName).toBe(
343+"Puck",
344+);
345+expect(liveConstraints?.config?.tools?.[0]?.functionDeclarations?.[0]?.name).toBe(
346+"openclaw_agent_consult",
347+);
348+expect(liveConstraints?.config?.tools?.[0]?.functionDeclarations?.[0]?.behavior).toBe(
349+"NON_BLOCKING",
350+);
351+expect(session?.provider).toBe("google");
352+expect(session?.transport).toBe("provider-websocket");
353+const websocketSession = session as {
345354audio: {
346-inputEncoding: "pcm16",
347-inputSampleRateHz: 16000,
348-outputEncoding: "pcm16",
349-outputSampleRateHz: 24000,
350-},
355+inputEncoding: string;
356+inputSampleRateHz: number;
357+outputEncoding: string;
358+outputSampleRateHz: number;
359+};
360+clientSecret: string;
351361initialMessage: {
352-setup: {
353-model: "models/gemini-live-2.5-flash-preview",
354-generationConfig: {
355-responseModalities: ["AUDIO"],
356-},
357-},
358-},
359-});
362+setup: { generationConfig: { responseModalities: string[] }; model: string };
363+};
364+protocol: string;
365+websocketUrl: string;
366+};
367+expect(websocketSession.protocol).toBe("google-live-bidi");
368+expect(websocketSession.clientSecret).toBe("auth_tokens/browser-session");
369+expect(websocketSession.websocketUrl).toBe(
370+"wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContentConstrained",
371+);
372+expect(websocketSession.audio.inputEncoding).toBe("pcm16");
373+expect(websocketSession.audio.inputSampleRateHz).toBe(16000);
374+expect(websocketSession.audio.outputEncoding).toBe("pcm16");
375+expect(websocketSession.audio.outputSampleRateHz).toBe(24000);
376+expect(websocketSession.initialMessage.setup.model).toBe(
377+"models/gemini-live-2.5-flash-preview",
378+);
379+expect(websocketSession.initialMessage.setup.generationConfig.responseModalities).toEqual([
380+"AUDIO",
381+]);
360382});
361383362384it("can opt out of Google Live session resumption and context compression", async () => {
@@ -421,11 +443,8 @@ describe("buildGoogleRealtimeVoiceProvider", () => {
421443});
422444423445expect(onClose).not.toHaveBeenCalled();
424-expect(onError).toHaveBeenCalledWith(
425-expect.objectContaining({
426-message: expect.stringContaining("reconnecting 1/3"),
427-}),
428-);
446+const error = onError.mock.calls[0]?.[0] as { message?: string };
447+expect(error.message).toContain("reconnecting 1/3");
429448430449await vi.advanceTimersByTimeAsync(250);
431450@@ -457,10 +476,9 @@ describe("buildGoogleRealtimeVoiceProvider", () => {
457476458477expect(onReady).toHaveBeenCalledTimes(1);
459478expect(session.sendRealtimeInput).toHaveBeenCalledTimes(1);
460-expect(session.sendRealtimeInput.mock.calls[0]?.[0].audio).toMatchObject({
461-data: expect.any(String),
462-mimeType: "audio/pcm;rate=16000",
463-});
479+const audio = sentAudio();
480+expect(typeof audio.data).toBe("string");
481+expect(audio.mimeType).toBe("audio/pcm;rate=16000");
464482});
465483466484it("marks the Google audio stream complete after sustained telephony silence", async () => {
@@ -509,13 +527,10 @@ describe("buildGoogleRealtimeVoiceProvider", () => {
509527510528bridge.sendAudio(Buffer.from([0xff, 0x00]));
511529512-expect(session.sendRealtimeInput).toHaveBeenCalledWith({
513-audio: {
514-data: expect.any(String),
515-mimeType: "audio/pcm;rate=16000",
516-},
517-});
518-const sent = Buffer.from(session.sendRealtimeInput.mock.calls[0]?.[0].audio.data, "base64");
530+const audio = sentAudio();
531+expect(typeof audio.data).toBe("string");
532+expect(audio.mimeType).toBe("audio/pcm;rate=16000");
533+const sent = Buffer.from(audio.data as string, "base64");
519534expect(Array.from({ length: sent.length / 2 }, (_, i) => sent.readInt16LE(i * 2))).toEqual([
5205350, -16062, -32124, -32124,
521536]);
@@ -536,13 +551,10 @@ describe("buildGoogleRealtimeVoiceProvider", () => {
536551537552bridge.sendAudio(Buffer.alloc(480));
538553539-expect(session.sendRealtimeInput).toHaveBeenCalledWith({
540-audio: {
541-data: expect.any(String),
542-mimeType: "audio/pcm;rate=16000",
543-},
544-});
545-const sent = Buffer.from(session.sendRealtimeInput.mock.calls[0]?.[0].audio.data, "base64");
554+const audio = sentAudio();
555+expect(typeof audio.data).toBe("string");
556+expect(audio.mimeType).toBe("audio/pcm;rate=16000");
557+const sent = Buffer.from(audio.data as string, "base64");
546558expect(sent).toHaveLength(320);
547559});
548560@@ -559,13 +571,10 @@ describe("buildGoogleRealtimeVoiceProvider", () => {
559571560572await bridge.connect();
561573562-expect(lastConnectParams().config).toMatchObject({
563-realtimeInputConfig: {
564-automaticActivityDetection: {
565-disabled: true,
566-},
567-},
568-});
574+const config = lastConnectParams().config as {
575+realtimeInputConfig?: { automaticActivityDetection?: { disabled?: boolean } };
576+};
577+expect(config.realtimeInputConfig?.automaticActivityDetection?.disabled).toBe(true);
569578});
570579571580it("sends text prompts as ordered client turns", async () => {
@@ -777,11 +786,9 @@ describe("buildGoogleRealtimeVoiceProvider", () => {
777786bridge.submitToolResult("missing-call", { result: "ok" });
778787779788expect(session.sendToolResponse).not.toHaveBeenCalled();
780-expect(onError).toHaveBeenCalledWith(
781-expect.objectContaining({
782-message:
783-"Google Live function response is missing a matching function call for missing-call",
784-}),
789+const error = onError.mock.calls[0]?.[0] as { message?: string };
790+expect(error.message).toBe(
791+"Google Live function response is missing a matching function call for missing-call",
785792);
786793});
787794此内容由惯性聚合(RSS阅读器)自动聚合整理,仅供阅读参考。 原文来自 — 版权归原作者所有。