fix(tts): preserve legacy tool voice hints · openclaw/openclaw@60f9358
steipete
·
2026-04-26
·
via Recent Commits to openclaw:main
| Original file line number | Diff line number | Diff line change |
|---|
@@ -25,6 +25,9 @@ Docs: https://docs.openclaw.ai
|
25 | 25 | |
26 | 26 | ### Fixes |
27 | 27 | |
| 28 | +- Agents/TTS: preserve legacy `[[audio_as_voice]]` hints on trusted tool-result |
| 29 | + `MEDIA:` payloads so generated audio still delivers as a voice note. (#46535) |
| 30 | + Thanks @azade-c. |
28 | 31 | - Telegram/STT: frame inbound voice-note transcripts as machine-generated, |
29 | 32 | untrusted text in agent context while preserving raw transcript mention |
30 | 33 | detection. Closes #33360. Thanks @smartchainark. |
|
| Original file line number | Diff line number | Diff line change |
|---|
@@ -14,6 +14,7 @@ Assistant output can carry a small set of delivery/render directives:
|
14 | 14 | - `[embed ...]` for Control UI rich rendering |
15 | 15 | |
16 | 16 | These directives are separate. `MEDIA:` and reply/voice tags remain delivery metadata; `[embed ...]` is the web-only rich render path. |
| 17 | +Trusted tool-result media uses the same `MEDIA:` / `[[audio_as_voice]]` parser before delivery, so legacy tool outputs can still mark an audio attachment as a voice note. |
17 | 18 | |
18 | 19 | When block streaming is enabled, `MEDIA:` remains single-delivery metadata for a |
19 | 20 | turn. If the same media URL is sent in a streamed block and repeated in the final |
|
| Original file line number | Diff line number | Diff line change |
|---|
@@ -165,6 +165,34 @@ describe("handleToolExecutionEnd media emission", () => {
|
165 | 165 | expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/screenshot.png"]); |
166 | 166 | }); |
167 | 167 | |
| 168 | +it("preserves legacy audio_as_voice when queuing trusted MEDIA tool output", async () => { |
| 169 | +const onToolResult = vi.fn(); |
| 170 | +const ctx = createMockContext({ |
| 171 | +shouldEmitToolOutput: false, |
| 172 | + onToolResult, |
| 173 | +builtinToolNames: new Set(["tts"]), |
| 174 | +}); |
| 175 | + |
| 176 | +await handleToolExecutionEnd(ctx, { |
| 177 | +type: "tool_execution_end", |
| 178 | +toolName: "tts", |
| 179 | +toolCallId: "tc-1", |
| 180 | +isError: false, |
| 181 | +result: { |
| 182 | +content: [ |
| 183 | +{ |
| 184 | +type: "text", |
| 185 | +text: "Generated audio reply.\n[[audio_as_voice]]\nMEDIA:/tmp/reply.opus", |
| 186 | +}, |
| 187 | +], |
| 188 | +}, |
| 189 | +}); |
| 190 | + |
| 191 | +expect(onToolResult).not.toHaveBeenCalled(); |
| 192 | +expect(ctx.state.pendingToolMediaUrls).toEqual(["/tmp/reply.opus"]); |
| 193 | +expect(ctx.state.pendingToolAudioAsVoice).toBe(true); |
| 194 | +}); |
| 195 | + |
168 | 196 | it("does NOT emit local media for untrusted tools", async () => { |
169 | 197 | const onToolResult = vi.fn(); |
170 | 198 | const ctx = createMockContext({ shouldEmitToolOutput: false, onToolResult }); |
|
| Original file line number | Diff line number | Diff line change |
|---|
@@ -51,6 +51,33 @@ describe("extractToolResultMediaPaths", () => {
|
51 | 51 | }); |
52 | 52 | }); |
53 | 53 | |
| 54 | +it("extracts audioAsVoice from legacy MEDIA text", () => { |
| 55 | +expect( |
| 56 | +extractToolResultMediaArtifact({ |
| 57 | +content: [ |
| 58 | +{ type: "text", text: "Generated audio\n[[audio_as_voice]]\nMEDIA:/tmp/reply.opus" }, |
| 59 | +], |
| 60 | +}), |
| 61 | +).toEqual({ |
| 62 | +mediaUrls: ["/tmp/reply.opus"], |
| 63 | +audioAsVoice: true, |
| 64 | +}); |
| 65 | +}); |
| 66 | + |
| 67 | +it("keeps legacy audioAsVoice when the tag and MEDIA path are in separate text blocks", () => { |
| 68 | +expect( |
| 69 | +extractToolResultMediaArtifact({ |
| 70 | +content: [ |
| 71 | +{ type: "text", text: "[[audio_as_voice]]" }, |
| 72 | +{ type: "text", text: "MEDIA:/tmp/reply.opus" }, |
| 73 | +], |
| 74 | +}), |
| 75 | +).toEqual({ |
| 76 | +mediaUrls: ["/tmp/reply.opus"], |
| 77 | +audioAsVoice: true, |
| 78 | +}); |
| 79 | +}); |
| 80 | + |
54 | 81 | it("extracts structured media trust markers", () => { |
55 | 82 | expect( |
56 | 83 | extractToolResultMediaArtifact({ |
|
| Original file line number | Diff line number | Diff line change |
|---|
@@ -307,6 +307,7 @@ export function extractToolResultMediaArtifact(
|
307 | 307 | // parser so directive matching and validation stay in sync with outbound |
308 | 308 | // reply parsing. |
309 | 309 | const paths: string[] = []; |
| 310 | +let audioAsVoice = false; |
310 | 311 | let hasImageContent = false; |
311 | 312 | for (const item of content) { |
312 | 313 | if (!item || typeof item !== "object") { |
@@ -319,14 +320,20 @@ export function extractToolResultMediaArtifact(
|
319 | 320 | } |
320 | 321 | if (entry.type === "text" && typeof entry.text === "string") { |
321 | 322 | const parsed = splitMediaFromOutput(entry.text); |
| 323 | +if (parsed.audioAsVoice) { |
| 324 | +audioAsVoice = true; |
| 325 | +} |
322 | 326 | if (parsed.mediaUrls?.length) { |
323 | 327 | paths.push(...parsed.mediaUrls); |
324 | 328 | } |
325 | 329 | } |
326 | 330 | } |
327 | 331 | |
328 | 332 | if (paths.length > 0) { |
329 | | -return { mediaUrls: paths }; |
| 333 | +return { |
| 334 | +mediaUrls: paths, |
| 335 | + ...(audioAsVoice ? { audioAsVoice: true } : {}), |
| 336 | +}; |
330 | 337 | } |
331 | 338 | |
332 | 339 | // Fall back to legacy details.path when image content exists but no |
|
此内容由惯性聚合(RSS阅读器)自动聚合整理,仅供阅读参考。 原文来自 — 版权归原作者所有。