fix(telegram): guard UTF-16 surrogate pairs in outbound chunkers (#93… · openclaw/openclaw@df87b40

Recent Commits to openclaw:main

Nas01010101 · 2026-06-17 · via Recent Commits to openclaw:main

File tree

packages/markdown-core/src

Original file line number	Diff line number	Diff line change
`@@ -424,4 +424,47 @@ describe("markdownToTelegramHtml", () => {`
`424`	`424`	`it("fails loudly when tag overhead leaves no room for text", () => {`
`425`	`425`	`expect(() => splitTelegramHtmlChunks("<b><i><u>x</u></i></b>", 10)).toThrow(/tag overhead/i);`
`426`	`426`	`});`
	`427`	`+`
	`428`	`+it("does not split an astral char across the chunk boundary", () => {`
	`429`	`+// Emoji surrogate pair straddles index 10 (limit): high at 9, low at 10.`
	`430`	+const input = `${"A".repeat(9)}😀${"B".repeat(20)}`;
	`431`	`+const chunks = splitTelegramHtmlChunks(input, 10);`
	`432`	`+expect(chunks.length).toBeGreaterThan(1);`
	`433`	`+expect(chunks.join("")).toBe(input);`
	`434`	`+for (const chunk of chunks) {`
	`435`	`+expect(containsLoneSurrogate(chunk)).toBe(false);`
	`436`	`+}`
	`437`	`+});`
	`438`	`+`
	`439`	`+it("keeps an astral char whole when a positive limit starts on its pair", () => {`
	`440`	`+expect(splitTelegramHtmlChunks("A😀B", 1)).toEqual(["A", "😀", "B"]);`
	`441`	`+});`
	`442`	`+`
	`443`	`+it("keeps astral chars whole in rendered Markdown chunks", () => {`
	`444`	`+const chunks = markdownToTelegramChunks("A😀B", 1);`
	`445`	`+`
	`446`	`+expect(chunks.map((chunk) => chunk.text)).toEqual(["A", "😀", "B"]);`
	`447`	`+for (const chunk of chunks) {`
	`448`	`+expect(containsLoneSurrogate(chunk.html)).toBe(false);`
	`449`	`+expect(containsLoneSurrogate(chunk.text)).toBe(false);`
	`450`	`+}`
	`451`	`+});`
`427`	`452`	`});`
	`453`	`+`
	`454`	`+function containsLoneSurrogate(text: string): boolean {`
	`455`	`+for (let index = 0; index < text.length; index += 1) {`
	`456`	`+const code = text.charCodeAt(index);`
	`457`	`+const isHigh = code >= 0xd800 && code <= 0xdbff;`
	`458`	`+const isLow = code >= 0xdc00 && code <= 0xdfff;`
	`459`	`+if (isHigh) {`
	`460`	`+const next = text.charCodeAt(index + 1);`
	`461`	`+if (!(next >= 0xdc00 && next <= 0xdfff)) {`
	`462`	`+return true;`
	`463`	`+}`
	`464`	`+index += 1;`
	`465`	`+} else if (isLow) {`
	`466`	`+return true;`
	`467`	`+}`
	`468`	`+}`
	`469`	`+return false;`
	`470`	`+}`

Original file line number	Diff line number	Diff line change
`@@ -1070,11 +1070,30 @@ function findTelegramHtmlEntityEnd(text: string, start: number): number {`
`1070`	`1070`	`return text[index] === ";" ? index : -1;`
`1071`	`1071`	`}`
`1072`	`1072`
	`1073`	`+// Never return a split index that lands between a UTF-16 surrogate pair, or`
	`1074`	`+// both chunks would carry a lone surrogate that re-encodes to U+FFFD. If the`
	`1075`	`+// pair starts the segment, keep it whole so chunking still advances.`
	`1076`	`+function clampToSurrogateBoundary(text: string, index: number): number {`
	`1077`	`+const high = text.charCodeAt(index - 1);`
	`1078`	`+const low = text.charCodeAt(index);`
	`1079`	`+const splitsPair =`
	`1080`	`+index > 0 && high >= 0xd800 && high <= 0xdbff && low >= 0xdc00 && low <= 0xdfff;`
	`1081`	`+if (!splitsPair) {`
	`1082`	`+return index;`
	`1083`	`+}`
	`1084`	`+return index > 1 ? index - 1 : index + 1;`
	`1085`	`+}`
	`1086`	`+`
`1073`	`1087`	`function findTelegramHtmlSafeSplitIndex(text: string, maxLength: number): number {`
`1074`	`1088`	`if (text.length <= maxLength) {`
`1075`	`1089`	`return text.length;`
`1076`	`1090`	`}`
`1077`	`1091`	`const normalizedMaxLength = Math.max(1, Math.floor(maxLength));`
	`1092`	`+const splitIndex = findTelegramHtmlEntitySafeSplitIndex(text, normalizedMaxLength);`
	`1093`	`+return clampToSurrogateBoundary(text, splitIndex);`
	`1094`	`+}`
	`1095`	`+`
	`1096`	`+function findTelegramHtmlEntitySafeSplitIndex(text: string, normalizedMaxLength: number): number {`
`1078`	`1097`	`const lastAmpersand = text.lastIndexOf("&", normalizedMaxLength - 1);`
`1079`	`1098`	`if (lastAmpersand === -1) {`
`1080`	`1099`	`return normalizedMaxLength;`

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,57 @@`
	`1`	`+// Telegram tests cover plain-text chunk-splitting behavior.`
	`2`	`+import { describe, expect, it } from "vitest";`
	`3`	`+import { splitTelegramPlainTextChunksForTests } from "./send.js";`
	`4`	`+`
	`5`	`+function containsLoneSurrogate(text: string): boolean {`
	`6`	`+for (let index = 0; index < text.length; index += 1) {`
	`7`	`+const code = text.charCodeAt(index);`
	`8`	`+const isHigh = code >= 0xd800 && code <= 0xdbff;`
	`9`	`+const isLow = code >= 0xdc00 && code <= 0xdfff;`
	`10`	`+if (isHigh) {`
	`11`	`+const next = text.charCodeAt(index + 1);`
	`12`	`+if (!(next >= 0xdc00 && next <= 0xdfff)) {`
	`13`	`+return true;`
	`14`	`+}`
	`15`	`+index += 1;`
	`16`	`+} else if (isLow) {`
	`17`	`+return true;`
	`18`	`+}`
	`19`	`+}`
	`20`	`+return false;`
	`21`	`+}`
	`22`	`+`
	`23`	`+describe("splitTelegramPlainTextChunks", () => {`
	`24`	`+it("does not split an astral char across the chunk boundary", () => {`
	`25`	`+// Emoji surrogate pair straddles index 10 (limit): high at 9, low at 10.`
	`26`	+const input = `${"A".repeat(9)}😀${"B".repeat(20)}`;
	`27`	`+const chunks = splitTelegramPlainTextChunksForTests(input, 10);`
	`28`	`+expect(chunks.length).toBeGreaterThan(1);`
	`29`	`+expect(chunks.join("")).toBe(input);`
	`30`	`+for (const chunk of chunks) {`
	`31`	`+expect(containsLoneSurrogate(chunk)).toBe(false);`
	`32`	`+}`
	`33`	`+});`
	`34`	`+`
	`35`	`+it("does not hang when limit=1 and text starts with an astral char", () => {`
	`36`	`+// Regression: with limit=1 the clamp would return start (no advance),`
	`37`	`+// causing the while-loop to spin forever. The surrogate pair must be`
	`38`	`+// emitted as a unit (2 code units) so the loop always advances.`
	`39`	`+const input = "😀X";`
	`40`	`+const chunks = splitTelegramPlainTextChunksForTests(input, 1);`
	`41`	`+expect(chunks.join("")).toBe(input);`
	`42`	`+for (const chunk of chunks) {`
	`43`	`+expect(containsLoneSurrogate(chunk)).toBe(false);`
	`44`	`+}`
	`45`	`+});`
	`46`	`+`
	`47`	`+it("does not hang when limit=1 and an astral char appears mid-string at a chunk boundary", () => {`
	`48`	`+// 'A' + emoji: with limit=1, second iteration starts at index 1 (high`
	`49`	`+// surrogate) — same stall condition as above, now mid-string.`
	`50`	`+const input = "A😀B";`
	`51`	`+const chunks = splitTelegramPlainTextChunksForTests(input, 1);`
	`52`	`+expect(chunks.join("")).toBe(input);`
	`53`	`+for (const chunk of chunks) {`
	`54`	`+expect(containsLoneSurrogate(chunk)).toBe(false);`
	`55`	`+}`
	`56`	`+});`
	`57`	`+});`

Original file line number	Diff line number	Diff line change
`@@ -179,14 +179,40 @@ function resolveTelegramMessageIdOrThrow(`
`179`	`179`	throw new Error(`Telegram ${context} returned no message_id`);
`180`	`180`	`}`
`181`	`181`
	`182`	`+// Pull a chunk end back off a UTF-16 surrogate pair so neither chunk carries a`
	`183`	`+// lone surrogate that re-encodes to U+FFFD. Mirrors the guard in`
	`184`	+// bot/native-quote.ts `truncateUtf16Safe`; shared by both plain-text splitters.
	`185`	`+//`
	`186`	+// `start` is the beginning of the current chunk — the return value is
	`187`	+// guaranteed to be > start, so callers that loop on `start = end` always
	`188`	+// advance. When clamping would land on `start` (i.e. the surrogate pair begins
	`189`	+// exactly at `start`), we emit both surrogates together (end = start + 2)
	`190`	`+// rather than emitting a lone surrogate or stalling.`
	`191`	`+function surrogateSafeChunkEnd(text: string, end: number, start: number): number {`
	`192`	`+const high = text.charCodeAt(end - 1);`
	`193`	`+const low = text.charCodeAt(end);`
	`194`	`+const splitsPair = end > 0 && high >= 0xd800 && high <= 0xdbff && low >= 0xdc00 && low <= 0xdfff;`
	`195`	`+if (!splitsPair) {`
	`196`	`+return end;`
	`197`	`+}`
	`198`	`+const clamped = end - 1;`
	`199`	`+// Guard: never return an index that would stall the loop. If clamped equals`
	`200`	`+// start the surrogate pair's high unit is the very first char of this chunk;`
	`201`	`+// emit both surrogates together instead of splitting or stalling.`
	`202`	`+return clamped > start ? clamped : start + 2;`
	`203`	`+}`
	`204`	`+`
`182`	`205`	`function splitTelegramPlainTextChunks(text: string, limit: number): string[] {`
`183`	`206`	`if (!text) {`
`184`	`207`	`return [];`
`185`	`208`	`}`
`186`	`209`	`const normalizedLimit = Math.max(1, Math.floor(limit));`
`187`	`210`	`const chunks: string[] = [];`
`188`		`-for (let start = 0; start < text.length; start += normalizedLimit) {`
`189`		`-chunks.push(text.slice(start, start + normalizedLimit));`
	`211`	`+let start = 0;`
	`212`	`+while (start < text.length) {`
	`213`	`+const end = surrogateSafeChunkEnd(text, start + normalizedLimit, start);`
	`214`	`+chunks.push(text.slice(start, end));`
	`215`	`+start = end;`
`190`	`216`	`}`
`191`	`217`	`return chunks;`
`192`	`218`	`}`
`@@ -209,12 +235,19 @@ function splitTelegramPlainTextFallback(text: string, chunkCount: number, limit:`
`209`	`235`	`remainingChunks === 1`
`210`	`236`	`? remainingChars`
`211`	`237`	`: Math.min(normalizedLimit, Math.ceil(remainingChars / remainingChunks));`
`212`		`-chunks.push(text.slice(offset, offset + nextChunkLength));`
`213`		`-offset += nextChunkLength;`
	`238`	`+const end = surrogateSafeChunkEnd(text, offset + nextChunkLength, offset);`
	`239`	`+chunks.push(text.slice(offset, end));`
	`240`	`+offset = end;`
`214`	`241`	`}`
`215`	`242`	`return chunks;`
`216`	`243`	`}`
`217`	`244`
	`245`	`+// Test-only handle: the plain-text splitter is internal, but its surrogate-safe`
	`246`	`+// chunk boundary needs direct behavior coverage.`
	`247`	`+export function splitTelegramPlainTextChunksForTests(text: string, limit: number): string[] {`
	`248`	`+return splitTelegramPlainTextChunks(text, limit);`
	`249`	`+}`
	`250`	`+`
`218`	`251`	`function logTelegramOutboundSendOk(params: TelegramOutboundSuccessLogParams): void {`
`219`	`252`	`const parts = [`
`220`	`253`	`"telegram outbound send ok",`

Original file line number	Diff line number	Diff line change
`@@ -43,6 +43,17 @@ describe("telegramPlugin outbound", () => {`
`43`	`43`	`expect(telegramOutbound.chunker?.(text, 4000)).toEqual([text]);`
`44`	`44`	`});`
`45`	`45`
	`46`	`+it("keeps astral characters whole at positive configured chunk limits", () => {`
	`47`	`+clearTelegramRuntime();`
	`48`	`+`
	`49`	`+expect(telegramOutbound.chunker?.("A😀B", 1)).toEqual(["A", "😀", "B"]);`
	`50`	`+expect(telegramOutbound.chunker?.("A😀B", 1, { formatting: { parseMode: "HTML" } })).toEqual([`
	`51`	`+"A",`
	`52`	`+"😀",`
	`53`	`+"B",`
	`54`	`+]);`
	`55`	`+});`
	`56`	`+`
`46`	`57`	`it("preserves markdown tables for the configured delivery renderer", () => {`
`47`	`58`	`clearTelegramRuntime();`
`48`	`59`	`const text = ["\| Name \| Value \|", "\|------\|-------\|", "\| A \| 1 \|"].join("\n");`

Original file line number	Diff line number	Diff line change
`@@ -42,6 +42,23 @@ function scanParenAwareBreakpoints(text: string): { lastNewline: number; lastWhi`
`42`	`42`	`return { lastNewline, lastWhitespace };`
`43`	`43`	`}`
`44`	`44`
	`45`	`+/**`
	`46`	`+ * Keeps UTF-16 chunk boundaries from separating a supplementary-plane character.`
	`47`	`+ * A one-unit positive limit still needs to emit an entire surrogate pair.`
	`48`	`+ */`
	`49`	`+export function avoidTrailingHighSurrogateBreak(text: string, start: number, end: number): number {`
	`50`	`+if (`
	`51`	`+end >= text.length \|\|`
	`52`	`+text.charCodeAt(end - 1) < 0xd800 \|\|`
	`53`	`+text.charCodeAt(end - 1) > 0xdbff \|\|`
	`54`	`+text.charCodeAt(end) < 0xdc00 \|\|`
	`55`	`+text.charCodeAt(end) > 0xdfff`
	`56`	`+) {`
	`57`	`+return end;`
	`58`	`+}`
	`59`	`+return end - 1 > start ? end - 1 : end + 1;`
	`60`	`+}`
	`61`	`+`
`45`	`62`	`/**`
`46`	`63`	`* Splits plain text into size-bounded chunks at readable boundaries.`
`47`	`64`	`*`
`@@ -66,7 +83,11 @@ export function chunkText(text: string, limit: number): string[] {`
`66`	`83`	`// Prefer block boundaries, then spaces, then a hard size cut when no`
`67`	`84`	`// readable breakpoint exists inside this window.`
`68`	`85`	`const breakOffset = lastNewline > 0 ? lastNewline : lastWhitespace;`
`69`		`-const end = breakOffset > 0 ? cursor + breakOffset : windowEnd;`
	`86`	`+const end = avoidTrailingHighSurrogateBreak(`
	`87`	`+text,`
	`88`	`+cursor,`
	`89`	`+breakOffset > 0 ? cursor + breakOffset : windowEnd,`
	`90`	`+);`
`70`	`91`	`chunks.push(text.slice(cursor, end));`
`71`	`92`	`cursor = end;`
`72`	`93`	`while (cursor < text.length && /\s/.test(text[cursor] ?? "")) {`

Original file line number	Diff line number	Diff line change
`@@ -85,6 +85,28 @@ describe("renderMarkdownIRChunksWithinLimit", () => {`
`85`	`85`	`expect(chunks.every((chunk) => chunk.rendered.length <= 1)).toBe(true);`
`86`	`86`	`});`
`87`	`87`
	`88`	`+it("keeps astral characters whole when a positive limit reaches their pair", () => {`
	`89`	`+const chunks = renderMarkdownIRChunksWithinLimit({`
	`90`	`+ir: markdownToIR("A😀B"),`
	`91`	`+limit: 1,`
	`92`	`+renderChunk: (chunk) => chunk.text,`
	`93`	`+measureRendered: (rendered) => rendered.length,`
	`94`	`+});`
	`95`	`+`
	`96`	`+expect(chunks.map((chunk) => chunk.source.text)).toEqual(["A", "😀", "B"]);`
	`97`	`+});`
	`98`	`+`
	`99`	`+it("keeps astral characters whole when rendered size requires a retry split", () => {`
	`100`	`+const chunks = renderMarkdownIRChunksWithinLimit({`
	`101`	`+ir: markdownToIR("A😀"),`
	`102`	`+limit: 3,`
	`103`	`+renderChunk: (chunk) => (chunk.text === "A😀" ? "too long" : chunk.text),`
	`104`	`+measureRendered: (rendered) => rendered.length,`
	`105`	`+});`
	`106`	`+`
	`107`	`+expect(chunks.map((chunk) => chunk.source.text)).toEqual(["A", "😀"]);`
	`108`	`+});`
	`109`	`+`
`88`	`110`	`it("treats Infinity as no size cap and returns a single chunk", () => {`
`89`	`111`	`const text = "one two three four five six seven eight nine ten";`
`90`	`112`	`const ir = markdownToIR(text);`

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+import { avoidTrailingHighSurrogateBreak } from "./chunk-text.js";`
`1`	`2`	`// Markdown Core module implements render aware chunking behavior.`
`2`	`3`	`import {`
`3`	`4`	`chunkMarkdownIR,`
`@@ -127,10 +128,11 @@ function findLargestChunkTextLengthWithinRenderedLimit<TRendered>(`
`127`	`128`	`// Rendered length is not guaranteed to be monotonic after escaping/link or`
`128`	`129`	`// file-reference rewriting, so test exact candidates from longest to shortest.`
`129`	`130`	`for (let candidateLength = currentTextLength - 1; candidateLength >= 1; candidateLength -= 1) {`
`130`		`-const candidate = sliceMarkdownIR(chunk, 0, candidateLength);`
	`131`	`+const safeCandidateLength = avoidTrailingHighSurrogateBreak(chunk.text, 0, candidateLength);`
	`132`	`+const candidate = sliceMarkdownIR(chunk, 0, safeCandidateLength);`
`131`	`133`	`const rendered = options.renderChunk(candidate);`
`132`	`134`	`if (options.measureRendered(rendered) <= renderedLimit) {`
`133`		`-return candidateLength;`
	`135`	`+return safeCandidateLength;`
`134`	`136`	`}`
`135`	`137`	`}`
`136`	`138`	`return 0;`
`@@ -215,7 +217,7 @@ function findMarkdownIRPreservedSplitIndex(text: string, start: number, limit: n`
`215`	`217`	`if (lastAnyWhitespaceBreak > start) {`
`216`	`218`	`return resolveWhitespaceBreak(lastAnyWhitespaceBreak, lastAnyWhitespaceRunStart);`
`217`	`219`	`}`
`218`		`-return maxEnd;`
	`220`	`+return avoidTrailingHighSurrogateBreak(text, start, maxEnd);`
`219`	`221`	`}`
`220`	`222`
`221`	`223`	`function splitMarkdownIRPreserveWhitespace(ir: MarkdownIR, limit: number): MarkdownIR[] {`

Original file line number	Diff line number	Diff line change
`@@ -604,6 +604,10 @@ describe("chunkMarkdownTextWithMode", () => {`
`604`	`604`	`expect(chunks.every((chunk) => !/[\uD800-\uDBFF]$/u.test(chunk))).toBe(true);`
`605`	`605`	`expect(chunks.every((chunk) => !/^[\uDC00-\uDFFF]/u.test(chunk))).toBe(true);`
`606`	`606`	`});`
	`607`	`+`
	`608`	`+it("keeps an astral character whole when a positive hard limit starts on its pair", () => {`
	`609`	`+expect(chunkMarkdownTextWithMode("A😀B", 1, "length")).toEqual(["A", "😀", "B"]);`
	`610`	`+});`
`607`	`611`	`});`
`608`	`612`
`609`	`613`	`describe("resolveChunkMode", () => {`

Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,7 @@ export function avoidTrailingHighSurrogateBreak(text: string, start: number, end`
`16`	`16`	`return end;`
`17`	`17`	`}`
`18`	`18`	`const adjusted = end - 1;`
`19`		`-return adjusted > start ? adjusted : end;`
	`19`	`+return adjusted > start ? adjusted : end + 1;`
`20`	`20`	`}`
`21`	`21`
`22`	`22`	`export function chunkTextByBreakResolver(`

此内容由惯性聚合(RSS阅读器)自动聚合整理，仅供阅读参考。原文来自 — 版权归原作者所有。

推荐订阅源

Recent Commits to openclaw:main

File tree