












@@ -4766,6 +4766,248 @@ describe("openai transport stream", () => {
47664766expect(params).not.toHaveProperty("max_completion_tokens");
47674767});
476847684769+it("clamps max_completion_tokens to the remaining context budget for proxy-like endpoints when prompt + output would exceed contextWindow (covers #83086)", () => {
4770+// StepFun-style shape: large context window, max_tokens equal to context,
4771+// and a substantial prompt that should leave well under the context budget.
4772+// 200_000 ASCII chars -> estimated 62_500 input tokens (chars/4 * 1.25).
4773+// That leaves remaining budget of 262_144 - 62_500 - 1 = 199_643 tokens.
4774+const systemPrompt = "x".repeat(200_000);
4775+const params = buildOpenAICompletionsParams(
4776+{
4777+id: "step-router-v1",
4778+name: "StepFun step-router-v1",
4779+api: "openai-completions",
4780+provider: "stepfun-plan",
4781+baseUrl: "https://api.stepfun.com/v1",
4782+reasoning: false,
4783+input: ["text"],
4784+cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
4785+contextWindow: 262_144,
4786+maxTokens: 262_144,
4787+} satisfies Model<"openai-completions">,
4788+{
4789+ systemPrompt,
4790+messages: [],
4791+tools: [],
4792+} as never,
4793+undefined,
4794+);
4795+4796+expect(typeof params.max_completion_tokens).toBe("number");
4797+const cap = params.max_completion_tokens as number;
4798+const estimatedInputTokens = Math.ceil((systemPrompt.length / 4) * 1.25);
4799+expect(cap).toBe(262_144 - estimatedInputTokens - 1);
4800+expect(cap).toBeLessThan(262_144);
4801+});
4802+4803+it("uses CJK-aware input estimates when clamping proxy-like completions output budgets", () => {
4804+const cjkPrompt = "你好世界".repeat(1_000);
4805+const params = buildOpenAICompletionsParams(
4806+{
4807+id: "kimi-k2.6",
4808+name: "Kimi K2.6",
4809+api: "openai-completions",
4810+provider: "dashscope",
4811+baseUrl: "https://dashscope.aliyuncs.com/compatible-mode/v1",
4812+reasoning: false,
4813+input: ["text"],
4814+cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
4815+contextWindow: 10_000,
4816+maxTokens: 10_000,
4817+} satisfies Model<"openai-completions">,
4818+{
4819+systemPrompt: cjkPrompt,
4820+messages: [],
4821+tools: [],
4822+} as never,
4823+undefined,
4824+);
4825+4826+// 4,000 CJK chars count as 16,000 adjusted chars, then chars/4 * 1.25.
4827+expect(params.max_completion_tokens).toBe(10_000 - 5_000 - 1);
4828+});
4829+4830+it("rounds proxy-like completions input estimates after summing message content", () => {
4831+const messages = Array.from({ length: 4_000 }, () => ({
4832+role: "user",
4833+content: "x",
4834+}));
4835+const params = buildOpenAICompletionsParams(
4836+{
4837+id: "qwen3-5-122b-a10b-nvfp4",
4838+name: "qwen3-5-122b-a10b-nvfp4",
4839+api: "openai-completions",
4840+provider: "vllm",
4841+baseUrl: "http://localhost:8000/v1",
4842+reasoning: false,
4843+input: ["text"],
4844+cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
4845+contextWindow: 10_000,
4846+maxTokens: 10_000,
4847+} satisfies Model<"openai-completions">,
4848+{
4849+systemPrompt: undefined,
4850+ messages,
4851+tools: [],
4852+} as never,
4853+undefined,
4854+);
4855+4856+expect(params.max_completion_tokens).toBe(10_000 - 1_250 - 1);
4857+});
4858+4859+it("estimates proxy-like completions input from the final outbound messages after compat transforms", () => {
4860+const userText = "ok";
4861+const params = buildOpenAICompletionsParams(
4862+{
4863+id: "qwen3-5-122b-a10b-nvfp4",
4864+name: "qwen3-5-122b-a10b-nvfp4",
4865+api: "openai-completions",
4866+provider: "vllm",
4867+baseUrl: "http://localhost:8000/v1",
4868+reasoning: false,
4869+input: ["text"],
4870+cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
4871+contextWindow: 10_000,
4872+maxTokens: 10_000,
4873+} satisfies Model<"openai-completions">,
4874+{
4875+messages: [
4876+{ role: "user", content: userText, timestamp: 1 },
4877+{
4878+role: "assistant",
4879+content: [{ type: "text", text: "x".repeat(20_000) }],
4880+api: "openai-completions",
4881+provider: "vllm",
4882+model: "qwen3-5-122b-a10b-nvfp4",
4883+usage: {
4884+input: 0,
4885+output: 0,
4886+cacheRead: 0,
4887+cacheWrite: 0,
4888+totalTokens: 0,
4889+cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
4890+},
4891+stopReason: "aborted",
4892+timestamp: 2,
4893+},
4894+],
4895+tools: [],
4896+} as never,
4897+undefined,
4898+);
4899+4900+const estimatedInputTokens = Math.ceil((userText.length / 4) * 1.25);
4901+expect(params.max_completion_tokens).toBe(10_000 - estimatedInputTokens - 1);
4902+});
4903+4904+it("clamps proxy-like completions output budgets against contextTokens before contextWindow", () => {
4905+const params = buildOpenAICompletionsParams(
4906+{
4907+id: "qwen3-5-122b-a10b-nvfp4",
4908+name: "qwen3-5-122b-a10b-nvfp4",
4909+api: "openai-completions",
4910+provider: "vllm",
4911+baseUrl: "http://localhost:8000/v1",
4912+reasoning: false,
4913+input: ["text"],
4914+cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
4915+contextWindow: 131_072,
4916+contextTokens: 4_096,
4917+maxTokens: 200_000,
4918+} as unknown as Model<"openai-completions">,
4919+{
4920+systemPrompt: "system",
4921+messages: [],
4922+tools: [],
4923+} as never,
4924+undefined,
4925+);
4926+4927+expect(params.max_completion_tokens).toBe(4_096 - 2 - 1);
4928+});
4929+4930+it("clamps max_completion_tokens for proxy-like endpoints when configured maxTokens >= contextWindow and prompt is small", () => {
4931+// Misconfig case: tiny prompt, but configured maxTokens still exceeds the
4932+// model's contextWindow. Clamp should land just under the window.
4933+const params = buildOpenAICompletionsParams(
4934+{
4935+id: "qwen3-5-122b-a10b-nvfp4",
4936+name: "qwen3-5-122b-a10b-nvfp4",
4937+api: "openai-completions",
4938+provider: "vllm",
4939+baseUrl: "http://localhost:8000/v1",
4940+reasoning: false,
4941+input: ["text"],
4942+cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
4943+contextWindow: 131_072,
4944+maxTokens: 200_000,
4945+} satisfies Model<"openai-completions">,
4946+{
4947+systemPrompt: "system",
4948+messages: [],
4949+tools: [],
4950+} as never,
4951+undefined,
4952+);
4953+4954+expect(typeof params.max_completion_tokens).toBe("number");
4955+const cap = params.max_completion_tokens as number;
4956+expect(cap).toBeLessThan(131_072);
4957+// Small prompt → cap is essentially contextWindow - 1 - tiny_input_estimate.
4958+expect(cap).toBeGreaterThanOrEqual(131_000);
4959+});
4960+4961+it("does not clamp max_completion_tokens for proxy-like endpoints when maxTokens fits the context window", () => {
4962+const params = buildOpenAICompletionsParams(
4963+{
4964+id: "qwen3-5-122b-a10b-nvfp4",
4965+name: "qwen3-5-122b-a10b-nvfp4",
4966+api: "openai-completions",
4967+provider: "vllm",
4968+baseUrl: "http://localhost:8000/v1",
4969+reasoning: false,
4970+input: ["text"],
4971+cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
4972+contextWindow: 131_072,
4973+maxTokens: 8192,
4974+} satisfies Model<"openai-completions">,
4975+{
4976+systemPrompt: "system",
4977+messages: [],
4978+tools: [],
4979+} as never,
4980+undefined,
4981+);
4982+4983+expect(params.max_completion_tokens).toBe(8192);
4984+});
4985+4986+it("preserves the configured maxTokens for native openai-completions endpoints even when it equals or exceeds contextWindow", () => {
4987+const params = buildOpenAICompletionsParams(
4988+{
4989+id: "gpt-5.4",
4990+name: "GPT-5.4",
4991+api: "openai-completions",
4992+provider: "openai",
4993+baseUrl: "https://api.openai.com/v1",
4994+reasoning: false,
4995+input: ["text"],
4996+cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
4997+contextWindow: 100_000,
4998+maxTokens: 200_000,
4999+} satisfies Model<"openai-completions">,
5000+{
5001+systemPrompt: "system",
5002+messages: [],
5003+tools: [],
5004+} as never,
5005+undefined,
5006+);
5007+5008+expect(params.max_completion_tokens).toBe(200_000);
5009+});
5010+47695011it("omits strict tool shaping for Z.ai default-route completions providers", () => {
47705012const params = buildOpenAICompletionsParams(
47715013{
此內容由慣性聚合(RSS閱讀器)自動聚合整理,僅供閱讀參考。 原文來自 — 版權歸原作者所有。