// lib/model-caps.js — sizes generation defaults to the selected model. // // "Auto" mode (generation.auto, on by default) asks the provider for the // model's real limits and budgets maxTokens / maxSourceChars to fit: // Ollama POST /api/show -> model_info..context_length // LM Studio GET /api/v0/models -> loaded_context_length / max_context_length // Anthropic GET /v1/models/{id} -> max_input_tokens, max_tokens // Google GET /v1beta/models/{id} -> inputTokenLimit, outputTokenLimit // OpenAI (no limits API) -> pattern table below // Lookups are cached in memory and every failure falls back to safe defaults, // so generation never breaks because a limits lookup did. const VERSION_HEADER = { "anthropic-version": "2023-06-01" }; // English prose averages ~4 chars/token; 3.5 leaves margin for dense text. const CHARS_PER_TOKEN = 3.5; const CAPS_TTL_OK = 5 * 60 * 1000; const CAPS_TTL_FAIL = 30 * 1000; const capsCache = new Map(); // "provider|baseUrl|model" -> { caps, at } const LOCAL_PROVIDERS = new Set(["ollama", "lmstudio"]); // Used when the provider can't be asked (server down, no key, unknown model). const FALLBACK_CAPS = { ollama: { contextTokens: 8192, maxOutputTokens: null }, lmstudio: { contextTokens: 8192, maxOutputTokens: null }, openai: { contextTokens: 128000, maxOutputTokens: 16384 }, anthropic: { contextTokens: 200000, maxOutputTokens: 8192 }, google: { contextTokens: 1000000, maxOutputTokens: 8192 }, }; function cleanBase(url, fallback) { let b = (url || fallback || "").trim(); if (!b) return fallback; return b.replace(/\/+$/, ""); } function clamp(n, lo, hi) { return Math.min(hi, Math.max(lo, n)); } async function fetchJson(url, init = {}) { const res = await fetch(url, { ...init, signal: AbortSignal.timeout(5000) }); if (!res.ok) throw new Error("HTTP " + res.status); return res.json(); } async function ollamaCaps(cfg) { const base = cleanBase(cfg.baseUrl, "http://localhost:11434"); const data = await fetchJson(base + "/api/show", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ model: cfg.model }), }); const info = data?.model_info || {}; const arch = info["general.architecture"]; let ctx = Number(arch ? info[`${arch}.context_length`] : 0) || 0; if (!ctx) { const key = Object.keys(info).find((k) => k.endsWith(".context_length")); ctx = Number(key ? info[key] : 0) || 0; } // A num_ctx in the Modelfile is a deliberate (often memory-driven) cap — honor it. const numCtx = Number((String(data?.parameters || "").match(/^num_ctx\s+(\d+)/m) || [])[1] || 0); if (numCtx) ctx = ctx ? Math.min(ctx, numCtx) : numCtx; if (!ctx) throw new Error("no context_length in /api/show response"); return { contextTokens: ctx, maxOutputTokens: null }; } async function lmstudioCaps(cfg) { const base = cleanBase(cfg.baseUrl, "http://localhost:1234"); const data = await fetchJson(base + "/api/v0/models"); const m = (data?.data || []).find((x) => x.id === cfg.model); if (!m) throw new Error("model not in /api/v0/models"); // loaded_context_length is what the server actually honors; a not-yet-loaded // model JIT-loads at its configured default, so stay conservative there. const ctx = Number(m.loaded_context_length) || Math.min(Number(m.max_context_length) || 8192, 8192); return { contextTokens: ctx, maxOutputTokens: null }; } async function anthropicCaps(cfg) { if (!cfg.apiKey) throw new Error("no API key"); const data = await fetchJson( `https://api.anthropic.com/v1/models/${encodeURIComponent(cfg.model)}`, { headers: { "x-api-key": cfg.apiKey, ...VERSION_HEADER } } ); return { contextTokens: Number(data?.max_input_tokens) || 200000, maxOutputTokens: Number(data?.max_tokens) || 8192, }; } async function googleCaps(cfg) { if (!cfg.apiKey) throw new Error("no API key"); const data = await fetchJson( `https://generativelanguage.googleapis.com/v1beta/models/${encodeURIComponent(cfg.model)}?key=${encodeURIComponent(cfg.apiKey)}` ); return { contextTokens: Number(data?.inputTokenLimit) || 1000000, maxOutputTokens: Number(data?.outputTokenLimit) || 8192, }; } // OpenAI's models API doesn't report limits, so match on the model name. function openaiCaps(cfg) { const m = String(cfg.model || "").toLowerCase(); const pick = (contextTokens, maxOutputTokens) => ({ contextTokens, maxOutputTokens }); if (/^o\d/.test(m)) return pick(200000, 100000); if (m.includes("gpt-5")) return pick(272000, 128000); if (m.includes("gpt-4.1")) return pick(1000000, 32768); if (m.includes("gpt-4o") || m.includes("chatgpt-4o")) return pick(128000, 16384); if (m.includes("gpt-4-turbo")) return pick(128000, 4096); if (m.includes("gpt-4-32k")) return pick(32768, 8192); if (m.includes("gpt-4")) return pick(8192, 4096); if (m.includes("gpt-3.5")) return pick(16385, 4096); return pick(128000, 16384); } // getModelCaps(settings) -> { contextTokens, maxOutputTokens|null, source } // source: "live" (asked the provider) | "catalog" (pattern table) | "fallback" export async function getModelCaps(settings) { const provider = settings.provider; const cfg = settings.providers?.[provider] || {}; const fallback = { ...(FALLBACK_CAPS[provider] || FALLBACK_CAPS.openai), source: "fallback" }; if (!cfg.model) return fallback; // Key length is included so a cached "no key" fallback doesn't mask a freshly added key. const key = `${provider}|${cfg.baseUrl || ""}|${cfg.model}|${(cfg.apiKey || "").length}`; const hit = capsCache.get(key); if (hit && Date.now() - hit.at < (hit.caps.source === "fallback" ? CAPS_TTL_FAIL : CAPS_TTL_OK)) { return hit.caps; } let caps; try { if (provider === "ollama") caps = { ...(await ollamaCaps(cfg)), source: "live" }; else if (provider === "lmstudio") caps = { ...(await lmstudioCaps(cfg)), source: "live" }; else if (provider === "anthropic") caps = { ...(await anthropicCaps(cfg)), source: "live" }; else if (provider === "google") caps = { ...(await googleCaps(cfg)), source: "live" }; else if (provider === "openai") caps = { ...openaiCaps(cfg), source: "catalog" }; else caps = fallback; } catch { caps = fallback; } capsCache.set(key, { caps, at: Date.now() }); return caps; } function autoDefaults(caps, provider) { const ctx = caps.contextTokens; let maxTokens, overhead, sourceCap; if (LOCAL_PROVIDERS.has(provider)) { // Local models: spend at most a quarter of the window on the response and // keep the source moderate — small models lose accuracy when drowned in // text, and a bigger window costs RAM on the user's machine. maxTokens = clamp(Math.floor(ctx / 4), 2000, 8000); overhead = 3500; // system prompt + instructions + question list during verification sourceCap = 32000; } else { // Cloud models: generous response budget (long assignments with rubrics) // and room for much longer source material. maxTokens = Math.min(caps.maxOutputTokens || 16000, 16000); overhead = 3000; sourceCap = 120000; } const sourceTokens = Math.max(1200, ctx - maxTokens - overhead); const maxSourceChars = clamp( Math.floor((sourceTokens * CHARS_PER_TOKEN) / 1000) * 1000, 4000, sourceCap ); return { maxTokens, maxSourceChars }; } // resolveGeneration(settings) -> { auto, temperature, maxTokens, maxSourceChars, caps } // In auto mode the limits are computed from the model's capabilities; in // manual mode the user's stored values pass through. Never throws. export async function resolveGeneration(settings) { const gen = settings.generation || {}; const caps = await getModelCaps(settings); const auto = gen.auto !== false; const temperature = gen.temperature ?? 0.3; if (!auto) { return { auto, temperature, maxTokens: gen.maxTokens ?? 8000, maxSourceChars: gen.maxSourceChars ?? 24000, caps, }; } return { auto, temperature, ...autoDefaults(caps, settings.provider), caps }; }