Self-contained Dockerized build for end users. Run via docker compose; see README.md for setup. Source-only, no sample data or build artifacts. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
196 lines
7.9 KiB
JavaScript
196 lines
7.9 KiB
JavaScript
// lib/model-caps.js — sizes generation defaults to the selected model.
|
|
//
|
|
// "Auto" mode (generation.auto, on by default) asks the provider for the
|
|
// model's real limits and budgets maxTokens / maxSourceChars to fit:
|
|
// Ollama POST /api/show -> model_info.<arch>.context_length
|
|
// LM Studio GET /api/v0/models -> loaded_context_length / max_context_length
|
|
// Anthropic GET /v1/models/{id} -> max_input_tokens, max_tokens
|
|
// Google GET /v1beta/models/{id} -> inputTokenLimit, outputTokenLimit
|
|
// OpenAI (no limits API) -> pattern table below
|
|
// Lookups are cached in memory and every failure falls back to safe defaults,
|
|
// so generation never breaks because a limits lookup did.
|
|
|
|
const VERSION_HEADER = { "anthropic-version": "2023-06-01" };
|
|
|
|
// English prose averages ~4 chars/token; 3.5 leaves margin for dense text.
|
|
const CHARS_PER_TOKEN = 3.5;
|
|
|
|
const CAPS_TTL_OK = 5 * 60 * 1000;
|
|
const CAPS_TTL_FAIL = 30 * 1000;
|
|
const capsCache = new Map(); // "provider|baseUrl|model" -> { caps, at }
|
|
|
|
const LOCAL_PROVIDERS = new Set(["ollama", "lmstudio"]);
|
|
|
|
// Used when the provider can't be asked (server down, no key, unknown model).
|
|
const FALLBACK_CAPS = {
|
|
ollama: { contextTokens: 8192, maxOutputTokens: null },
|
|
lmstudio: { contextTokens: 8192, maxOutputTokens: null },
|
|
openai: { contextTokens: 128000, maxOutputTokens: 16384 },
|
|
anthropic: { contextTokens: 200000, maxOutputTokens: 8192 },
|
|
google: { contextTokens: 1000000, maxOutputTokens: 8192 },
|
|
};
|
|
|
|
function cleanBase(url, fallback) {
|
|
let b = (url || fallback || "").trim();
|
|
if (!b) return fallback;
|
|
return b.replace(/\/+$/, "");
|
|
}
|
|
|
|
function clamp(n, lo, hi) {
|
|
return Math.min(hi, Math.max(lo, n));
|
|
}
|
|
|
|
async function fetchJson(url, init = {}) {
|
|
const res = await fetch(url, { ...init, signal: AbortSignal.timeout(5000) });
|
|
if (!res.ok) throw new Error("HTTP " + res.status);
|
|
return res.json();
|
|
}
|
|
|
|
async function ollamaCaps(cfg) {
|
|
const base = cleanBase(cfg.baseUrl, "http://localhost:11434");
|
|
const data = await fetchJson(base + "/api/show", {
|
|
method: "POST",
|
|
headers: { "Content-Type": "application/json" },
|
|
body: JSON.stringify({ model: cfg.model }),
|
|
});
|
|
const info = data?.model_info || {};
|
|
const arch = info["general.architecture"];
|
|
let ctx = Number(arch ? info[`${arch}.context_length`] : 0) || 0;
|
|
if (!ctx) {
|
|
const key = Object.keys(info).find((k) => k.endsWith(".context_length"));
|
|
ctx = Number(key ? info[key] : 0) || 0;
|
|
}
|
|
// A num_ctx in the Modelfile is a deliberate (often memory-driven) cap — honor it.
|
|
const numCtx = Number((String(data?.parameters || "").match(/^num_ctx\s+(\d+)/m) || [])[1] || 0);
|
|
if (numCtx) ctx = ctx ? Math.min(ctx, numCtx) : numCtx;
|
|
if (!ctx) throw new Error("no context_length in /api/show response");
|
|
return { contextTokens: ctx, maxOutputTokens: null };
|
|
}
|
|
|
|
async function lmstudioCaps(cfg) {
|
|
const base = cleanBase(cfg.baseUrl, "http://localhost:1234");
|
|
const data = await fetchJson(base + "/api/v0/models");
|
|
const m = (data?.data || []).find((x) => x.id === cfg.model);
|
|
if (!m) throw new Error("model not in /api/v0/models");
|
|
// loaded_context_length is what the server actually honors; a not-yet-loaded
|
|
// model JIT-loads at its configured default, so stay conservative there.
|
|
const ctx =
|
|
Number(m.loaded_context_length) ||
|
|
Math.min(Number(m.max_context_length) || 8192, 8192);
|
|
return { contextTokens: ctx, maxOutputTokens: null };
|
|
}
|
|
|
|
async function anthropicCaps(cfg) {
|
|
if (!cfg.apiKey) throw new Error("no API key");
|
|
const data = await fetchJson(
|
|
`https://api.anthropic.com/v1/models/${encodeURIComponent(cfg.model)}`,
|
|
{ headers: { "x-api-key": cfg.apiKey, ...VERSION_HEADER } }
|
|
);
|
|
return {
|
|
contextTokens: Number(data?.max_input_tokens) || 200000,
|
|
maxOutputTokens: Number(data?.max_tokens) || 8192,
|
|
};
|
|
}
|
|
|
|
async function googleCaps(cfg) {
|
|
if (!cfg.apiKey) throw new Error("no API key");
|
|
const data = await fetchJson(
|
|
`https://generativelanguage.googleapis.com/v1beta/models/${encodeURIComponent(cfg.model)}?key=${encodeURIComponent(cfg.apiKey)}`
|
|
);
|
|
return {
|
|
contextTokens: Number(data?.inputTokenLimit) || 1000000,
|
|
maxOutputTokens: Number(data?.outputTokenLimit) || 8192,
|
|
};
|
|
}
|
|
|
|
// OpenAI's models API doesn't report limits, so match on the model name.
|
|
function openaiCaps(cfg) {
|
|
const m = String(cfg.model || "").toLowerCase();
|
|
const pick = (contextTokens, maxOutputTokens) => ({ contextTokens, maxOutputTokens });
|
|
if (/^o\d/.test(m)) return pick(200000, 100000);
|
|
if (m.includes("gpt-5")) return pick(272000, 128000);
|
|
if (m.includes("gpt-4.1")) return pick(1000000, 32768);
|
|
if (m.includes("gpt-4o") || m.includes("chatgpt-4o")) return pick(128000, 16384);
|
|
if (m.includes("gpt-4-turbo")) return pick(128000, 4096);
|
|
if (m.includes("gpt-4-32k")) return pick(32768, 8192);
|
|
if (m.includes("gpt-4")) return pick(8192, 4096);
|
|
if (m.includes("gpt-3.5")) return pick(16385, 4096);
|
|
return pick(128000, 16384);
|
|
}
|
|
|
|
// getModelCaps(settings) -> { contextTokens, maxOutputTokens|null, source }
|
|
// source: "live" (asked the provider) | "catalog" (pattern table) | "fallback"
|
|
export async function getModelCaps(settings) {
|
|
const provider = settings.provider;
|
|
const cfg = settings.providers?.[provider] || {};
|
|
const fallback = { ...(FALLBACK_CAPS[provider] || FALLBACK_CAPS.openai), source: "fallback" };
|
|
if (!cfg.model) return fallback;
|
|
|
|
// Key length is included so a cached "no key" fallback doesn't mask a freshly added key.
|
|
const key = `${provider}|${cfg.baseUrl || ""}|${cfg.model}|${(cfg.apiKey || "").length}`;
|
|
const hit = capsCache.get(key);
|
|
if (hit && Date.now() - hit.at < (hit.caps.source === "fallback" ? CAPS_TTL_FAIL : CAPS_TTL_OK)) {
|
|
return hit.caps;
|
|
}
|
|
|
|
let caps;
|
|
try {
|
|
if (provider === "ollama") caps = { ...(await ollamaCaps(cfg)), source: "live" };
|
|
else if (provider === "lmstudio") caps = { ...(await lmstudioCaps(cfg)), source: "live" };
|
|
else if (provider === "anthropic") caps = { ...(await anthropicCaps(cfg)), source: "live" };
|
|
else if (provider === "google") caps = { ...(await googleCaps(cfg)), source: "live" };
|
|
else if (provider === "openai") caps = { ...openaiCaps(cfg), source: "catalog" };
|
|
else caps = fallback;
|
|
} catch {
|
|
caps = fallback;
|
|
}
|
|
capsCache.set(key, { caps, at: Date.now() });
|
|
return caps;
|
|
}
|
|
|
|
function autoDefaults(caps, provider) {
|
|
const ctx = caps.contextTokens;
|
|
let maxTokens, overhead, sourceCap;
|
|
if (LOCAL_PROVIDERS.has(provider)) {
|
|
// Local models: spend at most a quarter of the window on the response and
|
|
// keep the source moderate — small models lose accuracy when drowned in
|
|
// text, and a bigger window costs RAM on the user's machine.
|
|
maxTokens = clamp(Math.floor(ctx / 4), 2000, 8000);
|
|
overhead = 3500; // system prompt + instructions + question list during verification
|
|
sourceCap = 32000;
|
|
} else {
|
|
// Cloud models: generous response budget (long assignments with rubrics)
|
|
// and room for much longer source material.
|
|
maxTokens = Math.min(caps.maxOutputTokens || 16000, 16000);
|
|
overhead = 3000;
|
|
sourceCap = 120000;
|
|
}
|
|
const sourceTokens = Math.max(1200, ctx - maxTokens - overhead);
|
|
const maxSourceChars = clamp(
|
|
Math.floor((sourceTokens * CHARS_PER_TOKEN) / 1000) * 1000,
|
|
4000,
|
|
sourceCap
|
|
);
|
|
return { maxTokens, maxSourceChars };
|
|
}
|
|
|
|
// resolveGeneration(settings) -> { auto, temperature, maxTokens, maxSourceChars, caps }
|
|
// In auto mode the limits are computed from the model's capabilities; in
|
|
// manual mode the user's stored values pass through. Never throws.
|
|
export async function resolveGeneration(settings) {
|
|
const gen = settings.generation || {};
|
|
const caps = await getModelCaps(settings);
|
|
const auto = gen.auto !== false;
|
|
const temperature = gen.temperature ?? 0.3;
|
|
if (!auto) {
|
|
return {
|
|
auto,
|
|
temperature,
|
|
maxTokens: gen.maxTokens ?? 8000,
|
|
maxSourceChars: gen.maxSourceChars ?? 24000,
|
|
caps,
|
|
};
|
|
}
|
|
return { auto, temperature, ...autoDefaults(caps, settings.provider), caps };
|
|
}
|