bizzle 5a51a0f112 Mr. Drew's Assignment Creator — Docker share build
Self-contained Dockerized build for end users. Run via docker compose;
see README.md for setup. Source-only, no sample data or build artifacts.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-21 19:58:36 -04:00

196 lines
7.9 KiB
JavaScript

// lib/model-caps.js — sizes generation defaults to the selected model.
//
// "Auto" mode (generation.auto, on by default) asks the provider for the
// model's real limits and budgets maxTokens / maxSourceChars to fit:
// Ollama POST /api/show -> model_info.<arch>.context_length
// LM Studio GET /api/v0/models -> loaded_context_length / max_context_length
// Anthropic GET /v1/models/{id} -> max_input_tokens, max_tokens
// Google GET /v1beta/models/{id} -> inputTokenLimit, outputTokenLimit
// OpenAI (no limits API) -> pattern table below
// Lookups are cached in memory and every failure falls back to safe defaults,
// so generation never breaks because a limits lookup did.
const VERSION_HEADER = { "anthropic-version": "2023-06-01" };
// English prose averages ~4 chars/token; 3.5 leaves margin for dense text.
const CHARS_PER_TOKEN = 3.5;
const CAPS_TTL_OK = 5 * 60 * 1000;
const CAPS_TTL_FAIL = 30 * 1000;
const capsCache = new Map(); // "provider|baseUrl|model" -> { caps, at }
const LOCAL_PROVIDERS = new Set(["ollama", "lmstudio"]);
// Used when the provider can't be asked (server down, no key, unknown model).
const FALLBACK_CAPS = {
ollama: { contextTokens: 8192, maxOutputTokens: null },
lmstudio: { contextTokens: 8192, maxOutputTokens: null },
openai: { contextTokens: 128000, maxOutputTokens: 16384 },
anthropic: { contextTokens: 200000, maxOutputTokens: 8192 },
google: { contextTokens: 1000000, maxOutputTokens: 8192 },
};
function cleanBase(url, fallback) {
let b = (url || fallback || "").trim();
if (!b) return fallback;
return b.replace(/\/+$/, "");
}
function clamp(n, lo, hi) {
return Math.min(hi, Math.max(lo, n));
}
async function fetchJson(url, init = {}) {
const res = await fetch(url, { ...init, signal: AbortSignal.timeout(5000) });
if (!res.ok) throw new Error("HTTP " + res.status);
return res.json();
}
async function ollamaCaps(cfg) {
const base = cleanBase(cfg.baseUrl, "http://localhost:11434");
const data = await fetchJson(base + "/api/show", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ model: cfg.model }),
});
const info = data?.model_info || {};
const arch = info["general.architecture"];
let ctx = Number(arch ? info[`${arch}.context_length`] : 0) || 0;
if (!ctx) {
const key = Object.keys(info).find((k) => k.endsWith(".context_length"));
ctx = Number(key ? info[key] : 0) || 0;
}
// A num_ctx in the Modelfile is a deliberate (often memory-driven) cap — honor it.
const numCtx = Number((String(data?.parameters || "").match(/^num_ctx\s+(\d+)/m) || [])[1] || 0);
if (numCtx) ctx = ctx ? Math.min(ctx, numCtx) : numCtx;
if (!ctx) throw new Error("no context_length in /api/show response");
return { contextTokens: ctx, maxOutputTokens: null };
}
async function lmstudioCaps(cfg) {
const base = cleanBase(cfg.baseUrl, "http://localhost:1234");
const data = await fetchJson(base + "/api/v0/models");
const m = (data?.data || []).find((x) => x.id === cfg.model);
if (!m) throw new Error("model not in /api/v0/models");
// loaded_context_length is what the server actually honors; a not-yet-loaded
// model JIT-loads at its configured default, so stay conservative there.
const ctx =
Number(m.loaded_context_length) ||
Math.min(Number(m.max_context_length) || 8192, 8192);
return { contextTokens: ctx, maxOutputTokens: null };
}
async function anthropicCaps(cfg) {
if (!cfg.apiKey) throw new Error("no API key");
const data = await fetchJson(
`https://api.anthropic.com/v1/models/${encodeURIComponent(cfg.model)}`,
{ headers: { "x-api-key": cfg.apiKey, ...VERSION_HEADER } }
);
return {
contextTokens: Number(data?.max_input_tokens) || 200000,
maxOutputTokens: Number(data?.max_tokens) || 8192,
};
}
async function googleCaps(cfg) {
if (!cfg.apiKey) throw new Error("no API key");
const data = await fetchJson(
`https://generativelanguage.googleapis.com/v1beta/models/${encodeURIComponent(cfg.model)}?key=${encodeURIComponent(cfg.apiKey)}`
);
return {
contextTokens: Number(data?.inputTokenLimit) || 1000000,
maxOutputTokens: Number(data?.outputTokenLimit) || 8192,
};
}
// OpenAI's models API doesn't report limits, so match on the model name.
function openaiCaps(cfg) {
const m = String(cfg.model || "").toLowerCase();
const pick = (contextTokens, maxOutputTokens) => ({ contextTokens, maxOutputTokens });
if (/^o\d/.test(m)) return pick(200000, 100000);
if (m.includes("gpt-5")) return pick(272000, 128000);
if (m.includes("gpt-4.1")) return pick(1000000, 32768);
if (m.includes("gpt-4o") || m.includes("chatgpt-4o")) return pick(128000, 16384);
if (m.includes("gpt-4-turbo")) return pick(128000, 4096);
if (m.includes("gpt-4-32k")) return pick(32768, 8192);
if (m.includes("gpt-4")) return pick(8192, 4096);
if (m.includes("gpt-3.5")) return pick(16385, 4096);
return pick(128000, 16384);
}
// getModelCaps(settings) -> { contextTokens, maxOutputTokens|null, source }
// source: "live" (asked the provider) | "catalog" (pattern table) | "fallback"
export async function getModelCaps(settings) {
const provider = settings.provider;
const cfg = settings.providers?.[provider] || {};
const fallback = { ...(FALLBACK_CAPS[provider] || FALLBACK_CAPS.openai), source: "fallback" };
if (!cfg.model) return fallback;
// Key length is included so a cached "no key" fallback doesn't mask a freshly added key.
const key = `${provider}|${cfg.baseUrl || ""}|${cfg.model}|${(cfg.apiKey || "").length}`;
const hit = capsCache.get(key);
if (hit && Date.now() - hit.at < (hit.caps.source === "fallback" ? CAPS_TTL_FAIL : CAPS_TTL_OK)) {
return hit.caps;
}
let caps;
try {
if (provider === "ollama") caps = { ...(await ollamaCaps(cfg)), source: "live" };
else if (provider === "lmstudio") caps = { ...(await lmstudioCaps(cfg)), source: "live" };
else if (provider === "anthropic") caps = { ...(await anthropicCaps(cfg)), source: "live" };
else if (provider === "google") caps = { ...(await googleCaps(cfg)), source: "live" };
else if (provider === "openai") caps = { ...openaiCaps(cfg), source: "catalog" };
else caps = fallback;
} catch {
caps = fallback;
}
capsCache.set(key, { caps, at: Date.now() });
return caps;
}
function autoDefaults(caps, provider) {
const ctx = caps.contextTokens;
let maxTokens, overhead, sourceCap;
if (LOCAL_PROVIDERS.has(provider)) {
// Local models: spend at most a quarter of the window on the response and
// keep the source moderate — small models lose accuracy when drowned in
// text, and a bigger window costs RAM on the user's machine.
maxTokens = clamp(Math.floor(ctx / 4), 2000, 8000);
overhead = 3500; // system prompt + instructions + question list during verification
sourceCap = 32000;
} else {
// Cloud models: generous response budget (long assignments with rubrics)
// and room for much longer source material.
maxTokens = Math.min(caps.maxOutputTokens || 16000, 16000);
overhead = 3000;
sourceCap = 120000;
}
const sourceTokens = Math.max(1200, ctx - maxTokens - overhead);
const maxSourceChars = clamp(
Math.floor((sourceTokens * CHARS_PER_TOKEN) / 1000) * 1000,
4000,
sourceCap
);
return { maxTokens, maxSourceChars };
}
// resolveGeneration(settings) -> { auto, temperature, maxTokens, maxSourceChars, caps }
// In auto mode the limits are computed from the model's capabilities; in
// manual mode the user's stored values pass through. Never throws.
export async function resolveGeneration(settings) {
const gen = settings.generation || {};
const caps = await getModelCaps(settings);
const auto = gen.auto !== false;
const temperature = gen.temperature ?? 0.3;
if (!auto) {
return {
auto,
temperature,
maxTokens: gen.maxTokens ?? 8000,
maxSourceChars: gen.maxSourceChars ?? 24000,
caps,
};
}
return { auto, temperature, ...autoDefaults(caps, settings.provider), caps };
}