mr-drews-assignment-creator/lib/model-caps.js

// lib/model-caps.js — sizes generation defaults to the selected model.
//
// "Auto" mode (generation.auto, on by default) asks the provider for the
// model's real limits and budgets maxTokens / maxSourceChars to fit:
//   Ollama     POST /api/show            -> model_info.<arch>.context_length
//   LM Studio  GET  /api/v0/models       -> loaded_context_length / max_context_length
//   Anthropic  GET  /v1/models/{id}      -> max_input_tokens, max_tokens
//   Google     GET  /v1beta/models/{id}  -> inputTokenLimit, outputTokenLimit
//   OpenAI     (no limits API)           -> pattern table below
// Lookups are cached in memory and every failure falls back to safe defaults,
// so generation never breaks because a limits lookup did.

const VERSION_HEADER = { "anthropic-version": "2023-06-01" };

// English prose averages ~4 chars/token; 3.5 leaves margin for dense text.
const CHARS_PER_TOKEN = 3.5;

const CAPS_TTL_OK = 5 * 60 * 1000;
const CAPS_TTL_FAIL = 30 * 1000;
const capsCache = new Map(); // "provider|baseUrl|model" -> { caps, at }

const LOCAL_PROVIDERS = new Set(["ollama", "lmstudio"]);

// Used when the provider can't be asked (server down, no key, unknown model).
const FALLBACK_CAPS = {
  ollama: { contextTokens: 8192, maxOutputTokens: null },
  lmstudio: { contextTokens: 8192, maxOutputTokens: null },
  openai: { contextTokens: 128000, maxOutputTokens: 16384 },
  anthropic: { contextTokens: 200000, maxOutputTokens: 8192 },
  google: { contextTokens: 1000000, maxOutputTokens: 8192 },
};

function cleanBase(url, fallback) {
  let b = (url || fallback || "").trim();
  if (!b) return fallback;
  return b.replace(/\/+$/, "");
}

function clamp(n, lo, hi) {
  return Math.min(hi, Math.max(lo, n));
}

async function fetchJson(url, init = {}) {
  const res = await fetch(url, { ...init, signal: AbortSignal.timeout(5000) });
  if (!res.ok) throw new Error("HTTP " + res.status);
  return res.json();
}

async function ollamaCaps(cfg) {
  const base = cleanBase(cfg.baseUrl, "http://localhost:11434");
  const data = await fetchJson(base + "/api/show", {
    method: "POST",
    headers: { "Content-Type": "application/json" },
    body: JSON.stringify({ model: cfg.model }),
  });
  const info = data?.model_info || {};
  const arch = info["general.architecture"];
  let ctx = Number(arch ? info[`${arch}.context_length`] : 0) || 0;
  if (!ctx) {
    const key = Object.keys(info).find((k) => k.endsWith(".context_length"));
    ctx = Number(key ? info[key] : 0) || 0;
  }
  // A num_ctx in the Modelfile is a deliberate (often memory-driven) cap — honor it.
  const numCtx = Number((String(data?.parameters || "").match(/^num_ctx\s+(\d+)/m) || [])[1] || 0);
  if (numCtx) ctx = ctx ? Math.min(ctx, numCtx) : numCtx;
  if (!ctx) throw new Error("no context_length in /api/show response");
  return { contextTokens: ctx, maxOutputTokens: null };
}

async function lmstudioCaps(cfg) {
  const base = cleanBase(cfg.baseUrl, "http://localhost:1234");
  const data = await fetchJson(base + "/api/v0/models");
  const m = (data?.data || []).find((x) => x.id === cfg.model);
  if (!m) throw new Error("model not in /api/v0/models");
  // loaded_context_length is what the server actually honors; a not-yet-loaded
  // model JIT-loads at its configured default, so stay conservative there.
  const ctx =
    Number(m.loaded_context_length) ||
    Math.min(Number(m.max_context_length) || 8192, 8192);
  return { contextTokens: ctx, maxOutputTokens: null };
}

async function anthropicCaps(cfg) {
  if (!cfg.apiKey) throw new Error("no API key");
  const data = await fetchJson(
    `https://api.anthropic.com/v1/models/${encodeURIComponent(cfg.model)}`,
    { headers: { "x-api-key": cfg.apiKey, ...VERSION_HEADER } }
  );
  return {
    contextTokens: Number(data?.max_input_tokens) || 200000,
    maxOutputTokens: Number(data?.max_tokens) || 8192,
  };
}

async function googleCaps(cfg) {
  if (!cfg.apiKey) throw new Error("no API key");
  const data = await fetchJson(
    `https://generativelanguage.googleapis.com/v1beta/models/${encodeURIComponent(cfg.model)}?key=${encodeURIComponent(cfg.apiKey)}`
  );
  return {
    contextTokens: Number(data?.inputTokenLimit) || 1000000,
    maxOutputTokens: Number(data?.outputTokenLimit) || 8192,
  };
}

// OpenAI's models API doesn't report limits, so match on the model name.
function openaiCaps(cfg) {
  const m = String(cfg.model || "").toLowerCase();
  const pick = (contextTokens, maxOutputTokens) => ({ contextTokens, maxOutputTokens });
  if (/^o\d/.test(m)) return pick(200000, 100000);
  if (m.includes("gpt-5")) return pick(272000, 128000);
  if (m.includes("gpt-4.1")) return pick(1000000, 32768);
  if (m.includes("gpt-4o") || m.includes("chatgpt-4o")) return pick(128000, 16384);
  if (m.includes("gpt-4-turbo")) return pick(128000, 4096);
  if (m.includes("gpt-4-32k")) return pick(32768, 8192);
  if (m.includes("gpt-4")) return pick(8192, 4096);
  if (m.includes("gpt-3.5")) return pick(16385, 4096);
  return pick(128000, 16384);
}

// getModelCaps(settings) -> { contextTokens, maxOutputTokens|null, source }
// source: "live" (asked the provider) | "catalog" (pattern table) | "fallback"
export async function getModelCaps(settings) {
  const provider = settings.provider;
  const cfg = settings.providers?.[provider] || {};
  const fallback = { ...(FALLBACK_CAPS[provider] || FALLBACK_CAPS.openai), source: "fallback" };
  if (!cfg.model) return fallback;

  // Key length is included so a cached "no key" fallback doesn't mask a freshly added key.
  const key = `${provider}|${cfg.baseUrl || ""}|${cfg.model}|${(cfg.apiKey || "").length}`;
  const hit = capsCache.get(key);
  if (hit && Date.now() - hit.at < (hit.caps.source === "fallback" ? CAPS_TTL_FAIL : CAPS_TTL_OK)) {
    return hit.caps;
  }

  let caps;
  try {
    if (provider === "ollama") caps = { ...(await ollamaCaps(cfg)), source: "live" };
    else if (provider === "lmstudio") caps = { ...(await lmstudioCaps(cfg)), source: "live" };
    else if (provider === "anthropic") caps = { ...(await anthropicCaps(cfg)), source: "live" };
    else if (provider === "google") caps = { ...(await googleCaps(cfg)), source: "live" };
    else if (provider === "openai") caps = { ...openaiCaps(cfg), source: "catalog" };
    else caps = fallback;
  } catch {
    caps = fallback;
  }
  capsCache.set(key, { caps, at: Date.now() });
  return caps;
}

function autoDefaults(caps, provider) {
  const ctx = caps.contextTokens;
  let maxTokens, overhead, sourceCap;
  if (LOCAL_PROVIDERS.has(provider)) {
    // Local models: spend at most a quarter of the window on the response and
    // keep the source moderate — small models lose accuracy when drowned in
    // text, and a bigger window costs RAM on the user's machine.
    maxTokens = clamp(Math.floor(ctx / 4), 2000, 8000);
    overhead = 3500; // system prompt + instructions + question list during verification
    sourceCap = 32000;
  } else {
    // Cloud models: generous response budget (long assignments with rubrics)
    // and room for much longer source material.
    maxTokens = Math.min(caps.maxOutputTokens || 16000, 16000);
    overhead = 3000;
    sourceCap = 120000;
  }
  const sourceTokens = Math.max(1200, ctx - maxTokens - overhead);
  const maxSourceChars = clamp(
    Math.floor((sourceTokens * CHARS_PER_TOKEN) / 1000) * 1000,
    4000,
    sourceCap
  );
  return { maxTokens, maxSourceChars };
}

// resolveGeneration(settings) -> { auto, temperature, maxTokens, maxSourceChars, caps }
// In auto mode the limits are computed from the model's capabilities; in
// manual mode the user's stored values pass through. Never throws.
export async function resolveGeneration(settings) {
  const gen = settings.generation || {};
  const caps = await getModelCaps(settings);
  const auto = gen.auto !== false;
  const temperature = gen.temperature ?? 0.3;
  if (!auto) {
    return {
      auto,
      temperature,
      maxTokens: gen.maxTokens ?? 8000,
      maxSourceChars: gen.maxSourceChars ?? 24000,
      caps,
    };
  }
  return { auto, temperature, ...autoDefaults(caps, settings.provider), caps };
}