mr-drews-assignment-creator/lib/html-to-text.js

// lib/html-to-text.js — turn a fetched web page into clean, LLM-friendly text.
// Zero dependencies: pragmatic tag stripping + entity decoding.

const ENTITIES = {
  amp: "&", lt: "<", gt: ">", quot: '"', apos: "'", nbsp: " ",
  mdash: "—", ndash: "–", hellip: "…", rsquo: "'", lsquo: "'",
  rdquo: '"', ldquo: '"', copy: "©", reg: "®", trade: "™",
  deg: "°", frac12: "½", frac14: "¼", times: "×", divide: "÷",
  eacute: "é", egrave: "è", agrave: "à", ccedil: "ç", uuml: "ü", ouml: "ö", auml: "ä",
};

function decodeEntities(s) {
  return s
    .replace(/&#x([0-9a-f]+);/gi, (_, h) => safeChar(parseInt(h, 16)))
    .replace(/&#(\d+);/g, (_, d) => safeChar(parseInt(d, 10)))
    .replace(/&([a-z]+);/gi, (m, name) => ENTITIES[name.toLowerCase()] ?? m);
}

function safeChar(code) {
  try { return String.fromCodePoint(code); } catch { return ""; }
}

export function extractTitle(html) {
  const m = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
  if (!m) return "";
  return decodeEntities(m[1]).replace(/\s+/g, " ").trim().slice(0, 200);
}

export function htmlToText(html) {
  let s = String(html);

  // Remove non-content blocks entirely
  s = s.replace(/<!--[\s\S]*?-->/g, " ");
  for (const tag of ["script", "style", "noscript", "svg", "iframe", "form", "nav", "footer", "header", "aside", "template", "button", "select"]) {
    s = s.replace(new RegExp(`<${tag}[\\s\\S]*?<\\/${tag}>`, "gi"), " ");
  }

  // Preserve structure: headings, paragraphs, list items, table cells, breaks
  s = s.replace(/<\/(h[1-6])>/gi, "\n\n");
  s = s.replace(/<(h[1-6])[^>]*>/gi, "\n\n## ");
  s = s.replace(/<\/(p|div|section|article|blockquote|tr|table|ul|ol|figcaption)>/gi, "\n");
  s = s.replace(/<li[^>]*>/gi, "\n- ");
  s = s.replace(/<(td|th)[^>]*>/gi, " | ");
  s = s.replace(/<br\s*\/?>/gi, "\n");

  // Strip all remaining tags
  s = s.replace(/<[^>]+>/g, " ");

  s = decodeEntities(s);

  // Normalize whitespace
  s = s.replace(/\r/g, "");
  s = s.replace(/[ \t]+/g, " ");
  s = s.replace(/ ?\n ?/g, "\n");
  s = s.replace(/\n{3,}/g, "\n\n");

  // Drop very short junk lines (menus, single links) when the doc is large
  const lines = s.split("\n").map((l) => l.trim());
  const kept = [];
  for (const line of lines) {
    if (!line) { kept.push(""); continue; }
    if (line.length < 3 && !/^[-#\d]/.test(line)) continue;
    kept.push(line);
  }
  s = kept.join("\n").replace(/\n{3,}/g, "\n\n").trim();
  return s;
}