bizzle 5a51a0f112 Mr. Drew's Assignment Creator — Docker share build
Self-contained Dockerized build for end users. Run via docker compose;
see README.md for setup. Source-only, no sample data or build artifacts.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-21 19:58:36 -04:00

68 lines
2.4 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// lib/html-to-text.js — turn a fetched web page into clean, LLM-friendly text.
// Zero dependencies: pragmatic tag stripping + entity decoding.
const ENTITIES = {
amp: "&", lt: "<", gt: ">", quot: '"', apos: "'", nbsp: " ",
mdash: "—", ndash: "", hellip: "…", rsquo: "'", lsquo: "'",
rdquo: '"', ldquo: '"', copy: "©", reg: "®", trade: "™",
deg: "°", frac12: "½", frac14: "¼", times: "×", divide: "÷",
eacute: "é", egrave: "è", agrave: "à", ccedil: "ç", uuml: "ü", ouml: "ö", auml: "ä",
};
function decodeEntities(s) {
return s
.replace(/&#x([0-9a-f]+);/gi, (_, h) => safeChar(parseInt(h, 16)))
.replace(/&#(\d+);/g, (_, d) => safeChar(parseInt(d, 10)))
.replace(/&([a-z]+);/gi, (m, name) => ENTITIES[name.toLowerCase()] ?? m);
}
function safeChar(code) {
try { return String.fromCodePoint(code); } catch { return ""; }
}
export function extractTitle(html) {
const m = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
if (!m) return "";
return decodeEntities(m[1]).replace(/\s+/g, " ").trim().slice(0, 200);
}
export function htmlToText(html) {
let s = String(html);
// Remove non-content blocks entirely
s = s.replace(/<!--[\s\S]*?-->/g, " ");
for (const tag of ["script", "style", "noscript", "svg", "iframe", "form", "nav", "footer", "header", "aside", "template", "button", "select"]) {
s = s.replace(new RegExp(`<${tag}[\\s\\S]*?<\\/${tag}>`, "gi"), " ");
}
// Preserve structure: headings, paragraphs, list items, table cells, breaks
s = s.replace(/<\/(h[1-6])>/gi, "\n\n");
s = s.replace(/<(h[1-6])[^>]*>/gi, "\n\n## ");
s = s.replace(/<\/(p|div|section|article|blockquote|tr|table|ul|ol|figcaption)>/gi, "\n");
s = s.replace(/<li[^>]*>/gi, "\n- ");
s = s.replace(/<(td|th)[^>]*>/gi, " | ");
s = s.replace(/<br\s*\/?>/gi, "\n");
// Strip all remaining tags
s = s.replace(/<[^>]+>/g, " ");
s = decodeEntities(s);
// Normalize whitespace
s = s.replace(/\r/g, "");
s = s.replace(/[ \t]+/g, " ");
s = s.replace(/ ?\n ?/g, "\n");
s = s.replace(/\n{3,}/g, "\n\n");
// Drop very short junk lines (menus, single links) when the doc is large
const lines = s.split("\n").map((l) => l.trim());
const kept = [];
for (const line of lines) {
if (!line) { kept.push(""); continue; }
if (line.length < 3 && !/^[-#\d]/.test(line)) continue;
kept.push(line);
}
s = kept.join("\n").replace(/\n{3,}/g, "\n\n").trim();
return s;
}