"use strict";
// Minimal, dependency-free HTML → main-text extractor.
// Inspired by what get-single-web-page-content does in mrkrsl/web-search-mcp,
// but without Playwright or @mozilla/readability — we just strip noise tags,
// pick the largest reasonable content container, and de-tag/whitespace it.
Object.defineProperty(exports, "__esModule", { value: true });
exports.fetchAndExtract = fetchAndExtract;
exports.fetchAndExtractMany = fetchAndExtractMany;
const ranking_1 = require("./ranking");
const NOISE_TAGS = [
    "script",
    "style",
    "noscript",
    "template",
    "svg",
    "iframe",
    "form",
    "nav",
    "aside",
    "footer",
    "header",
];
function stripTag(html, tag) {
    const re = new RegExp(`<${tag}\\b[^>]*>[\\s\\S]*?</${tag}>`, "gi");
    return html.replace(re, " ");
}
function stripComments(html) {
    return html.replace(/<!--[\s\S]*?-->/g, " ");
}
function decodeEntities(s) {
    return s
        .replace(/&amp;/g, "&")
        .replace(/&lt;/g, "<")
        .replace(/&gt;/g, ">")
        .replace(/&quot;/g, '"')
        .replace(/&apos;/g, "'")
        .replace(/&#39;/g, "'")
        .replace(/&nbsp;/g, " ")
        .replace(/&#(\d+);/g, (_, d) => String.fromCharCode(parseInt(d, 10)))
        .replace(/&#x([0-9a-fA-F]+);/g, (_, h) => String.fromCharCode(parseInt(h, 16)));
}
function extractFirst(html, tag) {
    const re = new RegExp(`<${tag}\\b[^>]*>([\\s\\S]*?)</${tag}>`, "i");
    const m = re.exec(html);
    return m ? m[1] : null;
}
function htmlToText(html) {
    // Convert block-level elements to newlines before stripping tags so we don't
    // glue words together.
    const blockified = html
        .replace(/<br\s*\/?>/gi, "\n")
        .replace(/<\/(p|div|li|tr|h[1-6]|section|article|blockquote)>/gi, "\n")
        .replace(/<li\b[^>]*>/gi, "• ");
    const noTags = blockified.replace(/<[^>]+>/g, " ");
    const decoded = decodeEntities(noTags);
    // Collapse whitespace per line, then collapse multiple blank lines.
    return decoded
        .split("\n")
        .map((line) => line.replace(/[\t \u00a0]+/g, " ").trim())
        .filter((line, i, a) => !(line === "" && a[i - 1] === ""))
        .join("\n")
        .trim();
}
function pickMainContainer(html) {
    // Prefer <article>, then <main>, then the largest <div> by text length.
    const article = extractFirst(html, "article");
    if (article && article.length > 200)
        return article;
    const main = extractFirst(html, "main");
    if (main && main.length > 200)
        return main;
    // Fall back: scan top-level <div> blocks and keep the longest one whose
    // visible text is reasonable.
    let best = html;
    let bestLen = htmlToText(html).length;
    const divRe = /<div\b[^>]*>([\s\S]*?)<\/div>/gi;
    let m;
    while ((m = divRe.exec(html)) !== null) {
        const inner = m[1];
        const textLen = htmlToText(inner).length;
        if (textLen > bestLen) {
            best = inner;
            bestLen = textLen;
        }
    }
    return best;
}
// When a page's extracted text exceeds the budget, this picks the most
// relevant paragraphs (those containing query terms) plus surrounding context
// up to the byte budget. Falls back to a plain head-truncate when no query is
// given or nothing matches.
function squeezeToBudget(text, budget, query) {
    if (text.length <= budget)
        return text;
    const headFallback = () => text.slice(0, budget - 1) + "…";
    if (!query)
        return headFallback();
    const qTokens = new Set((0, ranking_1.tokenize)(query));
    if (qTokens.size === 0)
        return headFallback();
    // Split into paragraphs by blank lines.
    const paragraphs = text.split(/\n\s*\n+/);
    if (paragraphs.length < 2)
        return headFallback();
    const scored = paragraphs.map((p, idx) => {
        const tt = (0, ranking_1.tokenize)(p);
        let hits = 0;
        for (const t of tt)
            if (qTokens.has(t))
                hits++;
        const density = p.length > 0 ? (hits * 100) / p.length : 0;
        return { idx, text: p, hits, density };
    });
    const anyHit = scored.some((s) => s.hits > 0);
    if (!anyHit) {
        // No query terms in body — keep the head so the model still gets something
        // structured (title is already returned separately).
        return headFallback();
    }
    // Always include the first paragraph (usually the lede / intro).
    const keep = new Set();
    if (paragraphs[0]?.length > 0)
        keep.add(0);
    // Take paragraphs in descending relevance order, plus one neighbour on each
    // side for context, until we run out of budget.
    const ordered = [...scored]
        .filter((s) => s.hits > 0)
        .sort((a, b) => b.density - a.density || b.hits - a.hits);
    let used = 0;
    const sepCost = 2; // "\n\n"
    for (const s of ordered) {
        for (const i of [s.idx - 1, s.idx, s.idx + 1]) {
            if (i < 0 || i >= paragraphs.length)
                continue;
            if (keep.has(i))
                continue;
            const cost = paragraphs[i].length + sepCost;
            if (used + cost > budget)
                continue;
            keep.add(i);
            used += cost;
        }
        if (used >= budget)
            break;
    }
    const ordered2 = [...keep].sort((a, b) => a - b);
    const out = [];
    for (let i = 0; i < ordered2.length; i++) {
        if (i > 0 && ordered2[i] !== ordered2[i - 1] + 1)
            out.push("[…]");
        out.push(paragraphs[ordered2[i]]);
    }
    let joined = out.join("\n\n");
    if (joined.length > budget)
        joined = joined.slice(0, budget - 1) + "…";
    return joined;
}
async function fetchAndExtract(url, opts) {
    const ctl = new AbortController();
    const timer = setTimeout(() => ctl.abort(), opts.timeoutMs);
    try {
        const res = await fetch(url, {
            headers: {
                "User-Agent": opts.userAgent,
                Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language": "en-US,en;q=0.9",
            },
            signal: ctl.signal,
            redirect: "follow",
        });
        const finalUrl = res.url || url;
        const status = res.status;
        if (!res.ok) {
            return {
                url,
                final_url: finalUrl,
                status,
                title: "",
                content: "",
                truncated: false,
                byte_length: 0,
                error: `HTTP ${status}`,
            };
        }
        const ct = res.headers.get("content-type") ?? "";
        const raw = await res.text();
        const byteLen = raw.length;
        if (!/html|xml|text\/plain/i.test(ct) && !/^\s*</.test(raw)) {
            // Non-HTML: return truncated raw text.
            const truncated = raw.length > opts.maxContentLength;
            return {
                url,
                final_url: finalUrl,
                status,
                title: "",
                content: raw.slice(0, opts.maxContentLength),
                truncated,
                byte_length: byteLen,
            };
        }
        let html = raw;
        html = stripComments(html);
        for (const tag of NOISE_TAGS)
            html = stripTag(html, tag);
        const titleHtml = extractFirst(html, "title") ?? "";
        const title = decodeEntities(titleHtml.replace(/<[^>]+>/g, " ").trim()).slice(0, 300);
        const body = extractFirst(html, "body") ?? html;
        const main = pickMainContainer(body);
        let content = htmlToText(main);
        let truncated = content.length > opts.maxContentLength;
        if (truncated) {
            content = squeezeToBudget(content, opts.maxContentLength, opts.query);
        }
        return {
            url,
            final_url: finalUrl,
            status,
            title,
            content,
            truncated,
            byte_length: byteLen,
        };
    }
    catch (e) {
        return {
            url,
            final_url: url,
            status: 0,
            title: "",
            content: "",
            truncated: false,
            byte_length: 0,
            error: e?.name === "AbortError" ? "Timeout" : String(e?.message ?? e),
        };
    }
    finally {
        clearTimeout(timer);
    }
}
async function fetchAndExtractMany(urls, opts, concurrency) {
    const c = Math.max(1, Math.min(8, concurrency));
    const out = new Array(urls.length);
    let next = 0;
    async function worker() {
        while (true) {
            const i = next++;
            if (i >= urls.length)
                return;
            out[i] = await fetchAndExtract(urls[i], opts);
        }
    }
    await Promise.all(Array.from({ length: Math.min(c, urls.length) }, worker));
    return out;
}
//# sourceMappingURL=extract.js.map
multi-search

multi-search