fetch.js
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.extractNamedEntities = extractNamedEntities;
exports.fetchPage = fetchPage;
exports.fetchWikipediaSummary = fetchWikipediaSummary;
const UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 " +
    "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36";
function cleanHtml(html) {
    return html
        .replace(/<script[\s\S]*?<\/script>/gi, " ")
        .replace(/<style[\s\S]*?<\/style>/gi, " ")
        .replace(/<nav[\s\S]*?<\/nav>/gi, " ")
        .replace(/<footer[\s\S]*?<\/footer>/gi, " ")
        .replace(/<aside[\s\S]*?<\/aside>/gi, " ")
        .replace(/<header[\s\S]*?<\/header>/gi, " ")
        .replace(/<noscript[\s\S]*?<\/noscript>/gi, " ")
        .replace(/<[^>]+>/g, " ")
        .replace(/&nbsp;/g, " ").replace(/&amp;/g, "&")
        .replace(/&lt;/g, "<").replace(/&gt;/g, ">")
        .replace(/&quot;/g, '"').replace(/&#39;/g, "'")
        .replace(/\s{2,}/g, " ").trim();
}
function extractTitle(html) {
    const m = html.match(/<title[^>]*>([^<]+)<\/title>/i);
    return m ? m[1].trim() : "";
}
function extractPublishedDate(html, url) {
    const metaPatterns = [
        /<meta[^>]+(?:property|name)="(?:article:published_time|datePublished)"[^>]+content="([^"]+)"/i,
        /<meta[^>]+content="([^"]+)"[^>]+(?:property|name)="(?:article:published_time|datePublished)"/i,
        /"datePublished"\s*:\s*"([^"]+)"/,
        /<time[^>]+datetime="([^"]+)"/i,
    ];
    for (const re of metaPatterns) {
        const m = html.match(re);
        if (m?.[1]) {
            const d = new Date(m[1]);
            if (!isNaN(d.getTime()))
                return d.toISOString().slice(0, 10);
        }
    }
    const urlDate = url.match(/\/(20\d{2})\/(0[1-9]|1[0-2])\/(0[1-9]|[12]\d|3[01])\//);
    if (urlDate)
        return `${urlDate[1]}-${urlDate[2]}-${urlDate[3]}`;
    return null;
}
function extractNamedEntities(text) {
    const seen = new Set();
    const phraseRe = /\b[A-Z][a-z]+(?:\s+[A-Z][a-zA-Z0-9\-]+){1,3}\b/g;
    let m;
    while ((m = phraseRe.exec(text)) !== null)
        seen.add(m[0]);
    const singleRe = /\b[A-Z][a-zA-Z]{2,}\b/g;
    const counts = new Map();
    while ((m = singleRe.exec(text)) !== null)
        counts.set(m[0], (counts.get(m[0]) ?? 0) + 1);
    for (const [w, c] of counts)
        if (c >= 2)
            seen.add(w);
    const stop = new Set(["The", "This", "That", "These", "Those", "There", "Their", "They", "What",
        "When", "Where", "Which", "While", "With", "Without", "Would", "Could", "Should", "About",
        "After", "Before", "Between", "During", "From", "Into", "More", "Most", "Also", "Each",
        "Such", "Some", "Other", "Than", "Then", "Been", "Being", "Have", "Having", "However",
        "Therefore", "Furthermore", "Moreover", "Although", "Because"]);
    return [...seen].filter(e => !stop.has(e)).slice(0, 15);
}
async function fetchPage(url, timeoutMs = 15_000) {
    const ctrl = new AbortController();
    const timer = setTimeout(() => ctrl.abort(), timeoutMs);
    let html = "";
    try {
        const res = await fetch(url, {
            signal: ctrl.signal,
            headers: {
                "User-Agent": UA,
                "Accept": "text/html,application/xhtml+xml,*/*;q=0.8",
                "Accept-Language": "en-US,en;q=0.9",
            },
        });
        if (!res.ok)
            throw new Error(`HTTP ${res.status}`);
        html = await res.text();
    }
    finally {
        clearTimeout(timer);
    }
    const title = extractTitle(html);
    const publishedDate = extractPublishedDate(html, url);
    const hasSchemaMarkup = /<script[^>]+type="application\/ld\+json"/i.test(html) ||
        /(?:og:|twitter:)(?:title|description)/i.test(html);
    const hasHeadings = /<h[23][^>]*>/i.test(html);
    let isClickbait = false;
    try {
        const urlPath = new URL(url).pathname.toLowerCase();
        isClickbait = /\/(top-10|shocking|you-wont-believe|click|viral)\//.test(urlPath);
    }
    catch {
        isClickbait = false;
    }
    const bodyText = cleanHtml(html).slice(0, 20_000);
    const wordCount = bodyText.split(/\s+/).filter(Boolean).length;
    const namedEntities = extractNamedEntities(bodyText);
    return { url, title, bodyText, wordCount, publishedDate, namedEntities, hasSchemaMarkup, hasHeadings, isClickbait };
}
async function fetchWikipediaSummary(entity) {
    const encoded = encodeURIComponent(entity.replace(/\s+/g, "_"));
    const url = `https://en.wikipedia.org/api/rest_v1/page/summary/${encoded}`;
    const ctrl = new AbortController();
    const timer = setTimeout(() => ctrl.abort(), 8_000);
    try {
        const res = await fetch(url, {
            signal: ctrl.signal,
            headers: { "User-Agent": "research-plugin/1.0" },
        });
        if (!res.ok)
            return null;
        const data = await res.json();
        return {
            entity: data.title ?? entity,
            summary: data.extract ?? data.description ?? "",
        };
    }
    catch {
        return null;
    }
    finally {
        clearTimeout(timer);
    }
}