Forked from altra/research
fetch.js
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.extractNamedEntities = extractNamedEntities;
exports.fetchPage = fetchPage;
exports.fetchWikipediaSummary = fetchWikipediaSummary;
const UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 " +
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36";
function cleanHtml(html) {
return html
.replace(/<script[\s\S]*?<\/script>/gi, " ")
.replace(/<style[\s\S]*?<\/style>/gi, " ")
.replace(/<nav[\s\S]*?<\/nav>/gi, " ")
.replace(/<footer[\s\S]*?<\/footer>/gi, " ")
.replace(/<aside[\s\S]*?<\/aside>/gi, " ")
.replace(/<header[\s\S]*?<\/header>/gi, " ")
.replace(/<noscript[\s\S]*?<\/noscript>/gi, " ")
.replace(/<[^>]+>/g, " ")
.replace(/ /g, " ").replace(/&/g, "&")
.replace(/</g, "<").replace(/>/g, ">")
.replace(/"/g, '"').replace(/'/g, "'")
.replace(/\s{2,}/g, " ").trim();
}
function extractTitle(html) {
const m = html.match(/<title[^>]*>([^<]+)<\/title>/i);
return m ? m[1].trim() : "";
}
function extractPublishedDate(html, url) {
const metaPatterns = [
/<meta[^>]+(?:property|name)="(?:article:published_time|datePublished)"[^>]+content="([^"]+)"/i,
/<meta[^>]+content="([^"]+)"[^>]+(?:property|name)="(?:article:published_time|datePublished)"/i,
/"datePublished"\s*:\s*"([^"]+)"/,
/<time[^>]+datetime="([^"]+)"/i,
];
for (const re of metaPatterns) {
const m = html.match(re);
if (m?.[1]) {
const d = new Date(m[1]);
if (!isNaN(d.getTime()))
return d.toISOString().slice(0, 10);
}
}
const urlDate = url.match(/\/(20\d{2})\/(0[1-9]|1[0-2])\/(0[1-9]|[12]\d|3[01])\//);
if (urlDate)
return `${urlDate[1]}-${urlDate[2]}-${urlDate[3]}`;
return null;
}
function extractNamedEntities(text) {
const seen = new Set();
const phraseRe = /\b[A-Z][a-z]+(?:\s+[A-Z][a-zA-Z0-9\-]+){1,3}\b/g;
let m;
while ((m = phraseRe.exec(text)) !== null)
seen.add(m[0]);
const singleRe = /\b[A-Z][a-zA-Z]{2,}\b/g;
const counts = new Map();
while ((m = singleRe.exec(text)) !== null)
counts.set(m[0], (counts.get(m[0]) ?? 0) + 1);
for (const [w, c] of counts)
if (c >= 2)
seen.add(w);
const stop = new Set(["The", "This", "That", "These", "Those", "There", "Their", "They", "What",
"When", "Where", "Which", "While", "With", "Without", "Would", "Could", "Should", "About",
"After", "Before", "Between", "During", "From", "Into", "More", "Most", "Also", "Each",
"Such", "Some", "Other", "Than", "Then", "Been", "Being", "Have", "Having", "However",
"Therefore", "Furthermore", "Moreover", "Although", "Because"]);
return [...seen].filter(e => !stop.has(e)).slice(0, 15);
}
async function fetchPage(url, timeoutMs = 15_000) {
const ctrl = new AbortController();
const timer = setTimeout(() => ctrl.abort(), timeoutMs);
let html = "";
try {
const res = await fetch(url, {
signal: ctrl.signal,
headers: {
"User-Agent": UA,
"Accept": "text/html,application/xhtml+xml,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
},
});
if (!res.ok)
throw new Error(`HTTP ${res.status}`);
html = await res.text();
}
finally {
clearTimeout(timer);
}
const title = extractTitle(html);
const publishedDate = extractPublishedDate(html, url);
const hasSchemaMarkup = /<script[^>]+type="application\/ld\+json"/i.test(html) ||
/(?:og:|twitter:)(?:title|description)/i.test(html);
const hasHeadings = /<h[23][^>]*>/i.test(html);
let isClickbait = false;
try {
const urlPath = new URL(url).pathname.toLowerCase();
isClickbait = /\/(top-10|shocking|you-wont-believe|click|viral)\//.test(urlPath);
}
catch {
isClickbait = false;
}
const bodyText = cleanHtml(html).slice(0, 20_000);
const wordCount = bodyText.split(/\s+/).filter(Boolean).length;
const namedEntities = extractNamedEntities(bodyText);
return { url, title, bodyText, wordCount, publishedDate, namedEntities, hasSchemaMarkup, hasHeadings, isClickbait };
}
async function fetchWikipediaSummary(entity) {
const encoded = encodeURIComponent(entity.replace(/\s+/g, "_"));
const url = `https://en.wikipedia.org/api/rest_v1/page/summary/${encoded}`;
const ctrl = new AbortController();
const timer = setTimeout(() => ctrl.abort(), 8_000);
try {
const res = await fetch(url, {
signal: ctrl.signal,
headers: { "User-Agent": "research-plugin/1.0" },
});
if (!res.ok)
return null;
const data = await res.json();
return {
entity: data.title ?? entity,
summary: data.extract ?? data.description ?? "",
};
}
catch {
return null;
}
finally {
clearTimeout(timer);
}
}