src / fetch.ts
const UA =
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 " +
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36";
function cleanHtml(html: string): string {
return html
.replace(/<script[\s\S]*?<\/script>/gi, " ")
.replace(/<style[\s\S]*?<\/style>/gi, " ")
.replace(/<nav[\s\S]*?<\/nav>/gi, " ")
.replace(/<footer[\s\S]*?<\/footer>/gi, " ")
.replace(/<aside[\s\S]*?<\/aside>/gi, " ")
.replace(/<header[\s\S]*?<\/header>/gi, " ")
.replace(/<noscript[\s\S]*?<\/noscript>/gi, " ")
.replace(/<[^>]+>/g, " ")
.replace(/ /g, " ").replace(/&/g, "&")
.replace(/</g, "<").replace(/>/g, ">")
.replace(/"/g, '"').replace(/'/g, "'")
.replace(/\s{2,}/g, " ").trim();
}
function extractTitle(html: string): string {
const m = html.match(/<title[^>]*>([^<]+)<\/title>/i);
return m ? m[1].trim() : "";
}
function extractPublishedDate(html: string, url: string): string | null {
const metaPatterns = [
/<meta[^>]+(?:property|name)="(?:article:published_time|datePublished)"[^>]+content="([^"]+)"/i,
/<meta[^>]+content="([^"]+)"[^>]+(?:property|name)="(?:article:published_time|datePublished)"/i,
/"datePublished"\s*:\s*"([^"]+)"/,
/<time[^>]+datetime="([^"]+)"/i,
];
for (const re of metaPatterns) {
const m = html.match(re);
if (m?.[1]) {
const d = new Date(m[1]);
if (!isNaN(d.getTime())) return d.toISOString().slice(0, 10);
}
}
const urlDate = url.match(/\/(20\d{2})\/(0[1-9]|1[0-2])\/(0[1-9]|[12]\d|3[01])\//);
if (urlDate) return `${urlDate[1]}-${urlDate[2]}-${urlDate[3]}`;
return null;
}
export interface FetchedPage {
url: string;
title: string;
bodyText: string;
wordCount: number;
publishedDate: string | null;
namedEntities: string[];
hasSchemaMarkup: boolean;
hasHeadings: boolean;
isClickbait: boolean;
}
export function extractNamedEntities(text: string): string[] {
const seen = new Set<string>();
const phraseRe = /\b[A-Z][a-z]+(?:\s+[A-Z][a-zA-Z0-9\-]+){1,3}\b/g;
let m: RegExpExecArray | null;
while ((m = phraseRe.exec(text)) !== null) seen.add(m[0]);
const singleRe = /\b[A-Z][a-zA-Z]{2,}\b/g;
const counts = new Map<string, number>();
while ((m = singleRe.exec(text)) !== null) counts.set(m[0], (counts.get(m[0]) ?? 0) + 1);
for (const [w, c] of counts) if (c >= 2) seen.add(w);
const stop = new Set(["The","This","That","These","Those","There","Their","They","What",
"When","Where","Which","While","With","Without","Would","Could","Should","About",
"After","Before","Between","During","From","Into","More","Most","Also","Each",
"Such","Some","Other","Than","Then","Been","Being","Have","Having","However",
"Therefore","Furthermore","Moreover","Although","Because"]);
return [...seen].filter(e => !stop.has(e)).slice(0, 15);
}
export async function fetchPage(url: string, timeoutMs = 15_000): Promise<FetchedPage> {
const ctrl = new AbortController();
const timer = setTimeout(() => ctrl.abort(), timeoutMs);
let html = "";
try {
const res = await fetch(url, {
signal: ctrl.signal,
headers: {
"User-Agent": UA,
"Accept": "text/html,application/xhtml+xml,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
},
});
if (!res.ok) throw new Error(`HTTP ${res.status}`);
html = await res.text();
} finally {
clearTimeout(timer);
}
const title = extractTitle(html);
const publishedDate = extractPublishedDate(html, url);
const hasSchemaMarkup =
/<script[^>]+type="application\/ld\+json"/i.test(html) ||
/(?:og:|twitter:)(?:title|description)/i.test(html);
const hasHeadings = /<h[23][^>]*>/i.test(html);
let isClickbait = false;
try {
const urlPath = new URL(url).pathname.toLowerCase();
isClickbait = /\/(top-10|shocking|you-wont-believe|click|viral)\//.test(urlPath);
} catch {
isClickbait = false;
}
const bodyText = cleanHtml(html).slice(0, 20_000);
const wordCount = bodyText.split(/\s+/).filter(Boolean).length;
const namedEntities = extractNamedEntities(bodyText);
return { url, title, bodyText, wordCount, publishedDate, namedEntities, hasSchemaMarkup, hasHeadings, isClickbait };
}
export interface WikipediaGrounding {
entity: string;
summary: string;
}
export async function fetchWikipediaSummary(entity: string): Promise<WikipediaGrounding | null> {
const encoded = encodeURIComponent(entity.replace(/\s+/g, "_"));
const url = `https://en.wikipedia.org/api/rest_v1/page/summary/${encoded}`;
const ctrl = new AbortController();
const timer = setTimeout(() => ctrl.abort(), 8_000);
try {
const res = await fetch(url, {
signal: ctrl.signal,
headers: { "User-Agent": "research-plugin/1.0" },
});
if (!res.ok) return null;
const data = await res.json() as { title?: string; description?: string; extract?: string };
return {
entity: data.title ?? entity,
summary: data.extract ?? data.description ?? "",
};
} catch {
return null;
} finally {
clearTimeout(timer);
}
}