Project Files
src / services / ingestion / web.ts
import { JSDOM } from "jsdom";
import { Readability } from "@mozilla/readability";
function spoofHeaders(url: string) {
return {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Referer": new URL(url).origin,
};
}
export async function extractWebContent(url: string, contentLimit: number = 20000) {
try {
const headers = spoofHeaders(url);
const response = await fetch(url, { headers });
if (!response.ok) throw new Error(`Failed to fetch ${url}: ${response.statusText}`);
const html = await response.text();
const doc = new JSDOM(html, { url });
const reader = new Readability(doc.window.document);
const article = reader.parse();
if (!article) return { title: url, content: "Could not parse content." };
let content = article.textContent || "";
if (content && content.length > contentLimit) {
content = content.substring(0, contentLimit) + "... [Truncated]";
}
return {
title: article.title || url,
content: content,
byline: article.byline,
siteName: article.siteName
};
} catch (e: any) {
throw new Error(`Web extraction failed: ${e.message}`);
}
}