Project Files
dist / services / ingestion / web.js
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.extractWebContent = extractWebContent;
const jsdom_1 = require("jsdom");
const readability_1 = require("@mozilla/readability");
function spoofHeaders(url) {
return {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Referer": new URL(url).origin,
};
}
async function extractWebContent(url, contentLimit = 20000) {
try {
const headers = spoofHeaders(url);
const response = await fetch(url, { headers });
if (!response.ok)
throw new Error(`Failed to fetch ${url}: ${response.statusText}`);
const html = await response.text();
const doc = new jsdom_1.JSDOM(html, { url });
const reader = new readability_1.Readability(doc.window.document);
const article = reader.parse();
if (!article)
return { title: url, content: "Could not parse content." };
let content = article.textContent || "";
if (content && content.length > contentLimit) {
content = content.substring(0, contentLimit) + "... [Truncated]";
}
return {
title: article.title || url,
content: content,
byline: article.byline,
siteName: article.siteName
};
}
catch (e) {
throw new Error(`Web extraction failed: ${e.message}`);
}
}