Project Files
src / crawler / crawlPage.ts
import * as cheerio from "cheerio";
import { retryFetch } from "../services/retryFetch";
import { extractContent } from "./extractContent";
import { rankChunks } from "./rankChunks";
export async function crawlPage(url: string, query = "") {
const html = await retryFetch(url);
const $ = cheerio.load(html);
// Strip layout blocks and scripts
$("script, style, nav, footer, noscript").remove();
// 1. Gather ordered page contents
const sequentialChunks = extractContent($);
// 2. Filter out identical duplicate crumbs or sidebar links
const uniqueChunks: string[] = [];
const seen = new Set<string>();
for (const chunk of sequentialChunks) {
if (!seen.has(chunk)) {
seen.add(chunk);
uniqueChunks.push(chunk);
}
}
// 3. Formulate the response with extended slice buffers to prevent cutoffs
let finalContent: string;
if (!query.trim()) {
// Increased limit to guarantee everything past Patch 2 displays seamlessly
finalContent = uniqueChunks.slice(0, 150).join("\n\n");
} else {
// Collects a wider contextual window for semantic matching
const ranked = await rankChunks(query, uniqueChunks);
finalContent = ranked.join("\n\n");
}
return {
title: $("title").text().trim(),
url,
content: finalContent
};
}