Project Files
src / services / crawlPage.ts
import * as cheerio from "cheerio";
import { retryFetch } from "../services/retryFetch";
export interface CrawledPage {
title: string;
url: string;
rawHtml: string;
content: string;
}
function clean(text: string): string {
return text.replace(/\s+/g, " ").trim();
}
function isGarbage(text: string): boolean {
if (text.length < 20) return false;
return [
"cookie",
"privacy policy",
"terms of service",
"retrieved from",
"categories:",
"all rights reserved",
"subscribe to",
"sign up for",
"log in",
"create account",
].some(pattern => text.toLowerCase().includes(pattern));
}
function splitChunks(text: string): string[] {
return text
.split(/(?<=[.!?])\s+/)
.map(clean)
.filter(Boolean);
}
function scoreChunk(query: string, chunk: string): number {
const words = query.toLowerCase().split(/\s+/);
const lower = chunk.toLowerCase();
let score = 0;
for (const word of words) {
if (word.length > 2 && lower.includes(word)) score += 15;
}
if (/added|fixed|improved|changed|updated|new|released|removed/i.test(chunk)) {
score += 20;
}
if (/patch\s+\d+|changes|fixes|notes|update/i.test(chunk)) {
score += 40;
}
score += Math.min(chunk.length / 100, 10);
return score;
}
// MODIFIED: Less aggressive deduplication so similar patches don't cancel each other out
function dedupe(chunks: string[]): string[] {
const seen = new Set<string>();
return chunks.filter(chunk => {
const key = chunk.toLowerCase().replace(/[^a-z0-9]/g, "").slice(0, 120);
if (seen.has(key)) return false;
seen.add(key);
return true;
});
}
export async function crawlPage(url: string, query = ""): Promise<CrawledPage> {
const html = await retryFetch(url);
const $ = cheerio.load(html);
const title = clean($("title").text()) || "Untitled";
// Clean out sidebar translation bars and languages globally
$(
"script, style, nav, footer, header, aside, " +
".sidebar, .advertisement, .ad, .cookie-banner, .popup, .modal, " +
"[role='navigation'], [role='banner'], [role='complementary'], " +
".mw-interlanguage-selector, #p-lang, .printfooter"
).remove();
// Kill interactive translation container layout elements
$(
".notranslate, [class*='translate'], [id*='translate'], " +
".gtrans, #google_translate_element, .skiptranslate, " +
".goog-te-banner-frame, #goog-gt-"
).remove();
const MAX_CHUNKS = 150;
const MAX_CONTENT_CHARS = 24_000; // Expanded to handle all mixed patch blocks safely
const blocks = $("p, li, h1, h2, h3, h4, td, div.mw-parser-output")
.map((_, el) => clean($(el).text()))
.get()
.filter(text => !isGarbage(text));
let chunks: string[] = [];
for (const block of blocks) {
chunks.push(...splitChunks(block));
}
chunks = dedupe(chunks);
const chunksWithScores = chunks.map((chunk, index) => ({
chunk,
index,
score: scoreChunk(query, chunk)
}));
const topScoredChunks = chunksWithScores
.sort((a, b) => b.score - a.score)
.slice(0, MAX_CHUNKS);
const orderedChunks = [...topScoredChunks].sort((a, b) => a.index - b.index);
const content = orderedChunks
.map(item => item.chunk)
.join("\n\n")
.slice(0, MAX_CONTENT_CHARS);
return { title, url, rawHtml: $.html(), content };
}