Project Files
src / utils / relevance.ts
/**
* Relevance filtering — keeps only URLs and crawled pages that are
* plausibly related to the search query.
*/
// ─── Known off-topic domains ───────────────────────────────────────────────
// Sites that almost never appear as a relevant result for any game/software query
// but frequently surface as DDG noise.
const NOISE_DOMAINS = new Set([
"python.org", "www.python.org",
"osu.ppy.sh",
"github.com", // too broad — keep subpaths like wikis if relevant
"stackoverflow.com",
"reddit.com", // may be wanted via siteFilter but noisy otherwise
"twitter.com", "x.com",
"facebook.com",
"youtube.com",
"instagram.com",
"tiktok.com",
"amazon.com",
"ebay.com",
"wikipedia.org", // fine for general queries but noisy for version lookups
]);
// Paths/extensions that are never useful page content
const NOISE_PATH_PATTERNS = [
/\.(css|js|png|jpg|jpeg|gif|svg|ico|woff|woff2|ttf|pdf|zip|exe)$/i,
/\/(login|signup|register|cart|checkout|account|profile)\b/i,
];
/**
* Extracts the eTLD+1 hostname (e.g. "osu.ppy.sh" → "osu.ppy.sh",
* "www.python.org" → "python.org").
*/
function hostname(url: string): string {
try { return new URL(url).hostname.replace(/^www\./, ""); }
catch { return ""; }
}
/**
* Returns true if the URL should be skipped before even fetching it.
* Cheap check — no network required.
*/
export function isNoiseUrl(url: string, query: string): boolean {
const host = hostname(url);
if (!host) return true;
// Hard-blocked domains
if (NOISE_DOMAINS.has(host)) return true;
// Noise path patterns
try {
const path = new URL(url).pathname;
if (NOISE_PATH_PATTERNS.some(p => p.test(path))) return true;
} catch {}
return false;
}
/**
* Scores a crawled page's relevance to the query.
* Returns a value 0–100. Pages below MIN_RELEVANCE_SCORE are dropped.
*/
export const MIN_RELEVANCE_SCORE = 8;
export function pageRelevanceScore(query: string, title: string, content: string): number {
const words = query
.toLowerCase()
.split(/\s+/)
.filter(w => w.length > 2)
// strip common stop words that inflate false matches
.filter(w => !["the","and","for","with","from","that","this","are","was","but","not","site"].includes(w));
if (words.length === 0) return 100; // can't judge, let it through
const haystack = (title + " " + content).toLowerCase();
let matched = 0;
for (const word of words) {
if (haystack.includes(word)) matched++;
}
const ratio = matched / words.length;
// Require at least 30% of query words to appear in the page
// Score 0–100 based on match ratio
return Math.round(ratio * 100);
}