Project Files

src

cache

cache.ts

pageCache.ts

crawler

crawlPage.ts

extractContent.ts

fetchHtml.ts

normalizeUrl.ts

rankChunks.ts

embeddings

cosine.ts

embed.ts

lib

crawler.ts

providers

DuckDuckGoProvider.ts

SearchProvider.ts

SearXNGProvider.ts

duckduckgo.ts

searxng.ts

services

crawlPage.ts

fetchHtml.ts

ranker.ts

retryFetch.ts

structuredExtraction.ts

tools

crawlSpecificPage.ts

crawlWebPage.ts

index.ts

searchWeb.ts

tools.ts

types

CrawlSession.ts

utils

blockedExtensions.ts

cleanText.ts

constants.ts

domainFilter.ts

formatPage.ts

relevance.ts

summarizer.ts

text.ts

url.ts

index.ts

.gitignore

CHANGELOG_FIX.md

LICENSE

manifest.json

package-lock.json

package.json

README.md

runtime.js

tsconfig.json

src / utils / relevance.ts

/**
 * Relevance filtering — keeps only URLs and crawled pages that are
 * plausibly related to the search query.
 */

// ─── Known off-topic domains ───────────────────────────────────────────────
// Sites that almost never appear as a relevant result for any game/software query
// but frequently surface as DDG noise.
const NOISE_DOMAINS = new Set([
    "python.org", "www.python.org",
    "osu.ppy.sh",
    "github.com",          // too broad — keep subpaths like wikis if relevant
    "stackoverflow.com",
    "reddit.com",          // may be wanted via siteFilter but noisy otherwise
    "twitter.com", "x.com",
    "facebook.com",
    "youtube.com",
    "instagram.com",
    "tiktok.com",
    "amazon.com",
    "ebay.com",
    "wikipedia.org",       // fine for general queries but noisy for version lookups
]);

// Paths/extensions that are never useful page content
const NOISE_PATH_PATTERNS = [
    /\.(css|js|png|jpg|jpeg|gif|svg|ico|woff|woff2|ttf|pdf|zip|exe)$/i,
    /\/(login|signup|register|cart|checkout|account|profile)\b/i,
];

/**
 * Extracts the eTLD+1 hostname (e.g. "osu.ppy.sh" → "osu.ppy.sh",
 * "www.python.org" → "python.org").
 */
function hostname(url: string): string {
    try { return new URL(url).hostname.replace(/^www\./, ""); }
    catch { return ""; }
}

/**
 * Returns true if the URL should be skipped before even fetching it.
 * Cheap check — no network required.
 */
export function isNoiseUrl(url: string, query: string): boolean {
    const host = hostname(url);
    if (!host) return true;

    // Hard-blocked domains
    if (NOISE_DOMAINS.has(host)) return true;

    // Noise path patterns
    try {
        const path = new URL(url).pathname;
        if (NOISE_PATH_PATTERNS.some(p => p.test(path))) return true;
    } catch {}

    return false;
}

/**
 * Scores a crawled page's relevance to the query.
 * Returns a value 0–100. Pages below MIN_RELEVANCE_SCORE are dropped.
 */
export const MIN_RELEVANCE_SCORE = 8;

export function pageRelevanceScore(query: string, title: string, content: string): number {
    const words = query
        .toLowerCase()
        .split(/\s+/)
        .filter(w => w.length > 2)
        // strip common stop words that inflate false matches
        .filter(w => !["the","and","for","with","from","that","this","are","was","but","not","site"].includes(w));

    if (words.length === 0) return 100; // can't judge, let it through

    const haystack = (title + " " + content).toLowerCase();
    let matched = 0;

    for (const word of words) {
        if (haystack.includes(word)) matched++;
    }

    const ratio = matched / words.length;

    // Require at least 30% of query words to appear in the page
    // Score 0–100 based on match ratio
    return Math.round(ratio * 100);
}