Project Files
dist / ranking.js
"use strict";
// Result ranking, filtering, and noise reduction. Pure functions, no I/O.
// Shared between the LM Studio plugin and the standalone MCP server.
Object.defineProperty(exports, "__esModule", { value: true });
exports.isRedirectUrl = isRedirectUrl;
exports.normalizeUrl = normalizeUrl;
exports.hostOf = hostOf;
exports.hostTrust = hostTrust;
exports.tokenize = tokenize;
exports.queryTokens = queryTokens;
exports.overlapScore = overlapScore;
exports.rankAndFilter = rankAndFilter;
// ---------- URL normalization ----------
const TRACKING_PARAM_RE = /^(utm_|fbclid$|gclid$|mc_|ref$|ref_$|source$|spm$)/i;
// Some engines return wrapper / redirect URLs that aren't real destinations.
// Drop these outright.
const REDIRECT_HOST_PREFIXES = [
"duckduckgo.com/l/",
"duckduckgo.com/y.js",
"www.google.com/url",
"www.bing.com/ck/",
"r.jina.ai/",
];
function isRedirectUrl(url) {
const lower = url.toLowerCase();
return REDIRECT_HOST_PREFIXES.some((p) => lower.includes(p));
}
function normalizeUrl(raw) {
try {
const u = new URL(raw);
u.hash = "";
u.hostname = u.hostname.toLowerCase().replace(/^www\./, "");
// Strip tracking-ish params.
const keep = [];
for (const [k, v] of u.searchParams) {
if (TRACKING_PARAM_RE.test(k))
continue;
keep.push([k, v]);
}
u.search = "";
for (const [k, v] of keep)
u.searchParams.append(k, v);
// Strip trailing slash on path (but keep "/").
if (u.pathname.length > 1 && u.pathname.endsWith("/")) {
u.pathname = u.pathname.slice(0, -1);
}
return u.toString();
}
catch {
return raw;
}
}
function hostOf(url) {
try {
return new URL(url).hostname.toLowerCase().replace(/^www\./, "");
}
catch {
return "";
}
}
// ---------- Host trust ----------
// Coarse trust score per host suffix. Higher = more reliable / authoritative.
// Used as one of several signals in scoring; not a hard filter.
const HOST_TRUST = [
[/(^|\.)wikipedia\.org$/, 0.95],
[/(^|\.)wikimedia\.org$/, 0.9],
[/(^|\.)arxiv\.org$/, 0.9],
[/(^|\.)nature\.com$/, 0.9],
[/(^|\.)science\.org$/, 0.9],
[/(^|\.)nih\.gov$/, 0.9],
[/(^|\.)ncbi\.nlm\.nih\.gov$/, 0.9],
[/\.gov$/, 0.85],
[/\.edu$/, 0.8],
[/(^|\.)github\.com$/, 0.8],
[/(^|\.)stackoverflow\.com$/, 0.8],
[/(^|\.)stackexchange\.com$/, 0.75],
[/(^|\.)mozilla\.org$/, 0.8],
[/(^|\.)developer\.mozilla\.org$/, 0.85],
[/(^|\.)readthedocs\.io$/, 0.75],
[/(^|\.)microsoft\.com$/, 0.7],
[/(^|\.)apple\.com$/, 0.7],
[/(^|\.)bbc\.(co\.uk|com)$/, 0.75],
[/(^|\.)reuters\.com$/, 0.75],
[/(^|\.)apnews\.com$/, 0.75],
[/(^|\.)nytimes\.com$/, 0.7],
[/(^|\.)theguardian\.com$/, 0.7],
[/(^|\.)reddit\.com$/, 0.55],
[/(^|\.)medium\.com$/, 0.5],
[/(^|\.)quora\.com$/, 0.45],
[/(^|\.)pinterest\.(com|.+)$/, 0.2],
[/(^|\.)answers\.com$/, 0.2],
];
function hostTrust(host) {
for (const [re, score] of HOST_TRUST)
if (re.test(host))
return score;
return 0.5; // unknown host
}
// ---------- Query/text matching ----------
const STOPWORDS = new Set([
"a", "an", "the", "of", "in", "on", "at", "for", "to", "and", "or", "but",
"is", "are", "was", "were", "be", "by", "with", "from", "as", "it", "its",
"this", "that", "these", "those", "i", "you", "we", "they", "he", "she",
"me", "him", "her", "us", "them", "my", "your", "our", "their",
"how", "what", "when", "where", "why", "who", "which", "do", "does", "did",
"can", "could", "should", "would", "will", "shall", "may", "might", "must",
"about", "into", "over", "under", "than", "then", "so", "if", "not", "no",
]);
function tokenize(s) {
return s
.toLowerCase()
.replace(/[^\p{L}\p{N}\s]/gu, " ")
.split(/\s+/)
.filter((t) => t.length >= 2 && !STOPWORDS.has(t));
}
function queryTokens(query) {
return Array.from(new Set(tokenize(query)));
}
// 0..1 — what fraction of the query's content tokens appear in `text`.
function overlapScore(qTokens, text) {
if (qTokens.length === 0)
return 0;
const tt = new Set(tokenize(text));
let hit = 0;
for (const t of qTokens)
if (tt.has(t))
hit++;
return hit / qTokens.length;
}
function trimSnippet(s, max) {
if (!s)
return "";
const cleaned = s.replace(/\s+/g, " ").trim();
if (cleaned.length <= max)
return cleaned;
return cleaned.slice(0, max - 1) + "…";
}
// Drop junk: empty title or URL, redirect wrappers, missing http(s) scheme.
function isUsable(r) {
if (!r.url || !r.title)
return false;
if (!/^https?:\/\//i.test(r.url))
return false;
if (isRedirectUrl(r.url))
return false;
if (r.title.trim().length < 2)
return false;
return true;
}
// Filter junk, dedupe by normalized URL, score, sort, top-K, trim snippets.
function rankAndFilter(results, opts) {
const qTokens = queryTokens(opts.query);
const seen = new Map();
for (const r of results) {
if (!isUsable(r))
continue;
const normUrl = normalizeUrl(r.url);
const host = hostOf(normUrl);
const trust = hostTrust(host);
const titleOverlap = overlapScore(qTokens, r.title);
const snippetOverlap = overlapScore(qTokens, r.snippet || "");
const snipLen = (r.snippet || "").length;
// Reward results that actually have a snippet (>40 chars) up to ~0.3.
const snipBonus = Math.min(0.3, snipLen / 600);
// Weighted blend. Title overlap dominates; trust is a meaningful prior;
// snippet evidence is secondary; length is a tiebreaker.
const score = 0.45 * titleOverlap +
0.30 * trust +
0.20 * snippetOverlap +
0.05 * snipBonus;
const scored = {
title: r.title.trim(),
snippet: trimSnippet(r.snippet || "", opts.snippetMaxChars),
url: normUrl,
source: r.source,
host,
score: Math.round(score * 1000) / 1000,
...(opts.includeScoreBreakdown
? {
score_breakdown: {
host_trust: trust,
title_overlap: Math.round(titleOverlap * 1000) / 1000,
snippet_overlap: Math.round(snippetOverlap * 1000) / 1000,
snippet_length_bonus: Math.round(snipBonus * 1000) / 1000,
},
}
: {}),
};
// Dedupe: keep the higher-scoring instance per normalized URL.
const existing = seen.get(normUrl);
if (!existing || scored.score > existing.score) {
seen.set(normUrl, scored);
}
}
const sorted = Array.from(seen.values()).sort((a, b) => b.score - a.score);
return sorted.slice(0, opts.topK);
}
//# sourceMappingURL=ranking.js.map