Project Files
src / retrieval / reranker.ts
import { normalizeWhitespace } from "../utils/text";
import { getTokenSet } from "./cache";
export interface RankedChunk {
text: string;
score: number;
citation?: string;
sourceName?: string;
confidence?: number;
}
function getNGrams(text: string, n: number = 3): Set<string> {
const clean = normalizeWhitespace(text.substring(0, 150)).toLowerCase();
if (clean.length < n) return new Set([clean]);
const grams = new Set<string>();
for (let i = 0; i <= clean.length - n; i++) {
grams.add(clean.substring(i, i + n));
}
return grams;
}
function ngramOverlap(queryGrams: Set<string>, text: string): number {
const textGrams = getNGrams(text);
if (queryGrams.size === 0 || textGrams.size === 0) return 0;
let matches = 0;
for (const gram of queryGrams) {
if (textGrams.has(gram)) matches++;
}
return matches / queryGrams.size;
}
function sourceBoost(query: string, sourceName: string | undefined): number {
if (!sourceName) return 0;
const qTokens = getTokenSet(query);
if (qTokens.size === 0) return 0;
const src = normalizeWhitespace(sourceName.toLowerCase());
if (!src) return 0;
for (const token of qTokens) {
if (src.includes(token)) return 0.06;
}
return 0;
}
export function rerankChunks(chunks: RankedChunk[], query: string): RankedChunk[] {
if (chunks.length <= 2) return chunks.slice();
const queryTokens = getTokenSet(query);
const queryGrams = getNGrams(query);
const scored = chunks.map((chunk) => {
const textTokens = getTokenSet(chunk.text);
const lexScore =
queryTokens.size === 0
? 0
: [...queryTokens].reduce(
(c, t) => c + (textTokens.has(t) ? 1 : 0),
0,
) / queryTokens.size;
const ngramScore = ngramOverlap(queryGrams, chunk.text);
const semanticLiteScore = lexScore * 0.4 + ngramScore * 0.6;
const boost = sourceBoost(query, chunk.sourceName);
const base = Math.max(0, Math.min(1, chunk.score));
const confidence = Math.max(0, Math.min(1, base * 0.75 + semanticLiteScore * 0.25));
return {
...chunk,
score: chunk.score + semanticLiteScore + boost + confidence * 0.08,
confidence,
};
});
scored.sort((a, b) => {
if (b.score !== a.score) return b.score - a.score;
const cA = a.confidence ?? 0;
const cB = b.confidence ?? 0;
if (cB !== cA) return cB - cA;
return a.text.length - b.text.length;
});
return scored;
}