Project Files
dist / retrieval / reranker.js
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.rerankChunks = rerankChunks;
const text_1 = require("../utils/text");
/**
* Token cache to avoid redundant computations across chunks.
*/
const MAX_TOKEN_CACHE = 256;
const tokenCache = new Map();
function getCacheKey(input) {
return (0, text_1.stableHash)(input.substring(0, 100));
}
function tokens(input) {
const key = getCacheKey(input);
const cached = tokenCache.get(key);
if (cached)
return cached;
const value = (0, text_1.normalizeWhitespace)(input.toLowerCase())
.split(/\W+/)
.filter(token => token.length > 2);
if (tokenCache.size >= MAX_TOKEN_CACHE) {
const oldest = tokenCache.keys().next().value;
if (oldest !== undefined)
tokenCache.delete(oldest);
}
tokenCache.set(key, value);
return value;
}
function tokenSet(input) {
return new Set(tokens(input));
}
/**
* STEP 2 IMPROVEMENT: N-Gram Overlap (Fuzzy Matching)
*/
function getNGrams(text, n = 3) {
const grams = new Set();
const cleanText = (0, text_1.normalizeWhitespace)(text.substring(0, 150).toLowerCase());
if (cleanText.length < n) {
grams.add(cleanText);
return grams;
}
for (let i = 0; i <= cleanText.length - n; i++) {
grams.add(cleanText.substring(i, i + n));
}
return grams;
}
function ngramOverlapScore(query, text) {
const queryGrams = getNGrams(query);
const textGrams = getNGrams(text);
if (queryGrams.size === 0 || textGrams.size === 0)
return 0;
let matches = 0;
for (const gram of queryGrams) {
if (textGrams.has(gram))
matches++;
}
return matches / queryGrams.size;
}
function lexicalOverlap(queryTokens, text) {
if (queryTokens.size === 0)
return 0;
const textTokens = tokenSet(text);
let matches = 0;
for (const token of queryTokens) {
if (textTokens.has(token))
matches++;
}
return matches / queryTokens.size;
}
function sourceBoost(query, sourceName) {
if (!sourceName)
return 0;
const qTokens = tokens(query);
if (qTokens.length === 0)
return 0;
const src = (0, text_1.normalizeWhitespace)(sourceName.toLowerCase());
if (!src)
return 0;
if (qTokens.some(token => src.includes(token)))
return 0.06;
return 0;
}
function confidenceFrom(score, overlap) {
const base = Math.max(0, Math.min(1, score));
return Math.max(0, Math.min(1, base * 0.75 + overlap * 0.25));
}
function rerankChunks(chunks, query) {
if (chunks.length <= 2)
return [...chunks];
const queryTokens = tokenSet(query);
return [...chunks]
.map(chunk => {
// Calculate both precise and fuzzy scores
const lexScore = lexicalOverlap(queryTokens, chunk.text);
const ngramScore = ngramOverlapScore(query, chunk.text);
// Weighted score: 40% Exact Word, 60% Fuzzy N-Gram
const semanticLiteScore = (lexScore * 0.4) + (ngramScore * 0.6);
const boost = sourceBoost(query, chunk.sourceName);
const confidence = confidenceFrom(chunk.score, semanticLiteScore);
return {
...chunk,
score: chunk.score + semanticLiteScore + boost + (confidence * 0.08),
confidence,
};
})
.sort((a, b) => {
if (b.score !== a.score)
return b.score - a.score;
const confA = a.confidence ?? 0;
const confB = b.confidence ?? 0;
if (confB !== confA)
return confB - confA;
return a.text.length - b.text.length;
});
}