Project Files
src / retrieval / hybridSearch.ts
import { NON_WORD_RE, normalizeWhitespace } from "../utils/text";
const MAX_QUERY_TOKEN_CACHE = 128;
const queryTokenCache = new Map<string, string[]>();
function tokenizedQuery(query: string): string[] {
const normalized = normalizeWhitespace(query.toLowerCase());
const cached = queryTokenCache.get(normalized);
if (cached) return cached;
const tokens = normalized.split(NON_WORD_RE).filter((w) => w.length > 2);
if (queryTokenCache.size >= MAX_QUERY_TOKEN_CACHE) {
const oldest = queryTokenCache.keys().next().value;
if (oldest !== undefined) queryTokenCache.delete(oldest);
}
queryTokenCache.set(normalized, tokens);
return tokens;
}
const MAX_LOWER_CACHE = 128;
const lowerCache = new Map<string, string>();
function lowerCached(text: string): string {
const cached = lowerCache.get(text);
if (cached !== undefined) return cached;
const lower = text.toLowerCase();
if (lowerCache.size >= MAX_LOWER_CACHE) {
const oldest = lowerCache.keys().next().value;
if (oldest !== undefined) lowerCache.delete(oldest);
}
lowerCache.set(text, lower);
return lower;
}
export function hybridScore(query: string, text: string, semanticScore: number): number {
const normalizedQuery = normalizeWhitespace(query.toLowerCase());
const queryWords = tokenizedQuery(query);
if (queryWords.length === 0) return semanticScore;
const haystack = lowerCached(text);
let matches = 0;
for (const word of queryWords) {
if (haystack.includes(word)) matches++;
}
const keywordBonus = (matches / queryWords.length) * 0.22;
const exactPhraseBoost = normalizedQuery && haystack.includes(normalizedQuery) ? 0.15 : 0;
return semanticScore + keywordBonus + exactPhraseBoost;
}