Project Files
src / rag / bm25.ts
/**
* BM25 (Best Matching 25) - Keyword-based ranking algorithm
* Used for hybrid search combining with semantic embeddings
*/
export interface BM25Config {
k1?: number; // Term frequency saturation (default: 1.5)
b?: number; // Document length normalization (default: 0.75)
}
export interface BM25Document {
id: number;
terms: Map<string, number>; // term -> frequency
length: number; // total terms
}
export class BM25Index {
private k1: number;
private b: number;
private documents: Map<number, BM25Document> = new Map();
private documentFrequency: Map<string, number> = new Map(); // term -> doc count
private avgDocLength: number = 0;
private totalDocs: number = 0;
constructor(config: BM25Config = {}) {
this.k1 = config.k1 ?? 1.5;
this.b = config.b ?? 0.75;
}
/**
* Tokenize text into terms (lowercase, alphanumeric)
*/
private tokenize(text: string): string[] {
return text
.toLowerCase()
.replace(/[^\p{L}\p{N}\s]/gu, ' ') // Keep letters and numbers (Unicode-aware)
.split(/\s+/)
.filter(term => term.length >= 2); // Min 2 chars
}
/**
* Add a document to the index
*/
addDocument(id: number, content: string): void {
const tokens = this.tokenize(content);
const termFreq = new Map<string, number>();
for (const term of tokens) {
termFreq.set(term, (termFreq.get(term) || 0) + 1);
}
// Update document frequency for new terms
for (const term of termFreq.keys()) {
if (!this.documents.has(id) || !this.documents.get(id)!.terms.has(term)) {
this.documentFrequency.set(term, (this.documentFrequency.get(term) || 0) + 1);
}
}
// If document already exists, remove old term counts from DF
if (this.documents.has(id)) {
const oldDoc = this.documents.get(id)!;
for (const term of oldDoc.terms.keys()) {
if (!termFreq.has(term)) {
const df = this.documentFrequency.get(term) || 1;
if (df <= 1) {
this.documentFrequency.delete(term);
} else {
this.documentFrequency.set(term, df - 1);
}
}
}
} else {
this.totalDocs++;
}
this.documents.set(id, {
id,
terms: termFreq,
length: tokens.length,
});
// Recalculate average document length
this.recalculateAvgLength();
}
/**
* Remove a document from the index
*/
removeDocument(id: number): void {
const doc = this.documents.get(id);
if (!doc) return;
// Update document frequency
for (const term of doc.terms.keys()) {
const df = this.documentFrequency.get(term) || 1;
if (df <= 1) {
this.documentFrequency.delete(term);
} else {
this.documentFrequency.set(term, df - 1);
}
}
this.documents.delete(id);
this.totalDocs--;
this.recalculateAvgLength();
}
/**
* Remove all documents for a specific document ID (chunks)
*/
removeDocumentChunks(chunkIds: number[]): void {
for (const id of chunkIds) {
this.removeDocument(id);
}
}
private recalculateAvgLength(): void {
if (this.totalDocs === 0) {
this.avgDocLength = 0;
return;
}
let totalLength = 0;
for (const doc of this.documents.values()) {
totalLength += doc.length;
}
this.avgDocLength = totalLength / this.totalDocs;
}
/**
* Calculate IDF (Inverse Document Frequency) for a term
*/
private idf(term: string): number {
const df = this.documentFrequency.get(term) || 0;
if (df === 0) return 0;
// BM25 IDF formula
return Math.log((this.totalDocs - df + 0.5) / (df + 0.5) + 1);
}
/**
* Search the index and return scored results
*/
search(query: string, limit: number = 10): Array<{ id: number; score: number }> {
const queryTerms = this.tokenize(query);
if (queryTerms.length === 0 || this.totalDocs === 0) {
return [];
}
const scores: Map<number, number> = new Map();
for (const [docId, doc] of this.documents) {
let score = 0;
for (const term of queryTerms) {
const tf = doc.terms.get(term) || 0;
if (tf === 0) continue;
const idf = this.idf(term);
// BM25 scoring formula
const numerator = tf * (this.k1 + 1);
const denominator = tf + this.k1 * (1 - this.b + this.b * (doc.length / this.avgDocLength));
score += idf * (numerator / denominator);
}
if (score > 0) {
scores.set(docId, score);
}
}
// Sort by score descending
const results = Array.from(scores.entries())
.map(([id, score]) => ({ id, score }))
.sort((a, b) => b.score - a.score)
.slice(0, limit);
return results;
}
/**
* Get stats about the index
*/
getStats(): { documents: number; terms: number; avgLength: number } {
return {
documents: this.totalDocs,
terms: this.documentFrequency.size,
avgLength: Math.round(this.avgDocLength),
};
}
/**
* Clear the entire index
*/
clear(): void {
this.documents.clear();
this.documentFrequency.clear();
this.avgDocLength = 0;
this.totalDocs = 0;
}
/**
* Export index for persistence
*/
export(): string {
return JSON.stringify({
k1: this.k1,
b: this.b,
documents: Array.from(this.documents.entries()).map(([id, doc]) => ({
id,
terms: Array.from(doc.terms.entries()),
length: doc.length,
})),
documentFrequency: Array.from(this.documentFrequency.entries()),
});
}
/**
* Import index from persistence
*/
import(data: string): void {
try {
const parsed = JSON.parse(data);
this.k1 = parsed.k1 ?? 1.5;
this.b = parsed.b ?? 0.75;
this.documents.clear();
this.documentFrequency.clear();
for (const doc of parsed.documents) {
this.documents.set(doc.id, {
id: doc.id,
terms: new Map(doc.terms),
length: doc.length,
});
}
this.documentFrequency = new Map(parsed.documentFrequency);
this.totalDocs = this.documents.size;
this.recalculateAvgLength();
} catch (error) {
console.error('[BM25] Failed to import index:', error);
this.clear();
}
}
}