Project Files
src / search / bm25.ts
/**
* BM25 index (adapted from rag-vc/src/rag/bm25.ts).
* Uses whole-document IDs (string paths) instead of numeric chunk IDs.
*/
interface BM25Doc {
id: string;
terms: Map<string, number>;
length: number;
}
export class BM25Index {
private k1 = 1.5;
private b = 0.75;
private docs = new Map<string, BM25Doc>();
private df = new Map<string, number>(); // term → doc count
private avgLen = 0;
private tokenize(text: string): string[] {
return text
.toLowerCase()
.replace(/[^\p{L}\p{N}\s]/gu, " ")
.split(/\s+/)
.filter((t) => t.length >= 2);
}
addDocument(id: string, content: string): void {
const tokens = this.tokenize(content);
const tf = new Map<string, number>();
for (const t of tokens) tf.set(t, (tf.get(t) ?? 0) + 1);
// Remove old doc first if re-indexing
if (this.docs.has(id)) this.removeDocument(id);
for (const term of tf.keys()) {
this.df.set(term, (this.df.get(term) ?? 0) + 1);
}
this.docs.set(id, { id, terms: tf, length: tokens.length });
this.recalcAvg();
}
removeDocument(id: string): void {
const doc = this.docs.get(id);
if (!doc) return;
for (const term of doc.terms.keys()) {
const n = (this.df.get(term) ?? 1) - 1;
if (n <= 0) this.df.delete(term);
else this.df.set(term, n);
}
this.docs.delete(id);
this.recalcAvg();
}
private recalcAvg(): void {
if (this.docs.size === 0) { this.avgLen = 0; return; }
let sum = 0;
for (const d of this.docs.values()) sum += d.length;
this.avgLen = sum / this.docs.size;
}
private idf(term: string): number {
const n = this.df.get(term) ?? 0;
if (n === 0) return 0;
return Math.log((this.docs.size - n + 0.5) / (n + 0.5) + 1);
}
search(query: string, limit = 10): Array<{ id: string; score: number }> {
const terms = this.tokenize(query);
if (terms.length === 0 || this.docs.size === 0) return [];
const scores = new Map<string, number>();
for (const [docId, doc] of this.docs) {
let score = 0;
for (const term of terms) {
const tf = doc.terms.get(term) ?? 0;
if (tf === 0) continue;
const idf = this.idf(term);
score +=
idf *
((tf * (this.k1 + 1)) /
(tf + this.k1 * (1 - this.b + this.b * (doc.length / this.avgLen))));
}
if (score > 0) scores.set(docId, score);
}
return [...scores.entries()]
.map(([id, score]) => ({ id, score }))
.sort((a, b) => b.score - a.score)
.slice(0, limit);
}
clear(): void {
this.docs.clear();
this.df.clear();
this.avgLen = 0;
}
get size(): number {
return this.docs.size;
}
}