Forked from khtsly/persistent-memory
Project Files
src / retrieval / tfidf.ts
/**
* @file retrieval/tfidf.ts
* Lightweight TF-IDF vector engine for semantic similarity search.
*
* Why TF-IDF instead of embeddings?
* - Zero external dependencies (no embedding model required)
* - Works instantly on plugin start (no model loading)
* - Surprisingly effective for short-text matching (memory entries are sentences)
* - If an embedding model IS loaded, the AI relevance scorer can augment this
*
* Optimizations:
* - Sparse vectors (Map<string, number>) instead of dense arrays
* - Inverted index for O(1) document lookup per term
* - Incremental IDF updates (no full recompute on insert)
* - Pre-computed document norms for fast cosine similarity
*/
import { STOP_WORDS, MIN_TERM_LENGTH, MAX_VOCAB_SIZE, MAX_TERMS_PER_DOC } from "../constants";
import type { TfIdfVector } from "../types";
/** Tokenize and normalize text into terms. */
export function tokenize(text: string): string[] {
return text
.toLowerCase()
.replace(/[^a-z0-9\s\-_]/g, " ")
.split(/\s+/)
.filter(t => t.length >= MIN_TERM_LENGTH && !STOP_WORDS.has(t))
.slice(0, MAX_TERMS_PER_DOC);
}
/**
* In-memory TF-IDF index.
* Supports incremental inserts (no full reindex needed).
*/
export class TfIdfIndex {
/** term → Set of document IDs containing this term */
private invertedIndex = new Map<string, Set<string>>();
/** docId → sparse TF vector (normalized by doc length) */
private docVectors = new Map<string, TfIdfVector>();
/** docId → precomputed L2 norm of the TF vector */
private docNorms = new Map<string, number>();
/** Total number of indexed documents */
private docCount = 0;
/** Cached IDF values — invalidated on add/remove */
private idfCache = new Map<string, number>();
private idfDirty = true;
/** Add or update a document in the index. */
addDocument(docId: string, text: string): void {
// Remove old version if exists
if (this.docVectors.has(docId)) {
this.removeDocument(docId);
}
const terms = tokenize(text);
if (terms.length === 0) return;
// Compute term frequencies
const tf = new Map<string, number>();
for (const term of terms) {
tf.set(term, (tf.get(term) ?? 0) + 1);
}
// Normalize TF by document length (prevents long docs from dominating)
const maxTf = Math.max(...tf.values());
const normalizedTf: TfIdfVector = new Map();
for (const [term, count] of tf) {
normalizedTf.set(term, 0.5 + 0.5 * (count / maxTf));
}
// Update inverted index
for (const term of tf.keys()) {
if (!this.invertedIndex.has(term)) {
// Vocabulary cap
if (this.invertedIndex.size >= MAX_VOCAB_SIZE) continue;
this.invertedIndex.set(term, new Set());
}
this.invertedIndex.get(term)!.add(docId);
}
this.docVectors.set(docId, normalizedTf);
this.docCount++;
this.idfDirty = true;
// Precompute norm (will be refined when IDF is applied at query time)
this.docNorms.set(docId, this.computeNorm(normalizedTf));
}
/** Remove a document from the index. */
removeDocument(docId: string): void {
const vec = this.docVectors.get(docId);
if (!vec) return;
for (const term of vec.keys()) {
const docs = this.invertedIndex.get(term);
if (docs) {
docs.delete(docId);
if (docs.size === 0) this.invertedIndex.delete(term);
}
}
this.docVectors.delete(docId);
this.docNorms.delete(docId);
this.docCount--;
this.idfDirty = true;
}
/**
* Search the index and return document IDs sorted by TF-IDF cosine similarity.
* Returns array of [docId, score] pairs.
*/
search(queryText: string, topK: number = 10): Array<[string, number]> {
const queryTerms = tokenize(queryText);
if (queryTerms.length === 0 || this.docCount === 0) return [];
// Rebuild IDF cache if index changed since last search
if (this.idfDirty) {
this.idfCache.clear();
for (const [term, docs] of this.invertedIndex) {
this.idfCache.set(term, Math.log(1 + this.docCount / docs.size));
}
this.idfDirty = false;
}
// Build query TF-IDF vector
const queryTf = new Map<string, number>();
for (const term of queryTerms) {
queryTf.set(term, (queryTf.get(term) ?? 0) + 1);
}
const maxQueryTf = Math.max(...queryTf.values());
const queryVec: TfIdfVector = new Map();
for (const [term, count] of queryTf) {
const idf = this.idfCache.get(term);
if (!idf) continue;
queryVec.set(term, (0.5 + 0.5 * (count / maxQueryTf)) * idf);
}
if (queryVec.size === 0) return [];
const queryNorm = this.computeNorm(queryVec);
if (queryNorm === 0) return [];
// Find candidate documents (only docs sharing at least one query term)
const candidates = new Set<string>();
for (const term of queryVec.keys()) {
const docs = this.invertedIndex.get(term);
if (docs) for (const docId of docs) candidates.add(docId);
}
// Score via cosine similarity with cached IDF
const scores: Array<[string, number]> = [];
for (const docId of candidates) {
const docVec = this.docVectors.get(docId);
if (!docVec) continue;
let dotProduct = 0;
let docNormSq = 0;
for (const [term, docTf] of docVec) {
const idf = this.idfCache.get(term) ?? 0;
const docTfIdf = docTf * idf;
docNormSq += docTfIdf * docTfIdf;
const qw = queryVec.get(term);
if (qw !== undefined) dotProduct += docTfIdf * qw;
}
const docNorm = Math.sqrt(docNormSq);
if (docNorm === 0) continue;
const sim = dotProduct / (docNorm * queryNorm);
if (sim > 0) scores.push([docId, sim]);
}
scores.sort((a, b) => b[1] - a[1]);
return scores.slice(0, topK);
}
/** Compute L2 norm of a sparse vector. */
private computeNorm(vec: TfIdfVector): number {
let sumSq = 0;
for (const w of vec.values()) sumSq += w * w;
return Math.sqrt(sumSq);
}
/** Get index statistics. */
get stats(): { vocabSize: number; docCount: number } {
return { vocabSize: this.invertedIndex.size, docCount: this.docCount };
}
/** Clear the entire index. */
clear(): void {
this.invertedIndex.clear();
this.docVectors.clear();
this.docNorms.clear();
this.idfCache.clear();
this.idfDirty = true;
this.docCount = 0;
}
}