"use strict";
/**
* @file retrieval/tfidf.ts
* Lightweight TF-IDF vector engine for semantic similarity search.
*
* Why TF-IDF instead of embeddings?
* - Zero external dependencies (no embedding model required)
* - Works instantly on plugin start (no model loading)
* - Surprisingly effective for short-text matching (memory entries are sentences)
* - If an embedding model IS loaded, the AI relevance scorer can augment this
*
* Optimizations:
* - Sparse vectors (Map<string, number>) instead of dense arrays
* - Inverted index for O(1) document lookup per term
* - Incremental IDF updates (no full recompute on insert)
* - Pre-computed document norms for fast cosine similarity
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.TfIdfIndex = void 0;
exports.tokenize = tokenize;
const constants_1 = require("../constants");
/** Tokenize and normalize text into terms (supports Latin/accented chars). */
function tokenize(text) {
return text
.toLowerCase()
.normalize("NFD")
.replace(/[\u0300-\u036f]/g, "") // strip diacritics for matching (รกโa, รงโc, รฃโa)
.replace(/[^a-z0-9\s\-_]/g, " ")
.split(/\s+/)
.filter(t => t.length >= constants_1.MIN_TERM_LENGTH && !constants_1.STOP_WORDS.has(t))
.slice(0, constants_1.MAX_TERMS_PER_DOC);
}
/**
* In-memory TF-IDF index.
* Supports incremental inserts (no full reindex needed).
*/
class TfIdfIndex {
constructor() {
/** term โ Set of document IDs containing this term */
this.invertedIndex = new Map();
/** docId โ sparse TF vector (normalized by doc length) */
this.docVectors = new Map();
/** docId โ precomputed L2 norm of the TF vector */
this.docNorms = new Map();
/** Total number of indexed documents */
this.docCount = 0;
/** Cached IDF values โ invalidated on add/remove */
this.idfCache = new Map();
this.idfDirty = true;
}
/** Add or update a document in the index. */
addDocument(docId, text) {
// Remove old version if exists
if (this.docVectors.has(docId)) {
this.removeDocument(docId);
}
const terms = tokenize(text);
if (terms.length === 0)
return;
// Compute term frequencies
const tf = new Map();
for (const term of terms) {
tf.set(term, (tf.get(term) ?? 0) + 1);
}
// Normalize TF by document length (prevents long docs from dominating)
const maxTf = Math.max(...tf.values());
const normalizedTf = new Map();
for (const [term, count] of tf) {
normalizedTf.set(term, 0.5 + 0.5 * (count / maxTf));
}
// Update inverted index
for (const term of tf.keys()) {
if (!this.invertedIndex.has(term)) {
// Vocabulary cap
if (this.invertedIndex.size >= constants_1.MAX_VOCAB_SIZE)
continue;
this.invertedIndex.set(term, new Set());
}
this.invertedIndex.get(term).add(docId);
}
this.docVectors.set(docId, normalizedTf);
this.docCount++;
this.idfDirty = true;
// Precompute norm (will be refined when IDF is applied at query time)
this.docNorms.set(docId, this.computeNorm(normalizedTf));
}
/** Remove a document from the index. */
removeDocument(docId) {
const vec = this.docVectors.get(docId);
if (!vec)
return;
for (const term of vec.keys()) {
const docs = this.invertedIndex.get(term);
if (docs) {
docs.delete(docId);
if (docs.size === 0)
this.invertedIndex.delete(term);
}
}
this.docVectors.delete(docId);
this.docNorms.delete(docId);
this.docCount--;
this.idfDirty = true;
}
/**
* Search the index and return document IDs sorted by TF-IDF cosine similarity.
* Returns array of [docId, score] pairs.
*/
search(queryText, topK = 10) {
const queryTerms = tokenize(queryText);
if (queryTerms.length === 0 || this.docCount === 0)
return [];
// Rebuild IDF cache if index changed since last search
if (this.idfDirty) {
this.idfCache.clear();
for (const [term, docs] of this.invertedIndex) {
this.idfCache.set(term, Math.log(1 + this.docCount / docs.size));
}
this.idfDirty = false;
}
// Build query TF-IDF vector
const queryTf = new Map();
for (const term of queryTerms) {
queryTf.set(term, (queryTf.get(term) ?? 0) + 1);
}
const maxQueryTf = Math.max(...queryTf.values());
const queryVec = new Map();
for (const [term, count] of queryTf) {
const idf = this.idfCache.get(term);
if (!idf)
continue;
queryVec.set(term, (0.5 + 0.5 * (count / maxQueryTf)) * idf);
}
if (queryVec.size === 0)
return [];
const queryNorm = this.computeNorm(queryVec);
if (queryNorm === 0)
return [];
// Find candidate documents (only docs sharing at least one query term)
const candidates = new Set();
for (const term of queryVec.keys()) {
const docs = this.invertedIndex.get(term);
if (docs)
for (const docId of docs)
candidates.add(docId);
}
// Score via cosine similarity with cached IDF
const scores = [];
for (const docId of candidates) {
const docVec = this.docVectors.get(docId);
if (!docVec)
continue;
let dotProduct = 0;
let docNormSq = 0;
for (const [term, docTf] of docVec) {
const idf = this.idfCache.get(term) ?? 0;
const docTfIdf = docTf * idf;
docNormSq += docTfIdf * docTfIdf;
const qw = queryVec.get(term);
if (qw !== undefined)
dotProduct += docTfIdf * qw;
}
const docNorm = Math.sqrt(docNormSq);
if (docNorm === 0)
continue;
const sim = dotProduct / (docNorm * queryNorm);
if (sim > 0)
scores.push([docId, sim]);
}
scores.sort((a, b) => b[1] - a[1]);
return scores.slice(0, topK);
}
/** Compute L2 norm of a sparse vector. */
computeNorm(vec) {
let sumSq = 0;
for (const w of vec.values())
sumSq += w * w;
return Math.sqrt(sumSq);
}
/** Get index statistics. */
get stats() {
return { vocabSize: this.invertedIndex.size, docCount: this.docCount };
}
/** Clear the entire index. */
clear() {
this.invertedIndex.clear();
this.docVectors.clear();
this.docNorms.clear();
this.idfCache.clear();
this.idfDirty = true;
this.docCount = 0;
}
}
exports.TfIdfIndex = TfIdfIndex;
//# sourceMappingURL=tfidf.js.map