"use strict";
/**
 * @file retrieval/tfidf.ts
 * Lightweight TF-IDF vector engine for semantic similarity search.
 *
 * Why TF-IDF instead of embeddings?
 * - Zero external dependencies (no embedding model required)
 * - Works instantly on plugin start (no model loading)
 * - Surprisingly effective for short-text matching (memory entries are sentences)
 * - If an embedding model IS loaded, the AI relevance scorer can augment this
 *
 * Optimizations:
 * - Sparse vectors (Map<string, number>) instead of dense arrays
 * - Inverted index for O(1) document lookup per term
 * - Incremental IDF updates (no full recompute on insert)
 * - Pre-computed document norms for fast cosine similarity
 */
Object.defineProperty(exports, "__esModule", { value: true });
exports.TfIdfIndex = void 0;
exports.tokenize = tokenize;
const constants_1 = require("../constants");
/** Tokenize and normalize text into terms (supports Latin/accented chars). */
function tokenize(text) {
    return text
        .toLowerCase()
        .normalize("NFD")
        .replace(/[\u0300-\u036f]/g, "") // strip diacritics for matching (á→a, ç→c, ã→a)
        .replace(/[^a-z0-9\s\-_]/g, " ")
        .split(/\s+/)
        .filter(t => t.length >= constants_1.MIN_TERM_LENGTH && !constants_1.STOP_WORDS.has(t))
        .slice(0, constants_1.MAX_TERMS_PER_DOC);
}
/**
 * In-memory TF-IDF index.
 * Supports incremental inserts (no full reindex needed).
 */
class TfIdfIndex {
    constructor() {
        /** term → Set of document IDs containing this term */
        this.invertedIndex = new Map();
        /** docId → sparse TF vector (normalized by doc length) */
        this.docVectors = new Map();
        /** docId → precomputed L2 norm of the TF vector */
        this.docNorms = new Map();
        /** Total number of indexed documents */
        this.docCount = 0;
        /** Cached IDF values — invalidated on add/remove */
        this.idfCache = new Map();
        this.idfDirty = true;
    }
    /** Add or update a document in the index. */
    addDocument(docId, text) {
        // Remove old version if exists
        if (this.docVectors.has(docId)) {
            this.removeDocument(docId);
        }
        const terms = tokenize(text);
        if (terms.length === 0)
            return;
        // Compute term frequencies
        const tf = new Map();
        for (const term of terms) {
            tf.set(term, (tf.get(term) ?? 0) + 1);
        }
        // Normalize TF by document length (prevents long docs from dominating)
        const maxTf = Math.max(...tf.values());
        const normalizedTf = new Map();
        for (const [term, count] of tf) {
            normalizedTf.set(term, 0.5 + 0.5 * (count / maxTf));
        }
        // Update inverted index
        for (const term of tf.keys()) {
            if (!this.invertedIndex.has(term)) {
                // Vocabulary cap
                if (this.invertedIndex.size >= constants_1.MAX_VOCAB_SIZE)
                    continue;
                this.invertedIndex.set(term, new Set());
            }
            this.invertedIndex.get(term).add(docId);
        }
        this.docVectors.set(docId, normalizedTf);
        this.docCount++;
        this.idfDirty = true;
        // Precompute norm (will be refined when IDF is applied at query time)
        this.docNorms.set(docId, this.computeNorm(normalizedTf));
    }
    /** Remove a document from the index. */
    removeDocument(docId) {
        const vec = this.docVectors.get(docId);
        if (!vec)
            return;
        for (const term of vec.keys()) {
            const docs = this.invertedIndex.get(term);
            if (docs) {
                docs.delete(docId);
                if (docs.size === 0)
                    this.invertedIndex.delete(term);
            }
        }
        this.docVectors.delete(docId);
        this.docNorms.delete(docId);
        this.docCount--;
        this.idfDirty = true;
    }
    /**
     * Search the index and return document IDs sorted by TF-IDF cosine similarity.
     * Returns array of [docId, score] pairs.
     */
    search(queryText, topK = 10) {
        const queryTerms = tokenize(queryText);
        if (queryTerms.length === 0 || this.docCount === 0)
            return [];
        // Rebuild IDF cache if index changed since last search
        if (this.idfDirty) {
            this.idfCache.clear();
            for (const [term, docs] of this.invertedIndex) {
                this.idfCache.set(term, Math.log(1 + this.docCount / docs.size));
            }
            this.idfDirty = false;
        }
        // Build query TF-IDF vector
        const queryTf = new Map();
        for (const term of queryTerms) {
            queryTf.set(term, (queryTf.get(term) ?? 0) + 1);
        }
        const maxQueryTf = Math.max(...queryTf.values());
        const queryVec = new Map();
        for (const [term, count] of queryTf) {
            const idf = this.idfCache.get(term);
            if (!idf)
                continue;
            queryVec.set(term, (0.5 + 0.5 * (count / maxQueryTf)) * idf);
        }
        if (queryVec.size === 0)
            return [];
        const queryNorm = this.computeNorm(queryVec);
        if (queryNorm === 0)
            return [];
        // Find candidate documents (only docs sharing at least one query term)
        const candidates = new Set();
        for (const term of queryVec.keys()) {
            const docs = this.invertedIndex.get(term);
            if (docs)
                for (const docId of docs)
                    candidates.add(docId);
        }
        // Score via cosine similarity with cached IDF
        const scores = [];
        for (const docId of candidates) {
            const docVec = this.docVectors.get(docId);
            if (!docVec)
                continue;
            let dotProduct = 0;
            let docNormSq = 0;
            for (const [term, docTf] of docVec) {
                const idf = this.idfCache.get(term) ?? 0;
                const docTfIdf = docTf * idf;
                docNormSq += docTfIdf * docTfIdf;
                const qw = queryVec.get(term);
                if (qw !== undefined)
                    dotProduct += docTfIdf * qw;
            }
            const docNorm = Math.sqrt(docNormSq);
            if (docNorm === 0)
                continue;
            const sim = dotProduct / (docNorm * queryNorm);
            if (sim > 0)
                scores.push([docId, sim]);
        }
        scores.sort((a, b) => b[1] - a[1]);
        return scores.slice(0, topK);
    }
    /** Compute L2 norm of a sparse vector. */
    computeNorm(vec) {
        let sumSq = 0;
        for (const w of vec.values())
            sumSq += w * w;
        return Math.sqrt(sumSq);
    }
    /** Get index statistics. */
    get stats() {
        return { vocabSize: this.invertedIndex.size, docCount: this.docCount };
    }
    /** Clear the entire index. */
    clear() {
        this.invertedIndex.clear();
        this.docVectors.clear();
        this.docNorms.clear();
        this.idfCache.clear();
        this.idfDirty = true;
        this.docCount = 0;
    }
}
exports.TfIdfIndex = TfIdfIndex;
//# sourceMappingURL=tfidf.js.map
maestro