Project Files
src / local / store.ts
/**
* @file local/store.ts
* Enhanced RAG document store with multi-library support.
*
* Major features:
* - MULTIPLE LIBRARIES: Named libraries (like GPT4All's multi-library model)
* with metadata, tags, and priority tiers. Each library maps to a folder
* and can be tagged for automatic worker routing.
* - PROGRESSIVE SOURCE TIERS: Libraries are ranked by priority
* (proprietary > internal > reference > general) so local knowledge
* is preferred before falling back to web.
* - BM25 SCORING: Proper BM25 ranking (replaces basic TF-IDF) with
* field boosting on file names and section headings.
* - SMART CHUNKING: Section-aware splitting that respects headings,
* paragraphs, and code blocks. Configurable overlap.
* - CONTEXTUAL RETRIEVAL: Parent and sibling chunk retrieval for
* richer context windows around matched chunks.
* - PERSISTENCE: Save/load index to disk JSON so re-indexing
* is only needed when files change.
* - FILE METADATA: Tracks file type, modification date, size,
* and custom user tags per file.
* - HYBRID SEARCH: Combines BM25 keyword matching with n-gram
* fuzzy matching for better recall on partial/misspelled terms.
*
* Designed to run entirely on-device with zero external dependencies —
* no vector database, no embedding model needed.
*/
import * as fs from "node:fs";
import * as path from "node:path";
import * as crypto from "node:crypto";
let PDFParseRef: any = null;
try {
const pdfMod = require("pdf-parse");
PDFParseRef = pdfMod.PDFParse ?? pdfMod.default ?? pdfMod;
} catch {
/* pdf-parse not available */
}
/** Priority tiers for progressive source retrieval. */
export type LibraryPriority =
| "proprietary"
| "internal"
| "reference"
| "general";
/** Tags that map libraries to worker roles automatically. */
export type LibraryTag =
| "legal"
| "academic"
| "technical"
| "financial"
| "medical"
| "policy"
| "reports"
| "code"
| "general";
export interface DocumentChunk {
readonly id: string;
readonly libraryId: string;
readonly filePath: string;
readonly fileName: string;
readonly fileRelPath: string;
readonly chunkIndex: number;
readonly totalChunks: number;
readonly text: string;
readonly wordCount: number;
readonly terms: ReadonlyMap<string, number>;
readonly heading: string;
readonly sectionDepth: number;
readonly fileType: string;
readonly ngramSet: ReadonlySet<string>;
}
export interface FileMetadata {
readonly filePath: string;
readonly fileName: string;
readonly fileRelPath: string;
readonly fileType: string;
readonly sizeBytes: number;
readonly modifiedAt: string;
readonly chunkCount: number;
readonly wordCount: number;
readonly tags: ReadonlyArray<string>;
readonly contentHash: string;
}
export interface LocalLibrary {
readonly id: string;
readonly name: string;
readonly folderPath: string;
readonly description: string;
readonly priority: LibraryPriority;
readonly tags: ReadonlyArray<LibraryTag>;
readonly fileCount: number;
readonly chunkCount: number;
readonly totalWords: number;
readonly indexedAt: string;
readonly files: ReadonlyArray<FileMetadata>;
}
export interface LocalSearchHit {
readonly chunkId: string;
readonly libraryId: string;
readonly libraryName: string;
readonly libraryPriority: LibraryPriority;
readonly filePath: string;
readonly fileName: string;
readonly fileRelPath: string;
readonly text: string;
readonly wordCount: number;
readonly score: number;
readonly bm25Score: number;
readonly ngramScore: number;
readonly chunkIndex: number;
readonly totalChunks: number;
readonly heading: string;
readonly fileType: string;
readonly contextBefore: string;
readonly contextAfter: string;
}
/** Serialisable form for persistence. */
interface PersistedIndex {
readonly version: number;
readonly savedAt: string;
readonly libraries: ReadonlyArray<PersistedLibrary>;
}
interface PersistedLibrary {
readonly library: Omit<LocalLibrary, "files"> & { files: FileMetadata[] };
readonly chunks: ReadonlyArray<PersistedChunk>;
}
interface PersistedChunk {
readonly id: string;
readonly libraryId: string;
readonly filePath: string;
readonly fileName: string;
readonly fileRelPath: string;
readonly chunkIndex: number;
readonly totalChunks: number;
readonly text: string;
readonly wordCount: number;
readonly terms: ReadonlyArray<[string, number]>;
readonly heading: string;
readonly sectionDepth: number;
readonly fileType: string;
}
export type LocalCollection = LocalLibrary;
const MIN_CHUNK_WORDS = 15;
const MAX_CHUNKS_PER_FILE = 300;
const PERSIST_VERSION = 2;
/** BM25 parameters */
const BM25_K1 = 1.4;
const BM25_B = 0.75;
/** N-gram size for fuzzy matching */
const NGRAM_SIZE = 3;
const NGRAM_WEIGHT = 0.2;
/** Priority sort order (lower = more preferred) */
const PRIORITY_ORDER: Record<LibraryPriority, number> = {
proprietary: 0,
internal: 1,
reference: 2,
general: 3,
};
/** Worker role → library tag mapping for auto-routing. */
const ROLE_TAG_MAP: Readonly<Record<string, ReadonlyArray<LibraryTag>>> = {
academic: ["academic", "technical"],
regulatory: ["legal", "policy"],
technical: ["technical", "code"],
statistical: ["financial", "reports"],
primary: ["reports", "general"],
depth: ["technical", "academic"],
breadth: ["general", "reports"],
recency: ["general", "reports"],
critical: ["academic", "policy"],
comparative: ["reports", "general"],
};
const SUPPORTED_EXTENSIONS = new Set([
".pdf",
".txt",
".md",
".markdown",
".rst",
".org",
".html",
".htm",
".xhtml",
".csv",
".tsv",
".json",
".jsonl",
".xml",
".log",
".yaml",
".yml",
".ini",
".cfg",
".conf",
".tex",
".bib",
".py",
".js",
".ts",
".java",
".c",
".cpp",
".h",
".hpp",
".rs",
".go",
".rb",
".php",
".swift",
".kt",
".scala",
".sh",
".bash",
".zsh",
".ps1",
".sql",
".r",
".R",
".css",
".scss",
".less",
".ipynb",
".toml",
".env",
".properties",
]);
const STOP_WORDS = new Set([
"the",
"a",
"an",
"is",
"in",
"of",
"and",
"or",
"for",
"to",
"how",
"what",
"why",
"when",
"does",
"with",
"from",
"that",
"this",
"these",
"those",
"would",
"should",
"could",
"which",
"about",
"their",
"its",
"are",
"was",
"were",
"been",
"being",
"have",
"has",
"had",
"having",
"do",
"did",
"doing",
"will",
"shall",
"may",
"might",
"can",
"must",
"not",
"no",
"nor",
"but",
"if",
"then",
"else",
"so",
"than",
"too",
"very",
"just",
"only",
"also",
"more",
"most",
"some",
"any",
"each",
"every",
"all",
"both",
"few",
"many",
"much",
"such",
"own",
"same",
"other",
"into",
"over",
"after",
"before",
"between",
"under",
"above",
"below",
"up",
"down",
"out",
"off",
"on",
"at",
"by",
"as",
"be",
"it",
"he",
"she",
"we",
"they",
"me",
"him",
"her",
"us",
"them",
"my",
"your",
"his",
"our",
"you",
"i",
]);
function tokenize(text: string): string[] {
return text
.toLowerCase()
.replace(/[^a-z0-9\s\-_]/g, " ")
.split(/\s+/)
.filter((w) => w.length > 2 && !STOP_WORDS.has(w));
}
function stemSimple(word: string): string {
return word
.replace(/ies$/, "y")
.replace(/ves$/, "f")
.replace(/(s|ed|ing|tion|ment|ness|able|ible)$/, "")
.replace(/(.)\1+$/, "$1");
}
function tokenizeWithStems(text: string): string[] {
const raw = tokenize(text);
const withStems: string[] = [];
for (const w of raw) {
withStems.push(w);
const stemmed = stemSimple(w);
if (stemmed !== w && stemmed.length > 2) {
withStems.push(stemmed);
}
}
return withStems;
}
function computeTermFrequencies(tokens: string[]): Map<string, number> {
const freq = new Map<string, number>();
for (const token of tokens) {
freq.set(token, (freq.get(token) ?? 0) + 1);
}
return freq;
}
function buildNgramSet(text: string): Set<string> {
const ngrams = new Set<string>();
const lower = text.toLowerCase().replace(/[^a-z0-9]/g, "");
for (let i = 0; i <= lower.length - NGRAM_SIZE; i++) {
ngrams.add(lower.slice(i, i + NGRAM_SIZE));
}
return ngrams;
}
function ngramSimilarity(
a: ReadonlySet<string>,
b: ReadonlySet<string>,
): number {
if (a.size === 0 || b.size === 0) return 0;
let intersection = 0;
for (const ng of a) {
if (b.has(ng)) intersection++;
}
return intersection / Math.min(a.size, b.size);
}
interface ChunkMeta {
text: string;
heading: string;
sectionDepth: number;
}
const HEADING_PATTERNS: ReadonlyArray<{ regex: RegExp; depth: number }> = [
{ regex: /^#{1}\s+(.+)$/m, depth: 1 },
{ regex: /^#{2}\s+(.+)$/m, depth: 2 },
{ regex: /^#{3,6}\s+(.+)$/m, depth: 3 },
{ regex: /^([A-Z][A-Z\s]{5,60})$/m, depth: 2 },
{ regex: /^(\d+\.[\d.]*\s+.{5,80})$/m, depth: 2 },
{ regex: /^(Chapter\s+\d+[.:]\s*.+)$/im, depth: 1 },
{ regex: /^(Section\s+\d+[.:]\s*.+)$/im, depth: 2 },
{
regex:
/^(Abstract|Introduction|Conclusion|References|Methods|Results|Discussion)\b/im,
depth: 1,
},
];
function detectHeading(text: string): { heading: string; depth: number } {
const firstLine = text.split("\n")[0]?.trim() ?? "";
for (const { regex, depth } of HEADING_PATTERNS) {
const match = regex.exec(firstLine);
if (match) {
return { heading: (match[1] ?? firstLine).trim().slice(0, 120), depth };
}
}
return { heading: "", depth: 0 };
}
function smartChunkText(text: string, chunkSize: number): ChunkMeta[] {
const overlap = Math.round(chunkSize * 0.12);
const chunks: ChunkMeta[] = [];
const sectionBreaks = text.split(/\n(?=#{1,6}\s|\d+\.\s|[A-Z]{5,}\n)/);
let buffer = "";
let currentHeading = "";
let currentDepth = 0;
for (const section of sectionBreaks) {
const { heading: newHeading, depth: newDepth } = detectHeading(section);
if (newHeading) {
if (buffer.trim().length > 100) {
pushChunks(
buffer,
currentHeading,
currentDepth,
chunkSize,
overlap,
chunks,
);
}
buffer = section;
currentHeading = newHeading;
currentDepth = newDepth;
} else {
buffer += "\n" + section;
}
if (buffer.length > chunkSize * 1.5) {
pushChunks(
buffer,
currentHeading,
currentDepth,
chunkSize,
overlap,
chunks,
);
buffer = "";
}
}
if (buffer.trim().length > 50) {
pushChunks(
buffer,
currentHeading,
currentDepth,
chunkSize,
overlap,
chunks,
);
}
if (chunks.length === 0 && text.trim().length > 50) {
pushChunks(text, "", 0, chunkSize, overlap, chunks);
}
return chunks.slice(0, MAX_CHUNKS_PER_FILE);
}
function pushChunks(
text: string,
heading: string,
depth: number,
chunkSize: number,
overlap: number,
out: ChunkMeta[],
): void {
if (text.length <= chunkSize * 1.3) {
const trimmed = text.trim();
if (trimmed.length > 50) {
out.push({ text: trimmed, heading, sectionDepth: depth });
}
return;
}
let offset = 0;
while (offset < text.length && out.length < MAX_CHUNKS_PER_FILE) {
const end = Math.min(offset + chunkSize, text.length);
let slice = text.slice(offset, end);
if (end < text.length) {
const lastBreak = Math.max(
slice.lastIndexOf("\n\n"),
slice.lastIndexOf(". "),
slice.lastIndexOf(".\n"),
);
if (lastBreak > chunkSize * 0.3) {
slice = slice.slice(0, lastBreak + 1);
}
}
const trimmed = slice.trim();
if (trimmed.length > 50) {
const chunkHeading =
offset === 0 ? heading : detectHeading(trimmed).heading || heading;
out.push({ text: trimmed, heading: chunkHeading, sectionDepth: depth });
}
offset += Math.max(slice.length - overlap, 1);
}
}
function stripHtmlTags(html: string): string {
return html
.replace(/<script[\s\S]*?<\/script>/gi, "")
.replace(/<style[\s\S]*?<\/style>/gi, "")
.replace(/<[^>]+>/g, " ")
.replace(/&/g, "&")
.replace(/</g, "<")
.replace(/>/g, ">")
.replace(/"/g, '"')
.replace(/'/g, "'")
.replace(/ /g, " ")
.replace(/\s+/g, " ")
.trim();
}
function cleanPdfText(raw: string): string {
return raw
.replace(/\r\n/g, "\n")
.replace(/\r/g, "\n")
.replace(/^--\s*\d+\s*of\s*\d+\s*--\s*$/gm, "")
.replace(/(\w)-\n(\w)/g, "$1$2")
.replace(/[ \t]+/g, " ")
.replace(/\n{3,}/g, "\n\n")
.replace(/^\s*\d{1,4}\s*$/gm, "")
.trim();
}
async function readFileAsText(filePath: string): Promise<string | null> {
try {
const ext = path.extname(filePath).toLowerCase();
if (ext === ".pdf") {
if (!PDFParseRef) return null;
try {
const buffer = fs.readFileSync(filePath);
const data = new Uint8Array(buffer);
const parser = new PDFParseRef({ data } as any);
const result = await parser.getText({
lineEnforce: true,
lineThreshold: 5,
});
await parser.destroy();
return cleanPdfText(result.text || "");
} catch {
return null;
}
}
if (ext === ".ipynb") {
try {
const raw = fs.readFileSync(filePath, "utf-8");
const notebook = JSON.parse(raw);
return (notebook.cells ?? [])
.map((c: any) => {
const src = Array.isArray(c.source)
? c.source.join("")
: (c.source ?? "");
return `[${c.cell_type}]\n${src}`;
})
.join("\n\n");
} catch {
return null;
}
}
const raw = fs.readFileSync(filePath, "utf-8");
if (ext === ".html" || ext === ".htm" || ext === ".xhtml")
return stripHtmlTags(raw);
if (ext === ".json") {
try {
return JSON.stringify(JSON.parse(raw), null, 2);
} catch {
return raw;
}
}
return raw;
} catch {
return null;
}
}
function scanDirectory(dirPath: string): string[] {
const files: string[] = [];
function walk(dir: string, depth: number): void {
if (depth > 10) return;
let entries: fs.Dirent[];
try {
entries = fs.readdirSync(dir, { withFileTypes: true });
} catch {
return;
}
for (const entry of entries) {
if (entry.name.startsWith(".")) continue;
if (entry.name === "node_modules" || entry.name === "__pycache__")
continue;
const fullPath = path.join(dir, entry.name);
if (entry.isDirectory()) {
walk(fullPath, depth + 1);
} else if (entry.isFile()) {
const ext = path.extname(entry.name).toLowerCase();
if (SUPPORTED_EXTENSIONS.has(ext) || (!ext && entry.name.length < 30)) {
files.push(fullPath);
}
}
}
}
walk(dirPath, 0);
return files;
}
function fileContentHash(filePath: string): string {
try {
const stat = fs.statSync(filePath);
return crypto
.createHash("sha256")
.update(`${filePath}:${stat.size}:${stat.mtimeMs}`)
.digest("hex")
.slice(0, 16);
} catch {
return crypto.randomUUID().slice(0, 16);
}
}
class BM25Index {
private avgDl = 0;
private docCount = 0;
private readonly df = new Map<string, number>();
rebuild(chunks: ReadonlyMap<string, DocumentChunk>): void {
this.df.clear();
this.docCount = chunks.size;
let totalWords = 0;
for (const chunk of chunks.values()) {
totalWords += chunk.wordCount;
for (const term of chunk.terms.keys()) {
this.df.set(term, (this.df.get(term) ?? 0) + 1);
}
}
this.avgDl = this.docCount > 0 ? totalWords / this.docCount : 1;
}
score(queryTerms: Map<string, number>, chunk: DocumentChunk): number {
let score = 0;
const dl = chunk.wordCount;
for (const [term, qf] of queryTerms) {
const tf = chunk.terms.get(term) ?? 0;
if (tf === 0) continue;
const n = this.df.get(term) ?? 0;
const idf = Math.log(1 + (this.docCount - n + 0.5) / (n + 0.5));
const tfNorm =
(tf * (BM25_K1 + 1)) /
(tf + BM25_K1 * (1 - BM25_B + BM25_B * (dl / this.avgDl)));
score += idf * tfNorm * qf;
}
if (chunk.heading) {
const headingLower = chunk.heading.toLowerCase();
for (const term of queryTerms.keys()) {
if (headingLower.includes(term)) {
score *= 1.3;
break;
}
}
}
const fileNameLower = chunk.fileName.toLowerCase();
for (const term of queryTerms.keys()) {
if (fileNameLower.includes(term)) {
score *= 1.15;
break;
}
}
return score;
}
}
export class LocalDocumentStore {
private readonly libraries = new Map<string, LocalLibrary>();
private readonly chunks = new Map<string, DocumentChunk>();
private readonly libraryChunks = new Map<string, Set<string>>();
private readonly bm25 = new BM25Index();
getCollections(): ReadonlyArray<LocalLibrary> {
return this.getLibraries();
}
getCollection(id: string): LocalLibrary | undefined {
return this.getLibrary(id);
}
hasCollections(): boolean {
return this.libraries.size > 0;
}
async indexCollection(
name: string,
folderPath: string,
chunkSize: number,
onProgress?: (message: string) => void,
): Promise<LocalLibrary> {
return this.indexLibrary(
name,
folderPath,
"",
"general",
["general"],
chunkSize,
onProgress,
);
}
removeCollection(id: string): boolean {
return this.removeLibrary(id);
}
getLibraries(): ReadonlyArray<LocalLibrary> {
return Array.from(this.libraries.values()).sort(
(a, b) => PRIORITY_ORDER[a.priority] - PRIORITY_ORDER[b.priority],
);
}
getLibrary(id: string): LocalLibrary | undefined {
return this.libraries.get(id);
}
getLibraryByName(name: string): LocalLibrary | undefined {
const lower = name.toLowerCase();
for (const lib of this.libraries.values()) {
if (lib.name.toLowerCase() === lower) return lib;
}
return undefined;
}
hasLibraries(): boolean {
return this.libraries.size > 0;
}
findLibrariesByTag(tag: LibraryTag): ReadonlyArray<LocalLibrary> {
return this.getLibraries().filter((lib) => lib.tags.includes(tag));
}
findLibrariesForRole(role: string): ReadonlyArray<string> {
const roleTags = ROLE_TAG_MAP[role] ?? ["general"];
const matching: LocalLibrary[] = [];
for (const lib of this.libraries.values()) {
if (lib.tags.some((t) => roleTags.includes(t))) {
matching.push(lib);
}
}
matching.sort(
(a, b) => PRIORITY_ORDER[a.priority] - PRIORITY_ORDER[b.priority],
);
if (matching.length === 0) return Array.from(this.libraries.keys());
return matching.map((lib) => lib.id);
}
async indexLibrary(
name: string,
folderPath: string,
description: string = "",
priority: LibraryPriority = "general",
tags: LibraryTag[] = ["general"],
chunkSize: number = 4000,
onProgress?: (message: string) => void,
): Promise<LocalLibrary> {
const resolvedPath = path.resolve(folderPath);
if (!fs.existsSync(resolvedPath)) {
throw new Error(`Folder not found: ${resolvedPath}`);
}
if (!fs.statSync(resolvedPath).isDirectory()) {
throw new Error(`Path is not a directory: ${resolvedPath}`);
}
const existingId = Array.from(this.libraries.values()).find(
(c) => c.folderPath === resolvedPath,
)?.id;
if (existingId) this.removeLibrary(existingId);
const libraryId = crypto.randomUUID();
const chunkIds = new Set<string>();
const fileMetadatas: FileMetadata[] = [];
onProgress?.(`Scanning ${resolvedPath} for documents…`);
const files = scanDirectory(resolvedPath);
onProgress?.(`Found ${files.length} supported files`);
let totalWords = 0;
let indexedFiles = 0;
for (const filePath of files) {
const text = await readFileAsText(filePath);
if (!text || text.trim().length < 50) continue;
const textChunks = smartChunkText(text, chunkSize);
const fileName = path.basename(filePath);
const fileRelPath = path.relative(resolvedPath, filePath);
const ext = path.extname(filePath).toLowerCase();
const contentHash = fileContentHash(filePath);
let fileStat: fs.Stats;
try {
fileStat = fs.statSync(filePath);
} catch {
continue;
}
let fileWordCount = 0;
for (let ci = 0; ci < textChunks.length; ci++) {
const chunkMeta = textChunks[ci];
const tokens = tokenizeWithStems(chunkMeta.text);
if (tokens.length < MIN_CHUNK_WORDS) continue;
const chunkId = `${libraryId}:${indexedFiles}:${ci}`;
const terms = computeTermFrequencies(tokens);
const ngramSet = buildNgramSet(chunkMeta.text.slice(0, 500));
const chunk: DocumentChunk = {
id: chunkId,
libraryId,
filePath,
fileName,
fileRelPath,
chunkIndex: ci,
totalChunks: textChunks.length,
text: chunkMeta.text,
wordCount: tokens.length,
terms,
heading: chunkMeta.heading,
sectionDepth: chunkMeta.sectionDepth,
fileType: ext || "unknown",
ngramSet,
};
this.chunks.set(chunkId, chunk);
chunkIds.add(chunkId);
totalWords += tokens.length;
fileWordCount += tokens.length;
}
fileMetadatas.push({
filePath,
fileName,
fileRelPath,
fileType: ext || "unknown",
sizeBytes: fileStat.size,
modifiedAt: fileStat.mtime.toISOString(),
chunkCount: textChunks.length,
wordCount: fileWordCount,
tags: inferFileTags(ext, fileName),
contentHash,
});
indexedFiles++;
if (indexedFiles % 50 === 0) {
onProgress?.(`Indexed ${indexedFiles}/${files.length} files…`);
}
}
this.bm25.rebuild(this.chunks);
const library: LocalLibrary = {
id: libraryId,
name,
folderPath: resolvedPath,
description,
priority,
tags,
fileCount: indexedFiles,
chunkCount: chunkIds.size,
totalWords,
indexedAt: new Date().toISOString(),
files: fileMetadatas,
};
this.libraries.set(libraryId, library);
this.libraryChunks.set(libraryId, chunkIds);
onProgress?.(
`Library "${name}" ready: ${indexedFiles} files, ${chunkIds.size} chunks, ` +
`~${totalWords.toLocaleString()} words [${priority}]`,
);
return library;
}
removeLibrary(id: string): boolean {
const chunkIds = this.libraryChunks.get(id);
if (!chunkIds) return false;
for (const chunkId of chunkIds) this.chunks.delete(chunkId);
this.libraryChunks.delete(id);
this.libraries.delete(id);
this.bm25.rebuild(this.chunks);
return true;
}
/** Update library metadata without re-indexing. */
updateLibraryMeta(
id: string,
updates: {
name?: string;
description?: string;
priority?: LibraryPriority;
tags?: LibraryTag[];
},
): LocalLibrary | null {
const lib = this.libraries.get(id);
if (!lib) return null;
const updated: LocalLibrary = {
...lib,
name: updates.name ?? lib.name,
description: updates.description ?? lib.description,
priority: updates.priority ?? lib.priority,
tags: updates.tags ?? lib.tags,
};
this.libraries.set(id, updated);
return updated;
}
listAll(
maxResults: number = 100,
collectionIds?: ReadonlyArray<string>,
): ReadonlyArray<LocalSearchHit> {
const targetLibs = collectionIds ? new Set(collectionIds) : undefined;
const results: LocalSearchHit[] = [];
for (const chunk of this.chunks.values()) {
if (results.length >= maxResults) break;
if (targetLibs && !targetLibs.has(chunk.libraryId)) continue;
const lib = this.libraries.get(chunk.libraryId);
results.push(this.makeHit(chunk, 0, 0, 0, lib));
}
return results;
}
search(
query: string,
maxResults: number = 10,
collectionIds?: ReadonlyArray<string>,
): ReadonlyArray<LocalSearchHit> {
return this.searchLibraries(query, maxResults, collectionIds);
}
searchLibraries(
query: string,
maxResults: number = 10,
libraryIds?: ReadonlyArray<string>,
priorityBoost: boolean = true,
): ReadonlyArray<LocalSearchHit> {
const queryTokens = tokenizeWithStems(query);
if (queryTokens.length === 0) return [];
const queryTerms = computeTermFrequencies(queryTokens);
const queryNgrams = buildNgramSet(query);
const targetLibs = libraryIds ? new Set(libraryIds) : undefined;
const scored: Array<{
chunk: DocumentChunk;
bm25: number;
ngram: number;
total: number;
}> = [];
for (const chunk of this.chunks.values()) {
if (targetLibs && !targetLibs.has(chunk.libraryId)) continue;
const bm25 = this.bm25.score(queryTerms, chunk);
if (bm25 <= 0) {
const ngram = ngramSimilarity(queryNgrams, chunk.ngramSet);
if (ngram < 0.15) continue;
scored.push({ chunk, bm25: 0, ngram, total: ngram * NGRAM_WEIGHT });
continue;
}
const ngram = ngramSimilarity(queryNgrams, chunk.ngramSet);
let total = bm25 + ngram * NGRAM_WEIGHT;
if (priorityBoost) {
const lib = this.libraries.get(chunk.libraryId);
if (lib) {
total *= 1 + (3 - PRIORITY_ORDER[lib.priority]) * 0.08;
}
}
scored.push({ chunk, bm25, ngram, total });
}
scored.sort((a, b) => b.total - a.total);
const seen = new Set<string>();
const results: LocalSearchHit[] = [];
for (const { chunk, bm25, ngram, total } of scored) {
if (results.length >= maxResults) break;
const dedupeKey = `${chunk.filePath}:${chunk.chunkIndex}`;
if (seen.has(dedupeKey)) continue;
seen.add(dedupeKey);
const lib = this.libraries.get(chunk.libraryId);
results.push(this.makeHit(chunk, total, bm25, ngram, lib));
}
return results;
}
searchByRole(
query: string,
role: string,
maxResults: number = 8,
roleCollectionMap?: ReadonlyMap<string, ReadonlyArray<string>>,
): ReadonlyArray<LocalSearchHit> {
const targetIds =
roleCollectionMap?.get(role) ?? this.findLibrariesForRole(role);
return this.searchLibraries(query, maxResults, targetIds, true);
}
/**
* Progressive search: searches libraries in priority order,
* stopping early if enough high-quality results are found.
*/
searchProgressive(
query: string,
maxResults: number = 10,
minResultsPerTier: number = 3,
): ReadonlyArray<LocalSearchHit> {
const allResults: LocalSearchHit[] = [];
const tiers: LibraryPriority[] = [
"proprietary",
"internal",
"reference",
"general",
];
for (const tier of tiers) {
const tierLibs = Array.from(this.libraries.values())
.filter((lib) => lib.priority === tier)
.map((lib) => lib.id);
if (tierLibs.length === 0) continue;
const remaining = maxResults - allResults.length;
if (remaining <= 0) break;
const tierResults = this.searchLibraries(
query,
Math.max(remaining, minResultsPerTier),
tierLibs,
false,
);
allResults.push(...tierResults);
if (allResults.length >= maxResults) break;
}
allResults.sort((a, b) => b.score - a.score);
return allResults.slice(0, maxResults);
}
getChunkWithContext(
chunkId: string,
windowSize: number = 1,
): { chunk: DocumentChunk; context: ReadonlyArray<DocumentChunk> } | null {
const chunk = this.chunks.get(chunkId);
if (!chunk) return null;
const context: DocumentChunk[] = [];
const libChunks = this.libraryChunks.get(chunk.libraryId);
if (!libChunks) return { chunk, context };
for (const sibId of libChunks) {
const sib = this.chunks.get(sibId);
if (!sib || sib.filePath !== chunk.filePath || sib.id === chunk.id)
continue;
if (Math.abs(sib.chunkIndex - chunk.chunkIndex) <= windowSize) {
context.push(sib);
}
}
context.sort((a, b) => a.chunkIndex - b.chunkIndex);
return { chunk, context };
}
saveIndex(filePath: string): void {
const persistedLibraries: PersistedLibrary[] = [];
for (const [libId, lib] of this.libraries) {
const libChunkIds = this.libraryChunks.get(libId);
if (!libChunkIds) continue;
const persistedChunks: PersistedChunk[] = [];
for (const chunkId of libChunkIds) {
const chunk = this.chunks.get(chunkId);
if (!chunk) continue;
persistedChunks.push({
id: chunk.id,
libraryId: chunk.libraryId,
filePath: chunk.filePath,
fileName: chunk.fileName,
fileRelPath: chunk.fileRelPath,
chunkIndex: chunk.chunkIndex,
totalChunks: chunk.totalChunks,
text: chunk.text,
wordCount: chunk.wordCount,
terms: Array.from(chunk.terms.entries()),
heading: chunk.heading,
sectionDepth: chunk.sectionDepth,
fileType: chunk.fileType,
});
}
persistedLibraries.push({
library: { ...lib, files: [...lib.files] },
chunks: persistedChunks,
});
}
const index: PersistedIndex = {
version: PERSIST_VERSION,
savedAt: new Date().toISOString(),
libraries: persistedLibraries,
};
const dir = path.dirname(filePath);
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
fs.writeFileSync(filePath, JSON.stringify(index), "utf-8");
}
loadIndex(filePath: string): { loaded: number; skipped: number } {
if (!fs.existsSync(filePath)) return { loaded: 0, skipped: 0 };
const raw = fs.readFileSync(filePath, "utf-8");
const index: PersistedIndex = JSON.parse(raw);
if (index.version !== PERSIST_VERSION) {
return { loaded: 0, skipped: index.libraries?.length ?? 0 };
}
let loaded = 0;
let skipped = 0;
for (const persisted of index.libraries) {
const lib = persisted.library;
if (!fs.existsSync(lib.folderPath)) {
skipped++;
continue;
}
const chunkIds = new Set<string>();
for (const pc of persisted.chunks) {
const terms = new Map<string, number>(pc.terms);
const ngramSet = buildNgramSet(pc.text.slice(0, 500));
const chunk: DocumentChunk = { ...pc, terms, ngramSet };
this.chunks.set(pc.id, chunk);
chunkIds.add(pc.id);
}
this.libraries.set(lib.id, lib);
this.libraryChunks.set(lib.id, chunkIds);
loaded++;
}
if (loaded > 0) this.bm25.rebuild(this.chunks);
return { loaded, skipped };
}
checkForChanges(libraryId: string): {
modified: string[];
deleted: string[];
added: string[];
} {
const lib = this.libraries.get(libraryId);
if (!lib) return { modified: [], deleted: [], added: [] };
const modified: string[] = [];
const deleted: string[] = [];
const indexedFiles = new Set(lib.files.map((f) => f.filePath));
for (const file of lib.files) {
if (!fs.existsSync(file.filePath)) {
deleted.push(file.fileRelPath);
continue;
}
const currentHash = fileContentHash(file.filePath);
if (currentHash !== file.contentHash) modified.push(file.fileRelPath);
}
const currentFiles = scanDirectory(lib.folderPath);
const added = currentFiles
.filter((f) => !indexedFiles.has(f))
.map((f) => path.relative(lib.folderPath, f));
return { modified, deleted, added };
}
getStats(): {
libraries: number;
totalChunks: number;
totalWords: number;
uniqueTerms: number;
byPriority: Record<string, number>;
collections: number;
} {
let totalWords = 0;
const byPriority: Record<string, number> = {};
for (const lib of this.libraries.values()) {
totalWords += lib.totalWords;
byPriority[lib.priority] = (byPriority[lib.priority] ?? 0) + 1;
}
const uniqueTerms = new Set<string>();
for (const chunk of this.chunks.values()) {
for (const term of chunk.terms.keys()) uniqueTerms.add(term);
}
return {
libraries: this.libraries.size,
collections: this.libraries.size,
totalChunks: this.chunks.size,
totalWords,
uniqueTerms: uniqueTerms.size,
byPriority,
};
}
private makeHit(
chunk: DocumentChunk,
total: number,
bm25: number,
ngram: number,
lib: LocalLibrary | undefined,
): LocalSearchHit {
let contextBefore = "";
let contextAfter = "";
const ctxResult = this.getChunkWithContext(chunk.id, 1);
if (ctxResult) {
const before = ctxResult.context.find(
(c) => c.chunkIndex === chunk.chunkIndex - 1,
);
const after = ctxResult.context.find(
(c) => c.chunkIndex === chunk.chunkIndex + 1,
);
if (before) contextBefore = before.text.slice(-200);
if (after) contextAfter = after.text.slice(0, 200);
}
return {
chunkId: chunk.id,
libraryId: chunk.libraryId,
libraryName: lib?.name ?? "unknown",
libraryPriority: lib?.priority ?? "general",
filePath: chunk.filePath,
fileName: chunk.fileName,
fileRelPath: chunk.fileRelPath,
text: chunk.text,
wordCount: chunk.wordCount,
score: total,
bm25Score: bm25,
ngramScore: ngram,
chunkIndex: chunk.chunkIndex,
totalChunks: chunk.totalChunks,
heading: chunk.heading,
fileType: chunk.fileType,
contextBefore,
contextAfter,
};
}
}
function inferFileTags(ext: string, fileName: string): string[] {
const tags: string[] = [];
const lower = fileName.toLowerCase();
if ([".py", ".js", ".ts", ".java", ".c", ".cpp", ".rs", ".go"].includes(ext))
tags.push("code");
if ([".tex", ".bib"].includes(ext)) tags.push("academic");
if ([".sql", ".csv", ".tsv"].includes(ext)) tags.push("data");
if ([".md", ".txt", ".rst"].includes(ext)) tags.push("document");
if ([".yaml", ".yml", ".json", ".xml", ".toml"].includes(ext))
tags.push("config");
if (
lower.includes("legal") ||
lower.includes("contract") ||
lower.includes("policy")
)
tags.push("legal");
if (lower.includes("report") || lower.includes("analysis"))
tags.push("report");
return tags.length > 0 ? tags : ["general"];
}
let globalStore: LocalDocumentStore | null = null;
export function getGlobalStore(): LocalDocumentStore {
if (!globalStore) {
globalStore = new LocalDocumentStore();
}
return globalStore;
}