Forked from mindstudio/big-rag
Project Files
src / vectorstore / vectorStore.ts
import * as fs from "fs/promises";
import * as path from "path";
import { LocalIndex } from "vectra";
const MAX_ITEMS_PER_SHARD = 10000;
const SHARD_DIR_PREFIX = "shard_";
const SHARD_DIR_REGEX = /^shard_(\d+)$/;
export interface DocumentChunk {
id: string;
text: string;
vector: number[];
filePath: string;
fileName: string;
fileHash: string;
chunkIndex: number;
metadata: Record<string, any>;
}
export interface SearchResult {
text: string;
score: number;
filePath: string;
fileName: string;
chunkIndex: number;
shardName: string;
metadata: Record<string, any>;
}
export interface FilenameSearchResult {
filePath: string;
fileName: string;
matchedKeywords: string[];
}
type ChunkMetadata = {
text: string;
filePath: string;
fileName: string;
fileHash: string;
chunkIndex: number;
[key: string]: any;
};
/**
* Compute cosine similarity between two vectors.
*/
function cosineSimilarity(a: number[], b: number[]): number {
let dot = 0;
let normA = 0;
let normB = 0;
for (let i = 0; i < a.length; i++) {
dot += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
const denom = Math.sqrt(normA) * Math.sqrt(normB);
return denom === 0 ? 0 : dot / denom;
}
export class VectorStore {
private dbPath: string;
private shardDirs: string[] = [];
private activeShard: LocalIndex | null = null;
private activeShardCount: number = 0;
private updateMutex: Promise<void> = Promise.resolve();
constructor(dbPath: string) {
this.dbPath = path.resolve(dbPath);
}
/**
* Open a shard by directory name (e.g. "shard_000"). Caller must not hold the reference
* after use so GC can free the parsed index data.
*/
private openShard(dir: string): LocalIndex {
const fullPath = path.join(this.dbPath, dir);
return new LocalIndex(fullPath);
}
/**
* Scan dbPath for shard_NNN directories and return sorted list.
*/
private async discoverShardDirs(): Promise<string[]> {
const entries = await fs.readdir(this.dbPath, { withFileTypes: true });
const dirs: string[] = [];
for (const e of entries) {
if (e.isDirectory() && SHARD_DIR_REGEX.test(e.name)) {
dirs.push(e.name);
}
}
dirs.sort((a, b) => {
const n = (m: string) => parseInt(m.match(SHARD_DIR_REGEX)![1], 10);
return n(a) - n(b);
});
return dirs;
}
/**
* Initialize the vector store: discover or create shards, open the last as active.
*/
async initialize(): Promise<void> {
await fs.mkdir(this.dbPath, { recursive: true });
this.shardDirs = await this.discoverShardDirs();
if (this.shardDirs.length === 0) {
const firstDir = `${SHARD_DIR_PREFIX}000`;
const fullPath = path.join(this.dbPath, firstDir);
const index = new LocalIndex(fullPath);
await index.createIndex({ version: 1 });
this.shardDirs = [firstDir];
this.activeShard = index;
this.activeShardCount = 0;
} else {
const lastDir = this.shardDirs[this.shardDirs.length - 1];
this.activeShard = this.openShard(lastDir);
const items = await this.activeShard.listItems();
this.activeShardCount = items.length;
}
console.log("Vector store initialized successfully");
}
/**
* Add document chunks to the active shard. Rotates to a new shard when full.
*/
async addChunks(chunks: DocumentChunk[]): Promise<void> {
if (!this.activeShard) {
throw new Error("Vector store not initialized");
}
if (chunks.length === 0) return;
this.updateMutex = this.updateMutex.then(async () => {
await this.activeShard!.beginUpdate();
try {
for (const chunk of chunks) {
const metadata: ChunkMetadata = {
text: chunk.text,
filePath: chunk.filePath,
fileName: chunk.fileName,
fileHash: chunk.fileHash,
chunkIndex: chunk.chunkIndex,
...chunk.metadata,
};
await this.activeShard!.upsertItem({
id: chunk.id,
vector: chunk.vector,
metadata,
});
}
await this.activeShard!.endUpdate();
} catch (e) {
this.activeShard!.cancelUpdate();
throw e;
}
this.activeShardCount += chunks.length;
console.log(`Added ${chunks.length} chunks to vector store`);
if (this.activeShardCount >= MAX_ITEMS_PER_SHARD) {
const nextNum = this.shardDirs.length;
const nextDir = `${SHARD_DIR_PREFIX}${String(nextNum).padStart(3, "0")}`;
const fullPath = path.join(this.dbPath, nextDir);
const newIndex = new LocalIndex(fullPath);
await newIndex.createIndex({ version: 1 });
this.shardDirs.push(nextDir);
this.activeShard = newIndex;
this.activeShardCount = 0;
}
});
return this.updateMutex;
}
/**
* Search: query each shard in turn, merge results, sort by score, filter by threshold, return top limit.
*/
async search(
queryVector: number[],
limit: number = 5,
threshold: number = 0.5,
): Promise<SearchResult[]> {
const merged: SearchResult[] = [];
for (const dir of this.shardDirs) {
const shard = this.openShard(dir);
// Request more candidates per shard to improve recall before threshold filtering
const shardLimit = Math.max(limit * 3, 20);
const results = await shard.queryItems(
queryVector,
"",
shardLimit,
undefined,
false,
);
for (const r of results) {
const m = r.item.metadata as ChunkMetadata;
merged.push({
text: m?.text ?? "",
score: r.score,
filePath: m?.filePath ?? "",
fileName: m?.fileName ?? "",
chunkIndex: m?.chunkIndex ?? 0,
shardName: dir,
metadata: (r.item.metadata as Record<string, any>) ?? {},
});
}
}
// Sort by score descending for diagnostics and filtering
merged.sort((a, b) => b.score - a.score);
// Diagnostic logging: show top scores before threshold filtering
if (merged.length > 0) {
const topScores = merged
.slice(0, Math.min(5, merged.length))
.map((r) => `${r.score.toFixed(4)} (${path.basename(r.filePath)})`);
const aboveThreshold = merged.filter((r) => r.score >= threshold).length;
console.info(
`[BigRAG] Vector search raw: top_scores=[${topScores.join(", ")}], ` +
`above_threshold=${aboveThreshold}/${merged.length}, threshold=${threshold}`,
);
} else {
console.warn("[BigRAG] Vector search returned 0 raw results from all shards.");
}
return merged
.filter((r) => r.score >= threshold)
.slice(0, limit);
}
/**
* Delete all chunks for a file (by hash) across all shards.
*/
async deleteByFileHash(fileHash: string): Promise<void> {
const lastDir = this.shardDirs[this.shardDirs.length - 1];
this.updateMutex = this.updateMutex.then(async () => {
for (const dir of this.shardDirs) {
const shard = this.openShard(dir);
const items = await shard.listItems();
const toDelete = items.filter(
(i) => (i.metadata as ChunkMetadata)?.fileHash === fileHash,
);
if (toDelete.length > 0) {
await shard.beginUpdate();
for (const item of toDelete) {
await shard.deleteItem(item.id);
}
await shard.endUpdate();
if (dir === lastDir && this.activeShard) {
this.activeShardCount = (await this.activeShard.listItems()).length;
}
}
}
console.log(`Deleted chunks for file hash: ${fileHash}`);
});
return this.updateMutex;
}
/**
* Get file path -> set of file hashes currently in the store.
*/
async getFileHashInventory(): Promise<Map<string, Set<string>>> {
const inventory = new Map<string, Set<string>>();
for (const dir of this.shardDirs) {
const shard = this.openShard(dir);
const items = await shard.listItems();
for (const item of items) {
const m = item.metadata as ChunkMetadata;
const filePath = m?.filePath;
const fileHash = m?.fileHash;
if (!filePath || !fileHash) continue;
let set = inventory.get(filePath);
if (!set) {
set = new Set<string>();
inventory.set(filePath, set);
}
set.add(fileHash);
}
}
return inventory;
}
/**
* Get total chunk count and unique file count.
*/
async getStats(): Promise<{
totalChunks: number;
uniqueFiles: number;
}> {
let totalChunks = 0;
const uniqueHashes = new Set<string>();
for (const dir of this.shardDirs) {
const shard = this.openShard(dir);
const items = await shard.listItems();
totalChunks += items.length;
for (const item of items) {
const h = (item.metadata as ChunkMetadata)?.fileHash;
if (h) uniqueHashes.add(h);
}
}
return { totalChunks, uniqueFiles: uniqueHashes.size };
}
/**
* Check if any chunk exists for the given file hash (short-circuits on first match).
*/
async hasFile(fileHash: string): Promise<boolean> {
for (const dir of this.shardDirs) {
const shard = this.openShard(dir);
const items = await shard.listItems();
if (items.some((i) => (i.metadata as ChunkMetadata)?.fileHash === fileHash)) {
return true;
}
}
return false;
}
/**
* Search for files whose names contain ALL the specified keywords (case-insensitive substring match).
* Scans shard metadata to build a unique list of indexed file paths, then filters by keywords.
* Returns a list of matching files with the keywords that matched each.
*/
async searchByFilenames(keywords: string[]): Promise<FilenameSearchResult[]> {
if (keywords.length === 0) return [];
const lowerKeywords = keywords.map((k) => k.toLowerCase());
const seenPaths = new Map<string, string>(); // filePath -> fileName
for (const dir of this.shardDirs) {
const shard = this.openShard(dir);
const items = await shard.listItems();
for (const item of items) {
const m = item.metadata as ChunkMetadata;
if (m?.filePath && m?.fileName && !seenPaths.has(m.filePath)) {
seenPaths.set(m.filePath, m.fileName);
}
}
}
const results: FilenameSearchResult[] = [];
for (const [filePath, fileName] of seenPaths) {
// Search in full path (including directory names) and file name
const lowerPath = filePath.toLowerCase();
const lowerName = fileName.toLowerCase();
const matchedKeywords: string[] = [];
for (const kw of lowerKeywords) {
if (lowerPath.includes(kw) || lowerName.includes(kw)) {
matchedKeywords.push(kw);
}
}
// File matches if ALL keywords are found
if (matchedKeywords.length === lowerKeywords.length) {
results.push({ filePath, fileName, matchedKeywords });
}
}
// Fallback: if AND search returned nothing and we have multiple keywords, try OR
if (results.length === 0 && lowerKeywords.length > 1) {
console.info(
`[BigRAG] Filename AND-search found 0 files for [${keywords.join(", ")}]. Trying OR-search...`,
);
for (const [filePath, fileName] of seenPaths) {
const lowerPath = filePath.toLowerCase();
const lowerName = fileName.toLowerCase();
const matchedKeywords: string[] = [];
for (const kw of lowerKeywords) {
if (lowerPath.includes(kw) || lowerName.includes(kw)) {
matchedKeywords.push(kw);
}
}
// File matches if ANY keyword is found
if (matchedKeywords.length > 0) {
results.push({ filePath, fileName, matchedKeywords });
}
}
}
console.log(
`[BigRAG] Filename search for [${keywords.join(", ")}] found ${results.length} matching files`,
);
return results;
}
/**
* Search for relevant chunks within specific files, ranked by vector similarity.
* Used after a filename search to retrieve the most relevant content from matching files.
*/
async searchInFiles(
queryVector: number[],
filePaths: string[],
limit: number = 5,
threshold: number = 0.3,
): Promise<SearchResult[]> {
if (filePaths.length === 0) return [];
const pathSet = new Set(filePaths);
// Use queryItems() per shard (which has access to stored vectors),
// then filter by file paths. Request more candidates for better recall.
const shardQueryLimit = Math.max(limit * 5, 30);
const candidates: SearchResult[] = [];
for (const dir of this.shardDirs) {
const shard = this.openShard(dir);
const results = await shard.queryItems(
queryVector,
"",
shardQueryLimit,
undefined,
false,
);
for (const r of results) {
const m = r.item.metadata as ChunkMetadata;
if (!m?.filePath || !pathSet.has(m.filePath)) continue;
if (r.score >= threshold) {
candidates.push({
text: m.text ?? "",
score: r.score,
filePath: m.filePath,
fileName: m.fileName ?? "",
chunkIndex: m.chunkIndex ?? 0,
shardName: dir,
metadata: (r.item.metadata as Record<string, any>) ?? {},
});
}
}
}
console.info(
`[BigRAG] searchInFiles: found ${candidates.length} candidates in ${filePaths.length} file(s), ` +
`threshold=${threshold}`,
);
return candidates
.sort((a, b) => b.score - a.score)
.slice(0, limit);
}
/**
* Get all chunks for specific file paths (unsorted, for listing file content).
*/
async getChunksForFiles(
filePaths: string[],
limit: number = 5,
): Promise<SearchResult[]> {
if (filePaths.length === 0) return [];
const pathSet = new Set(filePaths);
const chunks: SearchResult[] = [];
for (const dir of this.shardDirs) {
const shard = this.openShard(dir);
const items = await shard.listItems();
for (const item of items) {
const m = item.metadata as ChunkMetadata;
if (!m?.filePath || !pathSet.has(m.filePath)) continue;
chunks.push({
text: m.text ?? "",
score: 1.0,
filePath: m.filePath,
fileName: m.fileName ?? "",
chunkIndex: m.chunkIndex ?? 0,
shardName: dir,
metadata: (item.metadata as Record<string, any>) ?? {},
});
}
}
// Group by file, sort chunks within each file by chunkIndex, take first `limit` per file
const byFile = new Map<string, SearchResult[]>();
for (const chunk of chunks) {
let arr = byFile.get(chunk.filePath);
if (!arr) {
arr = [];
byFile.set(chunk.filePath, arr);
}
arr.push(chunk);
}
const results: SearchResult[] = [];
for (const [filePath, fileChunks] of byFile) {
fileChunks.sort((a, b) => a.chunkIndex - b.chunkIndex);
results.push(...fileChunks.slice(0, limit));
}
return results;
}
/**
* Get the number of chunks stored for a specific file path.
*/
async getChunkCountForFile(filePath: string): Promise<number> {
let count = 0;
for (const dir of this.shardDirs) {
const shard = this.openShard(dir);
const items = await shard.listItems();
for (const item of items) {
const m = item.metadata as ChunkMetadata;
if (m?.filePath === filePath) {
count++;
}
}
}
return count;
}
/**
* Release the active shard reference.
*/
async close(): Promise<void> {
this.activeShard = null;
}
}