Project Files
src / rag / vectorStore.ts
/**
* Vector Store using SQLite with sql.js (WebAssembly)
* Stores documents, chunks, and embeddings with cosine similarity search
*/
import initSqlJs, { Database } from 'sql.js';
import path from 'path';
import { existsSync, mkdirSync, readFileSync, statSync, writeFileSync } from 'fs';
import type { Document, Chunk, SearchResult } from '../types';
import { ragDebug } from '../utils/ragLogger.js';
export interface VectorStoreConfig {
dbPath: string;
embeddingDimension: number;
}
export class VectorStore {
private db: Database | null = null;
private config: VectorStoreConfig;
private SQL: any = null;
constructor(config: VectorStoreConfig) {
this.config = config;
}
async init(): Promise<void> {
if (this.db) return;
// Initialize sql.js
this.SQL = await initSqlJs();
// Ensure directory exists
const dir = path.dirname(this.config.dbPath);
if (!existsSync(dir)) {
mkdirSync(dir, { recursive: true });
}
// Load existing DB or create new
if (existsSync(this.config.dbPath)) {
ragDebug('VectorStore', `Loading existing DB from: ${this.config.dbPath}`);
const buffer = readFileSync(this.config.dbPath);
this.db = new this.SQL.Database(buffer);
} else {
ragDebug('VectorStore', `Creating new DB at: ${this.config.dbPath}`);
this.db = new this.SQL.Database();
}
this.initSchema();
// Log current stats
const stats = this.getStats();
ragDebug('VectorStore', `DB Stats after init: ${stats.documentCount} docs, ${stats.chunkCount} chunks`);
}
private initSchema() {
if (!this.db) throw new Error('Database not initialized');
this.db.run(`
CREATE TABLE IF NOT EXISTS documents (
id INTEGER PRIMARY KEY AUTOINCREMENT,
path TEXT UNIQUE NOT NULL,
hash TEXT NOT NULL,
title TEXT,
metadata TEXT DEFAULT '{}',
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
)
`);
this.db.run(`CREATE INDEX IF NOT EXISTS idx_documents_path ON documents(path)`);
this.db.run(`CREATE INDEX IF NOT EXISTS idx_documents_hash ON documents(hash)`);
this.db.run(`
CREATE TABLE IF NOT EXISTS chunks (
id INTEGER PRIMARY KEY AUTOINCREMENT,
document_id INTEGER NOT NULL,
content TEXT NOT NULL,
embedding BLOB,
metadata TEXT DEFAULT '{}',
FOREIGN KEY (document_id) REFERENCES documents(id) ON DELETE CASCADE
)
`);
this.db.run(`CREATE INDEX IF NOT EXISTS idx_chunks_document_id ON chunks(document_id)`);
this.save();
}
private save(): void {
if (!this.db) return;
const data = this.db.export();
writeFileSync(this.config.dbPath, data);
}
/**
* Insert or update a document
*/
upsertDocument(doc: Omit<Document, 'id' | 'updatedAt'>): number {
if (!this.db) throw new Error('Database not initialized');
const metadata = JSON.stringify(doc.metadata || {});
// Check if exists
const existing = this.db.exec(`SELECT id FROM documents WHERE path = ?`, [doc.path]);
if (existing.length > 0 && existing[0].values.length > 0) {
// Update - return existing ID
const existingId = existing[0].values[0][0] as number;
this.db.run(
`UPDATE documents SET hash = ?, title = ?, metadata = ?, updated_at = CURRENT_TIMESTAMP WHERE path = ?`,
[doc.hash, doc.title || null, metadata, doc.path]
);
this.save();
ragDebug('VectorStore', `Updated document, returning existing ID: ${existingId}`);
return existingId;
} else {
// Insert
this.db.run(
`INSERT INTO documents (path, hash, title, metadata) VALUES (?, ?, ?, ?)`,
[doc.path, doc.hash, doc.title || null, metadata]
);
// Get the ID BEFORE save() - save() might affect last_insert_rowid()
const result = this.db.exec(`SELECT last_insert_rowid() as id`);
const newId = result[0].values[0][0] as number;
ragDebug('VectorStore', `Inserted new document, last_insert_rowid returned: ${newId}`);
this.save();
return newId;
}
}
/**
* Get document by path
*/
getDocumentByPath(docPath: string): Document | null {
if (!this.db) throw new Error('Database not initialized');
const result = this.db.exec(
`SELECT id, path, hash, title, metadata, updated_at FROM documents WHERE path = ?`,
[docPath]
);
if (result.length === 0 || result[0].values.length === 0) return null;
const row = result[0].values[0];
return {
id: row[0] as number,
path: row[1] as string,
hash: row[2] as string,
title: row[3] as string,
metadata: JSON.parse((row[4] as string) || '{}'),
updatedAt: new Date(row[5] as string),
};
}
/**
* Get document by ID
*/
getDocumentById(docId: number): Document | null {
if (!this.db) return null;
const result = this.db.exec(
`SELECT id, path, hash, title, metadata, updated_at FROM documents WHERE id = ?`,
[docId]
);
if (result.length === 0 || result[0].values.length === 0) return null;
const row = result[0].values[0];
return {
id: row[0] as number,
path: row[1] as string,
hash: row[2] as string,
title: row[3] as string,
metadata: JSON.parse((row[4] as string) || '{}'),
updatedAt: new Date(row[5] as string),
};
}
/**
* Get all indexed document paths
*/
getAllDocumentPaths(): string[] {
if (!this.db) return [];
const result = this.db.exec(`SELECT path FROM documents`);
if (result.length === 0) return [];
return result[0].values.map(row => row[0] as string);
}
/**
* Get all chunks (ID and content) for BM25 index building
*/
getAllChunksForBM25(): Array<{ id: number; content: string }> {
if (!this.db) return [];
const result = this.db.exec(`SELECT id, content FROM chunks`);
if (result.length === 0) return [];
return result[0].values.map(row => ({
id: row[0] as number,
content: row[1] as string,
}));
}
/**
* Insert chunks with embeddings
* Returns array of inserted chunk IDs
*/
insertChunks(chunks: Omit<Chunk, 'id'>[]): number[] {
if (!this.db || chunks.length === 0) return [];
const insertedIds: number[] = [];
for (const chunk of chunks) {
const embeddingBlob = chunk.embedding
? Buffer.from(chunk.embedding.buffer)
: null;
const metadata = JSON.stringify(chunk.metadata);
this.db.run(
`INSERT INTO chunks (document_id, content, embedding, metadata) VALUES (?, ?, ?, ?)`,
[chunk.documentId, chunk.content, embeddingBlob, metadata]
);
// Get the inserted ID
const result = this.db.exec(`SELECT last_insert_rowid() as id`);
insertedIds.push(result[0].values[0][0] as number);
}
this.save();
return insertedIds;
}
/**
* Delete all chunks for a document
*/
deleteChunksByDocumentId(documentId: number): void {
if (!this.db) return;
this.db.run('DELETE FROM chunks WHERE document_id = ?', [documentId]);
this.save();
}
/**
* Get chunk IDs for a document (for BM25 cleanup)
*/
getChunkIdsByDocumentId(documentId: number): number[] {
if (!this.db) return [];
const result = this.db.exec(`SELECT id FROM chunks WHERE document_id = ?`, [documentId]);
if (result.length === 0) return [];
return result[0].values.map(row => row[0] as number);
}
/**
* Get a single chunk by ID
*/
getChunkById(chunkId: number): { documentId: number; content: string; metadata: Record<string, unknown> } | null {
if (!this.db) return null;
const result = this.db.exec(
`SELECT document_id, content, metadata FROM chunks WHERE id = ?`,
[chunkId]
);
if (result.length === 0 || result[0].values.length === 0) return null;
const row = result[0].values[0];
return {
documentId: row[0] as number,
content: row[1] as string,
metadata: JSON.parse((row[2] as string) || '{}'),
};
}
/**
* Delete document and all its chunks
*/
deleteDocument(docPath: string): void {
if (!this.db) return;
// First get the document ID
const result = this.db.exec(`SELECT id FROM documents WHERE path = ?`, [docPath]);
if (result.length > 0 && result[0].values.length > 0) {
const docId = result[0].values[0][0] as number;
// Delete chunks first (foreign key)
this.db.run('DELETE FROM chunks WHERE document_id = ?', [docId]);
ragDebug('VectorStore', `Deleted chunks for document ${docId}`);
}
// Then delete the document
this.db.run('DELETE FROM documents WHERE path = ?', [docPath]);
ragDebug('VectorStore', `Deleted document: ${docPath}`);
this.save();
}
/**
* Semantic search using cosine similarity
*/
search(queryEmbedding: Float32Array, limit: number, threshold: number): SearchResult[] {
if (!this.db) throw new Error('Database not initialized');
ragDebug('VectorStore', `Search called with limit=${limit}, threshold=${threshold}`);
ragDebug('VectorStore', `Query embedding dimension: ${queryEmbedding.length}`);
const result = this.db.exec(`
SELECT
c.id,
c.document_id,
c.content,
c.embedding,
c.metadata as chunk_metadata,
d.path,
d.hash,
d.title,
d.metadata as doc_metadata,
d.updated_at
FROM chunks c
JOIN documents d ON c.document_id = d.id
WHERE c.embedding IS NOT NULL
`);
if (result.length === 0) {
ragDebug('VectorStore', 'No chunks found in database');
return [];
}
const rows = result[0].values;
ragDebug('VectorStore', `Found ${rows.length} chunks in DB to compare`);
// Calculate cosine similarity for each chunk
const results: SearchResult[] = rows
.map(row => {
const embeddingBuffer = row[3] as Uint8Array;
const embedding = new Float32Array(embeddingBuffer.buffer, embeddingBuffer.byteOffset, embeddingBuffer.byteLength / 4);
const similarity = this.cosineSimilarity(queryEmbedding, embedding);
// Convert similarity to distance (0 = identical, 2 = opposite)
const distance = 1 - similarity;
return {
chunk: {
id: row[0] as number,
documentId: row[1] as number,
content: row[2] as string,
embedding,
metadata: JSON.parse(row[4] as string),
},
document: {
id: row[1] as number,
path: row[5] as string,
hash: row[6] as string,
title: row[7] as string,
metadata: JSON.parse((row[8] as string) || '{}'),
updatedAt: new Date(row[9] as string),
},
score: similarity,
distance,
};
})
.filter(result => result.score >= threshold)
.sort((a, b) => b.score - a.score)
.slice(0, limit);
ragDebug('VectorStore', `Search results: ${results.length} above threshold ${threshold}`);
if (results.length > 0) {
ragDebug('VectorStore', `Top scores: ${results.slice(0, 3).map(r => r.score.toFixed(3)).join(', ')}`);
}
return results;
}
/**
* Cosine similarity between two vectors
*/
private cosineSimilarity(a: Float32Array, b: Float32Array): number {
if (a.length !== b.length) {
throw new Error('Vector dimensions must match');
}
let dotProduct = 0;
let normA = 0;
let normB = 0;
for (let i = 0; i < a.length; i++) {
dotProduct += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
const denominator = Math.sqrt(normA) * Math.sqrt(normB);
return denominator === 0 ? 0 : dotProduct / denominator;
}
/**
* Get statistics about the vector store
*/
getStats(): {
documentCount: number;
chunkCount: number;
dbSizeBytes: number;
} {
if (!this.db) return { documentCount: 0, chunkCount: 0, dbSizeBytes: 0 };
const docResult = this.db.exec('SELECT COUNT(*) as count FROM documents');
const chunkResult = this.db.exec('SELECT COUNT(*) as count FROM chunks');
return {
documentCount: docResult[0]?.values[0]?.[0] as number || 0,
chunkCount: chunkResult[0]?.values[0]?.[0] as number || 0,
dbSizeBytes: existsSync(this.config.dbPath)
? statSync(this.config.dbPath).size
: 0,
};
}
/**
* Close the database connection
*/
close(): void {
if (this.db) {
this.save();
this.db.close();
this.db = null;
}
}
/**
* Clear all data (for testing/reset)
*/
clearAll(): void {
if (!this.db) return;
this.db.run('DELETE FROM chunks');
this.db.run('DELETE FROM documents');
this.save();
}
}