Project Files
src / search / docSearch.ts
/**
* DocSearch — hybrid BM25 + cosine similarity search with RRF merge.
* Returns scored metadata; never reads file content.
*/
import { BM25Index } from "./bm25.js";
import { type DocMeta } from "../db/docIndex.js";
import { type EmbeddingClient } from "../embeddings/embeddingClient.js";
export interface SearchResult {
id: string;
filePath: string;
title: string;
tags: string[];
/** 0–100 relevance score (higher = better) */
score: number;
}
/** Reciprocal Rank Fusion constant */
const RRF_K = 60;
function cosine(a: Float32Array, b: Float32Array): number {
let dot = 0, normA = 0, normB = 0;
for (let i = 0; i < a.length; i++) {
dot += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
if (normA === 0 || normB === 0) return 0;
return dot / (Math.sqrt(normA) * Math.sqrt(normB));
}
function rrfScore(rank: number): number {
return 1 / (RRF_K + rank + 1);
}
export class DocSearch {
private bm25 = new BM25Index();
private embeddingClient: EmbeddingClient | null = null;
setEmbeddingClient(client: EmbeddingClient | null): void {
this.embeddingClient = client;
}
/** Rebuild BM25 from a full list of documents + their text content. */
rebuild(docs: Array<{ id: string; title: string; tags: string[]; bodyText: string }>): void {
this.bm25.clear();
for (const d of docs) {
const indexText = `${d.title} ${d.tags.join(" ")} ${d.bodyText}`;
this.bm25.addDocument(d.id, indexText);
}
}
addToIndex(id: string, title: string, tags: string[], bodyText: string): void {
const indexText = `${title} ${tags.join(" ")} ${bodyText}`;
this.bm25.addDocument(id, indexText);
}
removeFromIndex(id: string): void {
this.bm25.removeDocument(id);
}
async search(
query: string,
docs: Array<DocMeta & { embedding: Float32Array | null }>,
opts: { limit: number; minScore: number; tags?: string[] }
): Promise<SearchResult[]> {
// Optional hard tag filter
const filtered =
opts.tags && opts.tags.length > 0
? docs.filter((d) => opts.tags!.every((t) => d.tags.includes(t)))
: docs;
if (filtered.length === 0) return [];
// 1. BM25 ranking
const bm25Results = this.bm25.search(query, filtered.length);
const bm25RankMap = new Map(bm25Results.map((r, i) => [r.id, i]));
// 2. Semantic ranking (optional)
let semanticRankMap = new Map<string, number>();
if (this.embeddingClient) {
try {
const queryVec = await this.embeddingClient.embedQuery(query);
const withSim = filtered
.filter((d) => d.embedding !== null)
.map((d) => ({ id: d.id, sim: cosine(queryVec, d.embedding!) }))
.sort((a, b) => b.sim - a.sim);
semanticRankMap = new Map(withSim.map((r, i) => [r.id, i]));
} catch {
// Fall back to BM25-only silently
}
}
// 3. RRF merge
const rrfScores = new Map<string, number>();
const hasSemanticData = semanticRankMap.size > 0;
for (const doc of filtered) {
let score = 0;
const bm25Rank = bm25RankMap.get(doc.id);
if (bm25Rank !== undefined) {
score += rrfScore(bm25Rank);
}
if (hasSemanticData) {
const semRank = semanticRankMap.get(doc.id);
if (semRank !== undefined) {
score += rrfScore(semRank);
}
}
if (score > 0) rrfScores.set(doc.id, score);
}
// 4. Normalise to 0–100
const sorted = [...rrfScores.entries()]
.sort((a, b) => b[1] - a[1])
.slice(0, opts.limit * 3); // oversample before score cut
if (sorted.length === 0) return [];
const maxRrf = sorted[0][1];
const normalised = sorted.map(([id, rrf]) => ({
id,
score: Math.round((rrf / maxRrf) * 100),
}));
// 5. Apply minScore threshold and final limit
const docMap = new Map(filtered.map((d) => [d.id, d]));
return normalised
.filter((r) => r.score >= opts.minScore)
.slice(0, opts.limit)
.map((r) => {
const d = docMap.get(r.id)!;
return { id: d.id, filePath: d.filePath, title: d.title, tags: d.tags, score: r.score };
});
}
}