Project Files
src / embeddings / vectorOps.ts
/**
* Vector Operations for Semantic Search
*
* Pure math functions for embedding vector operations.
* No dependencies on external libraries - these are simple enough
* that we don't need numpy/etc.
*/
/**
* Calculate cosine similarity between two vectors
* Returns value between -1 and 1, where:
* - 1 = identical direction (same meaning)
* - 0 = orthogonal (unrelated)
* - -1 = opposite direction (opposite meaning)
*
* For text embeddings, typical useful range is 0.5 to 1.0
*/
export function cosineSimilarity(a: number[], b: number[]): number {
if (a.length !== b.length) {
throw new Error(`Vector dimensions must match: ${a.length} vs ${b.length}`);
}
if (a.length === 0) {
return 0;
}
let dotProduct = 0;
let normA = 0;
let normB = 0;
for (let i = 0; i < a.length; i++) {
dotProduct += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
const magnitude = Math.sqrt(normA) * Math.sqrt(normB);
return magnitude === 0 ? 0 : dotProduct / magnitude;
}
/**
* Calculate Euclidean distance between two vectors
* Lower = more similar (0 = identical)
*/
export function euclideanDistance(a: number[], b: number[]): number {
if (a.length !== b.length) {
throw new Error(`Vector dimensions must match: ${a.length} vs ${b.length}`);
}
let sum = 0;
for (let i = 0; i < a.length; i++) {
const diff = a[i] - b[i];
sum += diff * diff;
}
return Math.sqrt(sum);
}
/**
* Normalize a vector to unit length (magnitude = 1)
* Useful for pre-processing before cosine similarity
*/
export function normalize(v: number[]): number[] {
const magnitude = Math.sqrt(v.reduce((sum, x) => sum + x * x, 0));
if (magnitude === 0) return v;
return v.map(x => x / magnitude);
}
/**
* Calculate dot product of two vectors
* If both vectors are normalized, this equals cosine similarity
*/
export function dotProduct(a: number[], b: number[]): number {
if (a.length !== b.length) {
throw new Error(`Vector dimensions must match: ${a.length} vs ${b.length}`);
}
let sum = 0;
for (let i = 0; i < a.length; i++) {
sum += a[i] * b[i];
}
return sum;
}
/**
* Find top-k most similar vectors from a collection
* Uses cosine similarity by default
*/
export function findTopK<T>(
queryVector: number[],
items: T[],
getVector: (item: T) => number[] | undefined,
k: number,
minSimilarity: number = 0.0,
): Array<{ item: T; similarity: number }> {
const scored: Array<{ item: T; similarity: number }> = [];
for (const item of items) {
const vector = getVector(item);
if (!vector) continue;
const similarity = cosineSimilarity(queryVector, vector);
if (similarity >= minSimilarity) {
scored.push({ item, similarity });
}
}
// Sort by similarity descending
scored.sort((a, b) => b.similarity - a.similarity);
return scored.slice(0, k);
}
/**
* Batch compute similarities for a query against multiple vectors
* More memory-efficient than calling cosineSimilarity in a loop
* when dealing with large collections
*/
export function batchCosineSimilarity(
queryVector: number[],
vectors: number[][],
): number[] {
// Pre-compute query norm
let queryNorm = 0;
for (let i = 0; i < queryVector.length; i++) {
queryNorm += queryVector[i] * queryVector[i];
}
queryNorm = Math.sqrt(queryNorm);
if (queryNorm === 0) {
return vectors.map(() => 0);
}
return vectors.map(vector => {
if (vector.length !== queryVector.length) {
return 0;
}
let dotProduct = 0;
let vectorNorm = 0;
for (let i = 0; i < queryVector.length; i++) {
dotProduct += queryVector[i] * vector[i];
vectorNorm += vector[i] * vector[i];
}
vectorNorm = Math.sqrt(vectorNorm);
if (vectorNorm === 0) return 0;
return dotProduct / (queryNorm * vectorNorm);
});
}
/**
* Convert similarity score (0-1) to a human-readable match score (0-100)
* Applies a non-linear scaling to make scores more intuitive
*/
export function similarityToScore(similarity: number): number {
// Typical embedding similarities:
// - 0.9+ = very strong match
// - 0.8-0.9 = strong match
// - 0.7-0.8 = moderate match
// - 0.6-0.7 = weak match
// - <0.6 = poor match
// Map 0.5-1.0 range to 0-100 with slight boost for high similarities
const normalized = Math.max(0, (similarity - 0.5) * 2); // 0.5->0, 1.0->1
const boosted = Math.pow(normalized, 0.8); // Slight curve to favor high scores
return Math.round(boosted * 100);
}
/**
* Convert a match score (0-100) back to similarity (0-1)
* Inverse of similarityToScore()
*/
export function scoreToSimilarity(score: number): number {
// Reverse the similarityToScore formula:
// score = (((similarity - 0.5) * 2) ^ 0.8) * 100
//
// Solving for similarity:
// score/100 = normalized^0.8
// normalized = (score/100)^(1/0.8) = (score/100)^1.25
// similarity = normalized/2 + 0.5
const boosted = score / 100;
const normalized = Math.pow(boosted, 1.25); // Inverse of ^0.8
return normalized / 2 + 0.5;
}
/**
* Estimate memory usage for storing embeddings
* @param count Number of embeddings
* @param dimension Vector dimension (e.g., 768 or 1536)
* @returns Estimated memory in bytes
*/
export function estimateMemoryUsage(count: number, dimension: number): number {
// Each number in JS is 8 bytes (64-bit float)
// Plus array overhead (~32 bytes per array)
return count * (dimension * 8 + 32);
}
/**
* Format memory size for display
*/
export function formatMemorySize(bytes: number): string {
if (bytes < 1024) return `${bytes} B`;
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
}