Project Files
src / search / fuzzyMatch.ts
/**
* Fuzzy Matching Utilities for Draw Things Index
* Implements Levenshtein distance and various fuzzy matching strategies
*/
// βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
// Simple Stemmer (English)
// βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
/**
* Simple English stemmer - handles common suffixes
* Not as good as Porter, but zero dependencies and fast
*/
function stem(word: string): string {
if (word.length < 4) return word;
// Plural forms
if (word.endsWith('ies') && word.length > 4) {
return word.slice(0, -3) + 'y'; // cities β city
}
if (word.endsWith('es') && word.length > 4) {
// boxes β box, watches β watch, but not "times"
const stem = word.slice(0, -2);
if (stem.endsWith('ch') || stem.endsWith('sh') || stem.endsWith('x') || stem.endsWith('s')) {
return stem;
}
}
if (word.endsWith('s') && !word.endsWith('ss') && word.length > 3) {
return word.slice(0, -1); // cats β cat
}
// Verb forms
if (word.endsWith('ing') && word.length > 5) {
const stem = word.slice(0, -3);
if (stem.endsWith(stem[stem.length - 1])) {
return stem.slice(0, -1); // running β run
}
return stem; // walking β walk
}
if (word.endsWith('ed') && word.length > 4) {
const stem = word.slice(0, -2);
if (stem.endsWith(stem[stem.length - 1])) {
return stem.slice(0, -1); // stopped β stop
}
return stem; // walked β walk
}
// Adjective forms
if (word.endsWith('ly') && word.length > 4) {
return word.slice(0, -2); // quickly β quick
}
if (word.endsWith('er') && word.length > 4) {
return word.slice(0, -2); // faster β fast
}
if (word.endsWith('est') && word.length > 5) {
return word.slice(0, -3); // fastest β fast
}
return word;
}
// βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
// Levenshtein Distance
// βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
/**
* Calculate Levenshtein distance between two strings
* Optimized implementation with early termination
*/
export function levenshteinDistance(a: string, b: string): number {
if (a === b) return 0;
if (a.length === 0) return b.length;
if (b.length === 0) return a.length;
// Swap to ensure a is the shorter string (memory optimization)
if (a.length > b.length) {
[a, b] = [b, a];
}
const aLen = a.length;
const bLen = b.length;
// Single row optimization
let prevRow = new Array(aLen + 1);
let currRow = new Array(aLen + 1);
// Initialize first row
for (let i = 0; i <= aLen; i++) {
prevRow[i] = i;
}
for (let j = 1; j <= bLen; j++) {
currRow[0] = j;
for (let i = 1; i <= aLen; i++) {
const cost = a[i - 1] === b[j - 1] ? 0 : 1;
currRow[i] = Math.min(
currRow[i - 1] + 1, // Insertion
prevRow[i] + 1, // Deletion
prevRow[i - 1] + cost // Substitution
);
}
// Swap rows
[prevRow, currRow] = [currRow, prevRow];
}
return prevRow[aLen];
}
/**
* Calculate similarity ratio (0-1) based on Levenshtein distance
*/
export function levenshteinSimilarity(a: string, b: string): number {
const maxLen = Math.max(a.length, b.length);
if (maxLen === 0) return 1;
return 1 - levenshteinDistance(a, b) / maxLen;
}
// βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
// Fuzzy Matching
// βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
export interface FuzzyMatchResult {
/** Match score 0-100 */
score: number;
/** Type of match */
type: 'exact' | 'fuzzy' | 'partial' | 'none';
/** Which terms matched */
matchedTerms: string[];
/** Similarity ratio for fuzzy matches */
similarity?: number;
}
export interface FuzzyMatchOptions {
/** Levenshtein similarity threshold for term matching (default: 0.85) */
fuzzyTermThreshold?: number;
/** Minimum fraction of query terms that must match (default: 0.5) */
minTermCoverage?: number;
}
const DEFAULT_FUZZY_OPTIONS: Required<FuzzyMatchOptions> = {
fuzzyTermThreshold: 0.85,
minTermCoverage: 0.5,
};
/**
* Check if a term matches as a WHOLE WORD in the target
* Uses word boundaries - "cat" won't match "intricately"
*/
function hasWholeWordMatch(term: string, targetWords: string[]): boolean {
return targetWords.some(word => word === term);
}
/**
* Check if stems match (for plural/verb forms)
* "cats" β "cat", "running" β "run"
*/
function hasStemMatch(term: string, targetWords: string[]): boolean {
const termStem = stem(term);
return targetWords.some(word => stem(word) === termStem);
}
/**
* Perform fuzzy matching of a query against a target string
* Returns detailed match information
*
* Scoring:
* - 100: Exact whole-word match for all query terms
* - 90-99: Stem match (catsβcat) for all terms
* - 60-89: Partial matches (some terms match)
* - <60: Fuzzy Levenshtein matches
*/
export function fuzzyMatch(
query: string,
target: string,
options?: FuzzyMatchOptions
): FuzzyMatchResult {
const opts = { ...DEFAULT_FUZZY_OPTIONS, ...options };
const queryLower = query.toLowerCase().trim();
const targetLower = target.toLowerCase();
// Tokenize both into words
const queryTerms = tokenize(queryLower);
const targetWords = tokenize(targetLower);
if (queryTerms.length === 0) {
return { score: 0, type: 'none', matchedTerms: [] };
}
const matchedTerms: string[] = [];
const matchTypes: ('exact' | 'stem' | 'fuzzy')[] = [];
for (const term of queryTerms) {
// Skip very short terms (too many false positives)
if (term.length < 3) continue;
// 1. Exact whole-word match (highest priority)
if (hasWholeWordMatch(term, targetWords)) {
matchedTerms.push(term);
matchTypes.push('exact');
continue;
}
// 2. Stem match (catsβcat, runningβrun)
if (hasStemMatch(term, targetWords)) {
matchedTerms.push(term);
matchTypes.push('stem');
continue;
}
// 3. Fuzzy Levenshtein match on individual words
let foundFuzzy = false;
for (const word of targetWords) {
// Only fuzzy match words of similar length
if (Math.abs(term.length - word.length) > 2) continue;
const termSimilarity = levenshteinSimilarity(term, word);
if (termSimilarity >= opts.fuzzyTermThreshold) {
matchedTerms.push(term);
matchTypes.push('fuzzy');
foundFuzzy = true;
break;
}
}
}
// No matches at all
if (matchedTerms.length === 0) {
return { score: 0, type: 'none', matchedTerms: [] };
}
// Calculate coverage and score
const significantQueryTerms = queryTerms.filter(t => t.length >= 3);
const significantQueryTermCount = significantQueryTerms.length;
const coverage = matchedTerms.length / Math.max(1, significantQueryTermCount);
// Tighten coverage for longer queries to avoid single-term false positives
// Example: 3-term query where only one common word matches ("like")
const effectiveMinCoverage = significantQueryTermCount >= 3
? Math.max(opts.minTermCoverage, 0.5)
: opts.minTermCoverage;
// Check minimum coverage
if (coverage < effectiveMinCoverage) {
return { score: 0, type: 'none', matchedTerms: [] };
}
// Calculate score based on match quality
const exactCount = matchTypes.filter(t => t === 'exact').length;
const stemCount = matchTypes.filter(t => t === 'stem').length;
const fuzzyCount = matchTypes.filter(t => t === 'fuzzy').length;
let score: number;
let type: 'exact' | 'fuzzy' | 'partial';
if (exactCount === matchedTerms.length && coverage >= 1.0) {
// All terms matched exactly
score = 100;
type = 'exact';
} else if ((exactCount + stemCount) === matchedTerms.length && coverage >= 1.0) {
// All terms matched via exact or stem
score = 95;
type = 'exact';
} else {
// Partial or fuzzy matches
// Score: base 50 + up to 30 for coverage + up to 10 for exact + 5 for stem
score = Math.round(50 + (coverage * 30) + (exactCount / matchedTerms.length * 10) + (stemCount / matchedTerms.length * 5));
score = Math.min(89, score); // Cap at 89 for partial
type = fuzzyCount > 0 ? 'fuzzy' : 'partial';
}
return {
score,
type,
matchedTerms,
};
}
/**
* Find best fuzzy match among multiple targets
*/
export function findBestMatch(
query: string,
targets: string[]
): { index: number; result: FuzzyMatchResult } | null {
let bestIndex = -1;
let bestResult: FuzzyMatchResult = { score: 0, type: 'none', matchedTerms: [] };
for (let i = 0; i < targets.length; i++) {
const result = fuzzyMatch(query, targets[i]);
if (result.score > bestResult.score) {
bestIndex = i;
bestResult = result;
}
}
if (bestIndex === -1 || bestResult.score === 0) {
return null;
}
return { index: bestIndex, result: bestResult };
}
// βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
// Text Processing
// βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
/**
* Tokenize text into searchable terms
* Removes common words and punctuation
*/
export function tokenize(text: string): string[] {
// Common words to ignore (English + German basics)
const stopWords = new Set([
'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
'of', 'with', 'by', 'from', 'is', 'are', 'was', 'were', 'be', 'been',
'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
'could', 'should', 'may', 'might', 'must', 'shall', 'can', 'need',
'like',
'ein', 'eine', 'der', 'die', 'das', 'und', 'oder', 'aber', 'mit',
'von', 'zu', 'fΓΌr', 'auf', 'ist', 'sind', 'war', 'waren',
]);
return text
.toLowerCase()
.replace(/[^\w\s-]/g, ' ') // Remove punctuation except hyphens
.split(/\s+/)
.filter(word => word.length > 2 && !stopWords.has(word));
}
/**
* Extract key phrases from a prompt
* Useful for finding the "essence" of a generation prompt
*/
export function extractKeyPhrases(prompt: string): string[] {
const phrases: string[] = [];
// Split on common prompt separators
const parts = prompt.split(/[,;|]/);
for (const part of parts) {
const trimmed = part.trim();
if (trimmed.length > 3 && trimmed.length < 100) {
phrases.push(trimmed);
}
}
return phrases;
}
/**
* Normalize a prompt for comparison
* Removes weight syntax, special tokens, etc.
*/
export function normalizePrompt(prompt: string): string {
return prompt
// Remove weight syntax like (word:1.2) or [word:0.8]
.replace(/[\(\[](.*?)(?::\d*\.?\d+)?[\)\]]/g, '$1')
// Remove BREAK tokens
.replace(/\bBREAK\b/gi, '')
// Normalize whitespace
.replace(/\s+/g, ' ')
.trim()
.toLowerCase();
}