Project Files
src / utils / language.ts
export type Language = 'de' | 'en' | 'mixed' | 'unknown';
export type LanguageGuess = {
language: Language;
confidence: number; // 0..1
deHits: number;
enHits: number;
};
const DE_STOPWORDS = new Set([
'der','die','das','und','ist','sind','nicht','ich','du','wir','ihr','sie','ein','eine','einen','einem','einer',
'mit','für','auf','zu','im','in','am','an','den','dem','des','als','auch','aber','oder','wenn','dass','was','wie',
'bitte','kann','können','kannst','könnt','wird','werden','bei','vom','von','aus','mehr','weniger'
]);
const EN_STOPWORDS = new Set([
'the','and','is','are','not','i','you','we','they','a','an','with','for','on','to','in','of','that','this','as','also',
'be','have','has','it','can','could','should','would','please','what','how','when','if','but','or','from','more','less'
]);
function stripCodeAndNoise(input: string): string {
// Remove fenced code blocks
let s = input.replace(/```[\s\S]*?```/g, ' ');
// Remove inline code
s = s.replace(/`[^`]*`/g, ' ');
// Remove URLs
s = s.replace(/https?:\/\/\S+/g, ' ');
// Remove long hex/base64-ish tokens
s = s.replace(/[A-Za-z0-9+/=]{40,}/g, ' ');
// Collapse whitespace
s = s.replace(/\s+/g, ' ').trim();
return s;
}
function tokenizeWords(text: string): string[] {
return text
.toLowerCase()
.split(/[^\p{L}\p{N}]+/u)
.filter(Boolean);
}
export function guessLanguage(text: string): LanguageGuess {
const cleaned = stripCodeAndNoise(text);
if (!cleaned) return { language: 'unknown', confidence: 0, deHits: 0, enHits: 0 };
const words = tokenizeWords(cleaned);
if (words.length === 0) return { language: 'unknown', confidence: 0, deHits: 0, enHits: 0 };
let deHits = 0;
let enHits = 0;
for (const w of words) {
if (DE_STOPWORDS.has(w)) deHits++;
if (EN_STOPWORDS.has(w)) enHits++;
}
const totalHits = deHits + enHits;
if (totalHits < 2) {
// Not enough signal (numbers, nouns, jargon etc.)
return { language: 'unknown', confidence: 0, deHits, enHits };
}
const diff = deHits - enHits;
const absDiff = Math.abs(diff);
const confidence = Math.max(0, Math.min(1, absDiff / totalHits));
const bothStrong = deHits >= 2 && enHits >= 2;
const nearTie = absDiff <= 1;
if (bothStrong && nearTie) {
return { language: 'mixed', confidence: 0.3, deHits, enHits };
}
if (diff > 0) return { language: 'de', confidence, deHits, enHits };
if (diff < 0) return { language: 'en', confidence, deHits, enHits };
return { language: 'mixed', confidence: 0.3, deHits, enHits };
}
export function computeLanguageNeutrality(text: string): number {
// 0 => language-neutral (code/logs/table-ish), 1 => natural language
// Heuristic: ratio of letters to total characters (excluding whitespace).
const s = text.slice(0, 1200);
const compact = s.replace(/\s+/g, '');
if (!compact) return 0;
const letters = (compact.match(/[\p{L}]/gu) ?? []).length;
const digits = (compact.match(/[\p{N}]/gu) ?? []).length;
const punct = compact.length - letters - digits;
const letterRatio = letters / compact.length;
const punctRatio = punct / compact.length;
// Code-ish signals
const hasBraces = /[{};<>\[\]()]|=>/.test(s);
const hasStackTrace = /\b(at\s+\S+\s*\(|Exception|Traceback)\b/.test(s);
const hasMarkdownTable = /\|\s*[-:]+\s*\|/.test(s);
// Start from letterRatio and dampen if it looks like code/logs.
let natural = letterRatio;
if (hasBraces) natural *= 0.6;
if (hasStackTrace) natural *= 0.5;
if (hasMarkdownTable) natural *= 0.8;
if (punctRatio > 0.35) natural *= 0.7;
return Math.max(0, Math.min(1, natural));
}
export function languageMatchScore(queryLang: Language, docLang: Language): -1 | 0 | 1 {
if (queryLang === 'unknown' || queryLang === 'mixed') return 0;
if (docLang === 'unknown' || docLang === 'mixed') return 0;
return queryLang === docLang ? 1 : -1;
}