Forked from mindstudio/big-rag
Project Files
src / utils / encodingDetector.ts
/**
* Detect and convert text encoding for FB2 and other files
* Handles common encodings: UTF-8, Windows-1251, KOI8-R
*/
export function detectAndConvertEncoding(buffer: Buffer): string {
// Try UTF-8 first
const utf8Content = buffer.toString("utf-8");
// Check if it's valid UTF-8 (no replacement characters)
if (!utf8Content.includes("\uFFFD") && isValidUTF8(buffer)) {
return utf8Content;
}
// Fallback to latin1 (preserves bytes, works for many single-byte encodings)
// Note: For proper Windows-1251 support, consider adding iconv-lite dependency
return buffer.toString("latin1");
}
function isValidUTF8(buffer: Buffer): boolean {
let i = 0;
while (i < buffer.length) {
const byte = buffer[i];
if (byte <= 0x7F) {
i++;
} else if ((byte & 0xE0) === 0xC0) {
if (i + 1 >= buffer.length) return false;
if ((buffer[i + 1] & 0xC0) !== 0x80) return false;
i += 2;
} else if ((byte & 0xF0) === 0xE0) {
if (i + 2 >= buffer.length) return false;
if ((buffer[i + 1] & 0xC0) !== 0x80) return false;
if ((buffer[i + 2] & 0xC0) !== 0x80) return false;
i += 3;
} else if ((byte & 0xF8) === 0xF0) {
if (i + 3 >= buffer.length) return false;
if ((buffer[i + 1] & 0xC0) !== 0x80) return false;
if ((buffer[i + 2] & 0xC0) !== 0x80) return false;
if ((buffer[i + 3] & 0xC0) !== 0x80) return false;
i += 4;
} else {
return false;
}
}
return true;
}