Forked from mindstudio/big-rag
Project Files
src / parsers / fb2Parser.ts
import * as cheerio from "cheerio";
import * as fs from "fs";
import * as iconv from "iconv-lite";
/**
* Parse FB2 (FictionBook 2) files and extract text content
* FB2 is XML-based format for e-books
*/
export async function parseFB2(filePath: string): Promise<string> {
try {
// FB2 files often use Windows-1251 or other encodings
const buffer = await fs.promises.readFile(filePath);
// Try UTF-8 first
let content = buffer.toString("utf-8");
// If content contains replacement characters, try Windows-1251
if (content.includes("\uFFFD")) {
try {
content = iconv.decode(buffer, "windows-1251");
} catch {
// Fallback to latin1 if iconv fails
content = buffer.toString("latin1");
}
}
const $ = cheerio.load(content, {
xmlMode: true,
});
// Remove unwanted elements
$("script, style, noscript").remove();
$.root().contents().filter(function () {
return this.type === "comment";
}).remove();
// FB2 structure: <FictionBook> -> <body> -> <section> -> <p>
// Extract text from body element
const bodyElement = $("body");
if (bodyElement.length === 0) {
console.warn(`[FB2 Parser] No <body> element found in ${filePath}`);
return "";
}
// Extract text from all paragraphs and sections
const text = bodyElement.text() || $.text();
// Clean up whitespace
return text
.replace(/\s+/g, " ")
.replace(/\n+/g, "\n")
.trim();
} catch (error) {
console.error(`Error parsing FB2 file ${filePath}:`, error);
return "";
}
}