Project Files

stubs

napi-rs-canvas

index.d.ts

index.js

package.json

test-fixtures

sample.fb2

sample.html

sample.md

sample.txt

sample.xml

.gitignore

EXAMPLES.md

manifest.json

package-lock.json

package.json

QUICKSTART.md

readme_RU.md

README.md

TESTING.md

tsconfig.json

src / parsers / fb2Parser.ts

import * as cheerio from "cheerio";
import * as fs from "fs";
import * as iconv from "iconv-lite";

/**
 * Parse FB2 (FictionBook 2) files and extract text content
 * FB2 is XML-based format for e-books
 */
export async function parseFB2(filePath: string): Promise<string> {
  try {
    // FB2 files often use Windows-1251 or other encodings
    const buffer = await fs.promises.readFile(filePath);
    
    // Try UTF-8 first
    let content = buffer.toString("utf-8");
    
    // If content contains replacement characters, try Windows-1251
    if (content.includes("\uFFFD")) {
      try {
        content = iconv.decode(buffer, "windows-1251");
      } catch {
        // Fallback to latin1 if iconv fails
        content = buffer.toString("latin1");
      }
    }
    
    const $ = cheerio.load(content, {
      xmlMode: true,
    });

    // Remove unwanted elements
    $("script, style, noscript").remove();
    $.root().contents().filter(function () {
      return this.type === "comment";
    }).remove();

    // FB2 structure: <FictionBook> -> <body> -> <section> -> <p>
    // Extract text from body element
    const bodyElement = $("body");
    
    if (bodyElement.length === 0) {
      console.warn(`[FB2 Parser] No <body> element found in ${filePath}`);
      return "";
    }

    // Extract text from all paragraphs and sections
    const text = bodyElement.text() || $.text();

    // Clean up whitespace
    return text
      .replace(/\s+/g, " ")
      .replace(/\n+/g, "\n")
      .trim();
  } catch (error) {
    console.error(`Error parsing FB2 file ${filePath}:`, error);
    return "";
  }
}

big-rag

big-rag