Forked from mindstudio/big-rag
Project Files
src / parsers / epubParser.ts
import * as fs from "fs";
import JSZip from "jszip";
import * as cheerio from "cheerio";
/**
* Parse EPUB files by extracting text content using jszip + cheerio.
*
* EPUB is a ZIP archive containing:
* - META-INF/container.xml → points to the .opf file
* - .opf (Open Packaging Format) → manifest + spine
* - XHTML content files (chapters)
*/
export async function parseEPUB(filePath: string): Promise<string> {
try {
const buffer = await fs.promises.readFile(filePath);
const zip = await JSZip.loadAsync(buffer);
// 1. Read container.xml to find the .opf file path
const opfPath = await findOpfPath(zip);
if (!opfPath) {
console.error(`[EPUB Parser] Could not find .opf file in ${filePath}`);
return "";
}
// 2. Parse the .opf file
const opfDir = opfPath.includes("/") ? opfPath.substring(0, opfPath.lastIndexOf("/") + 1) : "";
const opfContent = await readZipText(zip, opfPath);
if (!opfContent) {
console.error(`[EPUB Parser] Could not read .opf file at ${opfPath} in ${filePath}`);
return "";
}
const $opf = cheerio.load(opfContent, { xmlMode: true });
// 3. Build manifest: id → { href, mediaType }
const manifest = new Map<string, { href: string; mediaType: string }>();
$opf("manifest > item").each((_, el) => {
const id = $opf(el).attr("id");
const href = $opf(el).attr("href") || "";
const mediaType = $opf(el).attr("media-type") || "";
if (id) {
manifest.set(id, { href, mediaType });
}
});
// 4. Read spine order (list of manifest item IDs)
const spineIds: string[] = [];
$opf("spine > itemref").each((_, el) => {
const idref = $opf(el).attr("idref");
if (idref) {
spineIds.push(idref);
}
});
if (spineIds.length === 0) {
console.warn(`[EPUB Parser] Empty spine in ${filePath}, trying all XHTML items from manifest`);
// Fallback: read all XHTML/HTML items from manifest
manifest.forEach((item, id) => {
const mt = item.mediaType.toLowerCase();
if (mt.includes("html") || mt === "application/xhtml+xml") {
spineIds.push(id);
}
});
}
// 5. Extract text from each chapter in spine order
const textParts: string[] = [];
for (const itemId of spineIds) {
const item = manifest.get(itemId);
if (!item) {
console.warn(`[EPUB Parser] Spine item "${itemId}" not found in manifest, skipping`);
continue;
}
const chapterPath = resolveRelativePath(opfDir, item.href);
const chapterContent = await readZipText(zip, chapterPath);
if (!chapterContent) {
console.warn(`[EPUB Parser] Could not read chapter "${chapterPath}" in ${filePath}, skipping`);
continue;
}
const text = extractTextFromXhtml(chapterContent);
if (text.length > 0) {
textParts.push(text);
}
}
const fullText = textParts.join("\n\n");
return fullText
.replace(/\s+/g, " ")
.replace(/\n+/g, "\n")
.trim();
} catch (error) {
console.error(`[EPUB Parser] Error parsing EPUB file ${filePath}:`, error);
return "";
}
}
/**
* Find the path to the .opf file by reading META-INF/container.xml
*/
async function findOpfPath(zip: JSZip): Promise<string | null> {
const containerXml = await readZipText(zip, "META-INF/container.xml");
if (!containerXml) {
return null;
}
const $ = cheerio.load(containerXml, { xmlMode: true });
const rootfile = $("rootfile").first();
const fullPath = rootfile.attr("full-path") || null;
return fullPath;
}
/**
* Extract readable text from XHTML content using cheerio
*/
function extractTextFromXhtml(xhtml: string): string {
const $ = cheerio.load(xhtml, { xmlMode: true });
// Remove scripts, styles, and other non-content elements
$("script, style, link, meta, head").remove();
// Extract text from body, or fall back to entire document
const body = $("body");
const text = (body.length > 0 ? body.text() : $("html").text()) || "";
return text
.replace(/\s+/g, " ")
.trim();
}
/**
* Read a text file from the ZIP archive
*/
async function readZipText(zip: JSZip, path: string): Promise<string | null> {
// Try exact path first
let file = zip.file(path);
if (!file) {
// Try lowercase (EPUB paths are sometimes case-sensitive)
file = zip.file(path.toLowerCase());
}
if (!file) {
// Search for the file by name (handle case sensitivity issues)
const fileName = path.split("/").pop() || path;
zip.forEach((relativePath, zipEntry) => {
if (!file && relativePath.endsWith(fileName)) {
file = zipEntry;
}
});
}
if (!file) {
return null;
}
return file.async("string");
}
/**
* Resolve a relative path against a base directory
*/
function resolveRelativePath(baseDir: string, relativePath: string): string {
if (!baseDir || relativePath.startsWith("/")) {
return relativePath;
}
// Decode URI encoding (EPUBs often use %20 for spaces, etc.)
const decoded = decodeURIComponent(relativePath);
return baseDir + decoded;
}