import * as fs from "fs";
import JSZip from "jszip";
import * as cheerio from "cheerio";

/**
 * Parse EPUB files by extracting text content using jszip + cheerio.
 *
 * EPUB is a ZIP archive containing:
 * - META-INF/container.xml → points to the .opf file
 * - .opf (Open Packaging Format) → manifest + spine
 * - XHTML content files (chapters)
 */
export async function parseEPUB(filePath: string): Promise<string> {
  try {
    const buffer = await fs.promises.readFile(filePath);
    const zip = await JSZip.loadAsync(buffer);

    // 1. Read container.xml to find the .opf file path
    const opfPath = await findOpfPath(zip);
    if (!opfPath) {
      console.error(`[EPUB Parser] Could not find .opf file in ${filePath}`);
      return "";
    }

    // 2. Parse the .opf file
    const opfDir = opfPath.includes("/") ? opfPath.substring(0, opfPath.lastIndexOf("/") + 1) : "";
    const opfContent = await readZipText(zip, opfPath);
    if (!opfContent) {
      console.error(`[EPUB Parser] Could not read .opf file at ${opfPath} in ${filePath}`);
      return "";
    }

    const $opf = cheerio.load(opfContent, { xmlMode: true });

    // 3. Build manifest: id → { href, mediaType }
    const manifest = new Map<string, { href: string; mediaType: string }>();
    $opf("manifest > item").each((_, el) => {
      const id = $opf(el).attr("id");
      const href = $opf(el).attr("href") || "";
      const mediaType = $opf(el).attr("media-type") || "";
      if (id) {
        manifest.set(id, { href, mediaType });
      }
    });

    // 4. Read spine order (list of manifest item IDs)
    const spineIds: string[] = [];
    $opf("spine > itemref").each((_, el) => {
      const idref = $opf(el).attr("idref");
      if (idref) {
        spineIds.push(idref);
      }
    });

    if (spineIds.length === 0) {
      console.warn(`[EPUB Parser] Empty spine in ${filePath}, trying all XHTML items from manifest`);
      // Fallback: read all XHTML/HTML items from manifest
      manifest.forEach((item, id) => {
        const mt = item.mediaType.toLowerCase();
        if (mt.includes("html") || mt === "application/xhtml+xml") {
          spineIds.push(id);
        }
      });
    }

    // 5. Extract text from each chapter in spine order
    const textParts: string[] = [];

    for (const itemId of spineIds) {
      const item = manifest.get(itemId);
      if (!item) {
        console.warn(`[EPUB Parser] Spine item "${itemId}" not found in manifest, skipping`);
        continue;
      }

      const chapterPath = resolveRelativePath(opfDir, item.href);
      const chapterContent = await readZipText(zip, chapterPath);
      if (!chapterContent) {
        console.warn(`[EPUB Parser] Could not read chapter "${chapterPath}" in ${filePath}, skipping`);
        continue;
      }

      const text = extractTextFromXhtml(chapterContent);
      if (text.length > 0) {
        textParts.push(text);
      }
    }

    const fullText = textParts.join("\n\n");
    return fullText
      .replace(/\s+/g, " ")
      .replace(/\n+/g, "\n")
      .trim();
  } catch (error) {
    console.error(`[EPUB Parser] Error parsing EPUB file ${filePath}:`, error);
    return "";
  }
}

/**
 * Find the path to the .opf file by reading META-INF/container.xml
 */
async function findOpfPath(zip: JSZip): Promise<string | null> {
  const containerXml = await readZipText(zip, "META-INF/container.xml");
  if (!containerXml) {
    return null;
  }

  const $ = cheerio.load(containerXml, { xmlMode: true });
  const rootfile = $("rootfile").first();
  const fullPath = rootfile.attr("full-path") || null;
  return fullPath;
}

/**
 * Extract readable text from XHTML content using cheerio
 */
function extractTextFromXhtml(xhtml: string): string {
  const $ = cheerio.load(xhtml, { xmlMode: true });

  // Remove scripts, styles, and other non-content elements
  $("script, style, link, meta, head").remove();

  // Extract text from body, or fall back to entire document
  const body = $("body");
  const text = (body.length > 0 ? body.text() : $("html").text()) || "";
  return text
    .replace(/\s+/g, " ")
    .trim();
}

/**
 * Read a text file from the ZIP archive
 */
async function readZipText(zip: JSZip, path: string): Promise<string | null> {
  // Try exact path first
  let file = zip.file(path);
  if (!file) {
    // Try lowercase (EPUB paths are sometimes case-sensitive)
    file = zip.file(path.toLowerCase());
  }
  if (!file) {
    // Search for the file by name (handle case sensitivity issues)
    const fileName = path.split("/").pop() || path;
    zip.forEach((relativePath, zipEntry) => {
      if (!file && relativePath.endsWith(fileName)) {
        file = zipEntry;
      }
    });
  }
  if (!file) {
    return null;
  }
  return file.async("string");
}

/**
 * Resolve a relative path against a base directory
 */
function resolveRelativePath(baseDir: string, relativePath: string): string {
  if (!baseDir || relativePath.startsWith("/")) {
    return relativePath;
  }
  // Decode URI encoding (EPUBs often use %20 for spaces, etc.)
  const decoded = decodeURIComponent(relativePath);
  return baseDir + decoded;
}
big-rag-rus