import * as fs from "fs/promises";
import JSZip from "jszip";

// ── Modern PPTX (ZIP + XML) ───────────────────────────────────────────────────
// Covers .pptx, .pptm, .ppsx, .ppsm — all are ZIP archives containing
// XML slide files under ppt/slides/slide*.xml.

const SLIDE_FILE_RE = /^ppt\/slides\/slide(\d+)\.xml$/;

/** Extract text from a single slide's XML. */
function extractSlideText(xml: string): string {
  // Paragraph breaks → newline
  const withBreaks = xml
    .replace(/<a:br\s*\/?>/gi, "\n")
    .replace(/<\/a:p>/gi, "\n");

  // Collect all <a:t> text runs
  const runs: string[] = [];
  const runRe = /<a:t(?:\s[^>]*)?>([^<]*)<\/a:t>/g;
  let m: RegExpExecArray | null;
  while ((m = runRe.exec(withBreaks)) !== null) {
    runs.push(m[1]);
  }

  return runs
    .join("")
    .replace(/&amp;/g, "&")
    .replace(/&lt;/g, "<")
    .replace(/&gt;/g, ">")
    .replace(/&quot;/g, '"')
    .replace(/&apos;/g, "'")
    .replace(/ {2,}/g, " ")
    .replace(/\n{3,}/g, "\n\n")
    .trim();
}

export async function parsePPTX(filePath: string): Promise<string> {
  try {
    const data = await fs.readFile(filePath);
    const zip = await JSZip.loadAsync(data);

    // Collect and sort slide files numerically (slide1, slide2, …)
    const slideEntries = Object.keys(zip.files)
      .map((name) => {
        const m = SLIDE_FILE_RE.exec(name);
        return m ? { name, index: parseInt(m[1], 10) } : null;
      })
      .filter((x): x is { name: string; index: number } => x !== null)
      .sort((a, b) => a.index - b.index);

    const parts: string[] = [];
    for (const { name, index } of slideEntries) {
      const xml = await zip.files[name].async("string");
      const text = extractSlideText(xml);
      if (text) {
        parts.push(`[Slide ${index}]\n${text}`);
      }
    }

    return parts.join("\n\n").trim();
  } catch (error) {
    console.error(`Error parsing PPTX file ${filePath}:`, error);
    return "";
  }
}

// ── Legacy PPT (binary Compound Document) ────────────────────────────────────
// The binary .ppt format stores text as UTF-16LE runs inside a Compound File
// Binary container. There is no maintained pure-JS parser, so we do a best-
// effort scan for readable UTF-16LE strings (most slide text lives in that
// encoding) plus a fallback scan for single-byte ASCII runs.

const MIN_STRING_LEN = 4;

function extractStringsFromPPT(buffer: Buffer): string {
  const found = new Set<string>();

  // Pass 1 – UTF-16LE: pairs where the high byte is 0 and the low byte is
  // a printable ASCII character.  Most English slide text lives here.
  let wide = "";
  for (let i = 0; i + 1 < buffer.length; i += 2) {
    const lo = buffer[i];
    const hi = buffer[i + 1];
    if (hi === 0 && lo >= 32 && lo <= 126) {
      wide += String.fromCharCode(lo);
    } else {
      if (wide.trim().length >= MIN_STRING_LEN) found.add(wide.trim());
      wide = "";
      i -= 1; // back up one byte so the outer i+=2 moves to i+1
    }
  }
  if (wide.trim().length >= MIN_STRING_LEN) found.add(wide.trim());

  // Pass 2 – single-byte ASCII: runs of printable bytes.
  let ascii = "";
  for (let i = 0; i < buffer.length; i++) {
    const b = buffer[i];
    if (b >= 32 && b <= 126) {
      ascii += String.fromCharCode(b);
    } else {
      if (ascii.trim().length >= MIN_STRING_LEN) found.add(ascii.trim());
      ascii = "";
    }
  }
  if (ascii.trim().length >= MIN_STRING_LEN) found.add(ascii.trim());

  return Array.from(found)
    .filter((s) => /[a-zA-Z]/.test(s)) // drop pure-symbol noise
    .join(" ")
    .replace(/\s+/g, " ")
    .trim();
}

export async function parsePPT(filePath: string): Promise<string> {
  try {
    const buffer = await fs.readFile(filePath);
    return extractStringsFromPPT(buffer);
  } catch (error) {
    console.error(`Error parsing PPT file ${filePath}:`, error);
    return "";
  }
}
big-rag-modified