src / parser.ts
import { readFile, stat } from "fs/promises";
import { extname, resolve } from "path";

export async function resolvePath(
  inputPath: string,
  workspacePath: string,
  maxFileSizeMb: number,
): Promise<string> {
  const fullPath = inputPath.startsWith("/") || inputPath.startsWith("~")
    ? inputPath.replace(/^~/, process.env.HOME ?? "")
    : workspacePath
    ? resolve(workspacePath, inputPath)
    : resolve(inputPath);

  const info = await stat(fullPath);
  const sizeMb = info.size / (1024 * 1024);
  if (sizeMb > maxFileSizeMb) {
    throw new Error(`File is ${sizeMb.toFixed(1)} MB — exceeds limit of ${maxFileSizeMb} MB`);
  }
  return fullPath;
}

export function detectFormat(ext: string): "pdf" | "docx" | "spreadsheet" | "html" | "json" | "pptx" | "epub" | "txt" {
  if (ext === ".pdf") return "pdf";
  if (ext === ".docx" || ext === ".doc") return "docx";
  if ([".xlsx", ".xls", ".ods", ".csv"].includes(ext)) return "spreadsheet";
  if ([".html", ".htm"].includes(ext)) return "html";
  if (ext === ".json" || ext === ".jsonl") return "json";
  if ([".pptx", ".ppt"].includes(ext)) return "pptx";
  if (ext === ".epub") return "epub";
  return "txt";
}

function stripHtml(html: string): string {
  // Use htmlparser2 for proper HTML text extraction
  const { Parser } = require("htmlparser2");
  const parts: string[] = [];
  let inScript = false;
  let inStyle = false;
  const parser = new Parser({
    onopentag(name: string) {
      if (name === "script") inScript = true;
      if (name === "style") inStyle = true;
      if (["p", "div", "br", "li", "h1", "h2", "h3", "h4", "h5", "h6", "tr"].includes(name)) {
        parts.push("\n");
      }
    },
    onclosetag(name: string) {
      if (name === "script") inScript = false;
      if (name === "style") inStyle = false;
    },
    ontext(text: string) {
      if (!inScript && !inStyle) parts.push(text);
    },
  });
  parser.write(html);
  parser.end();
  return parts.join("").replace(/\n{3,}/g, "\n\n").trim();
}

function flattenJson(val: unknown, prefix = ""): string[] {
  if (val === null || val === undefined) return [`${prefix}: null`];
  if (typeof val !== "object") return [`${prefix}: ${val}`];
  if (Array.isArray(val)) {
    if (val.length === 0) return [`${prefix}: []`];
    return val.flatMap((v, i) => flattenJson(v, prefix ? `${prefix}[${i}]` : `[${i}]`));
  }
  const obj = val as Record<string, unknown>;
  return Object.keys(obj).flatMap(k => flattenJson(obj[k], prefix ? `${prefix}.${k}` : k));
}

const MAX_ZIP_UNCOMPRESSED_MB = 50;

async function extractZipXmlText(
  buf: Buffer,
  pathFilter: (p: string) => boolean,
  textTagPattern: RegExp,
): Promise<string> {
  const JSZip = (await import("jszip")).default;
  const zip = await JSZip.loadAsync(buf);
  const paths = Object.keys(zip.files).filter(pathFilter).sort();
  const parts: string[] = [];
  let totalBytes = 0;
  for (const p of paths) {
    const xml = await zip.files[p].async("string");
    totalBytes += xml.length;
    if (totalBytes > MAX_ZIP_UNCOMPRESSED_MB * 1024 * 1024) {
      throw new Error(`Zip content exceeds ${MAX_ZIP_UNCOMPRESSED_MB} MB uncompressed`);
    }
    const matches = xml.match(textTagPattern) ?? [];
    const text = matches.map(m => m.replace(/<[^>]+>/g, "")).join(" ").trim();
    if (text) parts.push(text);
  }
  return parts.join("\n\n");
}

export async function parseFile(
  filePath: string,
  maxChars: number,
): Promise<{ text: string; format: string }> {
  const ext = extname(filePath).toLowerCase();
  const format = detectFormat(ext);

  if (format === "pdf") {
    const pdfParse = (await import("pdf-parse")).default;
    const buf = await readFile(filePath);
    const data = await pdfParse(buf);
    return { text: data.text.slice(0, maxChars), format };
  }

  if (format === "docx") {
    const mammoth = await import("mammoth");
    const result = await mammoth.extractRawText({ path: filePath });
    return { text: result.value.slice(0, maxChars), format };
  }

  if (format === "spreadsheet") {
    const XLSX = (await import("xlsx")).default;
    const wb = XLSX.readFile(filePath);
    const parts: string[] = [];
    for (const sheetName of wb.SheetNames) {
      const ws = wb.Sheets[sheetName];
      const csv = XLSX.utils.sheet_to_csv(ws);
      parts.push(`=== Sheet: ${sheetName} ===\n${csv}`);
    }
    return { text: parts.join("\n\n").slice(0, maxChars), format };
  }

  if (format === "html") {
    const raw = await readFile(filePath, "utf-8");
    return { text: stripHtml(raw).slice(0, maxChars), format };
  }

  if (format === "json") {
    const raw = await readFile(filePath, "utf-8");
    if (ext === ".jsonl") {
      const lines = raw.split("\n").filter(l => l.trim());
      const text = lines.map((l, i) => {
        try { return `--- record ${i + 1} ---\n${flattenJson(JSON.parse(l)).join("\n")}`; }
        catch { return l; }
      }).join("\n\n");
      return { text: text.slice(0, maxChars), format };
    }
    try {
      const parsed = JSON.parse(raw);
      return { text: flattenJson(parsed).join("\n").slice(0, maxChars), format };
    } catch {
      return { text: raw.slice(0, maxChars), format };
    }
  }

  if (format === "pptx") {
    const buf = await readFile(filePath);
    try {
      // Extract text from ppt/slides/slide*.xml — <a:t> tags hold visible text
      const text = await extractZipXmlText(
        buf,
        p => /^ppt\/slides\/slide\d+\.xml$/.test(p),
        /<a:t[^>]*>[^<]*<\/a:t>/g,
      );
      return { text: text.slice(0, maxChars), format };
    } catch (e) {
      const msg = e instanceof Error ? e.message : String(e);
      throw new Error(`Failed to parse .pptx — legacy binary .ppt is not supported; re-save as .pptx. (${msg})`);
    }
  }

  if (format === "epub") {
    const buf = await readFile(filePath);
    try {
      const text = await extractZipXmlText(
        buf,
        p => /\.(html?|xhtml?)$/i.test(p) && !/\b(toc|nav)\b/i.test(p),
        /<[^>]+>[^<]*<\/[^>]+>/g,
      );
      return { text: stripHtml(text).slice(0, maxChars), format };
    } catch (e) {
      const msg = e instanceof Error ? e.message : String(e);
      throw new Error(`Failed to parse .epub. (${msg})`);
    }
  }

  // txt / md / source / fallback
  const buf = await readFile(filePath, "utf-8");
  return { text: buf.slice(0, maxChars), format };
}
rag

rag