Project Files
src / parser.ts
import { readFile, stat } from "fs/promises";
import { extname, resolve } from "path";
export async function resolvePath(
inputPath: string,
workspacePath: string,
maxFileSizeMb: number,
): Promise<string> {
const fullPath = inputPath.startsWith("/") || inputPath.startsWith("~")
? inputPath.replace(/^~/, process.env.HOME ?? "")
: workspacePath
? resolve(workspacePath, inputPath)
: resolve(inputPath);
const info = await stat(fullPath);
const sizeMb = info.size / (1024 * 1024);
if (sizeMb > maxFileSizeMb) {
throw new Error(`File is ${sizeMb.toFixed(1)} MB — exceeds limit of ${maxFileSizeMb} MB`);
}
return fullPath;
}
export function detectFormat(ext: string): "pdf" | "docx" | "spreadsheet" | "html" | "json" | "pptx" | "epub" | "txt" {
if (ext === ".pdf") return "pdf";
if (ext === ".docx" || ext === ".doc") return "docx";
if ([".xlsx", ".xls", ".ods", ".csv"].includes(ext)) return "spreadsheet";
if ([".html", ".htm"].includes(ext)) return "html";
if (ext === ".json" || ext === ".jsonl") return "json";
if ([".pptx", ".ppt"].includes(ext)) return "pptx";
if (ext === ".epub") return "epub";
return "txt";
}
function stripHtml(html: string): string {
// Use htmlparser2 for proper HTML text extraction
const { Parser } = require("htmlparser2");
const parts: string[] = [];
let inScript = false;
let inStyle = false;
const parser = new Parser({
onopentag(name: string) {
if (name === "script") inScript = true;
if (name === "style") inStyle = true;
if (["p", "div", "br", "li", "h1", "h2", "h3", "h4", "h5", "h6", "tr"].includes(name)) {
parts.push("\n");
}
},
onclosetag(name: string) {
if (name === "script") inScript = false;
if (name === "style") inStyle = false;
},
ontext(text: string) {
if (!inScript && !inStyle) parts.push(text);
},
});
parser.write(html);
parser.end();
return parts.join("").replace(/\n{3,}/g, "\n\n").trim();
}
function flattenJson(val: unknown, prefix = ""): string[] {
if (val === null || val === undefined) return [`${prefix}: null`];
if (typeof val !== "object") return [`${prefix}: ${val}`];
if (Array.isArray(val)) {
if (val.length === 0) return [`${prefix}: []`];
return val.flatMap((v, i) => flattenJson(v, prefix ? `${prefix}[${i}]` : `[${i}]`));
}
const obj = val as Record<string, unknown>;
return Object.keys(obj).flatMap(k => flattenJson(obj[k], prefix ? `${prefix}.${k}` : k));
}
const MAX_ZIP_UNCOMPRESSED_MB = 50;
async function extractZipXmlText(
buf: Buffer,
pathFilter: (p: string) => boolean,
textTagPattern: RegExp,
): Promise<string> {
const JSZip = (await import("jszip")).default;
const zip = await JSZip.loadAsync(buf);
const paths = Object.keys(zip.files).filter(pathFilter).sort();
const parts: string[] = [];
let totalBytes = 0;
for (const p of paths) {
const xml = await zip.files[p].async("string");
totalBytes += xml.length;
if (totalBytes > MAX_ZIP_UNCOMPRESSED_MB * 1024 * 1024) {
throw new Error(`Zip content exceeds ${MAX_ZIP_UNCOMPRESSED_MB} MB uncompressed`);
}
const matches = xml.match(textTagPattern) ?? [];
const text = matches.map(m => m.replace(/<[^>]+>/g, "")).join(" ").trim();
if (text) parts.push(text);
}
return parts.join("\n\n");
}
export async function parseFile(
filePath: string,
maxChars: number,
): Promise<{ text: string; format: string }> {
const ext = extname(filePath).toLowerCase();
const format = detectFormat(ext);
if (format === "pdf") {
const pdfParse = (await import("pdf-parse")).default;
const buf = await readFile(filePath);
const data = await pdfParse(buf);
return { text: data.text.slice(0, maxChars), format };
}
if (format === "docx") {
const mammoth = await import("mammoth");
const result = await mammoth.extractRawText({ path: filePath });
return { text: result.value.slice(0, maxChars), format };
}
if (format === "spreadsheet") {
const XLSX = (await import("xlsx")).default;
const wb = XLSX.readFile(filePath);
const parts: string[] = [];
for (const sheetName of wb.SheetNames) {
const ws = wb.Sheets[sheetName];
const csv = XLSX.utils.sheet_to_csv(ws);
parts.push(`=== Sheet: ${sheetName} ===\n${csv}`);
}
return { text: parts.join("\n\n").slice(0, maxChars), format };
}
if (format === "html") {
const raw = await readFile(filePath, "utf-8");
return { text: stripHtml(raw).slice(0, maxChars), format };
}
if (format === "json") {
const raw = await readFile(filePath, "utf-8");
if (ext === ".jsonl") {
const lines = raw.split("\n").filter(l => l.trim());
const text = lines.map((l, i) => {
try { return `--- record ${i + 1} ---\n${flattenJson(JSON.parse(l)).join("\n")}`; }
catch { return l; }
}).join("\n\n");
return { text: text.slice(0, maxChars), format };
}
try {
const parsed = JSON.parse(raw);
return { text: flattenJson(parsed).join("\n").slice(0, maxChars), format };
} catch {
return { text: raw.slice(0, maxChars), format };
}
}
if (format === "pptx") {
const buf = await readFile(filePath);
try {
// Extract text from ppt/slides/slide*.xml — <a:t> tags hold visible text
const text = await extractZipXmlText(
buf,
p => /^ppt\/slides\/slide\d+\.xml$/.test(p),
/<a:t[^>]*>[^<]*<\/a:t>/g,
);
return { text: text.slice(0, maxChars), format };
} catch (e) {
const msg = e instanceof Error ? e.message : String(e);
throw new Error(`Failed to parse .pptx — legacy binary .ppt is not supported; re-save as .pptx. (${msg})`);
}
}
if (format === "epub") {
const buf = await readFile(filePath);
try {
const text = await extractZipXmlText(
buf,
p => /\.(html?|xhtml?)$/i.test(p) && !/\b(toc|nav)\b/i.test(p),
/<[^>]+>[^<]*<\/[^>]+>/g,
);
return { text: stripHtml(text).slice(0, maxChars), format };
} catch (e) {
const msg = e instanceof Error ? e.message : String(e);
throw new Error(`Failed to parse .epub. (${msg})`);
}
}
// txt / md / source / fallback
const buf = await readFile(filePath, "utf-8");
return { text: buf.slice(0, maxChars), format };
}