Project Files
parser.js
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.resolvePath = resolvePath;
exports.detectFormat = detectFormat;
exports.parseFile = parseFile;
const promises_1 = require("fs/promises");
const path_1 = require("path");
async function resolvePath(inputPath, workspacePath, maxFileSizeMb) {
const fullPath = inputPath.startsWith("/") || inputPath.startsWith("~")
? inputPath.replace(/^~/, process.env.HOME ?? "")
: workspacePath
? (0, path_1.resolve)(workspacePath, inputPath)
: (0, path_1.resolve)(inputPath);
const info = await (0, promises_1.stat)(fullPath);
const sizeMb = info.size / (1024 * 1024);
if (sizeMb > maxFileSizeMb) {
throw new Error(`File is ${sizeMb.toFixed(1)} MB — exceeds limit of ${maxFileSizeMb} MB`);
}
return fullPath;
}
function detectFormat(ext) {
if (ext === ".pdf")
return "pdf";
if (ext === ".docx" || ext === ".doc")
return "docx";
if ([".xlsx", ".xls", ".ods", ".csv"].includes(ext))
return "spreadsheet";
if ([".html", ".htm"].includes(ext))
return "html";
if (ext === ".json" || ext === ".jsonl")
return "json";
if ([".pptx", ".ppt"].includes(ext))
return "pptx";
if (ext === ".epub")
return "epub";
return "txt";
}
function stripHtml(html) {
// Use htmlparser2 for proper HTML text extraction
const { Parser } = require("htmlparser2");
const parts = [];
let inScript = false;
let inStyle = false;
const parser = new Parser({
onopentag(name) {
if (name === "script")
inScript = true;
if (name === "style")
inStyle = true;
if (["p", "div", "br", "li", "h1", "h2", "h3", "h4", "h5", "h6", "tr"].includes(name)) {
parts.push("\n");
}
},
onclosetag(name) {
if (name === "script")
inScript = false;
if (name === "style")
inStyle = false;
},
ontext(text) {
if (!inScript && !inStyle)
parts.push(text);
},
});
parser.write(html);
parser.end();
return parts.join("").replace(/\n{3,}/g, "\n\n").trim();
}
function flattenJson(val, prefix = "") {
if (val === null || val === undefined)
return [`${prefix}: null`];
if (typeof val !== "object")
return [`${prefix}: ${val}`];
if (Array.isArray(val)) {
if (val.length === 0)
return [`${prefix}: []`];
return val.flatMap((v, i) => flattenJson(v, prefix ? `${prefix}[${i}]` : `[${i}]`));
}
const obj = val;
return Object.keys(obj).flatMap(k => flattenJson(obj[k], prefix ? `${prefix}.${k}` : k));
}
const MAX_ZIP_UNCOMPRESSED_MB = 50;
async function extractZipXmlText(buf, pathFilter, textTagPattern) {
const JSZip = (await Promise.resolve().then(() => __importStar(require("jszip")))).default;
const zip = await JSZip.loadAsync(buf);
const paths = Object.keys(zip.files).filter(pathFilter).sort();
const parts = [];
let totalBytes = 0;
for (const p of paths) {
const xml = await zip.files[p].async("string");
totalBytes += xml.length;
if (totalBytes > MAX_ZIP_UNCOMPRESSED_MB * 1024 * 1024) {
throw new Error(`Zip content exceeds ${MAX_ZIP_UNCOMPRESSED_MB} MB uncompressed`);
}
const matches = xml.match(textTagPattern) ?? [];
const text = matches.map(m => m.replace(/<[^>]+>/g, "")).join(" ").trim();
if (text)
parts.push(text);
}
return parts.join("\n\n");
}
async function parseFile(filePath, maxChars) {
const ext = (0, path_1.extname)(filePath).toLowerCase();
const format = detectFormat(ext);
if (format === "pdf") {
const pdfParse = (await Promise.resolve().then(() => __importStar(require("pdf-parse")))).default;
const buf = await (0, promises_1.readFile)(filePath);
const data = await pdfParse(buf);
return { text: data.text.slice(0, maxChars), format };
}
if (format === "docx") {
const mammoth = await Promise.resolve().then(() => __importStar(require("mammoth")));
const result = await mammoth.extractRawText({ path: filePath });
return { text: result.value.slice(0, maxChars), format };
}
if (format === "spreadsheet") {
const XLSX = (await Promise.resolve().then(() => __importStar(require("xlsx")))).default;
const wb = XLSX.readFile(filePath);
const parts = [];
for (const sheetName of wb.SheetNames) {
const ws = wb.Sheets[sheetName];
const csv = XLSX.utils.sheet_to_csv(ws);
parts.push(`=== Sheet: ${sheetName} ===\n${csv}`);
}
return { text: parts.join("\n\n").slice(0, maxChars), format };
}
if (format === "html") {
const raw = await (0, promises_1.readFile)(filePath, "utf-8");
return { text: stripHtml(raw).slice(0, maxChars), format };
}
if (format === "json") {
const raw = await (0, promises_1.readFile)(filePath, "utf-8");
if (ext === ".jsonl") {
const lines = raw.split("\n").filter(l => l.trim());
const text = lines.map((l, i) => {
try {
return `--- record ${i + 1} ---\n${flattenJson(JSON.parse(l)).join("\n")}`;
}
catch {
return l;
}
}).join("\n\n");
return { text: text.slice(0, maxChars), format };
}
try {
const parsed = JSON.parse(raw);
return { text: flattenJson(parsed).join("\n").slice(0, maxChars), format };
}
catch {
return { text: raw.slice(0, maxChars), format };
}
}
if (format === "pptx") {
const buf = await (0, promises_1.readFile)(filePath);
try {
// Extract text from ppt/slides/slide*.xml — <a:t> tags hold visible text
const text = await extractZipXmlText(buf, p => /^ppt\/slides\/slide\d+\.xml$/.test(p), /<a:t[^>]*>[^<]*<\/a:t>/g);
return { text: text.slice(0, maxChars), format };
}
catch (e) {
const msg = e instanceof Error ? e.message : String(e);
throw new Error(`Failed to parse .pptx — legacy binary .ppt is not supported; re-save as .pptx. (${msg})`);
}
}
if (format === "epub") {
const buf = await (0, promises_1.readFile)(filePath);
try {
const text = await extractZipXmlText(buf, p => /\.(html?|xhtml?)$/i.test(p) && !/\b(toc|nav)\b/i.test(p), /<[^>]+>[^<]*<\/[^>]+>/g);
return { text: stripHtml(text).slice(0, maxChars), format };
}
catch (e) {
const msg = e instanceof Error ? e.message : String(e);
throw new Error(`Failed to parse .epub. (${msg})`);
}
}
// txt / md / source / fallback
const buf = await (0, promises_1.readFile)(filePath, "utf-8");
return { text: buf.slice(0, maxChars), format };
}