Forked from mindstudio/big-rag
Project Files
src / parsers / officeParser.ts
/**
* Office document parser (.docx, .odt).
* Uses mammoth for .docx. ODT uses adm-zip when available and falls back to
* best-effort raw XML tag stripping.
*/
import * as fs from "fs/promises";
import * as path from "path";
type DynamicImporter = (specifier: string) => Promise<any>;
const importModule = new Function("specifier", "return import(specifier)") as DynamicImporter;
export async function parseOffice(filePath: string): Promise<string> {
const ext = path.extname(filePath).toLowerCase();
if (ext === ".docx") {
const mammoth = await importModule("mammoth").catch(() => null);
if (!mammoth) {
throw new Error("mammoth not installed; run: npm install mammoth");
}
const result = await mammoth.extractRawText({ path: filePath });
return result.value ?? "";
}
if (ext === ".odt") {
const AdmZip = await importModule("adm-zip").catch(() => null);
if (!AdmZip) {
const raw = await fs.readFile(filePath, "utf-8");
return stripXml(raw);
}
const ZipCtor = AdmZip.default ?? AdmZip;
const zip = new ZipCtor(filePath);
const entry = zip.getEntry("content.xml");
if (!entry) return "";
const xml = entry.getData().toString("utf-8");
return stripXml(xml);
}
throw new Error(`Unsupported office format: ${ext}`);
}
function stripXml(raw: string): string {
return raw.replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
}