Project Files

stubs

napi-rs-canvas

index.d.ts

index.js

package.json

test-fixtures

sample.html

sample.md

sample.txt

.gitignore

EXAMPLES.md

manifest.json

package-lock.json

package.json

QUICKSTART.md

README.md

TESTING.md

tsconfig.json

src / parsers / officeParser.ts

/**
 * Office document parser (.docx, .odt).
 * Uses mammoth for .docx. ODT uses adm-zip when available and falls back to
 * best-effort raw XML tag stripping.
 */
import * as fs from "fs/promises";
import * as path from "path";

type DynamicImporter = (specifier: string) => Promise<any>;
const importModule = new Function("specifier", "return import(specifier)") as DynamicImporter;

export async function parseOffice(filePath: string): Promise<string> {
  const ext = path.extname(filePath).toLowerCase();

  if (ext === ".docx") {
    const mammoth = await importModule("mammoth").catch(() => null);
    if (!mammoth) {
      throw new Error("mammoth not installed; run: npm install mammoth");
    }
    const result = await mammoth.extractRawText({ path: filePath });
    return result.value ?? "";
  }

  if (ext === ".odt") {
    const AdmZip = await importModule("adm-zip").catch(() => null);
    if (!AdmZip) {
      const raw = await fs.readFile(filePath, "utf-8");
      return stripXml(raw);
    }

    const ZipCtor = AdmZip.default ?? AdmZip;
    const zip = new ZipCtor(filePath);
    const entry = zip.getEntry("content.xml");
    if (!entry) return "";
    const xml = entry.getData().toString("utf-8");
    return stripXml(xml);
  }

  throw new Error(`Unsupported office format: ${ext}`);
}

function stripXml(raw: string): string {
  return raw.replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
}

big-rag