Forked from mindstudio/big-rag
Project Files
src / parsers / docxParser.ts
import * as fs from "fs";
import mammoth from "mammoth";
/**
* Parse DOCX files using mammoth.
* Extracts clean text from Word documents (.docx).
*/
export async function parseDocx(filePath: string): Promise<string> {
try {
const buffer = await fs.promises.readFile(filePath);
const result = await mammoth.extractRawText({ buffer });
const text = result.value || "";
return text
.replace(/\s+/g, " ")
.replace(/\n+/g, "\n")
.trim();
} catch (error) {
console.error(`Error parsing DOCX file ${filePath}:`, error);
return "";
}
}