Project Files
src / parsers / docxParser.ts
import mammoth from "mammoth";
import WordExtractor from "word-extractor";
/**
* Parse DOCX files (.docx) using mammoth.
*/
export async function parseDOCX(filePath: string): Promise<string> {
try {
const result = await mammoth.extractRawText({ path: filePath });
return result.value
.replace(/\r\n/g, "\n")
.replace(/\r/g, "\n")
.replace(/\n{3,}/g, "\n\n")
.replace(/ {2,}/g, " ")
.trim();
} catch (error) {
console.error(`Error parsing DOCX file ${filePath}:`, error);
return "";
}
}
/**
* Parse legacy Word binary files (.doc) using word-extractor.
*/
export async function parseDOC(filePath: string): Promise<string> {
try {
const extractor = new WordExtractor();
const doc = await extractor.extract(filePath);
return doc
.getBody()
.replace(/\r\n/g, "\n")
.replace(/\r/g, "\n")
.replace(/\n{3,}/g, "\n\n")
.replace(/ {2,}/g, " ")
.trim();
} catch (error) {
console.error(`Error parsing DOC file ${filePath}:`, error);
return "";
}
}