Forked from mindstudio/big-rag
Project Files
src / parsers / xmlParser.ts
import * as cheerio from "cheerio";
import * as fs from "fs";
/**
* Parse XML files and extract text content
*/
export async function parseXML(filePath: string): Promise<string> {
try {
const content = await fs.promises.readFile(filePath, "utf-8");
const $ = cheerio.load(content, {
xmlMode: true,
});
// Remove script, style, and comment nodes
$("script, style, noscript").remove();
$.root().contents().filter(function () {
return this.type === "comment";
}).remove();
// Extract text from all elements
const text = $.root().text() || $.text();
// Clean up whitespace
return text
.replace(/\s+/g, " ")
.replace(/\n+/g, "\n")
.trim();
} catch (error) {
console.error(`Error parsing XML file ${filePath}:`, error);
return "";
}
}