Forked from mindstudio/big-rag
Project Files
src / parsers / xlsxParser.ts
import * as fs from "fs";
import * as XLSX from "xlsx";
/**
* Parse XLSX/XLS spreadsheet files using SheetJS.
* Extracts text content from all sheets, row by row.
*/
export async function parseXlsx(filePath: string): Promise<string> {
try {
const buffer = await fs.promises.readFile(filePath);
const workbook = XLSX.read(buffer, { type: "buffer" });
const textParts: string[] = [];
for (const sheetName of workbook.SheetNames) {
const sheet = workbook.Sheets[sheetName];
if (!sheet || !sheet["!ref"]) continue;
// Get sheet as array of arrays for structured extraction
const rows: string[][] = XLSX.utils.sheet_to_json(sheet, {
header: 1,
defval: "",
blankrows: false,
});
for (const row of rows) {
const cells = row
.map((cell) => String(cell).trim())
.filter((cell) => cell.length > 0);
if (cells.length > 0) {
textParts.push(cells.join(" | "));
}
}
}
return textParts
.join("\n")
.replace(/\s+/g, " ")
.replace(/\n+/g, "\n")
.trim();
} catch (error) {
console.error(`Error parsing spreadsheet file ${filePath}:`, error);
return "";
}
}