Forked from mindstudio/big-rag
Project Files
src / parsers / pptxParser.ts
import * as fs from "fs";
import JSZip from "jszip";
/**
* Parse PPTX files by extracting text from slide XML.
* PPTX is a ZIP archive containing XML files for each slide.
*/
export async function parsePptx(filePath: string): Promise<string> {
try {
const buffer = await fs.promises.readFile(filePath);
const zip = await JSZip.loadAsync(buffer);
const textParts: string[] = [];
// Collect slide files and sort by slide number
const slideFiles: { num: number; path: string }[] = [];
zip.forEach((relativePath, file) => {
const match = relativePath.match(/^ppt\/slides\/slide(\d+)\.xml$/);
if (match && !file.dir) {
slideFiles.push({ num: parseInt(match[1], 10), path: relativePath });
}
});
slideFiles.sort((a, b) => a.num - b.num);
for (const slideFile of slideFiles) {
const xmlContent = await zip.file(slideFile.path)?.async("string");
if (!xmlContent) continue;
// Extract text from <a:t> elements (TextBody in DrawingML)
const texts = extractTextFromXml(xmlContent);
if (texts.length > 0) {
textParts.push(texts.join(" "));
}
}
return textParts
.join("\n")
.replace(/\s+/g, " ")
.replace(/\n+/g, "\n")
.trim();
} catch (error) {
console.error(`Error parsing PPTX file ${filePath}:`, error);
return "";
}
}
/**
* Extract text content from <a:t>...</a:t> tags in slide XML.
*/
function extractTextFromXml(xml: string): string[] {
const texts: string[] = [];
const regex = /<a:t[^>]*>(.*?)<\/a:t>/g;
let match: RegExpExecArray | null;
while ((match = regex.exec(xml)) !== null) {
const text = match[1]
.replace(/&/g, "&")
.replace(/</g, "<")
.replace(/>/g, ">")
.replace(/"/g, '"')
.replace(/'/g, "'")
.trim();
if (text.length > 0) {
texts.push(text);
}
}
return texts;
}