Project Files
src / parsers / pptxParser.ts
import * as fs from "fs/promises";
import JSZip from "jszip";
// ── Modern PPTX (ZIP + XML) ───────────────────────────────────────────────────
// Covers .pptx, .pptm, .ppsx, .ppsm — all are ZIP archives containing
// XML slide files under ppt/slides/slide*.xml.
const SLIDE_FILE_RE = /^ppt\/slides\/slide(\d+)\.xml$/;
/** Extract text from a single slide's XML. */
function extractSlideText(xml: string): string {
// Paragraph breaks → newline
const withBreaks = xml
.replace(/<a:br\s*\/?>/gi, "\n")
.replace(/<\/a:p>/gi, "\n");
// Collect all <a:t> text runs
const runs: string[] = [];
const runRe = /<a:t(?:\s[^>]*)?>([^<]*)<\/a:t>/g;
let m: RegExpExecArray | null;
while ((m = runRe.exec(withBreaks)) !== null) {
runs.push(m[1]);
}
return runs
.join("")
.replace(/&/g, "&")
.replace(/</g, "<")
.replace(/>/g, ">")
.replace(/"/g, '"')
.replace(/'/g, "'")
.replace(/ {2,}/g, " ")
.replace(/\n{3,}/g, "\n\n")
.trim();
}
export async function parsePPTX(filePath: string): Promise<string> {
try {
const data = await fs.readFile(filePath);
const zip = await JSZip.loadAsync(data);
// Collect and sort slide files numerically (slide1, slide2, …)
const slideEntries = Object.keys(zip.files)
.map((name) => {
const m = SLIDE_FILE_RE.exec(name);
return m ? { name, index: parseInt(m[1], 10) } : null;
})
.filter((x): x is { name: string; index: number } => x !== null)
.sort((a, b) => a.index - b.index);
const parts: string[] = [];
for (const { name, index } of slideEntries) {
const xml = await zip.files[name].async("string");
const text = extractSlideText(xml);
if (text) {
parts.push(`[Slide ${index}]\n${text}`);
}
}
return parts.join("\n\n").trim();
} catch (error) {
console.error(`Error parsing PPTX file ${filePath}:`, error);
return "";
}
}
// ── Legacy PPT (binary Compound Document) ────────────────────────────────────
// The binary .ppt format stores text as UTF-16LE runs inside a Compound File
// Binary container. There is no maintained pure-JS parser, so we do a best-
// effort scan for readable UTF-16LE strings (most slide text lives in that
// encoding) plus a fallback scan for single-byte ASCII runs.
const MIN_STRING_LEN = 4;
function extractStringsFromPPT(buffer: Buffer): string {
const found = new Set<string>();
// Pass 1 – UTF-16LE: pairs where the high byte is 0 and the low byte is
// a printable ASCII character. Most English slide text lives here.
let wide = "";
for (let i = 0; i + 1 < buffer.length; i += 2) {
const lo = buffer[i];
const hi = buffer[i + 1];
if (hi === 0 && lo >= 32 && lo <= 126) {
wide += String.fromCharCode(lo);
} else {
if (wide.trim().length >= MIN_STRING_LEN) found.add(wide.trim());
wide = "";
i -= 1; // back up one byte so the outer i+=2 moves to i+1
}
}
if (wide.trim().length >= MIN_STRING_LEN) found.add(wide.trim());
// Pass 2 – single-byte ASCII: runs of printable bytes.
let ascii = "";
for (let i = 0; i < buffer.length; i++) {
const b = buffer[i];
if (b >= 32 && b <= 126) {
ascii += String.fromCharCode(b);
} else {
if (ascii.trim().length >= MIN_STRING_LEN) found.add(ascii.trim());
ascii = "";
}
}
if (ascii.trim().length >= MIN_STRING_LEN) found.add(ascii.trim());
return Array.from(found)
.filter((s) => /[a-zA-Z]/.test(s)) // drop pure-symbol noise
.join(" ")
.replace(/\s+/g, " ")
.trim();
}
export async function parsePPT(filePath: string): Promise<string> {
try {
const buffer = await fs.readFile(filePath);
return extractStringsFromPPT(buffer);
} catch (error) {
console.error(`Error parsing PPT file ${filePath}:`, error);
return "";
}
}