Project Files

stubs

napi-rs-canvas

index.d.ts

index.js

package.json

.clinerules

.gitignore

EXAMPLES.md

manifest.json

package-lock.json

package.json

QUICKSTART.md

README.md

TESTING.md

tsconfig.json

src / parsers / pptxParser.ts

import * as fs from "fs";
import JSZip from "jszip";

/**
 * Parse PPTX files by extracting text from slide XML.
 * PPTX is a ZIP archive containing XML files for each slide.
 */
export async function parsePptx(filePath: string): Promise<string> {
  try {
    const buffer = await fs.promises.readFile(filePath);
    const zip = await JSZip.loadAsync(buffer);

    const textParts: string[] = [];

    // Collect slide files and sort by slide number
    const slideFiles: { num: number; path: string }[] = [];
    zip.forEach((relativePath, file) => {
      const match = relativePath.match(/^ppt\/slides\/slide(\d+)\.xml$/);
      if (match && !file.dir) {
        slideFiles.push({ num: parseInt(match[1], 10), path: relativePath });
      }
    });
    slideFiles.sort((a, b) => a.num - b.num);

    for (const slideFile of slideFiles) {
      const xmlContent = await zip.file(slideFile.path)?.async("string");
      if (!xmlContent) continue;

      // Extract text from <a:t> elements (TextBody in DrawingML)
      const texts = extractTextFromXml(xmlContent);
      if (texts.length > 0) {
        textParts.push(texts.join(" "));
      }
    }

    return textParts
      .join("\n")
      .replace(/\s+/g, " ")
      .replace(/\n+/g, "\n")
      .trim();
  } catch (error) {
    console.error(`Error parsing PPTX file ${filePath}:`, error);
    return "";
  }
}

/**
 * Extract text content from <a:t>...</a:t> tags in slide XML.
 */
function extractTextFromXml(xml: string): string[] {
  const texts: string[] = [];
  const regex = /<a:t[^>]*>(.*?)<\/a:t>/g;
  let match: RegExpExecArray | null;
  while ((match = regex.exec(xml)) !== null) {
    const text = match[1]
      .replace(/&/g, "&")
      .replace(/</g, "<")
      .replace(/>/g, ">")
      .replace(/"/g, '"')
      .replace(/'/g, "'")
      .trim();
    if (text.length > 0) {
      texts.push(text);
    }
  }
  return texts;
}

big-rag-rus