Project Files
src / documents / parsers / pdfParser.ts
/**
* PDF Parser with Docling Integration
* Uses IBM's docling for professional document parsing with layout understanding
* Falls back to PyMuPDF if docling misses content, then pdf-parse as last resort
*/
import { readFile } from 'fs/promises';
import path from 'path';
import type { ParsedDocument, ParserOptions } from '../../types';
import { executePythonScript } from '../../utils/pythonRunner';
import { ragDebug } from '../../utils/ragLogger.js';
// Dynamic import for fallback parser
const getPdfParse = () => import('pdf-parse');
export interface PdfParserConfig {
strategy: 'docling' | 'pymupdf' | 'simple';
extractTables: boolean;
preserveFormatting: boolean;
}
export class PdfParser {
/**
* Parse PDF with intelligent fallback chain:
* 1. Docling (best for tables/layout)
* 2. PyMuPDF (robust text extraction)
* 3. pdf-parse (simple fallback)
*
* If Docling extracts significantly less than PyMuPDF, we merge or prefer PyMuPDF
*/
static async parse(
filePath: string,
options?: ParserOptions & { config?: Partial<PdfParserConfig> }
): Promise<ParsedDocument> {
const config: PdfParserConfig = {
strategy: 'docling',
extractTables: options?.extractTables ?? true,
preserveFormatting: options?.preserveFormatting ?? true,
...options?.config,
};
ragDebug('PDF Parser', `Parsing with ${config.strategy} strategy: ${filePath}`);
try {
// Try docling first
if (config.strategy === 'docling') {
const doclingResult = await this.parseWithDocling(filePath, options);
const fileBuffer = await readFile(filePath);
const fileSizeKB = fileBuffer.length / 1024;
ragDebug('PDF Parser', `Docling: ${doclingResult.content.length} chars, file: ${fileSizeKB.toFixed(0)}KB`);
// ALWAYS check PyMuPDF for large PDFs - Docling can miss pages silently
if (fileSizeKB > 500) {
ragDebug('PDF Parser', 'Large PDF - comparing with PyMuPDF...');
try {
const pymupdfResult = await this.parseWithPyMuPDF(filePath);
ragDebug('PDF Parser', `PyMuPDF: ${pymupdfResult.content.length} chars`);
// If PyMuPDF found more content, merge both (PyMuPDF is more complete, Docling has better formatting)
if (pymupdfResult.content.length > doclingResult.content.length) {
// Check if PyMuPDF has content that Docling missed
// Simple heuristic: if PyMuPDF is >10% longer, use PyMuPDF
if (pymupdfResult.content.length > doclingResult.content.length * 1.1) {
ragDebug('PDF Parser', `Using PyMuPDF (${((pymupdfResult.content.length / doclingResult.content.length - 1) * 100).toFixed(0)}% more content)`);
return pymupdfResult;
}
}
} catch (e) {
ragDebug('PDF Parser', 'PyMuPDF check failed, keeping Docling result');
}
}
ragDebug('PDF Parser', `Docling parsing successful - ${doclingResult.content.length} chars`);
return doclingResult;
}
// Direct PyMuPDF if requested
if (config.strategy === 'pymupdf') {
return await this.parseWithPyMuPDF(filePath);
}
// Simple parsing
return await this.parseSimple(filePath, options);
} catch (error) {
ragDebug('PDF Parser', `Primary parsing failed: ${error instanceof Error ? error.message : String(error)}`);
// Fallback chain
try {
ragDebug('PDF Parser', 'Trying PyMuPDF fallback...');
return await this.parseWithPyMuPDF(filePath);
} catch (e2) {
ragDebug('PDF Parser', 'Trying pdf-parse fallback...');
return await this.parseSimple(filePath, options);
}
}
}
/**
* Parse PDF using docling (IBM's professional document parser)
*/
private static async parseWithDocling(
filePath: string,
options?: ParserOptions
): Promise<ParsedDocument> {
ragDebug('PDF Parser', 'Using docling for professional-grade parsing...');
const scriptPath = path.join(__dirname, '..', '..', '..', 'python', 'docling_parser.py');
const extractTables = options?.extractTables !== false ? 'true' : 'false';
const result = await executePythonScript<{
success: boolean;
content: string;
metadata: {
format: string;
strategy: string;
title?: string;
page_count: number;
has_tables: boolean;
char_count: number;
};
error?: string;
}>(scriptPath, [filePath, extractTables], {
timeout: 120000,
});
if (!result.success || !result.data?.success) {
throw new Error(result.error || result.data?.error || 'Docling parsing failed');
}
const data = result.data;
return {
content: data.content,
metadata: {
format: 'pdf',
strategy: 'docling',
title: data.metadata.title,
pageCount: data.metadata.page_count,
hasTables: data.metadata.has_tables,
},
};
}
/**
* Parse PDF using PyMuPDF (fitz) - robust text extraction
*/
private static async parseWithPyMuPDF(filePath: string): Promise<ParsedDocument> {
ragDebug('PDF Parser', 'Using PyMuPDF for robust text extraction...');
const scriptPath = path.join(__dirname, '..', '..', '..', 'python', 'pymupdf_parser.py');
const result = await executePythonScript<{
success: boolean;
content: string;
metadata: {
title?: string;
page_count: number;
char_count: number;
};
error?: string;
}>(scriptPath, [filePath], {
timeout: 60000,
});
if (!result.success || !result.data?.success) {
throw new Error(result.error || result.data?.error || 'PyMuPDF parsing failed');
}
const data = result.data;
ragDebug('PDF Parser', `PyMuPDF extracted: ${data.metadata.char_count} chars from ${data.metadata.page_count} pages`);
return {
content: data.content,
metadata: {
format: 'pdf',
strategy: 'pymupdf',
title: data.metadata.title,
pageCount: data.metadata.page_count,
hasTables: false, // PyMuPDF doesn't detect tables
},
};
}
/**
* Simple PDF parsing using pdf-parse
*/
private static async parseSimple(
filePath: string,
options?: ParserOptions
): Promise<ParsedDocument> {
ragDebug('PDF Parser', 'Using pdf-parse for simple extraction...');
const pdfParse = (await getPdfParse()).default;
const buffer = await readFile(filePath);
const data = await pdfParse(buffer, {
max: 0,
});
return {
content: data.text,
metadata: {
format: 'pdf',
strategy: 'simple',
title: data.info?.Title,
author: data.info?.Author,
pageCount: data.numpages,
hasTables: false,
},
};
}
}