Project Files
python / pymupdf_parser.py
#!/usr/bin/env python3
"""
PyMuPDF-based PDF parser for robust text extraction.
Fallback for when Docling misses content due to encoding issues.
"""
import sys
import json
import traceback
def parse_pdf(file_path: str) -> dict:
"""Parse PDF using PyMuPDF (fitz) for robust text extraction."""
try:
import fitz # PyMuPDF
except ImportError:
return {
"success": False,
"error": "PyMuPDF not installed. Run: pip install pymupdf"
}
try:
doc = fitz.open(file_path)
# Extract text from all pages
full_text = ""
for page_num, page in enumerate(doc):
text = page.get_text("text") # Plain text extraction
if text.strip():
full_text += f"\n--- Page {page_num + 1} ---\n"
full_text += text
# Get metadata BEFORE closing document!
metadata = doc.metadata or {}
title = metadata.get("title", "")
page_count = doc.page_count
char_count = len(full_text)
# Clean up
doc.close()
return {
"success": True,
"content": full_text.strip(),
"metadata": {
"title": title if title else None,
"page_count": page_count,
"char_count": char_count
}
}
except Exception as e:
return {
"success": False,
"error": f"PyMuPDF parsing failed: {str(e)}\n{traceback.format_exc()}"
}
def main():
if len(sys.argv) < 2:
print(json.dumps({
"success": False,
"error": "Usage: pymupdf_parser.py <pdf_path>"
}))
sys.exit(1)
file_path = sys.argv[1]
result = parse_pdf(file_path)
print(json.dumps(result, ensure_ascii=False))
if __name__ == "__main__":
main()