Project Files
dist / python / docling_parser.py
#!/usr/bin/env python3
"""
Docling-based PDF Parser
Uses IBM's docling library for professional document parsing with layout understanding
"""
import sys
import json
from pathlib import Path
from typing import Dict, Any
try:
from docling.document_converter import DocumentConverter
except ImportError:
print(json.dumps({
"success": False,
"error": "docling not installed. Run: pip install docling",
"content": "",
"metadata": {}
}))
sys.exit(1)
def parse_document(file_path: str, extract_tables: bool = True) -> Dict[str, Any]:
"""
Parse a document using docling
Args:
file_path: Path to the document file
extract_tables: Whether to extract and format tables (currently always enabled)
Returns:
Dictionary with parsed content and metadata
"""
try:
# Create converter with default options (most reliable)
converter = DocumentConverter()
# Convert document
result = converter.convert(file_path)
# Get the document
doc = result.document
# Export as markdown (preserves structure, tables, lists, etc.)
markdown_content = doc.export_to_markdown()
# Get metadata
metadata = {
"format": "pdf",
"strategy": "docling",
"title": getattr(doc, 'title', None) or Path(file_path).stem,
"page_count": result.input.page_count if hasattr(result, 'input') and hasattr(result.input, 'page_count') else 1,
"has_tables": '|' in markdown_content, # Simple heuristic: markdown tables have |
"char_count": len(markdown_content),
}
return {
"success": True,
"content": markdown_content,
"metadata": metadata,
"error": None
}
except Exception as e:
return {
"success": False,
"error": str(e),
"content": "",
"metadata": {}
}
def main():
"""CLI interface for docling parser"""
if len(sys.argv) < 2:
print(json.dumps({
"success": False,
"error": "Usage: docling_parser.py <file_path> [extract_tables]",
"content": "",
"metadata": {}
}))
sys.exit(1)
file_path = sys.argv[1]
extract_tables = sys.argv[2].lower() == 'true' if len(sys.argv) > 2 else True
# Validate file exists
if not Path(file_path).exists():
print(json.dumps({
"success": False,
"error": f"File not found: {file_path}",
"content": "",
"metadata": {}
}))
sys.exit(1)
# Parse and output JSON
result = parse_document(file_path, extract_tables)
print(json.dumps(result, ensure_ascii=False, indent=2))
if __name__ == "__main__":
main()