Project Files
scripts / generate_document_verified.py
#!/usr/bin/env python3
"""
Generate and verify documents from Markdown-like text.
Supported formats: PDF, DOCX, TXT, MD, HTML, ODT, RTF, EPUB.
Scientific typography path:
- Pandoc is used automatically when it is installed. HTML can use MathJax/KaTeX,
DOCX gets native Word equations, and PDF uses an installed TeX engine.
- The built-in renderer remains as a dependency-light fallback for ordinary text.
This file no longer pretends that stripping \\frac into slash text is a math
renderer. v0.4.1 uses a vendored toolchain layout: Pandoc plus Tectonic can be
bundled directly under vendor/toolchain/<platform>/bin and are used before PATH.
If binaries are missing, the tool fails cleanly instead of laundering TeX through
a sad little plaintext shredder. The chat UI rendering formulas is cute; export still needs
actual executables, because physics has consequences.
"""
from __future__ import annotations
import argparse
import html
import json
import os
import re
import shutil
import subprocess
import sys
import tempfile
import traceback
import zipfile
import tarfile
import urllib.request
import uuid
import urllib.error
import platform
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple
from xml.sax.saxutils import escape as xml_escape
def check_deps() -> None:
import reportlab # noqa: F401
import pypdfium2 # noqa: F401
import pypdf # noqa: F401
import PIL # noqa: F401
if "--check-deps" in sys.argv:
try:
check_deps()
print("ok")
sys.exit(0)
except Exception as exc:
print(f"missing dependency: {exc}", file=sys.stderr)
sys.exit(1)
from reportlab.lib import colors
from reportlab.lib.enums import TA_CENTER, TA_LEFT
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet
from reportlab.lib.units import cm
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.platypus import PageBreak, Paragraph, Preformatted, SimpleDocTemplate, Spacer, Table, TableStyle
from pypdf import PdfReader
from PIL import Image, ImageStat
import pypdfium2 as pdfium
from aidoc_render_builtin import *
from aidoc_toolchain import *
from aidoc_toolchain import _find_required_font, _missing_required_fonts
def normalize_markdown_for_pandoc(markdown: str) -> str:
"""Keep real TeX, but repair common ASCII aliases produced by LLMs."""
text = (markdown or "").replace("\r\n", "\n").replace("\r", "\n")
replacements = {
# Do not rewrite aliases that are already valid TeX commands.
# Earlier builds matched the "hbar" in "\\hbar" and turned it into
# "\\\\hbar", which made Pandoc/texmath choke on commands like
# i\\hbar, \frac and \nabla. Truly glorious self-sabotage.
r"(?<!\\)\bnabla\b": r"\\nabla",
r"(?<!\\)\bpartial\b": r"\\partial",
r"(?<!\\)\bh_bar\b": r"\\hbar",
r"(?<!\\)\bhbar\b": r"\\hbar",
r"(?<!\\)\binf\b": r"\\infty",
r"(?<!\\)\binfinity\b": r"\\infty",
r"(?<!\\)\bIntegral\b": r"\\int",
r"(?<!\\)\bintegral\b": r"\\int",
r"(?<!\\)\bSum\b": r"\\sum",
}
# Only rewrite aliases inside obvious formula-ish lines or existing math spans.
out_lines = []
for line in text.split("\n"):
probe = line.strip()
if is_formula_like_line(probe) or "$" in probe or "\\(" in probe or "\\[" in probe or MATH_ENV_RE.search(probe):
for pat, repl in replacements.items():
line = re.sub(pat, repl, line)
line = line.replace("->", r"\to") if ("$" in line or "\\" in line) else line.replace("->", "→")
out_lines.append(line)
return "\n".join(out_lines)
def strip_tex_spacing_commands(markdown: str) -> str:
"""Remove TeX row-spacing hints like ``\\[12pt]`` from model output."""
return re.sub(r"\\\\\s*\[\s*[0-9]+(?:\.[0-9]+)?\s*pt\s*\]", r"\\\\", markdown or "")
def _simplify_xarrow_for_texmath(match: re.Match[str]) -> str:
cmd = match.group(1)
below = (match.group(2) or "").strip()
above = (match.group(3) or "").strip()
if cmd == "xrightleftharpoons":
arrow = r"\rightleftharpoons"
elif cmd == "xleftarrow":
arrow = r"\leftarrow"
else:
arrow = r"\to"
if above:
arrow += "^{" + above + "}"
if below:
arrow += "_{" + below + "}"
return arrow
def downgrade_extended_arrows_for_texmath(markdown: str) -> str:
r"""Replace extended arrows unsupported by Pandoc texmath in DOCX/ODT/EPUB.
PDF/XeTeX accepts commands like ``\xrightleftharpoons`` via mathtools, but
Pandoc's native equation converter for Word/ODT/EPUB often rejects them. The
downgrade keeps arrow labels as ordinary super/subscripts on common relation
arrows. Not glamorous, but neither is getting raw TeX in a Word document.
"""
text = markdown or ""
label_atom = r"(?:[^{}]|\{[^{}]*\})*"
pattern = re.compile(
r"\\(xrightleftharpoons|xrightarrow|xleftarrow)\s*"
r"(?:\[([^\]]*)\])?\s*\{(" + label_atom + r")\}"
)
while True:
new = pattern.sub(_simplify_xarrow_for_texmath, text)
if new == text:
return new
text = new
def pandoc_source_for_format(markdown: str, fmt: str) -> str:
"""Return Markdown normalized for a specific Pandoc writer."""
text = strip_tex_spacing_commands(normalize_markdown_for_pandoc(markdown))
if fmt in {"docx", "odt", "epub", "txt", "rtf"}:
text = downgrade_extended_arrows_for_texmath(text)
return text
def has_scientific_math(markdown: str) -> bool:
text = markdown or ""
if DISPLAY_MATH_DELIM_RE.search(text) or re.search(r"(?<!\\)\$[^$]+(?<!\\)\$", text):
return True
if MATH_ENV_RE.search(text):
return True
aliases = ["nabla", "h_bar", "hbar", "\\frac", "\\sqrt", "\\sum", "\\int", "\\begin", "\\xrightarrow", "\\ce{", "\\pu{", "\\chemfig"]
return any(a in text for a in aliases)
def detect_document_profile(markdown: str) -> Tuple[str, List[str]]:
"""Classify the document for deterministic renderer selection."""
text = markdown or ""
lower = text.lower()
reasons: List[str] = []
chemistry_patterns = [
r"\\ce{", r"\\pu{", r"\\chemfig", r"\\xrightarrow", "c_6", "hno_3", "h2so4",
"\u043a\u0430\u0442\u0430\u043b\u0438\u0437", "\u0442\u0435\u043c\u043f\u0435\u0440\u0430\u0442\u0443\u0440", "\u0440\u0435\u0430\u043a\u0446\u0438", "\u043d\u0438\u0442\u0440\u043e\u0432\u0430\u043d", "\u043f\u043e\u043b\u0438\u043c\u0435\u0440\u0438\u0437\u0430\u0446", "\u044d\u043b\u0435\u043a\u0442\u0440\u043e\u0444\u0438\u043b\u044c",
]
if any(p in lower for p in chemistry_patterns):
reasons.append("chemistry notation/reaction labels detected")
return "chemistry", reasons
if any(p in text for p in [r"\nabla", "∇", r"\hbar", "ℏ", r"\partial", "∂", r"\Psi", "Ψ"]):
reasons.append("physics symbols detected")
return "physics", reasons
if MATH_ENV_RE.search(text) or any(p in text for p in [r"\begin{pmatrix}", r"\begin{bmatrix}", r"\begin{cases}", r"\begin{aligned}", r"\sum", r"\int", r"\oint", r"\frac"]):
reasons.append("advanced math environments/operators detected")
return "mathematics", reasons
if has_scientific_math(text):
reasons.append("LaTeX/math delimiters detected")
return "scientific", reasons
reasons.append("plain Markdown/text detected")
return "plain", reasons
def build_engine_plan(markdown: str, formats: List[str], requested_renderer: str) -> EnginePlan:
profile, reasons = detect_document_profile(markdown)
requested = (requested_renderer or "auto").lower()
if requested == "builtin":
return EnginePlan(profile, "builtin", False, pandoc=pandoc_executable() or "", tex_engine=available_tex_engine() or "", reasons=reasons + ["builtin explicitly requested"])
selected = requested
if requested == "auto":
if profile == "plain":
selected = "builtin"
elif profile == "chemistry":
# MathJax has the most forgiving chemistry path for HTML, and Pandoc+TeX handles mhchem for PDF.
selected = "mathjax"
else:
# MathJax is slower than KaTeX but accepts more TeX in messy model output. Robustness beats speed here.
selected = "mathjax"
pandoc = pandoc_executable() or ""
tex = available_tex_engine() or ""
needs_pandoc = selected != "builtin" and (requested != "auto" or profile != "plain" or any(f in {"docx", "odt", "rtf", "epub"} for f in formats))
if needs_pandoc and not pandoc:
reasons.append("Pandoc unavailable; built-in fallback will be used where necessary")
if "pdf" in formats and needs_pandoc and pandoc and not tex:
reasons.append("PDF needs a TeX engine for real LaTeX output; built-in PDF fallback will be used if none is installed")
return EnginePlan(profile, selected, needs_pandoc, pandoc=pandoc, tex_engine=tex, reasons=reasons)
def _font_present(*names: str) -> Optional[Path]:
# Search both the user toolchain and bundled plugin font directories.
# The STIX math font may be bundled with the plugin while Noto fonts are
# installed into the user's LM Studio cache.
return _find_required_font(names, primary=vendored_fonts_dir())
def _copy_font_to_temp(src: Path, dst_dir: Path) -> Optional[Path]:
"""Copy a bundled font into the Pandoc working directory.
Tectonic/XeTeX on Windows can choke on absolute font paths in xdvipdfmx
syntax, producing attempts to open names like ``[C:/.../font.ttf]/OT``.
Keeping fonts next to the temporary LaTeX build and referencing them via a
simple relative ``Path={fonts/}`` avoids that particular little circus.
"""
try:
dst_dir.mkdir(parents=True, exist_ok=True)
dst = dst_dir / src.name
if (not dst.exists()) or dst.stat().st_size != src.stat().st_size:
shutil.copy2(src, dst)
return dst
except Exception:
return None
def pandoc_font_header(temp_root: Path) -> Optional[Path]:
"""Create a LaTeX header that loads bundled fonts by relative path."""
regular = _font_present("NotoSerif-Regular.ttf")
bold = _font_present("NotoSerif-Bold.ttf")
italic = _font_present("NotoSerif-Italic.ttf")
bolditalic = _font_present("NotoSerif-BoldItalic.ttf")
mono = _font_present("NotoSansMono-Regular.ttf")
mathfont = _font_present("STIXTwoMath-Regular.otf", "STIX2Math.otf")
if not regular and not mathfont:
return None
local_fonts = temp_root / "fonts"
copied: Dict[str, Path] = {}
for key, src in {
"regular": regular,
"bold": bold,
"italic": italic,
"bolditalic": bolditalic,
"mono": mono,
"mathfont": mathfont,
}.items():
if src:
dst = _copy_font_to_temp(src, local_fonts)
if dst:
copied[key] = dst
font_path = "fonts/"
lines = [
r"\usepackage{fontspec}",
r"\defaultfontfeatures{Ligatures=TeX}",
]
if copied.get("regular"):
opts = [
f"Path={{{font_path}}}",
"Extension=.ttf",
"UprightFont=NotoSerif-Regular",
]
if copied.get("bold"):
opts.append("BoldFont=NotoSerif-Bold")
if copied.get("italic"):
opts.append("ItalicFont=NotoSerif-Italic")
if copied.get("bolditalic"):
opts.append("BoldItalicFont=NotoSerif-BoldItalic")
lines.append(r"\setmainfont[%s]{NotoSerif-Regular}" % ",".join(opts))
lines.append(r"\setsansfont[%s]{NotoSerif-Regular}" % ",".join(opts))
if copied.get("mono"):
lines.append(r"\setmonofont[Path={%s},Extension=.ttf,UprightFont=NotoSansMono-Regular]{NotoSansMono-Regular}" % font_path)
if copied.get("mathfont"):
lines.append(r"\usepackage{unicode-math}")
math_base = copied["mathfont"].stem
ext = copied["mathfont"].suffix or ".otf"
lines.append(r"\setmathfont[Path={%s},Extension=%s]{%s}" % (font_path, ext, math_base))
lines += [
r"\usepackage{mathtools}",
r"\usepackage[version=4]{mhchem}",
# amssymb is intentionally not loaded with unicode-math: it collides on commands such as \eth.
r"\providecommand{\Box}{\mathord{\text{□}}}",
r"\providecommand{\square}{\Box}",
]
header = temp_root / "vendored-fonts-header.tex"
header.write_text("\n".join(lines) + "\n", encoding="utf-8")
return header
def pandoc_scientific_latex_template(temp_root: Path) -> Path:
"""Return a minimal Pandoc LaTeX template that avoids amssymb+unicode-math conflicts."""
template = temp_root / "scientific-pandoc-template.latex"
if not template.exists():
template.write_text(r'''
\documentclass[11pt]{article}
\usepackage[a4paper,margin=22mm]{geometry}
\usepackage{xcolor}
\usepackage[unicode=true]{hyperref}
\hypersetup{hidelinks}
\usepackage{amsmath}
\usepackage{mathtools}
\usepackage{longtable}
\usepackage{booktabs}
\usepackage{array}
\usepackage{multirow}
\usepackage[version=4]{mhchem}
\providecommand{\Box}{\mathord{\text{□}}}
\providecommand{\square}{\Box}
\providecommand{\tightlist}{%
\setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
$if(title)$
\title{$title$}
$endif$
$for(header-includes)$
$header-includes$
$endfor$
\begin{document}
$if(title)$
\maketitle
$endif$
$body$
\end{document}
'''.lstrip(), encoding="utf-8")
return template
def pandoc_math_args(renderer: str, fmt: str) -> List[str]:
renderer = (renderer or "auto").lower()
if fmt == "html":
if renderer == "katex":
return ["--katex"]
if renderer in {"mathml", "epub"}:
return ["--mathml"]
return ["--mathjax"]
if fmt == "epub":
# MathML is self-contained in EPUB. MathJax in ebooks is a little circus of broken readers.
return ["--mathml"]
return []
def markdown_uses_chemistry(markdown: str) -> bool:
text = markdown or ""
return any(token in text for token in [r"\ce{", r"\pu{", r"\chemfig", r"\xrightarrow", r"\overset", r"\underset"])
def pandoc_chemistry_args(markdown: str, fmt: str) -> List[str]:
"""Enable chemistry-grade reaction notation when Pandoc/TeX is available."""
if not markdown_uses_chemistry(markdown):
return []
args: List[str] = []
if fmt == "pdf":
# amsmath/mathtools provide \xrightarrow; mhchem provides journal-style \ce{...}.
args += [
"-V", r"header-includes=\usepackage{amsmath}",
"-V", r"header-includes=\usepackage{mathtools}",
]
if r"\ce{" in markdown or r"\pu{" in markdown:
args += ["-V", r"header-includes=\usepackage[version=4]{mhchem}"]
if r"\chemfig" in markdown:
args += ["-V", r"header-includes=\usepackage{chemfig}"]
elif fmt == "html":
# MathJax normally handles \xrightarrow. \ce{} needs mhchem support in the viewer/CDN.
if r"\ce{" in markdown or r"\pu{" in markdown:
args += ["--metadata", "mathjax-config=TeX: { extensions: [mhchem.js] }"]
return args
def run_pandoc(args: List[str], cwd: Path) -> subprocess.CompletedProcess[str]:
env = os.environ.copy()
bin_dir = str(vendored_bin_dir())
if Path(bin_dir).exists():
env["PATH"] = bin_dir + os.pathsep + env.get("PATH", "")
cache_dir = vendored_tectonic_cache_dir()
if cache_dir.exists():
env["TECTONIC_CACHE_DIR"] = str(cache_dir)
return subprocess.run(args, cwd=str(cwd), text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=30 * 60, check=True, env=env)
def try_generate_with_pandoc(
formats: List[str],
out_dir: Path,
base: str,
title: str,
markdown: str,
math_renderer: str,
temp_root: Path,
) -> Tuple[List[Dict[str, Any]], List[str], List[str]]:
files: List[Dict[str, Any]] = []
warnings: List[str] = []
errors: List[str] = []
pandoc = pandoc_executable()
if not pandoc:
warnings.append("Pandoc not found (system PATH or optional pypandoc_binary); using built-in fallback renderer")
return files, warnings, errors
if not command_path("pandoc"):
warnings.append("Using bundled Pandoc from optional pypandoc_binary")
from_arg = PANDOC_FROM
for fmt in formats:
path = out_dir / f"{base}.{fmt}"
try:
source = temp_root / f"{base}_{fmt}_pandoc.md"
source.write_text(pandoc_source_for_format(markdown, fmt), encoding="utf-8")
if fmt == "md":
text = source.read_text(encoding="utf-8")
path.write_text((f"# {title}\n\n" if title else "") + text.strip() + "\n", encoding="utf-8")
files.append({"format": fmt, "path": str(path), "size": path.stat().st_size, "renderer": "pandoc-source"})
continue
cmd = [pandoc, str(source), "-f", from_arg, "--standalone", "--metadata", f"title={title}", "-o", str(path)]
cmd += pandoc_chemistry_args(markdown, fmt)
if fmt == "txt":
cmd += ["-t", "plain", "--wrap=none"]
elif fmt == "html":
cmd += ["-t", "html5", *pandoc_math_args(math_renderer, fmt)]
elif fmt == "epub":
cmd += ["-t", "epub3", *pandoc_math_args(math_renderer, fmt)]
elif fmt == "pdf":
engine = available_tex_engine()
if not engine:
warnings.append("PDF skipped by Pandoc: no TeX engine found (install xelatex/lualatex/tectonic); falling back")
continue
cmd += ["--pdf-engine", engine, "--template", str(pandoc_scientific_latex_template(temp_root))]
if engine in {"xelatex", "lualatex", "pdflatex"}:
cmd += ["--pdf-engine-opt=-no-shell-escape"]
if engine in {"xelatex", "lualatex", "tectonic"}:
font_header = pandoc_font_header(temp_root)
if font_header:
cmd += ["--include-in-header", str(font_header)]
elif engine in {"xelatex", "lualatex"}:
cmd += ["-V", "mainfont=DejaVu Serif", "-V", "sansfont=DejaVu Sans", "-V", "monofont=DejaVu Sans Mono"]
elif fmt not in {"docx", "odt", "rtf"}:
warnings.append(f"Pandoc unsupported format for this plugin path: {fmt}; falling back")
continue
completed = run_pandoc(cmd, temp_root)
if completed.stderr.strip():
warnings.append(f"pandoc {fmt}: {completed.stderr.strip()[:1000]}")
if path.exists() and path.stat().st_size > 0:
files.append({"format": fmt, "path": str(path), "size": path.stat().st_size, "renderer": "pandoc"})
else:
warnings.append(f"Pandoc did not create {fmt}; falling back")
except subprocess.CalledProcessError as exc:
msg = (exc.stderr or exc.stdout or str(exc)).strip()
warnings.append(f"pandoc {fmt} failed; falling back: {msg[:1200]}")
except Exception as exc:
warnings.append(f"pandoc {fmt} failed; falling back: {exc}")
return files, warnings, errors
def expected_scientific_tokens(source: str) -> List[Tuple[str, List[str]]]:
r"""
Expected visible math tokens inferred from the source document.
This catches the old failure mode where a PDF visually showed empty tofu
squares, but text extraction silently dropped symbols like \hbar, \nabla or
\oint. A PASS that ignores missing glyphs is not verification; it is theatre.
"""
checks: List[Tuple[str, List[str], List[str]]] = [
("hbar", [r"\\hbar", "h_bar", "hbar", "ℏ"], ["ℏ", "hbar"]),
("nabla", [r"\\nabla", "nabla", "∇"], ["∇", "nabla"]),
("partial", [r"\\partial", "partial", "∂"], ["∂", "partial"]),
("oint", [r"\\oint", "oint", "∮"], ["∮", "oint"]),
("int", [r"\\int", "Integral", "∫"], ["∫", "int"]),
("sum", [r"\\sum", "sum_", "Σ", "∑"], ["Σ", "∑", "sum"]),
("prod", [r"\\prod", "Π", "∏"], ["Π", "∏", "prod"]),
("sqrt", [r"\\sqrt", "sqrt", "√"], ["√", "sqrt"]),
("lambda", [r"\\lambda", "λ"], ["λ", "lambda"]),
("determinant", [r"\\det"], ["det"]),
("infinity", [r"\\infty", "+inf", "-inf", "∞"], ["∞", "inf"]),
("left-right-arrow", [r"\\xrightarrow", r"\\to", "->", "→"], ["→", "->"]),
("reversible-arrow", [r"\\rightleftharpoons", r"\\leftrightarrow", r"\\xrightleftharpoons", "⇌", "↔"], ["⇌", "↔", "<->", "leftharpoons"]),
("chemistry-ce", [r"\\ce{", r"\\pu{"], ["→", "+", "H", "C", "O", "N"]),
]
result: List[Tuple[str, List[str]]] = []
lower_source = source.lower()
for label, triggers, alternatives in checks:
if any(t.lower() in lower_source for t in triggers):
result.append((label, alternatives))
return result
def detect_missing_expected_tokens(text: str, expected_tokens: Optional[List[Tuple[str, List[str]]]]) -> List[str]:
if not expected_tokens:
return []
compact = re.sub(r"\s+", "", text).lower()
errors: List[str] = []
for label, alternatives in expected_tokens:
found = False
for alt in alternatives:
alt_compact = re.sub(r"\s+", "", alt).lower()
if alt_compact and alt_compact in compact:
found = True
break
if not found:
errors.append(f"expected scientific glyph/token missing after rendering: {label}")
return errors
def verify_generated_file(path: Path, fmt: str, keep_previews: bool, out_dir: Path, base: str, allow_latex_math: bool, expected_tokens: Optional[List[Tuple[str, List[str]]]] = None) -> Tuple[List[str], List[str], Dict[str, str], int]:
warnings: List[str] = []
errors: List[str] = []
preview_dirs: Dict[str, str] = {}
extracted_chars = 0
if not path.exists() or path.stat().st_size == 0:
return warnings, [f"{fmt}: file was not created or is empty"], preview_dirs, extracted_chars
try:
text = extract_text(path, fmt)
extracted_chars = len(text)
w, e = detect_bad_text(text, allow_latex_math=allow_latex_math, allow_markdown=(fmt == "md"))
missing_expected = detect_missing_expected_tokens(text, expected_tokens)
warnings.extend([f"{fmt}: {x}" for x in w])
errors.extend([f"{fmt}: {x}" for x in e])
errors.extend([f"{fmt}: {x}" for x in missing_expected])
except Exception as exc:
warnings.append(f"{fmt}: text extraction failed: {exc}")
if fmt == "pdf":
pdir = out_dir / f"{base}_verify"
kept, w, e = verify_pdf_render(path, keep_previews, pdir)
warnings.extend([f"pdf: {x}" for x in w])
errors.extend([f"pdf: {x}" for x in e])
if kept:
preview_dirs[fmt] = kept
return warnings, errors, preview_dirs, extracted_chars
def strip_xml_text(xml: str) -> str:
xml = re.sub(r"<[^>]+>", " ", xml)
return html.unescape(xml)
def extract_text(path: Path, fmt: str) -> str:
if fmt in {"txt", "md", "html", "rtf"}:
data = path.read_text(encoding="utf-8", errors="replace")
if fmt == "html":
data = re.sub(r"(?is)<(script|style)[^>]*>.*?<\/\1>", " ", data)
return re.sub(r"<[^>]+>", " ", html.unescape(data))
if fmt == "rtf":
data = re.sub(r"\\u(-?\d+)\?", lambda m: chr(int(m.group(1)) if int(m.group(1)) >= 0 else int(m.group(1)) + 65536), data)
data = re.sub(r"\\[a-zA-Z]+-?\d* ?", " ", data)
data = data.replace("{", " ").replace("}", " ").replace("\\", " ")
return data
return data
if fmt == "pdf":
reader = PdfReader(str(path))
return "\n".join(page.extract_text() or "" for page in reader.pages)
if fmt == "docx":
with zipfile.ZipFile(path, "r") as z:
xml = z.read("word/document.xml").decode("utf-8", "replace")
return strip_xml_text(xml)
if fmt == "odt":
with zipfile.ZipFile(path, "r") as z:
xml = z.read("content.xml").decode("utf-8", "replace")
return strip_xml_text(xml)
if fmt == "epub":
with zipfile.ZipFile(path, "r") as z:
texts = []
for name in z.namelist():
if name.endswith((".xhtml", ".html")):
texts.append(strip_xml_text(z.read(name).decode("utf-8", "replace")))
return "\n".join(texts)
return ""
def count_tofu_square_artifacts(pil_image: Image.Image, max_examples: int = 8) -> List[Tuple[int, int, int, int]]:
"""Detect small hollow square glyphs in rendered PDF pages.
This is intentionally conservative: it looks for the visual pattern produced
by missing font glyph boxes, not for every possible square-like shape.
"""
gray = pil_image.convert("L")
w, h = gray.size
ink_mask = gray.point(lambda p: 255 if p < 245 else 0, "L")
bbox = ink_mask.getbbox()
if not bbox:
return []
pad = 4
left, top, right, bottom = bbox
left = max(0, left - pad)
top = max(0, top - pad)
right = min(w, right + pad)
bottom = min(h, bottom + pad)
gray = gray.crop((left, top, right, bottom))
w, h = gray.size
pix = gray.load()
threshold = 135
rows: List[bytearray] = []
for y in range(h):
row = bytearray(w)
for x in range(w):
row[x] = 1 if pix[x, y] < threshold else 0
rows.append(row)
examples: List[Tuple[int, int, int, int]] = []
for y, row in enumerate(rows):
x = 0
while x < w:
while x < w and not row[x]:
x += 1
start = x
while x < w and row[x]:
x += 1
end = x - 1
length = end - start + 1
if not (10 <= length <= 28):
continue
for side in range(max(10, length - 3), min(29, length + 4)):
by = y + side - 1
if by >= h:
continue
xs = max(0, start - 1)
xe = min(w, end + 2)
span = max(1, xe - xs)
bottom_dark = sum(rows[by][xx] for xx in range(xs, xe)) / span
if bottom_dark < 0.70:
continue
x1, x2 = start, end
denom = by - y + 1
left_dark = sum(
1 for yy in range(y, by + 1)
if any(rows[yy][xx] for xx in range(max(0, x1 - 1), min(w, x1 + 2)))
) / denom
right_dark = sum(
1 for yy in range(y, by + 1)
if any(rows[yy][xx] for xx in range(max(0, x2 - 1), min(w, x2 + 2)))
) / denom
if left_dark < 0.70 or right_dark < 0.70:
continue
ix1, ix2 = x1 + 2, x2 - 1
iy1, iy2 = y + 2, by - 1
if ix2 <= ix1 or iy2 <= iy1:
continue
total = (ix2 - ix1) * (iy2 - iy1)
inside_dark = sum(rows[yy][xx] for yy in range(iy1, iy2) for xx in range(ix1, ix2)) / total
if inside_dark > 0.08:
continue
examples.append((left + x1, top + y, x2 - x1 + 1, by - y + 1))
if len(examples) >= max_examples:
return examples
return examples
def verify_pdf_render(path: Path, keep_previews: bool, preview_dir: Path) -> Tuple[Optional[str], List[str], List[str]]:
warnings: List[str] = []
errors: List[str] = []
kept_dir: Optional[str] = None
actual_dir = preview_dir
try:
pdf = pdfium.PdfDocument(str(path))
actual_dir.mkdir(parents=True, exist_ok=True)
for i in range(len(pdf)):
page = pdf[i]
bitmap = page.render(scale=1.4)
pil_image = bitmap.to_pil()
out = actual_dir / f"page-{i+1}.png"
pil_image.save(out)
stat = ImageStat.Stat(pil_image.convert("L"))
if stat.extrema[0][0] > 248 and stat.extrema[0][1] > 248:
warnings.append(f"PDF page {i+1} looks blank")
tofu_boxes = count_tofu_square_artifacts(pil_image)
if tofu_boxes:
examples = ", ".join(f"x={x},y={y},w={w},h={h}" for x, y, w, h in tofu_boxes[:4])
errors.append(f"PDF page {i+1} appears to contain missing-glyph square boxes ({len(tofu_boxes)} candidate(s); {examples})")
pdf.close()
if keep_previews:
kept_dir = str(actual_dir)
else:
# Verification previews are implementation detail. Do not dump them next to user files
# just because verification failed; that turns a useful failure into directory confetti.
shutil.rmtree(actual_dir, ignore_errors=True)
except Exception as exc:
errors.append(f"PDF render verification failed: {exc}")
if not keep_previews:
shutil.rmtree(actual_dir, ignore_errors=True)
return kept_dir, warnings, errors
def detect_bad_text(text: str, allow_latex_math: bool = False, allow_markdown: bool = False) -> Tuple[List[str], List[str]]:
warnings: List[str] = []
errors: List[str] = []
if len(text.strip()) < 10:
warnings.append("extracted text is very short")
bad_chars = ["□", "■", "�", "▯", "\ufffe", "\uffff"]
found_bad = [ch for ch in bad_chars if ch in text]
if found_bad:
errors.append("broken glyph artifacts found: " + " ".join(found_bad))
internal_markers = [SUB_OPEN, SUB_CLOSE, SUP_OPEN, SUP_CLOSE, RXN_ARROW_OPEN, RXN_ARROW_MID, RXN_ARROW_CLOSE]
found_markers = [m for m in internal_markers if m in text]
if found_markers or "§" in text:
unique = sorted(set(found_markers + (["§"] if "§" in text else [])))
errors.append("internal renderer marker leaked into output: " + ", ".join(unique))
if not allow_latex_math:
latex_patterns = [r"\$", r"\\text", r"\\frac", r"\\sqrt", r"\\begin", r"\\end", r"_\{", r"\^\{", r"\\\(", r"\\\[", r"\\(?=[ℏ∇∂∫∮ΣΠΛμνΨψΔδσπ∞≤≥≈√→←↔⇒⇐⇔])"]
leftovers = [pat for pat in latex_patterns if re.search(pat, text)]
if leftovers:
errors.append("LaTeX/MathJax artifacts remain: " + ", ".join(leftovers))
if (not allow_markdown) and ("**" in text or "__" in text):
errors.append("raw Markdown bold markers remain in output")
# Catch common TeX degradation artifacts from fallback rendering.
if re.search(r"\[[0-9]+(?:\.[0-9]+)?pt\]", text):
errors.append("TeX spacing command leaked into output, for example [10pt]")
if not allow_latex_math and re.search(r"(?<![A-Za-z])(?:leftharpoons|rightleftharpoons|xrightarrow|xleftarrow|(?:right)?arrow)(?![A-Za-z])", text):
errors.append("unrendered TeX arrow command leaked into output")
if re.search(r"\b([A-Za-z\u0410-\u042F\u0430-\u044F\u0401\u04510-9°=/]{3,})\1\b", text):
errors.append("duplicated token suggests broken TeX overset/underset conversion")
if re.search(r"°Cleft", text):
errors.append("temperature and arrow command were concatenated")
weird_controls = [ch for ch in text if (ord(ch) < 32 and ch not in "\t\n\r") or 0xFFFE <= ord(ch) <= 0xFFFF]
if weird_controls:
sample = " ".join(hex(ord(ch)) for ch in sorted(set(weird_controls))[:6])
errors.append("invalid control/noncharacter glyphs found: " + sample)
if re.search(r"=\s*,", text):
errors.append("formula appears to have lost its right-hand side before a comma")
# Single '*' can be a valid multiplication operator. Do not fail on it.
return warnings, errors
def generate_one(fmt: str, path: Path, title: str, blocks: List[Block], style: str, keep_previews: bool, temp_root: Path) -> Dict[str, Any]:
if fmt == "pdf":
generate_pdf(path, title, blocks, style, keep_previews, temp_root)
elif fmt == "docx":
generate_docx(path, title, blocks)
elif fmt == "txt":
generate_txt(path, blocks)
elif fmt == "md":
generate_md(path, blocks, title)
elif fmt == "html":
generate_html(path, title, blocks)
elif fmt == "odt":
generate_odt(path, title, blocks)
elif fmt == "rtf":
generate_rtf(path, title, blocks)
elif fmt == "epub":
generate_epub(path, title, blocks)
else:
raise ValueError(f"Unsupported format: {fmt}")
return {"format": fmt, "path": str(path), "size": path.stat().st_size if path.exists() else 0}
def safe_filename_base(value: str) -> str:
base = re.sub(r"[<>:\"/\\|?*\x00-\x1F]", "_", value or "document")
base = re.sub(r"\s+", "_", base).strip("._ ")
return base[:140] or "document"
def strip_auto_version_suffix(base: str) -> str:
"""Remove model-invented retry suffixes like _2, -3, or (2)."""
value = (base or "document").strip()
patterns = [
r"(?i)^(.+?)[_-](?:v|ver|version)[_-]?([2-9]|[1-9]\d+)$",
r"(?i)^(.+?)[_-](?:copy|\u043a\u043e\u043f\u0438\u044f)[_-]?(\d+)$",
r"^(.+?)[_-]([2-9]|[1-9]\d+)$",
r"^(.+?)_?\(([2-9]|[1-9]\d+)\)$",
]
for pat in patterns:
m = re.match(pat, value)
if m:
candidate = m.group(1).strip("._- ")
if len(candidate) >= 3:
return candidate
return value
def advanced_scientific_source(source: str) -> bool:
text = source or ""
advanced_patterns = [
r"\\frac", r"\\sqrt", r"\\sum", r"\\prod", r"\\int", r"\\iint", r"\\iiint", r"\\oint",
r"\\xrightarrow", r"\\overset", r"\\underset", r"\\begin\s*\{(?:matrix|pmatrix|bmatrix|array|cases|aligned|align|gather|split)",
r"\\ce\{", r"\\chemfig", r"\\nabla", r"\\partial", r"\\hbar",
]
return any(re.search(p, text) for p in advanced_patterns)
def scientific_fallback_quality_errors(source: str, profile: str, renderer_requested: str, formats: List[str], allow_fallback_pass: bool) -> List[str]:
if allow_fallback_pass:
return []
if (renderer_requested or "auto").lower() == "builtin":
return []
if profile == "plain" or not advanced_scientific_source(source):
return []
rich_formats = [f for f in formats if f in {"pdf", "docx", "html", "odt", "rtf", "epub"}]
if not rich_formats:
return []
return [
"advanced scientific/chemistry TeX was rendered by the built-in fallback, not by Pandoc/MathJax/TeX; output is readable but not verified as academic typography. Install Pandoc plus xelatex/lualatex/tectonic for PDF, or request plain TXT/MD/HTML-only fallback output."
]
def detect_document_language(text: str) -> str:
sample = text or ""
cyr = sum(1 for ch in sample if "\u0400" <= ch <= "\u04FF")
cjk = sum(1 for ch in sample if "\u4E00" <= ch <= "\u9FFF")
if cjk >= 4:
return "zh"
if cyr >= 4:
return "ru"
low = sample.lower()
french_hits = sum(1 for w in (" le ", " la ", " les ", " des ", " une ", " avec ", "pour ", "export", "polices") if w in low)
if french_hits >= 3 or any(ch in sample for ch in "éèêàùçôîïû"):
return "fr"
return "en"
def localized_install_prompt(lang: str) -> str:
lang = (lang or "en").lower()
if lang.startswith("ru"):
return "Для улучшенного академического экспорта PDF/DOCX/ODT/EPUB нужно установить Pandoc, Tectonic и шрифты. Напишите ровно «Установить» или «Не устанавливать»."
if lang.startswith("zh"):
return "To improve academic PDF/DOCX/ODT/EPUB export, install Pandoc, Tectonic, and fonts. Reply exactly “Install” or “Do not install”."
if lang.startswith("fr"):
return "Pour améliorer l’export académique PDF/DOCX/ODT/EPUB, installer Pandoc, Tectonic et les polices. Répondez exactement « Installer » ou « Ne pas installer »."
return "To improve academic PDF/DOCX/ODT/EPUB export, install Pandoc, Tectonic, and fonts. Reply exactly “Install” or “Do not install”."
def cleanup_working_directory_artifacts(out_dir: Path, base: str, keep_report: bool, keep_previews: bool) -> List[str]:
"""Remove verification/build leftovers from the user-visible output directory.
Generated documents are preserved. This only deletes known diagnostic/build
artifacts that are useful during verification and obnoxious afterward. If
the user asked to keep previews or reports, respect that request, because
apparently some humans enjoy collecting evidence of suffering.
"""
removed: List[str] = []
def rm_path(path: Path) -> None:
try:
if path.is_dir():
shutil.rmtree(path, ignore_errors=True)
removed.append(str(path))
elif path.exists():
path.unlink()
removed.append(str(path))
except Exception:
pass
if not keep_previews:
rm_path(out_dir / f"{base}_verify")
rm_path(out_dir / f"{base}_previews")
if not keep_report:
rm_path(out_dir / f"{base}_verification_report.json")
for name in (
f"{base}.aux",
f"{base}.log",
f"{base}.out",
f"{base}.toc",
f"{base}.lof",
f"{base}.lot",
f"{base}.fls",
f"{base}.fdb_latexmk",
f"{base}.synctex.gz",
f"{base}.xdv",
f"{base}.tex",
f"{base}.tmp",
"texput.aux",
"texput.log",
"texput.out",
"texput.toc",
"texput.tex",
"texput.xdv",
"texput.fls",
"texput.fdb_latexmk",
"texput.synctex.gz",
):
rm_path(out_dir / name)
for name in (f"{base}_files", f"{base}-media", f"{base}_media"):
rm_path(out_dir / name)
return removed
def main(request_path: str) -> Dict[str, Any]:
req = json.loads(Path(request_path).read_text(encoding="utf-8"))
title = str(req.get("title") or "Document")
markdown = str(req.get("markdown") or "")
detected_language = str(req.get("user_language") or req.get("language") or detect_document_language(title + "\n" + markdown))
install_prompt_text = localized_install_prompt(detected_language)
formats = [str(f).lower() for f in req.get("formats") or [req.get("format") or "pdf"]]
formats = [f for f in formats if f in SUPPORTED_FORMATS] or ["pdf"]
original_formats = list(formats)
# When academic Pandoc/TeX export is unavailable, still create every requested
# format using the built-in non-academic renderer. A missing engine should not
# leave the user staring into the void.
fallback_all_formats = set(SUPPORTED_FORMATS)
forced_non_academic_fallback = False
skipped_due_to_missing_toolchain: List[str] = []
out_dir = resolve_safe_output_dir(req.get("output_dir"), request_path)
out_dir.mkdir(parents=True, exist_ok=True)
base = safe_filename_base(str(req.get("filename_base") or title or "document"))
if bool(req.get("overwrite_existing", True)):
base = strip_auto_version_suffix(base)
style = str(req.get("style") or "lecture")
strict = bool(req.get("strict", False))
keep_previews = bool(req.get("keep_previews", False))
keep_report = bool(req.get("keep_report", False))
# v0.4.9: publish staged outputs even when verification finds format-specific issues.
# Losing every file because TXT/RTF could not render a TeX arrow is worse than
# returning useful PDFs/DOCX/HTML plus a clear FAIL report. Models can retry and
# overwrite the same filenames on the next iteration.
keep_failed_outputs = bool(req.get("keep_failed_outputs", True))
math_renderer = str(req.get("math_renderer") or "auto").lower()
if math_renderer not in {"auto", "pandoc", "builtin", "mathjax", "katex", "mathml"}:
math_renderer = "auto"
allow_scientific_fallback_pass = False # models may not waive scientific verification
toolchain_warnings: List[str] = []
if bool(req.get("auto_install_engines", False)):
toolchain_warnings.append(
"Ignored auto_install_engines=true. Document generation never downloads or installs engines; use install_document_engines after explicit user consent."
)
markdown, tex_security_warnings = sanitize_unsafe_tex(markdown)
toolchain_warnings.extend(tex_security_warnings)
engine_plan = build_engine_plan(markdown, formats, math_renderer)
# v0.4.1: fail before rendering when the requested job needs a real
# scientific typesetter. This avoids the old behavior where the tool first
# generated broken fallback PDFs, then complained about the wreckage it had
# just made. The LM Studio chat pane can display TeX beautifully because its
# UI uses a math renderer; that does not magically give the export tool
# Pandoc, Word equations, or a TeX engine. Cruel, but factual.
rich_formats = {"pdf", "docx", "odt", "rtf", "epub"}
# Treat any detected non-plain scientific/math/physics/chemistry document in rich formats
# as eligible for academic engines. The old check only matched a narrow list like
# \\frac/\\sum/\\xrightarrow and missed common content such as \\hat{H}\\Psi or
# \\xrightleftharpoons, so the tool silently produced fallback files without
# offering the installer.
needs_external_science = (engine_plan.profile != "plain" or advanced_scientific_source(markdown)) and any(f in rich_formats for f in formats) and math_renderer != "builtin"
if needs_external_science:
missing: List[str] = []
missing_fonts: List[str] = []
if not engine_plan.pandoc:
missing.append("Pandoc is required for verified scientific PDF/DOCX/ODT/RTF/EPUB export, but it was not found in the vendored toolchain, PATH, or optional pypandoc_binary.")
if "pdf" in formats and not engine_plan.tex_engine:
missing.append("A TeX engine is required for verified scientific PDF export: install/enable vendored Tectonic or system xelatex, lualatex, tectonic, or pdflatex.")
if "pdf" in formats and engine_plan.tex_engine:
missing_fonts = _missing_required_fonts(vendored_fonts_dir())
if missing_fonts:
missing.append(
"Required PDF fonts for verified scientific export are missing from "
+ str(vendored_fonts_dir())
+ ": "
+ ", ".join(missing_fonts)
+ ". Install document engines/fonts before rendering so Tectonic/XeTeX can use Noto Serif, Noto Sans Mono, and STIX Two Math instead of silently falling back."
)
if missing:
fallback_formats = [f for f in formats if f in fallback_all_formats]
toolchain_warnings.append(
"Required scientific export components are missing; generating ALL requested formats with the built-in non-academic fallback renderer: "
+ ", ".join(fallback_formats)
+ ". PDF/DOCX/ODT/EPUB may be readable but are not verified academic typography."
)
toolchain_warnings.extend(missing)
formats = fallback_formats
forced_non_academic_fallback = True
engine_plan = build_engine_plan(markdown, formats, "builtin")
staged_files: List[Dict[str, Any]] = []
files: List[Dict[str, Any]] = []
discarded_files: List[Dict[str, Any]] = []
warnings: List[str] = list(toolchain_warnings)
errors: List[str] = []
if forced_non_academic_fallback:
errors.append(
"Academic Pandoc/TeX scientific export was skipped because document engines/fonts are missing. "
"All requested formats were generated with the built-in fallback renderer; they are readable but not academic-standard verified typography."
)
preview_dirs: Dict[str, str] = {}
cleanup: Dict[str, Any] = {"replacements": 0, "remaining_artifacts": []}
expected_tokens = expected_scientific_tokens(markdown)
renderer_used = "builtin"
temp_root = Path(tempfile.mkdtemp(prefix="ai_doc_verify_"))
staging_dir = temp_root / "staged_outputs"
staging_dir.mkdir(parents=True, exist_ok=True)
try:
# Verified export is atomic: write to staging, verify, then publish.
# Failed documents should not sit next to real user files pretending to be usable output.
should_try_pandoc = engine_plan.should_try_pandoc
generated_formats = set()
if should_try_pandoc:
p_files, p_warnings, p_errors = try_generate_with_pandoc(formats, staging_dir, base, title, markdown, engine_plan.selected_math_renderer, temp_root)
warnings.extend(p_warnings)
errors.extend(p_errors)
if p_files:
renderer_used = "pandoc"
for info in p_files:
fmt = info["format"]
generated_formats.add(fmt)
path = Path(info["path"])
allow_latex = fmt in {"html", "epub", "md", "txt", "rtf"} and engine_plan.selected_math_renderer in {"auto", "mathjax", "katex", "mathml", "pandoc"}
w, e, pdirs, chars = verify_generated_file(path, fmt, keep_previews, out_dir, base, allow_latex_math=allow_latex, expected_tokens=None)
warnings.extend(w)
errors.extend(e)
preview_dirs.update(pdirs)
info["extracted_text_chars"] = chars
staged_files.append(info)
remaining_formats = [f for f in formats if f not in generated_formats]
if math_renderer == "pandoc" and remaining_formats:
warnings.append("Pandoc was requested but could not render all formats; built-in fallback rendered the rest")
if remaining_formats:
if generated_formats:
renderer_used = "mixed"
cleaned, cleanup = clean_math_markup(markdown)
blocks = parse_blocks(cleaned)
for fmt in remaining_formats:
path = staging_dir / f"{base}.{fmt}"
info = generate_one(fmt, path, title, blocks, style, keep_previews, temp_root)
info["renderer"] = "builtin"
staged_files.append(info)
w, e, pdirs, chars = verify_generated_file(path, fmt, keep_previews, out_dir, base, allow_latex_math=False, expected_tokens=expected_tokens)
warnings.extend(w)
errors.extend(e)
preview_dirs.update(pdirs)
info["extracted_text_chars"] = chars
errors.extend(scientific_fallback_quality_errors(markdown, engine_plan.profile, math_renderer, remaining_formats, allow_scientific_fallback_pass))
# Warnings from Pandoc/Tectonic include benign things like Fontconfig noise on
# Windows, overfull boxes, or plain-text writers leaving display math as TeX.
# They should not discard otherwise useful files. Verification FAIL is now based
# on real errors only; strict is retained for API compatibility, not as a
# directory-shredder. Humans invented warnings to read them, not to delete PDFs.
ok = len(errors) == 0
if ok or keep_failed_outputs:
for info in staged_files:
fmt = info.get("format")
if not fmt:
continue
src = Path(info["path"])
dst = out_dir / f"{base}.{fmt}"
if src.exists():
shutil.copy2(src, dst)
new_info = dict(info)
new_info["path"] = str(dst)
new_info["size"] = dst.stat().st_size if dst.exists() else 0
files.append(new_info)
if (not ok) and keep_failed_outputs:
warnings.append("verification failed, but failed output files were kept because keep_failed_outputs=true")
else:
discarded_files = [
{"format": info.get("format"), "renderer": info.get("renderer"), "size": info.get("size", 0)}
for info in staged_files
]
warnings.append("verification failed; generated output files were discarded because keep_failed_outputs=false.")
removed_workdir_artifacts = cleanup_working_directory_artifacts(out_dir, base, keep_report, keep_previews)
if removed_workdir_artifacts:
cleanup["removed_workdir_artifacts"] = removed_workdir_artifacts
report = {
"ok": ok,
"renderer": renderer_used,
"math_renderer_requested": math_renderer,
"engine_plan": {
"profile": engine_plan.profile,
"selected_math_renderer": engine_plan.selected_math_renderer,
"should_try_pandoc": engine_plan.should_try_pandoc,
"pandoc": engine_plan.pandoc,
"tex_engine": engine_plan.tex_engine,
"reasons": engine_plan.reasons or [],
},
"files": files,
"discarded_files": discarded_files + [
{"format": f, "renderer": "skipped-missing-toolchain", "size": 0}
for f in skipped_due_to_missing_toolchain
],
"requested_formats": original_formats,
"cleanup": cleanup,
"verification": {
"warnings": warnings,
"errors": errors,
},
"preview_dirs": preview_dirs,
"allow_scientific_fallback_pass": allow_scientific_fallback_pass,
"keep_failed_outputs": keep_failed_outputs,
"install_required": bool(forced_non_academic_fallback),
"install_prompt": install_prompt_text if forced_non_academic_fallback else "",
"language": detected_language,
"toolchain": toolchain_status(),
}
if (not ok) and not keep_report:
report["verification"]["warnings"].append("verification failed; diagnostics were returned in the tool result and not written to the working directory. Set keep_report=true or keep_previews=true to keep diagnostic files.")
if keep_report:
report_path = out_dir / f"{base}_verification_report.json"
report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
report["report_path"] = str(report_path)
return report
finally:
shutil.rmtree(temp_root, ignore_errors=True)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("request_json", nargs="?")
parser.add_argument("--install-toolchain", action="store_true", help="download/extract Pandoc + Tectonic + fonts into the user toolchain cache")
parser.add_argument("--toolchain-status", action="store_true", help="print detected toolchain paths")
parser.add_argument("--source", choices=["online", "local"], default="online", help="download from official sources or extract from --archives-dir")
parser.add_argument("--platform", default="auto", help="auto, windows-x64, linux-x64, macos-x64, macos-arm64")
parser.add_argument("--archives-dir", default="", help="directory with pre-downloaded archives for source=local")
parser.add_argument("--warm-cache", action="store_true", help="warm Tectonic cache after installing")
parser.add_argument("--force", action="store_true", help="replace existing engines/fonts")
args = parser.parse_args()
if args.toolchain_status:
print(json.dumps({"ok": True, "toolchain": toolchain_status()}, ensure_ascii=False))
sys.exit(0)
if args.install_toolchain:
result = install_toolchain_command(source=args.source, platform_name=args.platform, archives_dir=args.archives_dir or None, warm_cache=bool(args.warm_cache), force=bool(args.force))
print(json.dumps(result, ensure_ascii=False))
sys.exit(0 if result.get("ok") else 1)
if not args.request_json:
print("usage: generate_document_verified.py request.json [--install-toolchain]", file=sys.stderr)
sys.exit(2)
try:
result = main(args.request_json)
response_path = None
try:
req = json.loads(Path(args.request_json).read_text(encoding="utf-8"))
response_path = req.get("response_path")
except Exception:
pass
if response_path:
Path(response_path).write_text(json.dumps(result, ensure_ascii=False, indent=2), encoding="utf-8")
print(json.dumps(result, ensure_ascii=False))
sys.exit(0)
except Exception as exc:
error = {
"ok": False,
"files": [],
"discarded_files": [],
"error": str(exc),
"traceback": traceback.format_exc(),
"verification": {
"warnings": [],
"errors": [str(exc)],
},
"cleanup": {"replacements": 0, "remaining_artifacts": []},
"install_required": False,
"toolchain": toolchain_status(),
}
print(json.dumps(error, ensure_ascii=False), file=sys.stdout)
sys.exit(1)