"""Built-in Markdown-like document parser and fallback renderers.

This module contains dependency-light PDF/DOCX/HTML/TXT/MD/RTF/ODT/EPUB output
used when Pandoc/TeX is unavailable or not requested.
"""
from __future__ import annotations

import html
import os
import re
import shutil
import tempfile
import zipfile
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple
from xml.sax.saxutils import escape as xml_escape

from reportlab.lib import colors
from reportlab.lib.enums import TA_CENTER, TA_LEFT
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet
from reportlab.lib.units import cm
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.platypus import PageBreak, Paragraph, Preformatted, SimpleDocTemplate, Spacer, Table, TableStyle


def plugin_root() -> Path:
    return Path(__file__).resolve().parents[1]


SUPPORTED_FORMATS = {"pdf", "docx", "txt", "md", "html", "odt", "rtf", "epub"}
SUB_OPEN = "§SUB§"
SUB_CLOSE = "§ENDSUB§"
SUP_OPEN = "§SUP§"
SUP_CLOSE = "§ENDSUP§"
RXN_ARROW_OPEN = "§RXNARROW§"
RXN_ARROW_MID = "§RXNMID§"
RXN_ARROW_CLOSE = "§ENDRXNARROW§"
PANDOC_FROM = "markdown+tex_math_dollars+tex_math_single_backslash+raw_tex+smart"
MATH_ENV_RE = re.compile(r"\\begin\s*\{(?:matrix|pmatrix|bmatrix|Bmatrix|vmatrix|Vmatrix|smallmatrix|array|cases|aligned|align|alignat|gather|multline|equation|split)\}", re.I)
DISPLAY_MATH_DELIM_RE = re.compile(r"(^|\n)\s*(?:\$\$|\\\[|\\begin\s*\{(?:equation|align|aligned|gather|multline|split|matrix|pmatrix|bmatrix|Bmatrix|vmatrix|Vmatrix|array|cases)\})", re.I)


@dataclass
class FontSet:
    regular: str
    bold: str
    italic: str
    bold_italic: str
    mono: str
    math: str


@dataclass
class Segment:
    text: str
    sub: bool = False
    sup: bool = False
    bold: bool = False
    italic: bool = False
    code: bool = False


@dataclass
class Block:
    kind: str
    text: str = ""
    level: int = 0
    ordered: bool = False


@dataclass
class EnginePlan:
    profile: str
    selected_math_renderer: str
    should_try_pandoc: bool
    pandoc: str = ""
    tex_engine: str = ""
    reasons: Optional[List[str]] = None


def marker_sub(value: str) -> str:
    value = (value or "").strip()
    return f"{SUB_OPEN}{value}{SUB_CLOSE}" if value else ""


def marker_sup(value: str) -> str:
    value = (value or "").strip()
    return f"{SUP_OPEN}{value}{SUP_CLOSE}" if value else ""


def marker_reaction_arrow(above: str = "", below: str = "") -> str:
    above = (above or "").strip()
    below = (below or "").strip()
    return f"{RXN_ARROW_OPEN}{above}{RXN_ARROW_MID}{below}{RXN_ARROW_CLOSE}"


def reaction_arrow_markers_to_text(text: str) -> str:
    def repl(m: re.Match[str]) -> str:
        above = plain_text_from_markers(m.group(1)).strip()
        below = plain_text_from_markers(m.group(2)).strip()
        labels = [x for x in [above, below] if x]
        return " →" + ("[" + "; ".join(labels) + "]" if labels else "") + " "
    return re.sub(re.escape(RXN_ARROW_OPEN) + r"(.*?)" + re.escape(RXN_ARROW_MID) + r"(.*?)" + re.escape(RXN_ARROW_CLOSE), repl, text or "", flags=re.S)


REVERSE_SUBSCRIPT_MAP = {
    "₀": "0", "₁": "1", "₂": "2", "₃": "3", "₄": "4", "₅": "5", "₆": "6", "₇": "7", "₈": "8", "₉": "9",
    "₊": "+", "₋": "-", "₌": "=", "₍": "(", "₎": ")", "ₐ": "a", "ₑ": "e", "ₕ": "h", "ᵢ": "i", "ⱼ": "j",
    "ₖ": "k", "ₗ": "l", "ₘ": "m", "ₙ": "n", "ₒ": "o", "ₚ": "p", "ᵣ": "r", "ₛ": "s", "ₜ": "t", "ᵤ": "u", "ᵥ": "v", "ₓ": "x",
}
REVERSE_SUPERSCRIPT_MAP = {
    "⁰": "0", "¹": "1", "²": "2", "³": "3", "⁴": "4", "⁵": "5", "⁶": "6", "⁷": "7", "⁸": "8", "⁹": "9",
    "⁺": "+", "⁻": "-", "⁼": "=", "⁽": "(", "⁾": ")", "ᵃ": "a", "ᵇ": "b", "ᶜ": "c", "ᵈ": "d", "ᵉ": "e", "ᶠ": "f", "ᵍ": "g", "ʰ": "h",
    "ⁱ": "i", "ʲ": "j", "ᵏ": "k", "ˡ": "l", "ᵐ": "m", "ⁿ": "n", "ᵒ": "o", "ᵖ": "p", "ʳ": "r", "ˢ": "s", "ᵗ": "t", "ᵘ": "u", "ᵛ": "v", "ʷ": "w", "ˣ": "x", "ʸ": "y", "ᶻ": "z",
}
SUBSCRIPT_CHARS = "".join(re.escape(ch) for ch in REVERSE_SUBSCRIPT_MAP)
SUPERSCRIPT_CHARS = "".join(re.escape(ch) for ch in REVERSE_SUPERSCRIPT_MAP)

GREEK_COMMANDS = {
    "alpha": "α", "beta": "β", "gamma": "γ", "delta": "δ", "epsilon": "ε", "varepsilon": "ε", "zeta": "ζ", "eta": "η", "theta": "θ",
    "vartheta": "ϑ", "iota": "ι", "kappa": "κ", "lambda": "λ", "mu": "μ", "nu": "ν", "xi": "ξ", "pi": "π", "rho": "ρ", "sigma": "σ",
    "tau": "τ", "upsilon": "υ", "phi": "φ", "varphi": "φ", "chi": "χ", "psi": "ψ", "omega": "ω",
    "Gamma": "Γ", "Delta": "Δ", "Theta": "Θ", "Lambda": "Λ", "Xi": "Ξ", "Pi": "Π", "Sigma": "Σ", "Upsilon": "Υ", "Phi": "Φ", "Psi": "Ψ", "Omega": "Ω",
}
LATEX_SYMBOLS = {
    "leq": "≤", "le": "≤", "geq": "≥", "ge": "≥", "neq": "≠", "ne": "≠", "approx": "≈", "sim": "≈", "simeq": "≃", "equiv": "≡",
    "propto": "∝", "times": "×", "cdot": "·", "pm": "±", "mp": "∓", "div": "÷", "infty": "∞", "infinity": "∞", "partial": "∂", "nabla": "∇",
    "hbar": "ℏ", "hslash": "ℏ", "ell": "ℓ", "emptyset": "∅", "forall": "∀", "exists": "∃", "in": "∈", "notin": "∉", "subset": "⊂", "subseteq": "⊆",
    "sum": "Σ", "prod": "Π", "int": "∫", "iint": "∬", "iiint": "∭", "oint": "∮", "rightarrow": "→", "to": "→", "leftarrow": "←", "leftrightarrow": "↔", "rightleftharpoons": "⇌", "leftrightharpoons": "⇌", "leftharpoons": "↽", "rightharpoons": "⇀", "Rightarrow": "⇒",
    "Leftarrow": "⇐", "Leftrightarrow": "⇔", "uparrow": "↑", "downarrow": "↓", "degree": "°", "circ": "°", "cdots": "⋯", "ldots": "…", "dots": "…",
    "det": "det", "log": "log", "ln": "ln", "sin": "sin", "cos": "cos", "tan": "tan", "lim": "lim", "min": "min", "max": "max", "exp": "exp",
}
FORMAT_COMMANDS = ["text", "textrm", "textit", "textbf", "mathrm", "mathit", "mathbf", "mathsf", "operatorname", "ce", "chem", "rm", "bf", "it", "emph", "mbox"]


def strip_tex_left_right_delimiters(text: str) -> str:
    """Remove TeX sizing delimiters without damaging commands like \rightarrow."""
    return re.sub(r"\\(?:left|right)\s*(?=[()\[\]{}|\\.]|$)", "", text or "")


def decode_script_chars(text: str) -> str:
    def repl_sub(m: re.Match[str]) -> str:
        value = "".join(REVERSE_SUBSCRIPT_MAP.get(ch, ch) for ch in m.group(1))
        return marker_sub(value)
    def repl_sup(m: re.Match[str]) -> str:
        value = "".join(REVERSE_SUPERSCRIPT_MAP.get(ch, ch) for ch in m.group(1))
        return marker_sup(value)
    text = re.sub(f"([{SUBSCRIPT_CHARS}]+)", repl_sub, text)
    text = re.sub(f"([{SUPERSCRIPT_CHARS}]+)", repl_sup, text)
    return text


def replace_latex_commands(text: str) -> str:
    # Remove math delimiters without pretending a renderer exists, because fantasy is not an output format.
    text = text.replace("\\(", "").replace("\\)", "")
    text = text.replace("\\[", "").replace("\\]", "")
    text = text.replace("$$", "").replace("$", "")
    text = strip_tex_left_right_delimiters(text)
    text = re.sub(r"\^\s*\{?\\(?:circ|degree)\}?", "°", text)

    # Labeled arrows are common in chemistry. Keep above/below labels instead of eating them.
    def _arrow_label_for_fallback(match: re.Match[str]) -> str:
        below = match.group(1) or ""
        above = match.group(2) or ""
        above_clean = normalize_latex_fragment_for_fallback(above) if above.strip() else ""
        below_clean = normalize_latex_fragment_for_fallback(below) if below.strip() else ""
        return marker_reaction_arrow(above_clean, below_clean)

    arrow_pat = re.compile(r"\\x(?:right|left)(?:arrow|leftharpoons)\s*(?:\[([^\]]*)\])?\s*\{([^{}]*)\}")
    while True:
        new_text = arrow_pat.sub(_arrow_label_for_fallback, text)
        if new_text == text:
            break
        text = new_text

    for cmd in FORMAT_COMMANDS:
        pattern = re.compile(r"\\" + cmd + r"\s*\{([^{}]*)\}")
        while True:
            new = pattern.sub(lambda m: m.group(1), text)
            if new == text:
                break
            text = new

    # Reversible arrows often arrive as \underset{...}{\overset{...}{\rightleftharpoons}}.
    rev1 = re.compile(r"\\underset\s*\{([^{}]*)\}\s*\{\s*\\overset\s*\{([^{}]*)\}\s*\{\s*\\(?:rightleftharpoons|leftrightharpoons)\s*\}\s*\}")
    text = rev1.sub(lambda m: " ⇌[" + "; ".join(x for x in [normalize_latex_fragment_for_fallback(m.group(2)), normalize_latex_fragment_for_fallback(m.group(1))] if x) + "] ", text)
    rev2 = re.compile(r"\\overset\s*\{([^{}]*)\}\s*\{\s*\\underset\s*\{([^{}]*)\}\s*\{\s*\\(?:rightleftharpoons|leftrightharpoons)\s*\}\s*\}")
    text = rev2.sub(lambda m: " ⇌[" + "; ".join(x for x in [normalize_latex_fragment_for_fallback(m.group(1)), normalize_latex_fragment_for_fallback(m.group(2))] if x) + "] ", text)

    # Try labeled arrows again after \text{...} labels have been flattened.
    text = re.sub(r"\\x(?:right|left)(?:arrow|leftharpoons)\s*(?:\[([^\]]*)\])?\s*\{([^{}]*)\}", _arrow_label_for_fallback, text)
    text = re.sub(r"\\overset\s*\{([^{}]*)\}\s*\{\\to\}", lambda m: marker_reaction_arrow(normalize_latex_fragment_for_fallback(m.group(1)), ""), text)
    text = re.sub(r"\\underset\s*\{([^{}]*)\}\s*\{\\to\}", lambda m: marker_reaction_arrow("", normalize_latex_fragment_for_fallback(m.group(1))), text)

    # Common simple fractions and roots.
    frac_pat = re.compile(r"\\frac\s*\{([^{}]+)\}\s*\{([^{}]+)\}")
    while True:
        new = frac_pat.sub(lambda m: f"({m.group(1).strip()})/({m.group(2).strip()})", text)
        if new == text:
            break
        text = new
    text = re.sub(r"\\sqrt\s*\[([^\]]+)\]\s*\{([^{}]+)\}", lambda m: f"root_{m.group(1)}({m.group(2)})", text)
    text = re.sub(r"\\sqrt\s*\{([^{}]+)\}", lambda m: f"sqrt({m.group(1)})", text)

    for name, value in {**GREEK_COMMANDS, **LATEX_SYMBOLS}.items():
        # A TeX command can be followed by _, ^, punctuation or whitespace. Python's \b treats underscore as a word char,
        # so \oint_C used to lose the integral sign. Splendid little trap.
        text = re.sub(r"\\" + re.escape(name) + r"(?![A-Za-z])", value, text)

    # Remove spacing commands and braces left over from simple TeX fragments.
    text = re.sub(r"\\[,;:\s!]+", " ", text)
    text = text.replace("\\%", "%").replace("\\&", "&").replace("\\_", "_")
    text = re.sub(r"\\(?=[ℏ∇∂∫∮ΣΠΛμνΨψΔδσπ∞≤≥≈√→←↔⇒⇐⇔])", "", text)
    return text


def _consume_tex_script_value(text: str, pos: int) -> Tuple[str, int]:
    """Consume exactly one TeX script token after ^ or _."""
    n = len(text)
    i = pos
    while i < n and text[i].isspace():
        i += 1
    if i >= n:
        return "", pos
    if text[i] == "{":
        depth = 1
        j = i + 1
        while j < n and depth > 0:
            if text[j] == "{":
                depth += 1
            elif text[j] == "}":
                depth -= 1
            j += 1
        if depth == 0:
            return text[i + 1:j - 1], j
        return text[i + 1:], n
    if text[i] == "\\":
        m = re.match(r"\\([A-Za-z]+|.)", text[i:])
        if m:
            cmd = m.group(1)
            value = GREEK_COMMANDS.get(cmd, LATEX_SYMBOLS.get(cmd, cmd))
            return value, i + len(m.group(0))
    if text[i].isdigit():
        j = i + 1
        while j < n and text[j].isdigit():
            j += 1
        if j < n and text[j] in "+-":
            j += 1
        return text[i:j], j
    return text[i], i + 1


def convert_explicit_scripts(text: str) -> str:
    """Convert TeX-like scripts without creating nested internal markers."""
    if not text:
        return text
    out: List[str] = []
    i = 0
    n = len(text)
    while i < n:
        if text.startswith(SUB_OPEN, i):
            end = text.find(SUB_CLOSE, i + len(SUB_OPEN))
            if end != -1:
                out.append(text[i:end + len(SUB_CLOSE)])
                i = end + len(SUB_CLOSE)
                continue
        if text.startswith(SUP_OPEN, i):
            end = text.find(SUP_CLOSE, i + len(SUP_OPEN))
            if end != -1:
                out.append(text[i:end + len(SUP_CLOSE)])
                i = end + len(SUP_CLOSE)
                continue
        if text.startswith(RXN_ARROW_OPEN, i):
            end = text.find(RXN_ARROW_CLOSE, i + len(RXN_ARROW_OPEN))
            if end != -1:
                out.append(text[i:end + len(RXN_ARROW_CLOSE)])
                i = end + len(RXN_ARROW_CLOSE)
                continue
        ch = text[i]
        if ch in "_^":
            value, new_i = _consume_tex_script_value(text, i + 1)
            if value:
                out.append(marker_sub(value) if ch == "_" else marker_sup(value))
                i = new_i
                continue
        out.append(ch)
        i += 1
    return "".join(out)

def convert_unit_exponents(text: str) -> str:
    # m/s2, kg*m2, cm3, x2 after slash/dot/unit contexts -> superscript.
    unit_pat = re.compile(r"(?<![A-Za-z\u0410-\u042F\u0430-\u044F\u0401\u0451])((?:m|s|kg|g|mol|K|Pa|J|N|W|V|A|Hz|cm|mm|km|nm|μm|um|L|l))([23])(?!\d)")
    return unit_pat.sub(lambda m: m.group(1) + marker_sup(m.group(2)), text)


def convert_chemical_formula_token(token: str) -> str:
    # Tokenize a compact chemical formula. Digits after element/group are subscripts; trailing charge is superscript.
    pieces: List[str] = []
    i = 0
    n = len(token)
    while i < n:
        ch = token[i]
        if ch.isdigit():
            j = i
            while j < n and token[j].isdigit():
                j += 1
            num = token[i:j]
            if j == n - 1 and token[j] in "+-":
                pieces.append(marker_sup(num + token[j]))
                j += 1
            elif pieces and re.search(r"[A-Za-z)]$", plain_text_from_markers("".join(pieces))):
                pieces.append(marker_sub(num))
            else:
                pieces.append(num)
            i = j
        elif ch in "+-" and i == n - 1 and pieces:
            pieces.append(marker_sup(ch))
            i += 1
        else:
            pieces.append(ch)
            i += 1
    return "".join(pieces)


def looks_like_chemical_formula(token: str) -> bool:
    if len(token) < 2 or not re.search(r"\d", token):
        return False
    # Contains at least one element-like symbol and at least one digit attached to it.
    if not re.search(r"[A-Z][a-z]?\d|[A-Z][a-z]?[A-Z][a-z]?\d|\)\d", token):
        return False
    # Avoid ordinary abbreviations/names with random digits.
    if re.search(r"[a-z]{3,}\d", token):
        return False
    return bool(re.fullmatch(r"(?:[A-Z][a-z]?|\(|\)|\d+|[+-]){2,}", token))


def convert_compact_chemical_formulas(text: str) -> str:
    text = re.sub(r"\)(\d+)(?=[A-Z]|\b)", lambda m: ")" + marker_sub(m.group(1)), text)
    coeff_pat = re.compile(r"(?<![A-Za-z\u0410-\u042F\u0430-\u044F\u0401\u04510-9_§])(\d+)([A-Z][A-Za-z0-9()+\-]{1,24})(?![A-Za-z\u0410-\u042F\u0430-\u044F\u0401\u04510-9_§])")
    def repl_coeff(m: re.Match[str]) -> str:
        coeff, token = m.group(1), m.group(2)
        if looks_like_chemical_formula(token):
            return coeff + convert_chemical_formula_token(token)
        return m.group(0)
    text = coeff_pat.sub(repl_coeff, text)
    token_pat = re.compile(r"(?<![A-Za-z\u0410-\u042F\u0430-\u044F\u0401\u04510-9_§])([A-Z][A-Za-z0-9()+\-]{1,24})(?![A-Za-z\u0410-\u042F\u0430-\u044F\u0401\u04510-9_§])")
    def repl(m: re.Match[str]) -> str:
        token = m.group(1)
        if looks_like_chemical_formula(token):
            return convert_chemical_formula_token(token)
        return token
    return token_pat.sub(repl, text)

def plain_text_from_markers(text: str) -> str:
    text = reaction_arrow_markers_to_text(text or "")
    return text.replace(SUB_OPEN, "").replace(SUB_CLOSE, "").replace(SUP_OPEN, "").replace(SUP_CLOSE, "")


def normalize_plain_math_aliases(text: str) -> str:
    """Convert common model-friendly ASCII math words into real symbols.

    This is for the built-in fallback and for users who write `nabla` instead of
    `\\nabla`. Pandoc/TeX users can still write proper TeX.
    """
    replacements = {
        "h_bar": "ℏ", "hbar": "ℏ", "nabla": "∇", "del": "∇",
        "partial": "∂", "inf": "∞", "infty": "∞", "infinity": "∞",
        "Lambda": "Λ", "lambda": "λ", "Delta": "Δ", "delta": "δ",
        "Gamma": "Γ", "gamma": "γ", "Theta": "Θ", "theta": "θ",
        "Sigma": "Σ", "sigma": "σ", "Pi": "Π", "pi": "π",
        "Psi": "Ψ", "psi": "ψ", "Omega": "Ω", "omega": "ω",
        "Integral": "∫", "integral": "∫", "Sum": "Σ", "sum": "Σ",
        "sqrt": "√",
    }
    for k, v in replacements.items():
        text = re.sub(rf"\b{re.escape(k)}\b", v, text)
    return text


def _split_tex_rows(body: str) -> List[List[str]]:
    # Normalize TeX line-break spacing commands such as \\[10pt]. They are layout hints, not content.
    body = re.sub(r"\\\\\s*\[[^\]]*\]", r"\\\\", body or "")
    rows = [r.strip() for r in re.split(r"\\\\", body) if r.strip()]
    cleaned: List[List[str]] = []
    for row in rows:
        row = re.sub(r"^\s*\[[^\]]*\]\s*", "", row).strip()
        if row:
            cleaned.append([c.strip() for c in row.split("&")])
    return cleaned


def _matrix_to_unicode(env: str, body: str) -> str:
    rows = _split_tex_rows(body)
    if not rows:
        return ""
    rows = [[normalize_latex_fragment_for_fallback(c) for c in row] for row in rows]
    cols = max(len(r) for r in rows)
    for row in rows:
        row.extend([""] * (cols - len(row)))
    widths = [max(len(row[i]) for row in rows) for i in range(cols)]
    left_right = {
        "pmatrix": ("⎛", "⎞", "⎜", "⎟", "⎝", "⎠"),
        "bmatrix": ("⎡", "⎤", "⎢", "⎥", "⎣", "⎦"),
        "Bmatrix": ("⎧", "⎫", "⎨", "⎬", "⎩", "⎭"),
        "vmatrix": ("│", "│", "│", "│", "│", "│"),
        "Vmatrix": ("║", "║", "║", "║", "║", "║"),
        "matrix": (" ", " ", " ", " ", " ", " "),
        "smallmatrix": (" ", " ", " ", " ", " ", " "),
    }
    top_l, top_r, mid_l, mid_r, bot_l, bot_r = left_right.get(env, left_right["matrix"])
    out = []
    for idx, row in enumerate(rows):
        if len(rows) == 1:
            l, r = top_l, top_r
        elif idx == 0:
            l, r = top_l, top_r
        elif idx == len(rows) - 1:
            l, r = bot_l, bot_r
        else:
            l, r = mid_l, mid_r
        cells = "  ".join(row[i].ljust(widths[i]) for i in range(cols)).rstrip()
        out.append(f"{l} {cells} {r}".rstrip())
    return "\n".join(out)


def _cases_to_unicode(body: str) -> str:
    rows = _split_tex_rows(body)
    if not rows:
        return ""
    out = []
    for idx, row in enumerate(rows):
        expr = normalize_latex_fragment_for_fallback(row[0]) if row else ""
        cond = normalize_latex_fragment_for_fallback("  ".join(row[1:])) if len(row) > 1 else ""
        prefix = "⎧" if idx == 0 else ("⎩" if idx == len(rows) - 1 else "⎨")
        out.append(f"{prefix} {expr}" + (f"    {cond}" if cond else ""))
    return "\n".join(out)


def _aligned_to_unicode(body: str) -> str:
    rows = _split_tex_rows(body)
    lines = []
    for row in rows:
        cells = [normalize_latex_fragment_for_fallback(c) for c in row]
        lines.append(" ".join(c for c in cells if c))
    return "\n".join(lines)


def normalize_latex_fragment_for_fallback(text: str) -> str:
    """Best-effort Unicode fallback for TeX when Pandoc/TeX is unavailable."""
    text = text or ""
    # Convert common math environments into readable multiline Unicode.
    env_pat = re.compile(r"\\begin\s*\{([A-Za-z*]+)\}(.*?)\\end\s*\{\1\}", re.S)
    while True:
        def env_repl(m: re.Match[str]) -> str:
            env, body = m.group(1), m.group(2).strip()
            if env in {"matrix", "pmatrix", "bmatrix", "Bmatrix", "vmatrix", "Vmatrix", "smallmatrix"}:
                return _matrix_to_unicode(env, body)
            if env == "cases":
                return _cases_to_unicode(body)
            if env in {"aligned", "align", "align*", "split", "gather", "multline", "equation"}:
                return _aligned_to_unicode(body)
            if env == "array":
                body = re.sub(r"^\s*\{[^{}]*\}", "", body).strip()
                return _matrix_to_unicode("matrix", body)
            return body
        new = env_pat.sub(env_repl, text)
        if new == text:
            break
        text = new

    text = replace_latex_commands(text)
    text = normalize_plain_math_aliases(text)
    text = convert_explicit_scripts(text)
    text = convert_unit_exponents(text)
    text = convert_compact_chemical_formulas(text)
    text = strip_tex_left_right_delimiters(text)
    text = text.replace("\\,", " ").replace("\\;", " ").replace("\\!", "")
    text = text.replace("&=", "=").replace("&", " ")
    text = re.sub(r"\\\\", "\n", text)
    text = re.sub(r"\\[a-zA-Z]+\*?", "", text)
    text = text.replace("{", "").replace("}", "")
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r" *\n *", "\n", text)
    return text.strip()


def clean_math_markup(markdown: str) -> Tuple[str, Dict[str, Any]]:
    original = markdown or ""
    text = original.replace("\r\n", "\n").replace("\r", "\n")
    replacements = 0

    before = text
    text = decode_script_chars(text)
    text = normalize_latex_fragment_for_fallback(text)
    # Drop bold markers early. They often wrap formula tokens with sub/sup markers, which is where Markdown parsers go to die.
    text = text.replace("**", "").replace("__", "")
    # Remove simple italic markers around words, but keep multiplication asterisks with spaces.
    text = re.sub(r"(?<!\S)\*([^*\n]+?)\*(?!\S)", r"\1", text)
    text = re.sub(r"(?<!\w)\*([^*\n]+?)\*(?!\w)", r"\1", text)
    replacements = sum(1 for a, b in zip(before, text) if a != b) + abs(len(before) - len(text))

    artifact_patterns = [r"\\text", r"\\frac", r"\\sqrt", r"_\{", r"\^\{", r"\\\(", r"\\\[", r"\\begin", r"\\end"]
    remaining: List[str] = []
    for pat in artifact_patterns:
        if re.search(pat, text):
            remaining.append(pat)
    return text, {"replacements": replacements, "remaining_artifacts": remaining}



def normalize_formula_words(line: str) -> str:
    line = normalize_plain_math_aliases(line)
    line = line.replace("->", "→").replace("=>", "⇒")
    line = re.sub(r"lim\s*\(\s*([^)]*?)\s*→\s*∞\s*\)", r"lim(\1 → ∞)", line)
    return line


def is_formula_like_line(text: str) -> bool:
    s0 = (text or "").strip()
    s = plain_text_from_markers(strip_outer_emphasis_for_formula(s0) if 'strip_outer_emphasis_for_formula' in globals() else s0)
    if len(s) < 3:
        return False
    if s.startswith(("|", "http://", "https://")):
        return False
    formula_cues = ["=", "→", "->", "⇒", "±", "√", "∫", "∮", "Σ", "Π", "∇", "∂", "ℏ", "∞", "lim", "Integral", "Sum", "Delta", "Lambda", "nabla", "partial", "psi", "^", "_", "§SUB§", "§SUP§", "⎛", "⎞", "⎜", "⎟", "⎝", "⎠", "⎧", "⎨", "⎩", "⎡", "⎤", "⎢", "⎥", "⎣", "⎦"]
    if any(cue in s for cue in formula_cues):
        if any(ch in s for ch in "⎛⎞⎜⎟⎝⎠⎧⎨⎩⎡⎤⎢⎥⎣⎦") and len(s.split()) <= 24:
            return True
        words = re.findall(r"[A-Za-z\u0410-\u042F\u0430-\u044F\u0401\u0451]{3,}", s)
        operators = len(re.findall(r"[=+*/^_→<>±√∫∮ΣΠ∇∂]", s))
        if operators >= 1 and len(words) <= 18:
            return True
    if re.search(r"\b(?:[A-Z][a-z]?\d+){1,}[A-Za-z0-9()]*\b", s):
        # A long prose sentence mentioning V2O5 or K2O is not a display formula.
        # Previous versions styled whole explanatory paragraphs as formulas, because apparently
        # one oxide in parentheses was enough to trigger a tiny typographic coup.
        words = re.findall(r"[A-Za-z\u0410-\u042F\u0430-\u044F\u0401\u0451]{3,}", s)
        has_reaction_operator = any(op in s for op in ["→", "->", "=", "+"])
        if len(s.split()) <= 8 or (has_reaction_operator and len(words) <= 10):
            return True
    return False


def strip_outer_emphasis_for_formula(text: str) -> str:
    s = text.strip()
    changed = True
    while changed:
        changed = False
        for left, right in [("**", "**"), ("__", "__"), ("*", "*"), ("_", "_")]:
            if s.startswith(left) and s.endswith(right) and len(s) > len(left) + len(right):
                s = s[len(left):-len(right)].strip()
                changed = True
    return s


def parse_inline_no_markdown(text: str) -> List[Segment]:
    token_re = re.compile(r"(§SUB§.*?§ENDSUB§|§SUP§.*?§ENDSUP§)")
    result: List[Segment] = []
    for part in token_re.split(text or ""):
        if not part:
            continue
        if part.startswith(SUB_OPEN) and part.endswith(SUB_CLOSE):
            result.append(Segment(part[len(SUB_OPEN):-len(SUB_CLOSE)], sub=True))
        elif part.startswith(SUP_OPEN) and part.endswith(SUP_CLOSE):
            result.append(Segment(part[len(SUP_OPEN):-len(SUP_CLOSE)], sup=True))
        else:
            result.append(Segment(part))
    return merge_segments(result)

def parse_blocks(markdown: str) -> List[Block]:
    blocks: List[Block] = []
    in_code = False
    code_lines: List[str] = []
    for raw in (markdown or "").split("\n"):
        line = raw.rstrip()
        if line.strip().startswith("```"):
            if in_code:
                blocks.append(Block("code", "\n".join(code_lines)))
                code_lines = []
                in_code = False
            else:
                in_code = True
            continue
        if in_code:
            code_lines.append(line)
            continue
        if not line.strip():
            blocks.append(Block("blank"))
            continue
        m = re.match(r"^(#{1,6})\s+(.*)$", line)
        if m:
            blocks.append(Block("heading", m.group(2).strip(), level=len(m.group(1))))
            continue
        m = re.match(r"^\s*[-*+]\s+(.*)$", line)
        if m:
            text = m.group(1).strip()
            if is_formula_like_line(text):
                text = normalize_formula_words(strip_outer_emphasis_for_formula(text))
            blocks.append(Block("list", text, ordered=False))
            continue
        m = re.match(r"^\s*(\d+)[.)]\s+(.*)$", line)
        if m:
            text = m.group(2).strip()
            if is_formula_like_line(text):
                text = normalize_formula_words(strip_outer_emphasis_for_formula(text))
            blocks.append(Block("list", f"{m.group(1)}. {text}", ordered=True))
            continue
        if line.strip() == "---":
            blocks.append(Block("hr"))
            continue
        text = line.strip()
        if is_formula_like_line(text):
            blocks.append(Block("formula", normalize_formula_words(strip_outer_emphasis_for_formula(text))))
        else:
            blocks.append(Block("paragraph", text))
    if code_lines:
        blocks.append(Block("code", "\n".join(code_lines)))
    collapsed: List[Block] = []
    prev_blank = False
    for b in blocks:
        if b.kind == "blank":
            if not prev_blank:
                collapsed.append(b)
            prev_blank = True
        else:
            collapsed.append(b)
            prev_blank = False
    return collapsed

def parse_inline(text: str) -> List[Segment]:
    # Split neutral sub/sup markers first, parse basic markdown inside non-index spans.
    token_re = re.compile(r"(§SUB§.*?§ENDSUB§|§SUP§.*?§ENDSUP§)")
    result: List[Segment] = []
    for part in token_re.split(text or ""):
        if not part:
            continue
        if part.startswith(SUB_OPEN) and part.endswith(SUB_CLOSE):
            result.append(Segment(part[len(SUB_OPEN):-len(SUB_CLOSE)], sub=True))
        elif part.startswith(SUP_OPEN) and part.endswith(SUP_CLOSE):
            result.append(Segment(part[len(SUP_OPEN):-len(SUP_CLOSE)], sup=True))
        else:
            result.extend(parse_basic_markdown_inline(part))
    return merge_segments(result)


def parse_basic_markdown_inline(text: str) -> List[Segment]:
    segs: List[Segment] = []
    pattern = re.compile(
        r"(`[^`]+`|\*\*(.+?)\*\*|__(.+?)__|(?<!\*)\*([^\s*](?:.*?[^\s*])?)\*(?!\*)|(?<!_)_([^\s_](?:.*?[^\s_])?)_(?!_))"
    )
    pos = 0
    for m in pattern.finditer(text):
        if m.start() > pos:
            segs.append(Segment(text[pos:m.start()]))
        tok = m.group(0)
        if tok.startswith("`") and tok.endswith("`"):
            segs.append(Segment(tok[1:-1], code=True))
        elif tok.startswith("**") and tok.endswith("**"):
            segs.append(Segment(tok[2:-2], bold=True))
        elif tok.startswith("__") and tok.endswith("__"):
            segs.append(Segment(tok[2:-2], bold=True))
        elif tok.startswith("*") and tok.endswith("*"):
            segs.append(Segment(tok[1:-1], italic=True))
        elif tok.startswith("_") and tok.endswith("_"):
            segs.append(Segment(tok[1:-1], italic=True))
        pos = m.end()
    if pos < len(text):
        segs.append(Segment(text[pos:]))
    return segs

def merge_segments(segs: List[Segment]) -> List[Segment]:
    out: List[Segment] = []
    for s in segs:
        if not s.text:
            continue
        if out and (out[-1].sub, out[-1].sup, out[-1].bold, out[-1].italic, out[-1].code) == (s.sub, s.sup, s.bold, s.italic, s.code):
            out[-1].text += s.text
        else:
            out.append(s)
    return out


def segments_plain(segs: List[Segment]) -> str:
    return "".join(s.text for s in segs)


def blocks_to_plain(blocks: List[Block], md_mode: bool = False) -> str:
    lines: List[str] = []
    for b in blocks:
        if b.kind == "blank":
            lines.append("")
        elif b.kind == "heading":
            prefix = ("#" * max(1, min(6, b.level)) + " ") if md_mode else ""
            lines.append(prefix + segments_plain(parse_inline(b.text)))
        elif b.kind == "formula":
            lines.append(segments_plain(parse_inline_no_markdown(reaction_arrow_markers_to_text(b.text))))
        elif b.kind == "list":
            prefix = "- " if not b.ordered else ""
            lines.append(prefix + segments_plain(parse_inline(b.text)))
        elif b.kind == "hr":
            lines.append("---" if md_mode else "")
        elif b.kind == "code":
            if md_mode:
                lines.extend(["```", plain_text_from_markers(b.text), "```"])
            else:
                lines.append(plain_text_from_markers(b.text))
        else:
            lines.append(segments_plain(parse_inline(b.text)))
    return "\n".join(lines).strip() + "\n"

MATH_GLYPH_TEST = "ℏ∇∂∮∫∬∭ΣΠΛμνΨψΔδσπ∞≤≥≈√→←↔⇒⇐⇔±×·°"
CYRILLIC_GLYPH_TEST = "\u0410\u0411\u0412\u0430\u0431\u0432\u0451\u0401"
BASIC_LATIN_TEST = "ABCxyz012"


def font_dirs() -> List[Path]:
    win_font_dir = Path(os.environ.get("WINDIR", r"C:\Windows")) / "Fonts"
    dirs: List[Path] = []

    # Prefer fonts bundled with or installed by this plugin. The built-in PDF
    # fallback must use STIX/Noto when available instead of wandering off to
    # Helvetica and drawing tofu squares like a defeated printer.
    try:
        dirs.extend(_font_search_dirs())
    except Exception:
        try:
            dirs.append(plugin_root() / "vendor" / "toolchain" / "fonts")
        except Exception:
            pass

    dirs.extend([
        win_font_dir,
        Path("/usr/share/fonts"),
        Path("/usr/local/share/fonts"),
        Path("/System/Library/Fonts"),
        Path("/Library/Fonts"),
    ])

    seen = set()
    out: List[Path] = []
    for d in dirs:
        try:
            key = str(d.resolve())
        except Exception:
            key = str(d)
        if key in seen:
            continue
        seen.add(key)
        out.append(d)
    return out


def iter_font_candidates(names: List[str]) -> Iterable[Path]:
    seen = set()
    lowered = [n.lower() for n in names]

    def emit(path: Path):
        key = str(path).lower()
        if path.exists() and key not in seen:
            seen.add(key)
            return path
        return None

    # Respect the caller's priority order. Filesystem iteration order is not a ranking system, despite what computers imply.
    for name in names:
        lname = name.lower()
        for d in font_dirs():
            if not d.exists():
                continue
            direct = emit(d / name)
            if direct:
                yield direct
            for ext in ("*.ttf", "*.otf", "*.ttc"):
                for p in d.rglob(ext):
                    if p.name.lower() == lname:
                        got = emit(p)
                        if got:
                            yield got

    # Last resort: any installed font, scored later.
    for d in font_dirs():
        if not d.exists():
            continue
        for ext in ("*.ttf", "*.otf", "*.ttc"):
            for p in d.rglob(ext):
                got = emit(p)
                if got:
                    yield got

def _ttfont_for_probe(path: Path, subfont_index: int = 0) -> Optional[TTFont]:
    try:
        return TTFont("__probe__", str(path), subfontIndex=subfont_index)
    except Exception:
        return None


def font_score(path: Path, chars: str, max_subfonts: int = 8) -> Tuple[int, int]:
    best_score = -1
    best_index = 0
    for idx in range(max_subfonts):
        font = _ttfont_for_probe(path, idx)
        if font is None:
            if idx == 0:
                return -1, 0
            break
        cmap = getattr(font.face, "charToGlyph", {}) or {}
        score = sum(1 for ch in chars if ord(ch) in cmap)
        if score > best_score:
            best_score, best_index = score, idx
        if score == len(chars):
            break
    return best_score, best_index


def find_font_file(names: List[str], required_chars: str = "", min_score: Optional[int] = None) -> Optional[Tuple[str, int]]:
    best: Optional[Tuple[int, str, int]] = None
    required_chars = required_chars or BASIC_LATIN_TEST
    min_score = len(required_chars) if min_score is None else min_score
    for p in iter_font_candidates(names):
        score, idx = font_score(p, required_chars)
        if score < 0:
            continue
        if score >= min_score:
            return str(p), idx
        if best is None or score > best[0]:
            best = (score, str(p), idx)
    if best and best[0] > 0:
        return best[1], best[2]
    return None


def register_ttf_font(name: str, spec: Optional[Tuple[str, int]], fallback_name: Optional[str] = None) -> str:
    if spec:
        path, idx = spec
        try:
            pdfmetrics.registerFont(TTFont(name, path, subfontIndex=idx))
            return name
        except Exception:
            pass
    return fallback_name or "Helvetica"


def setup_fonts() -> FontSet:
    # Prefer fonts with broad Unicode coverage. Arial often lacks ℏ/∇, which is how we got tofu squares.
    regular_spec = find_font_file(
        ["DejaVuSans.ttf", "NotoSans-Regular.ttf", "FreeSans.ttf", "LiberationSans-Regular.ttf", "Arimo-Regular.ttf", "segoeui.ttf", "arial.ttf", "Arial.ttf"],
        CYRILLIC_GLYPH_TEST + BASIC_LATIN_TEST,
    )
    bold_spec = find_font_file(["DejaVuSans-Bold.ttf", "NotoSans-Bold.ttf", "FreeSansBold.ttf", "LiberationSans-Bold.ttf", "Arimo-Bold.ttf", "segoeuib.ttf", "arialbd.ttf", "Arial Bold.ttf"], CYRILLIC_GLYPH_TEST + BASIC_LATIN_TEST, min_score=6)
    italic_spec = find_font_file(["DejaVuSans-Oblique.ttf", "NotoSans-Italic.ttf", "FreeSansOblique.ttf", "LiberationSans-Italic.ttf", "Arimo-Italic.ttf", "segoeuii.ttf", "ariali.ttf", "Arial Italic.ttf"], CYRILLIC_GLYPH_TEST + BASIC_LATIN_TEST, min_score=6)
    bold_italic_spec = find_font_file(["DejaVuSans-BoldOblique.ttf", "NotoSans-BoldItalic.ttf", "FreeSansBoldOblique.ttf", "LiberationSans-BoldItalic.ttf", "Arimo-BoldItalic.ttf", "segoeuiz.ttf", "arialbi.ttf", "Arial Bold Italic.ttf"], CYRILLIC_GLYPH_TEST + BASIC_LATIN_TEST, min_score=6)
    mono_spec = find_font_file(["DejaVuSansMono.ttf", "NotoSansMono-Regular.ttf", "FreeMono.ttf", "LiberationMono-Regular.ttf", "Cousine-Regular.ttf", "consola.ttf", "Consolas.ttf"], BASIC_LATIN_TEST, min_score=8)
    math_spec = find_font_file(
        [
            "DejaVuSans.ttf", "DejaVuMathTeXGyre.ttf", "NotoSansMath-Regular.ttf", "NotoSansSymbols2-Regular.ttf", "NotoSansSymbols-Regular.ttf",
            "FreeSerif.ttf", "FreeSans.ttf", "STIXTwoMath-Regular.otf", "STIXGeneral.ttf", "Symbola.ttf", "seguisym.ttf", "cambria.ttc", "cambriamath.ttf", "arialuni.ttf",
        ],
        MATH_GLYPH_TEST,
        min_score=max(12, len(MATH_GLYPH_TEST) - 4),
    )

    regular = register_ttf_font("DocFont", regular_spec)
    bold = register_ttf_font("DocFont-Bold", bold_spec, regular)
    italic = register_ttf_font("DocFont-Italic", italic_spec, regular)
    bold_italic = register_ttf_font("DocFont-BoldItalic", bold_italic_spec, bold)
    mono = register_ttf_font("DocMono", mono_spec, regular)
    math = register_ttf_font("DocMath", math_spec, regular)
    try:
        pdfmetrics.registerFontFamily("DocFont", normal=regular, bold=bold, italic=italic, boldItalic=bold_italic)
    except Exception:
        pass
    return FontSet(regular, bold, italic, bold_italic, mono, math)


def font_supports_char(font_name: str, ch: str) -> bool:
    if not ch:
        return True
    if ord(ch) < 128:
        return True
    try:
        font = pdfmetrics.getFont(font_name)
        cmap = getattr(getattr(font, "face", None), "charToGlyph", {}) or {}
        return ord(ch) in cmap
    except Exception:
        # Built-in PDF fonts are not Unicode fonts. Pretending otherwise created the little square apocalypse.
        return False


PDF_UNSUPPORTED_GLYPH_FALLBACK = {
    "ℏ": "hbar", "∇": "nabla", "∂": "partial", "∮": "oint", "∫": "int", "∬": "iint", "∭": "iiint",
    "Σ": "Sum", "Π": "Prod", "Λ": "Lambda", "Ψ": "Psi", "Ω": "Omega", "Δ": "Delta", "σ": "sigma",
    "π": "pi", "μ": "mu", "ν": "nu", "∞": "inf", "≤": "<=", "≥": ">=", "≈": "~", "√": "sqrt",
    "→": "->", "←": "<-", "↔": "<->", "⇒": "=>", "⇐": "<=", "⇔": "<=>", "±": "+/-", "×": "x", "·": "*",
}


def pdf_text_with_fallback(text: str, primary_font: str, math_font: str) -> str:
    chunks: List[str] = []
    current_font: Optional[str] = None
    current: List[str] = []

    def flush() -> None:
        nonlocal current, current_font
        if not current:
            return
        escaped = html.escape("".join(current))
        if current_font and current_font != primary_font:
            chunks.append(f'<font name="{current_font}">{escaped}</font>')
        else:
            chunks.append(escaped)
        current = []

    for ch in text:
        target = primary_font
        out_text = ch
        if not font_supports_char(primary_font, ch):
            if font_supports_char(math_font, ch):
                target = math_font
            elif ch in PDF_UNSUPPORTED_GLYPH_FALLBACK:
                # No installed font can draw it: degrade to readable ASCII rather than rendering a white square.
                out_text = PDF_UNSUPPORTED_GLYPH_FALLBACK[ch]
            elif ord(ch) > 127:
                # Last resort: never knowingly emit a glyph the selected PDF font
                # cannot draw. This prevents tofu boxes in non-academic fallback PDFs.
                out_text = "?"
        if target != current_font:
            flush()
            current_font = target
        current.append(out_text)
    flush()
    return "".join(chunks)


def segs_to_pdf_markup(segs: List[Segment], fonts: FontSet, base_size: float = 10.5, prefer_math: bool = False) -> str:
    out = []
    primary = fonts.math if prefer_math else fonts.regular
    for s in segs:
        txt = pdf_text_with_fallback(s.text, primary, fonts.math)
        if s.sub:
            out.append(f"<sub>{txt}</sub>")
            continue
        if s.sup:
            out.append(f"<super>{txt}</super>")
            continue
        if s.code:
            txt = f'<font name="{fonts.mono}">{txt}</font>'
        if s.bold:
            txt = f"<b>{txt}</b>"
        if s.italic:
            txt = f"<i>{txt}</i>"
        out.append(txt)
    return "".join(out)

def generate_pdf(path: Path, title: str, blocks: List[Block], style: str, keep_previews: bool, temp_root: Path) -> Optional[Path]:
    fonts = setup_fonts()
    styles = getSampleStyleSheet()
    title_style = ParagraphStyle("title", parent=styles["Title"], fontName=fonts.bold, fontSize=18, leading=22, alignment=TA_CENTER, spaceAfter=10)
    h1 = ParagraphStyle("h1", parent=styles["Heading1"], fontName=fonts.bold, fontSize=14, leading=17, spaceBefore=8, spaceAfter=6)
    h2 = ParagraphStyle("h2", parent=styles["Heading2"], fontName=fonts.bold, fontSize=12.5, leading=15, spaceBefore=7, spaceAfter=4)
    body = ParagraphStyle("body", parent=styles["BodyText"], fontName=fonts.regular, fontSize=10.5, leading=13.7, spaceAfter=5)
    bullet = ParagraphStyle("bullet", parent=body, leftIndent=14, firstLineIndent=-10)
    code = ParagraphStyle("code", parent=body, fontName=fonts.mono, fontSize=9, leading=11, backColor=colors.HexColor("#f5f5f5"), borderPadding=5)
    formula = ParagraphStyle("formula", parent=body, fontName=fonts.math, fontSize=10.5, leading=15, leftIndent=10, rightIndent=10, spaceBefore=3, spaceAfter=7, backColor=colors.HexColor("#f7f7f7"), borderPadding=5)
    story: List[Any] = []
    if title:
        story.append(Paragraph(html.escape(title), title_style))
    for b in blocks:
        if b.kind == "blank":
            story.append(Spacer(1, 4))
        elif b.kind == "heading":
            pstyle = h1 if b.level <= 1 else h2
            story.append(Paragraph(segs_to_pdf_markup(parse_inline(b.text), fonts, pstyle.fontSize), pstyle))
        elif b.kind == "list":
            prefix = "• " if not b.ordered else ""
            story.append(Paragraph(prefix + segs_to_pdf_markup(parse_inline(b.text), fonts, body.fontSize), bullet if not b.ordered else body))
        elif b.kind == "code":
            story.append(Preformatted(plain_text_from_markers(b.text), code))
        elif b.kind == "formula":
            story.append(Paragraph(segs_to_pdf_markup(parse_inline_no_markdown(reaction_arrow_markers_to_text(b.text)), fonts, formula.fontSize, prefer_math=True), formula))
        elif b.kind == "hr":
            story.append(Spacer(1, 6))
        else:
            story.append(Paragraph(segs_to_pdf_markup(parse_inline(b.text), fonts, body.fontSize), body))

    doc = SimpleDocTemplate(str(path), pagesize=A4, leftMargin=1.8*cm, rightMargin=1.8*cm, topMargin=1.6*cm, bottomMargin=1.6*cm)
    def footer(canvas, doc_obj):
        canvas.saveState()
        canvas.setFont(fonts.regular, 9)
        canvas.setFillColor(colors.HexColor("#666666"))
        canvas.drawRightString(A4[0]-1.8*cm, 0.9*cm, f"Page {doc_obj.page}")
        canvas.restoreState()
    doc.build(story, onFirstPage=footer, onLaterPages=footer)
    return None


def segs_to_html(segs: List[Segment]) -> str:
    out = []
    for s in segs:
        txt = html.escape(s.text)
        if s.sub:
            txt = f"<sub>{txt}</sub>"
        elif s.sup:
            txt = f"<sup>{txt}</sup>"
        else:
            if s.code:
                txt = f"<code>{txt}</code>"
            if s.bold:
                txt = f"<strong>{txt}</strong>"
            if s.italic:
                txt = f"<em>{txt}</em>"
        out.append(txt)
    return "".join(out)


def formula_to_html(text: str) -> str:
    """Render built-in fallback formula text, including chemistry reaction arrows."""
    rxn_re = re.compile(re.escape(RXN_ARROW_OPEN) + r"(.*?)" + re.escape(RXN_ARROW_MID) + r"(.*?)" + re.escape(RXN_ARROW_CLOSE), re.S)
    out: List[str] = []
    pos = 0
    for m in rxn_re.finditer(text or ""):
        if m.start() > pos:
            out.append(segs_to_html(parse_inline_no_markdown((text or "")[pos:m.start()])))
        above = segs_to_html(parse_inline_no_markdown(m.group(1).strip()))
        below = segs_to_html(parse_inline_no_markdown(m.group(2).strip()))
        below_html = f'<span class="rxn-label rxn-below">{below}</span>' if below else ''
        above_html = f'<span class="rxn-label">{above}</span>' if above else '<span class="rxn-label">&nbsp;</span>'
        out.append(f'<span class="rxn-arrow">{above_html}<span class="rxn-line">⟶</span>{below_html}</span>')
        pos = m.end()
    if pos < len(text or ""):
        out.append(segs_to_html(parse_inline_no_markdown((text or "")[pos:])))
    return "".join(out)


def generate_html(path: Path, title: str, blocks: List[Block], epub_mode: bool = False) -> str:
    body_parts = []
    if title:
        body_parts.append(f"<h1>{html.escape(title)}</h1>")
    for b in blocks:
        if b.kind == "blank":
            continue
        if b.kind == "heading":
            lvl = max(1, min(6, b.level + (1 if title else 0)))
            body_parts.append(f"<h{lvl}>{segs_to_html(parse_inline(b.text))}</h{lvl}>")
        elif b.kind == "list":
            body_parts.append(f"<p>{'• ' if not b.ordered else ''}{segs_to_html(parse_inline(b.text))}</p>")
        elif b.kind == "code":
            body_parts.append(f"<pre><code>{html.escape(plain_text_from_markers(b.text))}</code></pre>")
        elif b.kind == "formula":
            body_parts.append(f"<p class=\"formula\">{formula_to_html(b.text)}</p>")
        elif b.kind == "hr":
            body_parts.append("<hr />")
        else:
            body_parts.append(f"<p>{segs_to_html(parse_inline(b.text))}</p>")
    if epub_mode:
        return "\n".join(body_parts)
    doc = f"""<!doctype html>
<html lang="ru">
<head>
<meta charset="utf-8" />
<title>{html.escape(title or path.stem)}</title>
<style>
body {{ font-family: Arial, sans-serif; line-height: 1.45; max-width: 900px; margin: 40px auto; padding: 0 24px; }}
h1, h2, h3 {{ line-height: 1.2; }}
code, pre {{ background: #f5f5f5; }}
pre {{ padding: 12px; overflow-x: auto; }}
sub, sup {{ line-height: 0; font-size: 75%; }}
.formula {{ background: #f7f7f7; padding: 8px 10px; border-radius: 6px; }}
.rxn-arrow {{ display: inline-flex; flex-direction: column; align-items: center; vertical-align: middle; margin: 0 .25em; line-height: 1; }}
.rxn-label {{ font-size: 75%; white-space: nowrap; line-height: 1; }}
.rxn-line {{ font-size: 120%; line-height: .75; }}
.rxn-below {{ margin-top: 1px; }}
</style>
</head>
<body>
{chr(10).join(body_parts)}
</body>
</html>
"""
    path.write_text(doc, encoding="utf-8")
    return doc


def generate_txt(path: Path, blocks: List[Block]) -> None:
    path.write_text(blocks_to_plain(blocks, md_mode=False), encoding="utf-8")


def generate_md(path: Path, blocks: List[Block], title: str) -> None:
    text = (f"# {title}\n\n" if title else "") + blocks_to_plain(blocks, md_mode=True)
    path.write_text(text, encoding="utf-8")


def xml_run_text(text: str) -> str:
    # Preserve spaces in XML text nodes.
    return f'<w:t xml:space="preserve">{xml_escape(text)}</w:t>'


def segs_to_docx_runs(segs: List[Segment]) -> str:
    runs = []
    for s in segs:
        props = []
        if s.bold: props.append("<w:b/>")
        if s.italic: props.append("<w:i/>")
        if s.code: props.append('<w:rFonts w:ascii="Consolas" w:hAnsi="Consolas"/>')
        if s.sub: props.append('<w:vertAlign w:val="subscript"/>')
        if s.sup: props.append('<w:vertAlign w:val="superscript"/>')
        rpr = f"<w:rPr>{''.join(props)}</w:rPr>" if props else ""
        runs.append(f"<w:r>{rpr}{xml_run_text(s.text)}</w:r>")
    return "".join(runs)


def docx_paragraph(text: str, style: Optional[str] = None) -> str:
    ppr = f'<w:pPr><w:pStyle w:val="{style}"/></w:pPr>' if style else ""
    return f"<w:p>{ppr}{segs_to_docx_runs(parse_inline(text))}</w:p>"


def docx_formula_paragraph(text: str) -> str:
    return f"<w:p>{segs_to_docx_runs(parse_inline_no_markdown(reaction_arrow_markers_to_text(text)))}</w:p>"


def generate_docx(path: Path, title: str, blocks: List[Block]) -> None:
    body = []
    if title:
        body.append(docx_paragraph(title, "Title"))
    for b in blocks:
        if b.kind == "blank":
            body.append("<w:p/>")
        elif b.kind == "heading":
            body.append(docx_paragraph(b.text, "Heading1" if b.level <= 1 else "Heading2"))
        elif b.kind == "list":
            body.append(docx_paragraph(("• " if not b.ordered else "") + b.text))
        elif b.kind == "code":
            body.append(docx_paragraph(plain_text_from_markers(b.text)))
        elif b.kind == "formula":
            body.append(docx_formula_paragraph(b.text))
        elif b.kind == "hr":
            body.append("<w:p><w:r><w:t>────────</w:t></w:r></w:p>")
        else:
            body.append(docx_paragraph(b.text))
    document_xml = f'''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
<w:body>{''.join(body)}<w:sectPr><w:pgSz w:w="11906" w:h="16838"/><w:pgMar w:top="1134" w:right="1134" w:bottom="1134" w:left="1134"/></w:sectPr></w:body></w:document>'''
    styles_xml = '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:styles xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:style w:type="paragraph" w:styleId="Title"><w:name w:val="Title"/><w:rPr><w:b/><w:sz w:val="32"/></w:rPr></w:style>
<w:style w:type="paragraph" w:styleId="Heading1"><w:name w:val="Heading 1"/><w:rPr><w:b/><w:sz w:val="28"/></w:rPr></w:style>
<w:style w:type="paragraph" w:styleId="Heading2"><w:name w:val="Heading 2"/><w:rPr><w:b/><w:sz w:val="24"/></w:rPr></w:style>
</w:styles>'''
    with zipfile.ZipFile(path, "w", zipfile.ZIP_DEFLATED) as z:
        z.writestr("[Content_Types].xml", '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"><Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/><Default Extension="xml" ContentType="application/xml"/><Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/><Override PartName="/word/styles.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml"/></Types>''')
        z.writestr("_rels/.rels", '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/></Relationships>''')
        z.writestr("word/_rels/document.xml.rels", '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"/>''')
        z.writestr("word/document.xml", document_xml)
        z.writestr("word/styles.xml", styles_xml)


def segs_to_odt(segs: List[Segment]) -> str:
    out = []
    for s in segs:
        txt = xml_escape(s.text)
        style = None
        if s.sub: style = "Subscript"
        elif s.sup: style = "Superscript"
        elif s.bold: style = "Bold"
        elif s.italic: style = "Italic"
        elif s.code: style = "Code"
        if style:
            out.append(f'<text:span text:style-name="{style}">{txt}</text:span>')
        else:
            out.append(txt)
    return "".join(out)


def generate_odt(path: Path, title: str, blocks: List[Block]) -> None:
    body = []
    if title:
        body.append(f'<text:h text:outline-level="1">{xml_escape(title)}</text:h>')
    for b in blocks:
        if b.kind == "blank":
            body.append('<text:p text:style-name="P"/>')
        elif b.kind == "heading":
            lvl = max(1, min(6, b.level))
            body.append(f'<text:h text:outline-level="{lvl}">{segs_to_odt(parse_inline(b.text))}</text:h>')
        elif b.kind == "list":
            body.append(f'<text:p text:style-name="P">{"• " if not b.ordered else ""}{segs_to_odt(parse_inline(b.text))}</text:p>')
        elif b.kind == "code":
            body.append(f'<text:p text:style-name="Code">{xml_escape(plain_text_from_markers(b.text))}</text:p>')
        elif b.kind == "formula":
            body.append(f'<text:p text:style-name="P">{segs_to_odt(parse_inline_no_markdown(reaction_arrow_markers_to_text(b.text)))}</text:p>')
        elif b.kind == "hr":
            body.append('<text:p text:style-name="P">────────</text:p>')
        else:
            body.append(f'<text:p text:style-name="P">{segs_to_odt(parse_inline(b.text))}</text:p>')
    content = f'''<?xml version="1.0" encoding="UTF-8"?>
<office:document-content xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0" xmlns:style="urn:oasis:names:tc:opendocument:xmlns:style:1.0" xmlns:text="urn:oasis:names:tc:opendocument:xmlns:text:1.0" xmlns:fo="urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0" office:version="1.2">
<office:automatic-styles>
<style:style style:name="P" style:family="paragraph"><style:paragraph-properties fo:margin-bottom="0.15cm"/></style:style>
<style:style style:name="Bold" style:family="text"><style:text-properties fo:font-weight="bold"/></style:style>
<style:style style:name="Italic" style:family="text"><style:text-properties fo:font-style="italic"/></style:style>
<style:style style:name="Code" style:family="text"><style:text-properties style:font-name="Consolas"/></style:style>
<style:style style:name="Subscript" style:family="text"><style:text-properties style:text-position="sub 58%"/></style:style>
<style:style style:name="Superscript" style:family="text"><style:text-properties style:text-position="super 58%"/></style:style>
</office:automatic-styles>
<office:body><office:text>{''.join(body)}</office:text></office:body></office:document-content>'''
    manifest = '''<?xml version="1.0" encoding="UTF-8"?>
<manifest:manifest xmlns:manifest="urn:oasis:names:tc:opendocument:xmlns:manifest:1.0" manifest:version="1.2"><manifest:file-entry manifest:full-path="/" manifest:media-type="application/vnd.oasis.opendocument.text"/><manifest:file-entry manifest:full-path="content.xml" manifest:media-type="text/xml"/></manifest:manifest>'''
    with zipfile.ZipFile(path, "w") as z:
        z.writestr("mimetype", "application/vnd.oasis.opendocument.text", compress_type=zipfile.ZIP_STORED)
        z.writestr("META-INF/manifest.xml", manifest, compress_type=zipfile.ZIP_DEFLATED)
        z.writestr("content.xml", content, compress_type=zipfile.ZIP_DEFLATED)


def rtf_escape(s: str) -> str:
    out = []
    for ch in s:
        code = ord(ch)
        if ch in "\\{}":
            out.append("\\" + ch)
        elif code > 127:
            if code > 32767:
                code -= 65536
            out.append(f"\\u{code}?")
        elif ch == "\n":
            out.append("\\line ")
        else:
            out.append(ch)
    return "".join(out)


def segs_to_rtf(segs: List[Segment]) -> str:
    out = []
    for s in segs:
        txt = rtf_escape(s.text)
        prefix = suffix = ""
        if s.bold: prefix += "\\b "; suffix = "\\b0 " + suffix
        if s.italic: prefix += "\\i "; suffix = "\\i0 " + suffix
        if s.sub: prefix += "\\sub "; suffix = "\\nosupersub " + suffix
        if s.sup: prefix += "\\super "; suffix = "\\nosupersub " + suffix
        out.append(prefix + txt + suffix)
    return "".join(out)


def generate_rtf(path: Path, title: str, blocks: List[Block]) -> None:
    parts = [r"{\rtf1\ansi\deff0{\fonttbl{\f0 Arial;}{\f1 Consolas;}}\fs22 "]
    if title:
        parts.append(r"\qc\b\fs36 " + rtf_escape(title) + r"\b0\fs22\par\ql ")
    for b in blocks:
        if b.kind == "blank":
            parts.append(r"\par ")
        elif b.kind == "heading":
            parts.append(r"\b\fs28 " + segs_to_rtf(parse_inline(b.text)) + r"\b0\fs22\par ")
        elif b.kind == "list":
            parts.append((r"\bullet " if not b.ordered else "") + segs_to_rtf(parse_inline(b.text)) + r"\par ")
        elif b.kind == "code":
            parts.append(r"\f1 " + rtf_escape(plain_text_from_markers(b.text)) + r"\f0\par ")
        elif b.kind == "formula":
            parts.append(segs_to_rtf(parse_inline_no_markdown(reaction_arrow_markers_to_text(b.text))) + r"\par ")
        elif b.kind == "hr":
            parts.append(r"\par ----------------\par ")
        else:
            parts.append(segs_to_rtf(parse_inline(b.text)) + r"\par ")
    parts.append("}")
    path.write_text("".join(parts), encoding="utf-8")


def generate_epub(path: Path, title: str, blocks: List[Block]) -> None:
    body = generate_html(Path("unused.html"), title, blocks, epub_mode=True)
    chapter = f'''<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="ru"><head><title>{html.escape(title or path.stem)}</title><style>body{{font-family:serif;line-height:1.45}} sub,sup{{font-size:75%;line-height:0}}</style></head><body>{body}</body></html>'''
    nav = f'''<?xml version="1.0" encoding="utf-8"?>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" lang="ru"><head><title>Navigation</title></head><body><nav epub:type="toc"><ol><li><a href="chapter1.xhtml">{html.escape(title or path.stem)}</a></li></ol></nav></body></html>'''
    opf = f'''<?xml version="1.0" encoding="utf-8"?>
<package xmlns="http://www.idpf.org/2007/opf" version="3.0" unique-identifier="bookid"><metadata xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:identifier id="bookid">urn:uuid:ai-to-document</dc:identifier><dc:title>{html.escape(title or path.stem)}</dc:title><dc:language>ru</dc:language></metadata><manifest><item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/><item id="chapter1" href="chapter1.xhtml" media-type="application/xhtml+xml"/></manifest><spine><itemref idref="chapter1"/></spine></package>'''
    container = '''<?xml version="1.0" encoding="utf-8"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container"><rootfiles><rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/></rootfiles></container>'''
    with zipfile.ZipFile(path, "w") as z:
        z.writestr("mimetype", "application/epub+zip", compress_type=zipfile.ZIP_STORED)
        z.writestr("META-INF/container.xml", container, compress_type=zipfile.ZIP_DEFLATED)
        z.writestr("OEBPS/content.opf", opf, compress_type=zipfile.ZIP_DEFLATED)
        z.writestr("OEBPS/nav.xhtml", nav, compress_type=zipfile.ZIP_DEFLATED)
        z.writestr("OEBPS/chapter1.xhtml", chapter, compress_type=zipfile.ZIP_DEFLATED)
ai-to-pdf-docx-odt-epub