Project Files
scripts / aidoc_render_builtin.py
"""Built-in Markdown-like document parser and fallback renderers.
This module contains dependency-light PDF/DOCX/HTML/TXT/MD/RTF/ODT/EPUB output
used when Pandoc/TeX is unavailable or not requested.
"""
from __future__ import annotations
import html
import os
import re
import shutil
import tempfile
import zipfile
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple
from xml.sax.saxutils import escape as xml_escape
from reportlab.lib import colors
from reportlab.lib.enums import TA_CENTER, TA_LEFT
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet
from reportlab.lib.units import cm
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.platypus import PageBreak, Paragraph, Preformatted, SimpleDocTemplate, Spacer, Table, TableStyle
def plugin_root() -> Path:
return Path(__file__).resolve().parents[1]
SUPPORTED_FORMATS = {"pdf", "docx", "txt", "md", "html", "odt", "rtf", "epub"}
SUB_OPEN = "§SUB§"
SUB_CLOSE = "§ENDSUB§"
SUP_OPEN = "§SUP§"
SUP_CLOSE = "§ENDSUP§"
RXN_ARROW_OPEN = "§RXNARROW§"
RXN_ARROW_MID = "§RXNMID§"
RXN_ARROW_CLOSE = "§ENDRXNARROW§"
PANDOC_FROM = "markdown+tex_math_dollars+tex_math_single_backslash+raw_tex+smart"
MATH_ENV_RE = re.compile(r"\\begin\s*\{(?:matrix|pmatrix|bmatrix|Bmatrix|vmatrix|Vmatrix|smallmatrix|array|cases|aligned|align|alignat|gather|multline|equation|split)\}", re.I)
DISPLAY_MATH_DELIM_RE = re.compile(r"(^|\n)\s*(?:\$\$|\\\[|\\begin\s*\{(?:equation|align|aligned|gather|multline|split|matrix|pmatrix|bmatrix|Bmatrix|vmatrix|Vmatrix|array|cases)\})", re.I)
@dataclass
class FontSet:
regular: str
bold: str
italic: str
bold_italic: str
mono: str
math: str
@dataclass
class Segment:
text: str
sub: bool = False
sup: bool = False
bold: bool = False
italic: bool = False
code: bool = False
@dataclass
class Block:
kind: str
text: str = ""
level: int = 0
ordered: bool = False
@dataclass
class EnginePlan:
profile: str
selected_math_renderer: str
should_try_pandoc: bool
pandoc: str = ""
tex_engine: str = ""
reasons: Optional[List[str]] = None
def marker_sub(value: str) -> str:
value = (value or "").strip()
return f"{SUB_OPEN}{value}{SUB_CLOSE}" if value else ""
def marker_sup(value: str) -> str:
value = (value or "").strip()
return f"{SUP_OPEN}{value}{SUP_CLOSE}" if value else ""
def marker_reaction_arrow(above: str = "", below: str = "") -> str:
above = (above or "").strip()
below = (below or "").strip()
return f"{RXN_ARROW_OPEN}{above}{RXN_ARROW_MID}{below}{RXN_ARROW_CLOSE}"
def reaction_arrow_markers_to_text(text: str) -> str:
def repl(m: re.Match[str]) -> str:
above = plain_text_from_markers(m.group(1)).strip()
below = plain_text_from_markers(m.group(2)).strip()
labels = [x for x in [above, below] if x]
return " →" + ("[" + "; ".join(labels) + "]" if labels else "") + " "
return re.sub(re.escape(RXN_ARROW_OPEN) + r"(.*?)" + re.escape(RXN_ARROW_MID) + r"(.*?)" + re.escape(RXN_ARROW_CLOSE), repl, text or "", flags=re.S)
REVERSE_SUBSCRIPT_MAP = {
"₀": "0", "₁": "1", "₂": "2", "₃": "3", "₄": "4", "₅": "5", "₆": "6", "₇": "7", "₈": "8", "₉": "9",
"₊": "+", "₋": "-", "₌": "=", "₍": "(", "₎": ")", "ₐ": "a", "ₑ": "e", "ₕ": "h", "ᵢ": "i", "ⱼ": "j",
"ₖ": "k", "ₗ": "l", "ₘ": "m", "ₙ": "n", "ₒ": "o", "ₚ": "p", "ᵣ": "r", "ₛ": "s", "ₜ": "t", "ᵤ": "u", "ᵥ": "v", "ₓ": "x",
}
REVERSE_SUPERSCRIPT_MAP = {
"⁰": "0", "¹": "1", "²": "2", "³": "3", "⁴": "4", "⁵": "5", "⁶": "6", "⁷": "7", "⁸": "8", "⁹": "9",
"⁺": "+", "⁻": "-", "⁼": "=", "⁽": "(", "⁾": ")", "ᵃ": "a", "ᵇ": "b", "ᶜ": "c", "ᵈ": "d", "ᵉ": "e", "ᶠ": "f", "ᵍ": "g", "ʰ": "h",
"ⁱ": "i", "ʲ": "j", "ᵏ": "k", "ˡ": "l", "ᵐ": "m", "ⁿ": "n", "ᵒ": "o", "ᵖ": "p", "ʳ": "r", "ˢ": "s", "ᵗ": "t", "ᵘ": "u", "ᵛ": "v", "ʷ": "w", "ˣ": "x", "ʸ": "y", "ᶻ": "z",
}
SUBSCRIPT_CHARS = "".join(re.escape(ch) for ch in REVERSE_SUBSCRIPT_MAP)
SUPERSCRIPT_CHARS = "".join(re.escape(ch) for ch in REVERSE_SUPERSCRIPT_MAP)
GREEK_COMMANDS = {
"alpha": "α", "beta": "β", "gamma": "γ", "delta": "δ", "epsilon": "ε", "varepsilon": "ε", "zeta": "ζ", "eta": "η", "theta": "θ",
"vartheta": "ϑ", "iota": "ι", "kappa": "κ", "lambda": "λ", "mu": "μ", "nu": "ν", "xi": "ξ", "pi": "π", "rho": "ρ", "sigma": "σ",
"tau": "τ", "upsilon": "υ", "phi": "φ", "varphi": "φ", "chi": "χ", "psi": "ψ", "omega": "ω",
"Gamma": "Γ", "Delta": "Δ", "Theta": "Θ", "Lambda": "Λ", "Xi": "Ξ", "Pi": "Π", "Sigma": "Σ", "Upsilon": "Υ", "Phi": "Φ", "Psi": "Ψ", "Omega": "Ω",
}
LATEX_SYMBOLS = {
"leq": "≤", "le": "≤", "geq": "≥", "ge": "≥", "neq": "≠", "ne": "≠", "approx": "≈", "sim": "≈", "simeq": "≃", "equiv": "≡",
"propto": "∝", "times": "×", "cdot": "·", "pm": "±", "mp": "∓", "div": "÷", "infty": "∞", "infinity": "∞", "partial": "∂", "nabla": "∇",
"hbar": "ℏ", "hslash": "ℏ", "ell": "ℓ", "emptyset": "∅", "forall": "∀", "exists": "∃", "in": "∈", "notin": "∉", "subset": "⊂", "subseteq": "⊆",
"sum": "Σ", "prod": "Π", "int": "∫", "iint": "∬", "iiint": "∭", "oint": "∮", "rightarrow": "→", "to": "→", "leftarrow": "←", "leftrightarrow": "↔", "rightleftharpoons": "⇌", "leftrightharpoons": "⇌", "leftharpoons": "↽", "rightharpoons": "⇀", "Rightarrow": "⇒",
"Leftarrow": "⇐", "Leftrightarrow": "⇔", "uparrow": "↑", "downarrow": "↓", "degree": "°", "circ": "°", "cdots": "⋯", "ldots": "…", "dots": "…",
"det": "det", "log": "log", "ln": "ln", "sin": "sin", "cos": "cos", "tan": "tan", "lim": "lim", "min": "min", "max": "max", "exp": "exp",
}
FORMAT_COMMANDS = ["text", "textrm", "textit", "textbf", "mathrm", "mathit", "mathbf", "mathsf", "operatorname", "ce", "chem", "rm", "bf", "it", "emph", "mbox"]
def strip_tex_left_right_delimiters(text: str) -> str:
"""Remove TeX sizing delimiters without damaging commands like \rightarrow."""
return re.sub(r"\\(?:left|right)\s*(?=[()\[\]{}|\\.]|$)", "", text or "")
def decode_script_chars(text: str) -> str:
def repl_sub(m: re.Match[str]) -> str:
value = "".join(REVERSE_SUBSCRIPT_MAP.get(ch, ch) for ch in m.group(1))
return marker_sub(value)
def repl_sup(m: re.Match[str]) -> str:
value = "".join(REVERSE_SUPERSCRIPT_MAP.get(ch, ch) for ch in m.group(1))
return marker_sup(value)
text = re.sub(f"([{SUBSCRIPT_CHARS}]+)", repl_sub, text)
text = re.sub(f"([{SUPERSCRIPT_CHARS}]+)", repl_sup, text)
return text
def replace_latex_commands(text: str) -> str:
# Remove math delimiters without pretending a renderer exists, because fantasy is not an output format.
text = text.replace("\\(", "").replace("\\)", "")
text = text.replace("\\[", "").replace("\\]", "")
text = text.replace("$$", "").replace("$", "")
text = strip_tex_left_right_delimiters(text)
text = re.sub(r"\^\s*\{?\\(?:circ|degree)\}?", "°", text)
# Labeled arrows are common in chemistry. Keep above/below labels instead of eating them.
def _arrow_label_for_fallback(match: re.Match[str]) -> str:
below = match.group(1) or ""
above = match.group(2) or ""
above_clean = normalize_latex_fragment_for_fallback(above) if above.strip() else ""
below_clean = normalize_latex_fragment_for_fallback(below) if below.strip() else ""
return marker_reaction_arrow(above_clean, below_clean)
arrow_pat = re.compile(r"\\x(?:right|left)(?:arrow|leftharpoons)\s*(?:\[([^\]]*)\])?\s*\{([^{}]*)\}")
while True:
new_text = arrow_pat.sub(_arrow_label_for_fallback, text)
if new_text == text:
break
text = new_text
for cmd in FORMAT_COMMANDS:
pattern = re.compile(r"\\" + cmd + r"\s*\{([^{}]*)\}")
while True:
new = pattern.sub(lambda m: m.group(1), text)
if new == text:
break
text = new
# Reversible arrows often arrive as \underset{...}{\overset{...}{\rightleftharpoons}}.
rev1 = re.compile(r"\\underset\s*\{([^{}]*)\}\s*\{\s*\\overset\s*\{([^{}]*)\}\s*\{\s*\\(?:rightleftharpoons|leftrightharpoons)\s*\}\s*\}")
text = rev1.sub(lambda m: " ⇌[" + "; ".join(x for x in [normalize_latex_fragment_for_fallback(m.group(2)), normalize_latex_fragment_for_fallback(m.group(1))] if x) + "] ", text)
rev2 = re.compile(r"\\overset\s*\{([^{}]*)\}\s*\{\s*\\underset\s*\{([^{}]*)\}\s*\{\s*\\(?:rightleftharpoons|leftrightharpoons)\s*\}\s*\}")
text = rev2.sub(lambda m: " ⇌[" + "; ".join(x for x in [normalize_latex_fragment_for_fallback(m.group(1)), normalize_latex_fragment_for_fallback(m.group(2))] if x) + "] ", text)
# Try labeled arrows again after \text{...} labels have been flattened.
text = re.sub(r"\\x(?:right|left)(?:arrow|leftharpoons)\s*(?:\[([^\]]*)\])?\s*\{([^{}]*)\}", _arrow_label_for_fallback, text)
text = re.sub(r"\\overset\s*\{([^{}]*)\}\s*\{\\to\}", lambda m: marker_reaction_arrow(normalize_latex_fragment_for_fallback(m.group(1)), ""), text)
text = re.sub(r"\\underset\s*\{([^{}]*)\}\s*\{\\to\}", lambda m: marker_reaction_arrow("", normalize_latex_fragment_for_fallback(m.group(1))), text)
# Common simple fractions and roots.
frac_pat = re.compile(r"\\frac\s*\{([^{}]+)\}\s*\{([^{}]+)\}")
while True:
new = frac_pat.sub(lambda m: f"({m.group(1).strip()})/({m.group(2).strip()})", text)
if new == text:
break
text = new
text = re.sub(r"\\sqrt\s*\[([^\]]+)\]\s*\{([^{}]+)\}", lambda m: f"root_{m.group(1)}({m.group(2)})", text)
text = re.sub(r"\\sqrt\s*\{([^{}]+)\}", lambda m: f"sqrt({m.group(1)})", text)
for name, value in {**GREEK_COMMANDS, **LATEX_SYMBOLS}.items():
# A TeX command can be followed by _, ^, punctuation or whitespace. Python's \b treats underscore as a word char,
# so \oint_C used to lose the integral sign. Splendid little trap.
text = re.sub(r"\\" + re.escape(name) + r"(?![A-Za-z])", value, text)
# Remove spacing commands and braces left over from simple TeX fragments.
text = re.sub(r"\\[,;:\s!]+", " ", text)
text = text.replace("\\%", "%").replace("\\&", "&").replace("\\_", "_")
text = re.sub(r"\\(?=[ℏ∇∂∫∮ΣΠΛμνΨψΔδσπ∞≤≥≈√→←↔⇒⇐⇔])", "", text)
return text
def _consume_tex_script_value(text: str, pos: int) -> Tuple[str, int]:
"""Consume exactly one TeX script token after ^ or _."""
n = len(text)
i = pos
while i < n and text[i].isspace():
i += 1
if i >= n:
return "", pos
if text[i] == "{":
depth = 1
j = i + 1
while j < n and depth > 0:
if text[j] == "{":
depth += 1
elif text[j] == "}":
depth -= 1
j += 1
if depth == 0:
return text[i + 1:j - 1], j
return text[i + 1:], n
if text[i] == "\\":
m = re.match(r"\\([A-Za-z]+|.)", text[i:])
if m:
cmd = m.group(1)
value = GREEK_COMMANDS.get(cmd, LATEX_SYMBOLS.get(cmd, cmd))
return value, i + len(m.group(0))
if text[i].isdigit():
j = i + 1
while j < n and text[j].isdigit():
j += 1
if j < n and text[j] in "+-":
j += 1
return text[i:j], j
return text[i], i + 1
def convert_explicit_scripts(text: str) -> str:
"""Convert TeX-like scripts without creating nested internal markers."""
if not text:
return text
out: List[str] = []
i = 0
n = len(text)
while i < n:
if text.startswith(SUB_OPEN, i):
end = text.find(SUB_CLOSE, i + len(SUB_OPEN))
if end != -1:
out.append(text[i:end + len(SUB_CLOSE)])
i = end + len(SUB_CLOSE)
continue
if text.startswith(SUP_OPEN, i):
end = text.find(SUP_CLOSE, i + len(SUP_OPEN))
if end != -1:
out.append(text[i:end + len(SUP_CLOSE)])
i = end + len(SUP_CLOSE)
continue
if text.startswith(RXN_ARROW_OPEN, i):
end = text.find(RXN_ARROW_CLOSE, i + len(RXN_ARROW_OPEN))
if end != -1:
out.append(text[i:end + len(RXN_ARROW_CLOSE)])
i = end + len(RXN_ARROW_CLOSE)
continue
ch = text[i]
if ch in "_^":
value, new_i = _consume_tex_script_value(text, i + 1)
if value:
out.append(marker_sub(value) if ch == "_" else marker_sup(value))
i = new_i
continue
out.append(ch)
i += 1
return "".join(out)
def convert_unit_exponents(text: str) -> str:
# m/s2, kg*m2, cm3, x2 after slash/dot/unit contexts -> superscript.
unit_pat = re.compile(r"(?<![A-Za-z\u0410-\u042F\u0430-\u044F\u0401\u0451])((?:m|s|kg|g|mol|K|Pa|J|N|W|V|A|Hz|cm|mm|km|nm|μm|um|L|l))([23])(?!\d)")
return unit_pat.sub(lambda m: m.group(1) + marker_sup(m.group(2)), text)
def convert_chemical_formula_token(token: str) -> str:
# Tokenize a compact chemical formula. Digits after element/group are subscripts; trailing charge is superscript.
pieces: List[str] = []
i = 0
n = len(token)
while i < n:
ch = token[i]
if ch.isdigit():
j = i
while j < n and token[j].isdigit():
j += 1
num = token[i:j]
if j == n - 1 and token[j] in "+-":
pieces.append(marker_sup(num + token[j]))
j += 1
elif pieces and re.search(r"[A-Za-z)]$", plain_text_from_markers("".join(pieces))):
pieces.append(marker_sub(num))
else:
pieces.append(num)
i = j
elif ch in "+-" and i == n - 1 and pieces:
pieces.append(marker_sup(ch))
i += 1
else:
pieces.append(ch)
i += 1
return "".join(pieces)
def looks_like_chemical_formula(token: str) -> bool:
if len(token) < 2 or not re.search(r"\d", token):
return False
# Contains at least one element-like symbol and at least one digit attached to it.
if not re.search(r"[A-Z][a-z]?\d|[A-Z][a-z]?[A-Z][a-z]?\d|\)\d", token):
return False
# Avoid ordinary abbreviations/names with random digits.
if re.search(r"[a-z]{3,}\d", token):
return False
return bool(re.fullmatch(r"(?:[A-Z][a-z]?|\(|\)|\d+|[+-]){2,}", token))
def convert_compact_chemical_formulas(text: str) -> str:
text = re.sub(r"\)(\d+)(?=[A-Z]|\b)", lambda m: ")" + marker_sub(m.group(1)), text)
coeff_pat = re.compile(r"(?<![A-Za-z\u0410-\u042F\u0430-\u044F\u0401\u04510-9_§])(\d+)([A-Z][A-Za-z0-9()+\-]{1,24})(?![A-Za-z\u0410-\u042F\u0430-\u044F\u0401\u04510-9_§])")
def repl_coeff(m: re.Match[str]) -> str:
coeff, token = m.group(1), m.group(2)
if looks_like_chemical_formula(token):
return coeff + convert_chemical_formula_token(token)
return m.group(0)
text = coeff_pat.sub(repl_coeff, text)
token_pat = re.compile(r"(?<![A-Za-z\u0410-\u042F\u0430-\u044F\u0401\u04510-9_§])([A-Z][A-Za-z0-9()+\-]{1,24})(?![A-Za-z\u0410-\u042F\u0430-\u044F\u0401\u04510-9_§])")
def repl(m: re.Match[str]) -> str:
token = m.group(1)
if looks_like_chemical_formula(token):
return convert_chemical_formula_token(token)
return token
return token_pat.sub(repl, text)
def plain_text_from_markers(text: str) -> str:
text = reaction_arrow_markers_to_text(text or "")
return text.replace(SUB_OPEN, "").replace(SUB_CLOSE, "").replace(SUP_OPEN, "").replace(SUP_CLOSE, "")
def normalize_plain_math_aliases(text: str) -> str:
"""Convert common model-friendly ASCII math words into real symbols.
This is for the built-in fallback and for users who write `nabla` instead of
`\\nabla`. Pandoc/TeX users can still write proper TeX.
"""
replacements = {
"h_bar": "ℏ", "hbar": "ℏ", "nabla": "∇", "del": "∇",
"partial": "∂", "inf": "∞", "infty": "∞", "infinity": "∞",
"Lambda": "Λ", "lambda": "λ", "Delta": "Δ", "delta": "δ",
"Gamma": "Γ", "gamma": "γ", "Theta": "Θ", "theta": "θ",
"Sigma": "Σ", "sigma": "σ", "Pi": "Π", "pi": "π",
"Psi": "Ψ", "psi": "ψ", "Omega": "Ω", "omega": "ω",
"Integral": "∫", "integral": "∫", "Sum": "Σ", "sum": "Σ",
"sqrt": "√",
}
for k, v in replacements.items():
text = re.sub(rf"\b{re.escape(k)}\b", v, text)
return text
def _split_tex_rows(body: str) -> List[List[str]]:
# Normalize TeX line-break spacing commands such as \\[10pt]. They are layout hints, not content.
body = re.sub(r"\\\\\s*\[[^\]]*\]", r"\\\\", body or "")
rows = [r.strip() for r in re.split(r"\\\\", body) if r.strip()]
cleaned: List[List[str]] = []
for row in rows:
row = re.sub(r"^\s*\[[^\]]*\]\s*", "", row).strip()
if row:
cleaned.append([c.strip() for c in row.split("&")])
return cleaned
def _matrix_to_unicode(env: str, body: str) -> str:
rows = _split_tex_rows(body)
if not rows:
return ""
rows = [[normalize_latex_fragment_for_fallback(c) for c in row] for row in rows]
cols = max(len(r) for r in rows)
for row in rows:
row.extend([""] * (cols - len(row)))
widths = [max(len(row[i]) for row in rows) for i in range(cols)]
left_right = {
"pmatrix": ("⎛", "⎞", "⎜", "⎟", "⎝", "⎠"),
"bmatrix": ("⎡", "⎤", "⎢", "⎥", "⎣", "⎦"),
"Bmatrix": ("⎧", "⎫", "⎨", "⎬", "⎩", "⎭"),
"vmatrix": ("│", "│", "│", "│", "│", "│"),
"Vmatrix": ("║", "║", "║", "║", "║", "║"),
"matrix": (" ", " ", " ", " ", " ", " "),
"smallmatrix": (" ", " ", " ", " ", " ", " "),
}
top_l, top_r, mid_l, mid_r, bot_l, bot_r = left_right.get(env, left_right["matrix"])
out = []
for idx, row in enumerate(rows):
if len(rows) == 1:
l, r = top_l, top_r
elif idx == 0:
l, r = top_l, top_r
elif idx == len(rows) - 1:
l, r = bot_l, bot_r
else:
l, r = mid_l, mid_r
cells = " ".join(row[i].ljust(widths[i]) for i in range(cols)).rstrip()
out.append(f"{l} {cells} {r}".rstrip())
return "\n".join(out)
def _cases_to_unicode(body: str) -> str:
rows = _split_tex_rows(body)
if not rows:
return ""
out = []
for idx, row in enumerate(rows):
expr = normalize_latex_fragment_for_fallback(row[0]) if row else ""
cond = normalize_latex_fragment_for_fallback(" ".join(row[1:])) if len(row) > 1 else ""
prefix = "⎧" if idx == 0 else ("⎩" if idx == len(rows) - 1 else "⎨")
out.append(f"{prefix} {expr}" + (f" {cond}" if cond else ""))
return "\n".join(out)
def _aligned_to_unicode(body: str) -> str:
rows = _split_tex_rows(body)
lines = []
for row in rows:
cells = [normalize_latex_fragment_for_fallback(c) for c in row]
lines.append(" ".join(c for c in cells if c))
return "\n".join(lines)
def normalize_latex_fragment_for_fallback(text: str) -> str:
"""Best-effort Unicode fallback for TeX when Pandoc/TeX is unavailable."""
text = text or ""
# Convert common math environments into readable multiline Unicode.
env_pat = re.compile(r"\\begin\s*\{([A-Za-z*]+)\}(.*?)\\end\s*\{\1\}", re.S)
while True:
def env_repl(m: re.Match[str]) -> str:
env, body = m.group(1), m.group(2).strip()
if env in {"matrix", "pmatrix", "bmatrix", "Bmatrix", "vmatrix", "Vmatrix", "smallmatrix"}:
return _matrix_to_unicode(env, body)
if env == "cases":
return _cases_to_unicode(body)
if env in {"aligned", "align", "align*", "split", "gather", "multline", "equation"}:
return _aligned_to_unicode(body)
if env == "array":
body = re.sub(r"^\s*\{[^{}]*\}", "", body).strip()
return _matrix_to_unicode("matrix", body)
return body
new = env_pat.sub(env_repl, text)
if new == text:
break
text = new
text = replace_latex_commands(text)
text = normalize_plain_math_aliases(text)
text = convert_explicit_scripts(text)
text = convert_unit_exponents(text)
text = convert_compact_chemical_formulas(text)
text = strip_tex_left_right_delimiters(text)
text = text.replace("\\,", " ").replace("\\;", " ").replace("\\!", "")
text = text.replace("&=", "=").replace("&", " ")
text = re.sub(r"\\\\", "\n", text)
text = re.sub(r"\\[a-zA-Z]+\*?", "", text)
text = text.replace("{", "").replace("}", "")
text = re.sub(r"[ \t]+", " ", text)
text = re.sub(r" *\n *", "\n", text)
return text.strip()
def clean_math_markup(markdown: str) -> Tuple[str, Dict[str, Any]]:
original = markdown or ""
text = original.replace("\r\n", "\n").replace("\r", "\n")
replacements = 0
before = text
text = decode_script_chars(text)
text = normalize_latex_fragment_for_fallback(text)
# Drop bold markers early. They often wrap formula tokens with sub/sup markers, which is where Markdown parsers go to die.
text = text.replace("**", "").replace("__", "")
# Remove simple italic markers around words, but keep multiplication asterisks with spaces.
text = re.sub(r"(?<!\S)\*([^*\n]+?)\*(?!\S)", r"\1", text)
text = re.sub(r"(?<!\w)\*([^*\n]+?)\*(?!\w)", r"\1", text)
replacements = sum(1 for a, b in zip(before, text) if a != b) + abs(len(before) - len(text))
artifact_patterns = [r"\\text", r"\\frac", r"\\sqrt", r"_\{", r"\^\{", r"\\\(", r"\\\[", r"\\begin", r"\\end"]
remaining: List[str] = []
for pat in artifact_patterns:
if re.search(pat, text):
remaining.append(pat)
return text, {"replacements": replacements, "remaining_artifacts": remaining}
def normalize_formula_words(line: str) -> str:
line = normalize_plain_math_aliases(line)
line = line.replace("->", "→").replace("=>", "⇒")
line = re.sub(r"lim\s*\(\s*([^)]*?)\s*→\s*∞\s*\)", r"lim(\1 → ∞)", line)
return line
def is_formula_like_line(text: str) -> bool:
s0 = (text or "").strip()
s = plain_text_from_markers(strip_outer_emphasis_for_formula(s0) if 'strip_outer_emphasis_for_formula' in globals() else s0)
if len(s) < 3:
return False
if s.startswith(("|", "http://", "https://")):
return False
formula_cues = ["=", "→", "->", "⇒", "±", "√", "∫", "∮", "Σ", "Π", "∇", "∂", "ℏ", "∞", "lim", "Integral", "Sum", "Delta", "Lambda", "nabla", "partial", "psi", "^", "_", "§SUB§", "§SUP§", "⎛", "⎞", "⎜", "⎟", "⎝", "⎠", "⎧", "⎨", "⎩", "⎡", "⎤", "⎢", "⎥", "⎣", "⎦"]
if any(cue in s for cue in formula_cues):
if any(ch in s for ch in "⎛⎞⎜⎟⎝⎠⎧⎨⎩⎡⎤⎢⎥⎣⎦") and len(s.split()) <= 24:
return True
words = re.findall(r"[A-Za-z\u0410-\u042F\u0430-\u044F\u0401\u0451]{3,}", s)
operators = len(re.findall(r"[=+*/^_→<>±√∫∮ΣΠ∇∂]", s))
if operators >= 1 and len(words) <= 18:
return True
if re.search(r"\b(?:[A-Z][a-z]?\d+){1,}[A-Za-z0-9()]*\b", s):
# A long prose sentence mentioning V2O5 or K2O is not a display formula.
# Previous versions styled whole explanatory paragraphs as formulas, because apparently
# one oxide in parentheses was enough to trigger a tiny typographic coup.
words = re.findall(r"[A-Za-z\u0410-\u042F\u0430-\u044F\u0401\u0451]{3,}", s)
has_reaction_operator = any(op in s for op in ["→", "->", "=", "+"])
if len(s.split()) <= 8 or (has_reaction_operator and len(words) <= 10):
return True
return False
def strip_outer_emphasis_for_formula(text: str) -> str:
s = text.strip()
changed = True
while changed:
changed = False
for left, right in [("**", "**"), ("__", "__"), ("*", "*"), ("_", "_")]:
if s.startswith(left) and s.endswith(right) and len(s) > len(left) + len(right):
s = s[len(left):-len(right)].strip()
changed = True
return s
def parse_inline_no_markdown(text: str) -> List[Segment]:
token_re = re.compile(r"(§SUB§.*?§ENDSUB§|§SUP§.*?§ENDSUP§)")
result: List[Segment] = []
for part in token_re.split(text or ""):
if not part:
continue
if part.startswith(SUB_OPEN) and part.endswith(SUB_CLOSE):
result.append(Segment(part[len(SUB_OPEN):-len(SUB_CLOSE)], sub=True))
elif part.startswith(SUP_OPEN) and part.endswith(SUP_CLOSE):
result.append(Segment(part[len(SUP_OPEN):-len(SUP_CLOSE)], sup=True))
else:
result.append(Segment(part))
return merge_segments(result)
def parse_blocks(markdown: str) -> List[Block]:
blocks: List[Block] = []
in_code = False
code_lines: List[str] = []
for raw in (markdown or "").split("\n"):
line = raw.rstrip()
if line.strip().startswith("```"):
if in_code:
blocks.append(Block("code", "\n".join(code_lines)))
code_lines = []
in_code = False
else:
in_code = True
continue
if in_code:
code_lines.append(line)
continue
if not line.strip():
blocks.append(Block("blank"))
continue
m = re.match(r"^(#{1,6})\s+(.*)$", line)
if m:
blocks.append(Block("heading", m.group(2).strip(), level=len(m.group(1))))
continue
m = re.match(r"^\s*[-*+]\s+(.*)$", line)
if m:
text = m.group(1).strip()
if is_formula_like_line(text):
text = normalize_formula_words(strip_outer_emphasis_for_formula(text))
blocks.append(Block("list", text, ordered=False))
continue
m = re.match(r"^\s*(\d+)[.)]\s+(.*)$", line)
if m:
text = m.group(2).strip()
if is_formula_like_line(text):
text = normalize_formula_words(strip_outer_emphasis_for_formula(text))
blocks.append(Block("list", f"{m.group(1)}. {text}", ordered=True))
continue
if line.strip() == "---":
blocks.append(Block("hr"))
continue
text = line.strip()
if is_formula_like_line(text):
blocks.append(Block("formula", normalize_formula_words(strip_outer_emphasis_for_formula(text))))
else:
blocks.append(Block("paragraph", text))
if code_lines:
blocks.append(Block("code", "\n".join(code_lines)))
collapsed: List[Block] = []
prev_blank = False
for b in blocks:
if b.kind == "blank":
if not prev_blank:
collapsed.append(b)
prev_blank = True
else:
collapsed.append(b)
prev_blank = False
return collapsed
def parse_inline(text: str) -> List[Segment]:
# Split neutral sub/sup markers first, parse basic markdown inside non-index spans.
token_re = re.compile(r"(§SUB§.*?§ENDSUB§|§SUP§.*?§ENDSUP§)")
result: List[Segment] = []
for part in token_re.split(text or ""):
if not part:
continue
if part.startswith(SUB_OPEN) and part.endswith(SUB_CLOSE):
result.append(Segment(part[len(SUB_OPEN):-len(SUB_CLOSE)], sub=True))
elif part.startswith(SUP_OPEN) and part.endswith(SUP_CLOSE):
result.append(Segment(part[len(SUP_OPEN):-len(SUP_CLOSE)], sup=True))
else:
result.extend(parse_basic_markdown_inline(part))
return merge_segments(result)
def parse_basic_markdown_inline(text: str) -> List[Segment]:
segs: List[Segment] = []
pattern = re.compile(
r"(`[^`]+`|\*\*(.+?)\*\*|__(.+?)__|(?<!\*)\*([^\s*](?:.*?[^\s*])?)\*(?!\*)|(?<!_)_([^\s_](?:.*?[^\s_])?)_(?!_))"
)
pos = 0
for m in pattern.finditer(text):
if m.start() > pos:
segs.append(Segment(text[pos:m.start()]))
tok = m.group(0)
if tok.startswith("`") and tok.endswith("`"):
segs.append(Segment(tok[1:-1], code=True))
elif tok.startswith("**") and tok.endswith("**"):
segs.append(Segment(tok[2:-2], bold=True))
elif tok.startswith("__") and tok.endswith("__"):
segs.append(Segment(tok[2:-2], bold=True))
elif tok.startswith("*") and tok.endswith("*"):
segs.append(Segment(tok[1:-1], italic=True))
elif tok.startswith("_") and tok.endswith("_"):
segs.append(Segment(tok[1:-1], italic=True))
pos = m.end()
if pos < len(text):
segs.append(Segment(text[pos:]))
return segs
def merge_segments(segs: List[Segment]) -> List[Segment]:
out: List[Segment] = []
for s in segs:
if not s.text:
continue
if out and (out[-1].sub, out[-1].sup, out[-1].bold, out[-1].italic, out[-1].code) == (s.sub, s.sup, s.bold, s.italic, s.code):
out[-1].text += s.text
else:
out.append(s)
return out
def segments_plain(segs: List[Segment]) -> str:
return "".join(s.text for s in segs)
def blocks_to_plain(blocks: List[Block], md_mode: bool = False) -> str:
lines: List[str] = []
for b in blocks:
if b.kind == "blank":
lines.append("")
elif b.kind == "heading":
prefix = ("#" * max(1, min(6, b.level)) + " ") if md_mode else ""
lines.append(prefix + segments_plain(parse_inline(b.text)))
elif b.kind == "formula":
lines.append(segments_plain(parse_inline_no_markdown(reaction_arrow_markers_to_text(b.text))))
elif b.kind == "list":
prefix = "- " if not b.ordered else ""
lines.append(prefix + segments_plain(parse_inline(b.text)))
elif b.kind == "hr":
lines.append("---" if md_mode else "")
elif b.kind == "code":
if md_mode:
lines.extend(["```", plain_text_from_markers(b.text), "```"])
else:
lines.append(plain_text_from_markers(b.text))
else:
lines.append(segments_plain(parse_inline(b.text)))
return "\n".join(lines).strip() + "\n"
MATH_GLYPH_TEST = "ℏ∇∂∮∫∬∭ΣΠΛμνΨψΔδσπ∞≤≥≈√→←↔⇒⇐⇔±×·°"
CYRILLIC_GLYPH_TEST = "\u0410\u0411\u0412\u0430\u0431\u0432\u0451\u0401"
BASIC_LATIN_TEST = "ABCxyz012"
def font_dirs() -> List[Path]:
win_font_dir = Path(os.environ.get("WINDIR", r"C:\Windows")) / "Fonts"
dirs: List[Path] = []
# Prefer fonts bundled with or installed by this plugin. The built-in PDF
# fallback must use STIX/Noto when available instead of wandering off to
# Helvetica and drawing tofu squares like a defeated printer.
try:
dirs.extend(_font_search_dirs())
except Exception:
try:
dirs.append(plugin_root() / "vendor" / "toolchain" / "fonts")
except Exception:
pass
dirs.extend([
win_font_dir,
Path("/usr/share/fonts"),
Path("/usr/local/share/fonts"),
Path("/System/Library/Fonts"),
Path("/Library/Fonts"),
])
seen = set()
out: List[Path] = []
for d in dirs:
try:
key = str(d.resolve())
except Exception:
key = str(d)
if key in seen:
continue
seen.add(key)
out.append(d)
return out
def iter_font_candidates(names: List[str]) -> Iterable[Path]:
seen = set()
lowered = [n.lower() for n in names]
def emit(path: Path):
key = str(path).lower()
if path.exists() and key not in seen:
seen.add(key)
return path
return None
# Respect the caller's priority order. Filesystem iteration order is not a ranking system, despite what computers imply.
for name in names:
lname = name.lower()
for d in font_dirs():
if not d.exists():
continue
direct = emit(d / name)
if direct:
yield direct
for ext in ("*.ttf", "*.otf", "*.ttc"):
for p in d.rglob(ext):
if p.name.lower() == lname:
got = emit(p)
if got:
yield got
# Last resort: any installed font, scored later.
for d in font_dirs():
if not d.exists():
continue
for ext in ("*.ttf", "*.otf", "*.ttc"):
for p in d.rglob(ext):
got = emit(p)
if got:
yield got
def _ttfont_for_probe(path: Path, subfont_index: int = 0) -> Optional[TTFont]:
try:
return TTFont("__probe__", str(path), subfontIndex=subfont_index)
except Exception:
return None
def font_score(path: Path, chars: str, max_subfonts: int = 8) -> Tuple[int, int]:
best_score = -1
best_index = 0
for idx in range(max_subfonts):
font = _ttfont_for_probe(path, idx)
if font is None:
if idx == 0:
return -1, 0
break
cmap = getattr(font.face, "charToGlyph", {}) or {}
score = sum(1 for ch in chars if ord(ch) in cmap)
if score > best_score:
best_score, best_index = score, idx
if score == len(chars):
break
return best_score, best_index
def find_font_file(names: List[str], required_chars: str = "", min_score: Optional[int] = None) -> Optional[Tuple[str, int]]:
best: Optional[Tuple[int, str, int]] = None
required_chars = required_chars or BASIC_LATIN_TEST
min_score = len(required_chars) if min_score is None else min_score
for p in iter_font_candidates(names):
score, idx = font_score(p, required_chars)
if score < 0:
continue
if score >= min_score:
return str(p), idx
if best is None or score > best[0]:
best = (score, str(p), idx)
if best and best[0] > 0:
return best[1], best[2]
return None
def register_ttf_font(name: str, spec: Optional[Tuple[str, int]], fallback_name: Optional[str] = None) -> str:
if spec:
path, idx = spec
try:
pdfmetrics.registerFont(TTFont(name, path, subfontIndex=idx))
return name
except Exception:
pass
return fallback_name or "Helvetica"
def setup_fonts() -> FontSet:
# Prefer fonts with broad Unicode coverage. Arial often lacks ℏ/∇, which is how we got tofu squares.
regular_spec = find_font_file(
["DejaVuSans.ttf", "NotoSans-Regular.ttf", "FreeSans.ttf", "LiberationSans-Regular.ttf", "Arimo-Regular.ttf", "segoeui.ttf", "arial.ttf", "Arial.ttf"],
CYRILLIC_GLYPH_TEST + BASIC_LATIN_TEST,
)
bold_spec = find_font_file(["DejaVuSans-Bold.ttf", "NotoSans-Bold.ttf", "FreeSansBold.ttf", "LiberationSans-Bold.ttf", "Arimo-Bold.ttf", "segoeuib.ttf", "arialbd.ttf", "Arial Bold.ttf"], CYRILLIC_GLYPH_TEST + BASIC_LATIN_TEST, min_score=6)
italic_spec = find_font_file(["DejaVuSans-Oblique.ttf", "NotoSans-Italic.ttf", "FreeSansOblique.ttf", "LiberationSans-Italic.ttf", "Arimo-Italic.ttf", "segoeuii.ttf", "ariali.ttf", "Arial Italic.ttf"], CYRILLIC_GLYPH_TEST + BASIC_LATIN_TEST, min_score=6)
bold_italic_spec = find_font_file(["DejaVuSans-BoldOblique.ttf", "NotoSans-BoldItalic.ttf", "FreeSansBoldOblique.ttf", "LiberationSans-BoldItalic.ttf", "Arimo-BoldItalic.ttf", "segoeuiz.ttf", "arialbi.ttf", "Arial Bold Italic.ttf"], CYRILLIC_GLYPH_TEST + BASIC_LATIN_TEST, min_score=6)
mono_spec = find_font_file(["DejaVuSansMono.ttf", "NotoSansMono-Regular.ttf", "FreeMono.ttf", "LiberationMono-Regular.ttf", "Cousine-Regular.ttf", "consola.ttf", "Consolas.ttf"], BASIC_LATIN_TEST, min_score=8)
math_spec = find_font_file(
[
"DejaVuSans.ttf", "DejaVuMathTeXGyre.ttf", "NotoSansMath-Regular.ttf", "NotoSansSymbols2-Regular.ttf", "NotoSansSymbols-Regular.ttf",
"FreeSerif.ttf", "FreeSans.ttf", "STIXTwoMath-Regular.otf", "STIXGeneral.ttf", "Symbola.ttf", "seguisym.ttf", "cambria.ttc", "cambriamath.ttf", "arialuni.ttf",
],
MATH_GLYPH_TEST,
min_score=max(12, len(MATH_GLYPH_TEST) - 4),
)
regular = register_ttf_font("DocFont", regular_spec)
bold = register_ttf_font("DocFont-Bold", bold_spec, regular)
italic = register_ttf_font("DocFont-Italic", italic_spec, regular)
bold_italic = register_ttf_font("DocFont-BoldItalic", bold_italic_spec, bold)
mono = register_ttf_font("DocMono", mono_spec, regular)
math = register_ttf_font("DocMath", math_spec, regular)
try:
pdfmetrics.registerFontFamily("DocFont", normal=regular, bold=bold, italic=italic, boldItalic=bold_italic)
except Exception:
pass
return FontSet(regular, bold, italic, bold_italic, mono, math)
def font_supports_char(font_name: str, ch: str) -> bool:
if not ch:
return True
if ord(ch) < 128:
return True
try:
font = pdfmetrics.getFont(font_name)
cmap = getattr(getattr(font, "face", None), "charToGlyph", {}) or {}
return ord(ch) in cmap
except Exception:
# Built-in PDF fonts are not Unicode fonts. Pretending otherwise created the little square apocalypse.
return False
PDF_UNSUPPORTED_GLYPH_FALLBACK = {
"ℏ": "hbar", "∇": "nabla", "∂": "partial", "∮": "oint", "∫": "int", "∬": "iint", "∭": "iiint",
"Σ": "Sum", "Π": "Prod", "Λ": "Lambda", "Ψ": "Psi", "Ω": "Omega", "Δ": "Delta", "σ": "sigma",
"π": "pi", "μ": "mu", "ν": "nu", "∞": "inf", "≤": "<=", "≥": ">=", "≈": "~", "√": "sqrt",
"→": "->", "←": "<-", "↔": "<->", "⇒": "=>", "⇐": "<=", "⇔": "<=>", "±": "+/-", "×": "x", "·": "*",
}
def pdf_text_with_fallback(text: str, primary_font: str, math_font: str) -> str:
chunks: List[str] = []
current_font: Optional[str] = None
current: List[str] = []
def flush() -> None:
nonlocal current, current_font
if not current:
return
escaped = html.escape("".join(current))
if current_font and current_font != primary_font:
chunks.append(f'<font name="{current_font}">{escaped}</font>')
else:
chunks.append(escaped)
current = []
for ch in text:
target = primary_font
out_text = ch
if not font_supports_char(primary_font, ch):
if font_supports_char(math_font, ch):
target = math_font
elif ch in PDF_UNSUPPORTED_GLYPH_FALLBACK:
# No installed font can draw it: degrade to readable ASCII rather than rendering a white square.
out_text = PDF_UNSUPPORTED_GLYPH_FALLBACK[ch]
elif ord(ch) > 127:
# Last resort: never knowingly emit a glyph the selected PDF font
# cannot draw. This prevents tofu boxes in non-academic fallback PDFs.
out_text = "?"
if target != current_font:
flush()
current_font = target
current.append(out_text)
flush()
return "".join(chunks)
def segs_to_pdf_markup(segs: List[Segment], fonts: FontSet, base_size: float = 10.5, prefer_math: bool = False) -> str:
out = []
primary = fonts.math if prefer_math else fonts.regular
for s in segs:
txt = pdf_text_with_fallback(s.text, primary, fonts.math)
if s.sub:
out.append(f"<sub>{txt}</sub>")
continue
if s.sup:
out.append(f"<super>{txt}</super>")
continue
if s.code:
txt = f'<font name="{fonts.mono}">{txt}</font>'
if s.bold:
txt = f"<b>{txt}</b>"
if s.italic:
txt = f"<i>{txt}</i>"
out.append(txt)
return "".join(out)
def generate_pdf(path: Path, title: str, blocks: List[Block], style: str, keep_previews: bool, temp_root: Path) -> Optional[Path]:
fonts = setup_fonts()
styles = getSampleStyleSheet()
title_style = ParagraphStyle("title", parent=styles["Title"], fontName=fonts.bold, fontSize=18, leading=22, alignment=TA_CENTER, spaceAfter=10)
h1 = ParagraphStyle("h1", parent=styles["Heading1"], fontName=fonts.bold, fontSize=14, leading=17, spaceBefore=8, spaceAfter=6)
h2 = ParagraphStyle("h2", parent=styles["Heading2"], fontName=fonts.bold, fontSize=12.5, leading=15, spaceBefore=7, spaceAfter=4)
body = ParagraphStyle("body", parent=styles["BodyText"], fontName=fonts.regular, fontSize=10.5, leading=13.7, spaceAfter=5)
bullet = ParagraphStyle("bullet", parent=body, leftIndent=14, firstLineIndent=-10)
code = ParagraphStyle("code", parent=body, fontName=fonts.mono, fontSize=9, leading=11, backColor=colors.HexColor("#f5f5f5"), borderPadding=5)
formula = ParagraphStyle("formula", parent=body, fontName=fonts.math, fontSize=10.5, leading=15, leftIndent=10, rightIndent=10, spaceBefore=3, spaceAfter=7, backColor=colors.HexColor("#f7f7f7"), borderPadding=5)
story: List[Any] = []
if title:
story.append(Paragraph(html.escape(title), title_style))
for b in blocks:
if b.kind == "blank":
story.append(Spacer(1, 4))
elif b.kind == "heading":
pstyle = h1 if b.level <= 1 else h2
story.append(Paragraph(segs_to_pdf_markup(parse_inline(b.text), fonts, pstyle.fontSize), pstyle))
elif b.kind == "list":
prefix = "• " if not b.ordered else ""
story.append(Paragraph(prefix + segs_to_pdf_markup(parse_inline(b.text), fonts, body.fontSize), bullet if not b.ordered else body))
elif b.kind == "code":
story.append(Preformatted(plain_text_from_markers(b.text), code))
elif b.kind == "formula":
story.append(Paragraph(segs_to_pdf_markup(parse_inline_no_markdown(reaction_arrow_markers_to_text(b.text)), fonts, formula.fontSize, prefer_math=True), formula))
elif b.kind == "hr":
story.append(Spacer(1, 6))
else:
story.append(Paragraph(segs_to_pdf_markup(parse_inline(b.text), fonts, body.fontSize), body))
doc = SimpleDocTemplate(str(path), pagesize=A4, leftMargin=1.8*cm, rightMargin=1.8*cm, topMargin=1.6*cm, bottomMargin=1.6*cm)
def footer(canvas, doc_obj):
canvas.saveState()
canvas.setFont(fonts.regular, 9)
canvas.setFillColor(colors.HexColor("#666666"))
canvas.drawRightString(A4[0]-1.8*cm, 0.9*cm, f"Page {doc_obj.page}")
canvas.restoreState()
doc.build(story, onFirstPage=footer, onLaterPages=footer)
return None
def segs_to_html(segs: List[Segment]) -> str:
out = []
for s in segs:
txt = html.escape(s.text)
if s.sub:
txt = f"<sub>{txt}</sub>"
elif s.sup:
txt = f"<sup>{txt}</sup>"
else:
if s.code:
txt = f"<code>{txt}</code>"
if s.bold:
txt = f"<strong>{txt}</strong>"
if s.italic:
txt = f"<em>{txt}</em>"
out.append(txt)
return "".join(out)
def formula_to_html(text: str) -> str:
"""Render built-in fallback formula text, including chemistry reaction arrows."""
rxn_re = re.compile(re.escape(RXN_ARROW_OPEN) + r"(.*?)" + re.escape(RXN_ARROW_MID) + r"(.*?)" + re.escape(RXN_ARROW_CLOSE), re.S)
out: List[str] = []
pos = 0
for m in rxn_re.finditer(text or ""):
if m.start() > pos:
out.append(segs_to_html(parse_inline_no_markdown((text or "")[pos:m.start()])))
above = segs_to_html(parse_inline_no_markdown(m.group(1).strip()))
below = segs_to_html(parse_inline_no_markdown(m.group(2).strip()))
below_html = f'<span class="rxn-label rxn-below">{below}</span>' if below else ''
above_html = f'<span class="rxn-label">{above}</span>' if above else '<span class="rxn-label"> </span>'
out.append(f'<span class="rxn-arrow">{above_html}<span class="rxn-line">⟶</span>{below_html}</span>')
pos = m.end()
if pos < len(text or ""):
out.append(segs_to_html(parse_inline_no_markdown((text or "")[pos:])))
return "".join(out)
def generate_html(path: Path, title: str, blocks: List[Block], epub_mode: bool = False) -> str:
body_parts = []
if title:
body_parts.append(f"<h1>{html.escape(title)}</h1>")
for b in blocks:
if b.kind == "blank":
continue
if b.kind == "heading":
lvl = max(1, min(6, b.level + (1 if title else 0)))
body_parts.append(f"<h{lvl}>{segs_to_html(parse_inline(b.text))}</h{lvl}>")
elif b.kind == "list":
body_parts.append(f"<p>{'• ' if not b.ordered else ''}{segs_to_html(parse_inline(b.text))}</p>")
elif b.kind == "code":
body_parts.append(f"<pre><code>{html.escape(plain_text_from_markers(b.text))}</code></pre>")
elif b.kind == "formula":
body_parts.append(f"<p class=\"formula\">{formula_to_html(b.text)}</p>")
elif b.kind == "hr":
body_parts.append("<hr />")
else:
body_parts.append(f"<p>{segs_to_html(parse_inline(b.text))}</p>")
if epub_mode:
return "\n".join(body_parts)
doc = f"""<!doctype html>
<html lang="ru">
<head>
<meta charset="utf-8" />
<title>{html.escape(title or path.stem)}</title>
<style>
body {{ font-family: Arial, sans-serif; line-height: 1.45; max-width: 900px; margin: 40px auto; padding: 0 24px; }}
h1, h2, h3 {{ line-height: 1.2; }}
code, pre {{ background: #f5f5f5; }}
pre {{ padding: 12px; overflow-x: auto; }}
sub, sup {{ line-height: 0; font-size: 75%; }}
.formula {{ background: #f7f7f7; padding: 8px 10px; border-radius: 6px; }}
.rxn-arrow {{ display: inline-flex; flex-direction: column; align-items: center; vertical-align: middle; margin: 0 .25em; line-height: 1; }}
.rxn-label {{ font-size: 75%; white-space: nowrap; line-height: 1; }}
.rxn-line {{ font-size: 120%; line-height: .75; }}
.rxn-below {{ margin-top: 1px; }}
</style>
</head>
<body>
{chr(10).join(body_parts)}
</body>
</html>
"""
path.write_text(doc, encoding="utf-8")
return doc
def generate_txt(path: Path, blocks: List[Block]) -> None:
path.write_text(blocks_to_plain(blocks, md_mode=False), encoding="utf-8")
def generate_md(path: Path, blocks: List[Block], title: str) -> None:
text = (f"# {title}\n\n" if title else "") + blocks_to_plain(blocks, md_mode=True)
path.write_text(text, encoding="utf-8")
def xml_run_text(text: str) -> str:
# Preserve spaces in XML text nodes.
return f'<w:t xml:space="preserve">{xml_escape(text)}</w:t>'
def segs_to_docx_runs(segs: List[Segment]) -> str:
runs = []
for s in segs:
props = []
if s.bold: props.append("<w:b/>")
if s.italic: props.append("<w:i/>")
if s.code: props.append('<w:rFonts w:ascii="Consolas" w:hAnsi="Consolas"/>')
if s.sub: props.append('<w:vertAlign w:val="subscript"/>')
if s.sup: props.append('<w:vertAlign w:val="superscript"/>')
rpr = f"<w:rPr>{''.join(props)}</w:rPr>" if props else ""
runs.append(f"<w:r>{rpr}{xml_run_text(s.text)}</w:r>")
return "".join(runs)
def docx_paragraph(text: str, style: Optional[str] = None) -> str:
ppr = f'<w:pPr><w:pStyle w:val="{style}"/></w:pPr>' if style else ""
return f"<w:p>{ppr}{segs_to_docx_runs(parse_inline(text))}</w:p>"
def docx_formula_paragraph(text: str) -> str:
return f"<w:p>{segs_to_docx_runs(parse_inline_no_markdown(reaction_arrow_markers_to_text(text)))}</w:p>"
def generate_docx(path: Path, title: str, blocks: List[Block]) -> None:
body = []
if title:
body.append(docx_paragraph(title, "Title"))
for b in blocks:
if b.kind == "blank":
body.append("<w:p/>")
elif b.kind == "heading":
body.append(docx_paragraph(b.text, "Heading1" if b.level <= 1 else "Heading2"))
elif b.kind == "list":
body.append(docx_paragraph(("• " if not b.ordered else "") + b.text))
elif b.kind == "code":
body.append(docx_paragraph(plain_text_from_markers(b.text)))
elif b.kind == "formula":
body.append(docx_formula_paragraph(b.text))
elif b.kind == "hr":
body.append("<w:p><w:r><w:t>────────</w:t></w:r></w:p>")
else:
body.append(docx_paragraph(b.text))
document_xml = f'''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
<w:body>{''.join(body)}<w:sectPr><w:pgSz w:w="11906" w:h="16838"/><w:pgMar w:top="1134" w:right="1134" w:bottom="1134" w:left="1134"/></w:sectPr></w:body></w:document>'''
styles_xml = '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:styles xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:style w:type="paragraph" w:styleId="Title"><w:name w:val="Title"/><w:rPr><w:b/><w:sz w:val="32"/></w:rPr></w:style>
<w:style w:type="paragraph" w:styleId="Heading1"><w:name w:val="Heading 1"/><w:rPr><w:b/><w:sz w:val="28"/></w:rPr></w:style>
<w:style w:type="paragraph" w:styleId="Heading2"><w:name w:val="Heading 2"/><w:rPr><w:b/><w:sz w:val="24"/></w:rPr></w:style>
</w:styles>'''
with zipfile.ZipFile(path, "w", zipfile.ZIP_DEFLATED) as z:
z.writestr("[Content_Types].xml", '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"><Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/><Default Extension="xml" ContentType="application/xml"/><Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/><Override PartName="/word/styles.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml"/></Types>''')
z.writestr("_rels/.rels", '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/></Relationships>''')
z.writestr("word/_rels/document.xml.rels", '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"/>''')
z.writestr("word/document.xml", document_xml)
z.writestr("word/styles.xml", styles_xml)
def segs_to_odt(segs: List[Segment]) -> str:
out = []
for s in segs:
txt = xml_escape(s.text)
style = None
if s.sub: style = "Subscript"
elif s.sup: style = "Superscript"
elif s.bold: style = "Bold"
elif s.italic: style = "Italic"
elif s.code: style = "Code"
if style:
out.append(f'<text:span text:style-name="{style}">{txt}</text:span>')
else:
out.append(txt)
return "".join(out)
def generate_odt(path: Path, title: str, blocks: List[Block]) -> None:
body = []
if title:
body.append(f'<text:h text:outline-level="1">{xml_escape(title)}</text:h>')
for b in blocks:
if b.kind == "blank":
body.append('<text:p text:style-name="P"/>')
elif b.kind == "heading":
lvl = max(1, min(6, b.level))
body.append(f'<text:h text:outline-level="{lvl}">{segs_to_odt(parse_inline(b.text))}</text:h>')
elif b.kind == "list":
body.append(f'<text:p text:style-name="P">{"• " if not b.ordered else ""}{segs_to_odt(parse_inline(b.text))}</text:p>')
elif b.kind == "code":
body.append(f'<text:p text:style-name="Code">{xml_escape(plain_text_from_markers(b.text))}</text:p>')
elif b.kind == "formula":
body.append(f'<text:p text:style-name="P">{segs_to_odt(parse_inline_no_markdown(reaction_arrow_markers_to_text(b.text)))}</text:p>')
elif b.kind == "hr":
body.append('<text:p text:style-name="P">────────</text:p>')
else:
body.append(f'<text:p text:style-name="P">{segs_to_odt(parse_inline(b.text))}</text:p>')
content = f'''<?xml version="1.0" encoding="UTF-8"?>
<office:document-content xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0" xmlns:style="urn:oasis:names:tc:opendocument:xmlns:style:1.0" xmlns:text="urn:oasis:names:tc:opendocument:xmlns:text:1.0" xmlns:fo="urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0" office:version="1.2">
<office:automatic-styles>
<style:style style:name="P" style:family="paragraph"><style:paragraph-properties fo:margin-bottom="0.15cm"/></style:style>
<style:style style:name="Bold" style:family="text"><style:text-properties fo:font-weight="bold"/></style:style>
<style:style style:name="Italic" style:family="text"><style:text-properties fo:font-style="italic"/></style:style>
<style:style style:name="Code" style:family="text"><style:text-properties style:font-name="Consolas"/></style:style>
<style:style style:name="Subscript" style:family="text"><style:text-properties style:text-position="sub 58%"/></style:style>
<style:style style:name="Superscript" style:family="text"><style:text-properties style:text-position="super 58%"/></style:style>
</office:automatic-styles>
<office:body><office:text>{''.join(body)}</office:text></office:body></office:document-content>'''
manifest = '''<?xml version="1.0" encoding="UTF-8"?>
<manifest:manifest xmlns:manifest="urn:oasis:names:tc:opendocument:xmlns:manifest:1.0" manifest:version="1.2"><manifest:file-entry manifest:full-path="/" manifest:media-type="application/vnd.oasis.opendocument.text"/><manifest:file-entry manifest:full-path="content.xml" manifest:media-type="text/xml"/></manifest:manifest>'''
with zipfile.ZipFile(path, "w") as z:
z.writestr("mimetype", "application/vnd.oasis.opendocument.text", compress_type=zipfile.ZIP_STORED)
z.writestr("META-INF/manifest.xml", manifest, compress_type=zipfile.ZIP_DEFLATED)
z.writestr("content.xml", content, compress_type=zipfile.ZIP_DEFLATED)
def rtf_escape(s: str) -> str:
out = []
for ch in s:
code = ord(ch)
if ch in "\\{}":
out.append("\\" + ch)
elif code > 127:
if code > 32767:
code -= 65536
out.append(f"\\u{code}?")
elif ch == "\n":
out.append("\\line ")
else:
out.append(ch)
return "".join(out)
def segs_to_rtf(segs: List[Segment]) -> str:
out = []
for s in segs:
txt = rtf_escape(s.text)
prefix = suffix = ""
if s.bold: prefix += "\\b "; suffix = "\\b0 " + suffix
if s.italic: prefix += "\\i "; suffix = "\\i0 " + suffix
if s.sub: prefix += "\\sub "; suffix = "\\nosupersub " + suffix
if s.sup: prefix += "\\super "; suffix = "\\nosupersub " + suffix
out.append(prefix + txt + suffix)
return "".join(out)
def generate_rtf(path: Path, title: str, blocks: List[Block]) -> None:
parts = [r"{\rtf1\ansi\deff0{\fonttbl{\f0 Arial;}{\f1 Consolas;}}\fs22 "]
if title:
parts.append(r"\qc\b\fs36 " + rtf_escape(title) + r"\b0\fs22\par\ql ")
for b in blocks:
if b.kind == "blank":
parts.append(r"\par ")
elif b.kind == "heading":
parts.append(r"\b\fs28 " + segs_to_rtf(parse_inline(b.text)) + r"\b0\fs22\par ")
elif b.kind == "list":
parts.append((r"\bullet " if not b.ordered else "") + segs_to_rtf(parse_inline(b.text)) + r"\par ")
elif b.kind == "code":
parts.append(r"\f1 " + rtf_escape(plain_text_from_markers(b.text)) + r"\f0\par ")
elif b.kind == "formula":
parts.append(segs_to_rtf(parse_inline_no_markdown(reaction_arrow_markers_to_text(b.text))) + r"\par ")
elif b.kind == "hr":
parts.append(r"\par ----------------\par ")
else:
parts.append(segs_to_rtf(parse_inline(b.text)) + r"\par ")
parts.append("}")
path.write_text("".join(parts), encoding="utf-8")
def generate_epub(path: Path, title: str, blocks: List[Block]) -> None:
body = generate_html(Path("unused.html"), title, blocks, epub_mode=True)
chapter = f'''<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="ru"><head><title>{html.escape(title or path.stem)}</title><style>body{{font-family:serif;line-height:1.45}} sub,sup{{font-size:75%;line-height:0}}</style></head><body>{body}</body></html>'''
nav = f'''<?xml version="1.0" encoding="utf-8"?>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" lang="ru"><head><title>Navigation</title></head><body><nav epub:type="toc"><ol><li><a href="chapter1.xhtml">{html.escape(title or path.stem)}</a></li></ol></nav></body></html>'''
opf = f'''<?xml version="1.0" encoding="utf-8"?>
<package xmlns="http://www.idpf.org/2007/opf" version="3.0" unique-identifier="bookid"><metadata xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:identifier id="bookid">urn:uuid:ai-to-document</dc:identifier><dc:title>{html.escape(title or path.stem)}</dc:title><dc:language>ru</dc:language></metadata><manifest><item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/><item id="chapter1" href="chapter1.xhtml" media-type="application/xhtml+xml"/></manifest><spine><itemref idref="chapter1"/></spine></package>'''
container = '''<?xml version="1.0" encoding="utf-8"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container"><rootfiles><rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/></rootfiles></container>'''
with zipfile.ZipFile(path, "w") as z:
z.writestr("mimetype", "application/epub+zip", compress_type=zipfile.ZIP_STORED)
z.writestr("META-INF/container.xml", container, compress_type=zipfile.ZIP_DEFLATED)
z.writestr("OEBPS/content.opf", opf, compress_type=zipfile.ZIP_DEFLATED)
z.writestr("OEBPS/nav.xhtml", nav, compress_type=zipfile.ZIP_DEFLATED)
z.writestr("OEBPS/chapter1.xhtml", chapter, compress_type=zipfile.ZIP_DEFLATED)