python / ocr_backend.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""OCR backend for the LM Studio plugin "qwen-pdf-ocr"."""
import argparse
import base64
import io
import sys
# Force UTF-8 output so characters like the ruble sign do not crash on cp1251.
try:
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
sys.stderr.reconfigure(encoding="utf-8", errors="replace")
except Exception:
pass
import pypdfium2 as pdfium
LANG_MAP = {
"ru": {"easyocr": "ru", "tesseract": "rus"},
"en": {"easyocr": "en", "tesseract": "eng"},
"de": {"easyocr": "de", "tesseract": "deu"},
"fr": {"easyocr": "fr", "tesseract": "fra"},
}
QWEN_PROMPT = {
"ocr": "Recognize and output all text on this document page. "
"Keep the reading order. Do not add comments.",
"layout": "Recognize this page and return Markdown. Tables as HTML, "
"formulas as LaTeX, keep headings and reading order. No comments.",
}
def log(msg):
print(f"[backend] {msg}", file=sys.stderr, flush=True)
def render_pdf(path, dpi):
pdf = pdfium.PdfDocument(path)
scale = dpi / 72.0
return [pdf[i].render(scale=scale).to_pil() for i in range(len(pdf))]
def parse_pages(spec, total):
if not spec:
return list(range(total))
idx = set()
for part in spec.split(","):
part = part.strip()
if "-" in part:
a, b = part.split("-")
idx.update(range(int(a) - 1, int(b)))
else:
idx.add(int(part) - 1)
return sorted(i for i in idx if 0 <= i < total)
class EasyEngine:
def __init__(self, langs):
import numpy # noqa: F401
import easyocr
import torch
codes = [LANG_MAP[l]["easyocr"] for l in langs]
gpu = torch.cuda.is_available()
log(f"EasyOCR: langs={codes}, gpu={gpu}")
self.np = __import__("numpy")
self.reader = easyocr.Reader(codes, gpu=gpu)
def run(self, img, mode):
arr = self.np.array(img.convert("RGB"))
lines = self.reader.readtext(arr, detail=0, paragraph=True)
return "\n".join(lines).strip()
class TessEngine:
def __init__(self, langs):
import pytesseract
self.pt = pytesseract
self.lang = "+".join(LANG_MAP[l]["tesseract"] for l in langs)
log(f"Tesseract: lang={self.lang}")
def run(self, img, mode):
return self.pt.image_to_string(img, lang=self.lang).strip()
class QwenServerEngine:
def __init__(self, url, model):
import requests # noqa: F401
self.requests = __import__("requests")
self.url = url.rstrip("/") + "/chat/completions"
self.model = model
log(f"Qwen via LM Studio: {self.url}, model={model}")
def run(self, img, mode):
buf = io.BytesIO()
img.convert("RGB").save(buf, format="PNG")
b64 = base64.b64encode(buf.getvalue()).decode()
payload = {
"model": self.model,
"messages": [{
"role": "user",
"content": [
{"type": "text", "text": QWEN_PROMPT[mode]},
{"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{b64}"}},
],
}],
"temperature": 0,
"max_tokens": 4096,
}
r = self.requests.post(self.url, json=payload, timeout=600)
r.raise_for_status()
return r.json()["choices"][0]["message"]["content"].strip()
def build_engines(names, langs, url, model):
engines = {}
errors = []
for name in names:
try:
if name == "easyocr":
engines["EasyOCR (GPU)"] = EasyEngine(langs)
elif name == "tesseract":
engines["Tesseract (CPU)"] = TessEngine(langs)
elif name == "qwen":
engines["Qwen3-VL"] = QwenServerEngine(url, model)
except Exception as e:
errors.append(f"{name}: {e}")
log(f"Engine '{name}' unavailable: {e}")
return engines, errors
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--pdf", required=True)
ap.add_argument("--engines", default="qwen", help="qwen,easyocr,tesseract")
ap.add_argument("--lang", default="ru,en")
ap.add_argument("--dpi", type=int, default=200)
ap.add_argument("--mode", choices=["ocr", "layout"], default="ocr")
ap.add_argument("--pages")
ap.add_argument("--lmstudio-url", default="http://localhost:1234/v1")
ap.add_argument("--model", default="qwen/qwen3-vl-8b")
args = ap.parse_args()
langs = [l.strip() for l in args.lang.split(",") if l.strip() in LANG_MAP] or ["en"]
names = [n.strip() for n in args.engines.split(",") if n.strip()]
log(f"Rendering PDF (dpi={args.dpi})...")
images = render_pdf(args.pdf, args.dpi)
page_idx = parse_pages(args.pages, len(images))
log(f"Pages to process: {len(page_idx)} of {len(images)}")
engines, errors = build_engines(names, langs, args.lmstudio_url, args.model)
if not engines:
print("Failed to start any OCR engine:\n" + "\n".join(errors), flush=True)
sys.exit(1)
multi = len(engines) > 1
for n, i in enumerate(page_idx, 1):
log(f"Page {i + 1} ({n}/{len(page_idx)})...")
block = [f"===== Page {i + 1} ====="]
for ename, eng in engines.items():
try:
txt = eng.run(images[i], args.mode)
except Exception as e:
txt = f"(engine error: {e})"
if multi:
block.append(f"--- {ename} ---\n{txt or '(empty)'}")
else:
block.append(txt or "(empty)")
print("\n".join(block) + "\n", flush=True)
log("Done.")
if __name__ == "__main__":
main()