"""Qwen3-VL grounding runner — persistent subprocess worker.

Protocol:
  - Startup: prints {"ready": true} after model loads, or {"ready": false, "error": "..."} and exits.
  - Request:  JSON line  {"image": "<base64>", "task": "<OD>|<OPEN_VOCABULARY_DETECTION>label"}
  - Response: JSON line  {"ok": true, "result": {"bboxes": [...], "labels": [...], "width": int, "height": int}}
              or         {"ok": false, "error": "..."}

Output format from Qwen3-VL:
  The model returns JSON with bbox_2d arrays in 0–1000 normalized coordinate space:
    [{"bbox_2d": [x1, y1, x2, y2], "label": "..."}]
  Conversion: pixel = coord / 1000 * image_dimension
"""
from __future__ import annotations

import base64
import io
import json
import logging
import os
import re
import sys
import tempfile

from PIL import Image

logging.basicConfig(stream=sys.stderr, level=logging.INFO, format="%(levelname)s %(message)s")
logger = logging.getLogger(__name__)

# Regex to extract JSON array embedded in ```json ... ``` fences (or bare JSON).
_JSON_FENCE_RE = re.compile(r"```(?:json)?\s*([\s\S]*?)```", re.IGNORECASE)

# Fallback: extract individual complete bbox_2d objects from truncated output.
_ITEM_RE = re.compile(
    r'\{"bbox_2d":\s*\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\],\s*"label":\s*"([^"]+)"\}',
    re.IGNORECASE,
)


_JSON_FORMAT = (
    " Output JSON only — a JSON array where each element has"
    " 'bbox_2d' ([x1, y1, x2, y2] as integers normalized 0–1000) and 'label' (a string)."
    " No prose, no markdown, no explanation."
)

_OD_INSTRUCTION = (
    "Detect objects in the image with strict hierarchical prioritization.\n\n"
    "PRIORITY 1 (CRITICAL - MUST DETECT FIRST):\n"
    "- You MUST detect \"human face\" (highest priority if a person is present)\n"
    "- You MUST detect \"person\" (if no face is clearly visible or if the person is the main subject)\n\n"
    "PRIORITY 2 (MAIN SUBJECT / HERO ELEMENT):\n"
    "- The most visually prominent object or subject that is NOT part of the background.\n"
    "- Use specific, concrete labels (e.g., 'red car', 'fluffy owl toy').\n"
    "- Avoid generic terms like 'object' or 'thing'.\n\n"
    "PRIORITY 3 (CONTEXTUAL BACKGROUND ELEMENTS):\n"
    "- Only detect background elements if they are significant to the scene composition OR if the main subject is interacting with them.\n"
    "- Do not detect minor or redundant background details.\n\n"
    "PRIORITY 4 (FOCUSSED MAIN SUBJECT / HERO ELEMENT):\n"
    "- All visible body parts (hands, feet, arms, legs).\n"
    "- Elements of the face, as far as clearly detectable and focussed on close-ups: nose, mouth, left and right eyes, eyebrows and ears\n"
    "- anatomical details, as far as recognizable as \"focussed\" or \"prominent\" (e.g., 'iris', 'pupil', 'eyelid')\n\n"
    "RULES:\n"
    "- Maximum 16 objects total.\n"
    "- Each bounding box must be unique and non-redundant.\n"
    "- For clothing, name the specific garment (e.g., 'tank top', 'jeans').\n"
    "- For body parts, qualify by position (e.g., 'left hand').\n"
    "- NEVER prioritize background elements over the main subject or human face.\n"
    "- NEVER prioritize anatomical details over general concepts unless they are solely focussed (e.g. only detect 'eyes' unless 'human face' is the dominant part of the image)\n"
    "- If the main subject is a person, focus on the person and their immediate interactions. Ignore background elements unless they are directly involved in the interaction."
)


def _build_prompt(task: str) -> str:
    if task.startswith("<OPEN_VOCABULARY_DETECTION>"):
        label = task[len("<OPEN_VOCABULARY_DETECTION>"):].strip()
        return f"Detect all instances of '{label}' in the image." + _JSON_FORMAT
    if task == "<OD>":
        od_instruction = os.environ.get("DETECT_OD_PROMPT") or _OD_INSTRUCTION
        return od_instruction + _JSON_FORMAT
    return task + _JSON_FORMAT


def _parse_output(text: str, width: int, height: int) -> dict:
    """Parse Qwen3-VL bbox_2d JSON output into bboxes/labels with pixel coords.

    Qwen3-VL emits:
        ```json
        [{"bbox_2d": [x1, y1, x2, y2], "label": "..."}]
        ```
    Coordinates are in 0–1000 normalized space; converted to pixels here.
    If the JSON is truncated (max_tokens hit), falls back to extracting individual
    complete items via regex so partial results are not silently lost.
    """
    bboxes: list = []
    labels: list = []
    seen: set = set()

    # Extract JSON from code fence if present, otherwise try the raw text.
    m = _JSON_FENCE_RE.search(text)
    json_str = m.group(1).strip() if m else text.strip()

    items: list | None = None
    try:
        parsed = json.loads(json_str)
        items = parsed if isinstance(parsed, list) else [parsed]
    except json.JSONDecodeError:
        # Output was likely truncated at max_tokens — recover whatever complete
        # objects are present using the item-level regex.
        logger.warning(
            "Qwen3-VL JSON parse failed (output may be truncated). "
            "Attempting partial recovery. Raw text length: %d, first 120: %r",
            len(text), text[:120],
        )
        recovered = [
            {"bbox_2d": [int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(4))],
             "label": m.group(5)}
            for m in _ITEM_RE.finditer(text)
        ]
        if recovered:
            logger.warning("Partial recovery: %d items extracted.", len(recovered))
            items = recovered
        else:
            logger.warning("Partial recovery yielded no items.")
            return {"bboxes": bboxes, "labels": labels, "width": width, "height": height}

    for item in items:
        if not isinstance(item, dict):
            continue
        bb = item.get("bbox_2d")
        label = item.get("label", "")
        if not isinstance(bb, list) or len(bb) != 4:
            continue
        nx1, ny1, nx2, ny2 = bb
        # Reject out-of-range coordinates (hallucinated huge numbers, negatives).
        if not all(0 <= v <= 1000 for v in (nx1, ny1, nx2, ny2)):
            continue
        # Reject degenerate boxes with zero or negative area.
        if nx2 <= nx1 or ny2 <= ny1:
            continue
        # Skip detections that cover the entire image (normalized coords ≈ [0,0,1000,1000]).
        if nx1 < 10 and ny1 < 10 and nx2 > 990 and ny2 > 990:
            continue
        x1 = nx1 / 1000 * width
        y1 = ny1 / 1000 * height
        x2 = nx2 / 1000 * width
        y2 = ny2 / 1000 * height
        # Deduplicate: skip entries with identical (rounded bbox, label).
        dedup_key = (round(nx1), round(ny1), round(nx2), round(ny2), label)
        if dedup_key in seen:
            continue
        seen.add(dedup_key)
        bboxes.append([x1, y1, x2, y2])
        labels.append(label)

    return {"bboxes": bboxes, "labels": labels, "width": width, "height": height}


def main() -> None:
    if len(sys.argv) < 2:
        print(json.dumps({"ready": False, "error": "Usage: qwen3_vl_runner.py <model_path>"}), flush=True)
        sys.exit(1)

    model_path = sys.argv[1]

    try:
        from mlx_vlm import load, generate

        model, processor = load(model_path)
        print(json.dumps({"ready": True}), flush=True)
    except Exception as e:
        print(json.dumps({"ready": False, "error": str(e)}), flush=True)
        sys.exit(1)

    for raw_line in sys.stdin:
        line = raw_line.strip()
        if not line:
            continue
        tmp_path: str | None = None
        try:
            req = json.loads(line)
            image_bytes = base64.b64decode(req["image"])
            image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
            width, height = image.size
            # generate() only accepts a file path, not a PIL object
            fd, tmp_path = tempfile.mkstemp(suffix=".png")
            os.close(fd)
            image.save(tmp_path, format="PNG")
            prompt_text = _build_prompt(req.get("task", "<OD>"))
            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "image", "image": tmp_path},
                        {"type": "text", "text": prompt_text},
                    ],
                }
            ]
            formatted = processor.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
            )
            import mlx.core as mx
            mx.random.seed(175308301)
            result = generate(model, processor, formatted, tmp_path, max_tokens=4096, temperature=0.3, top_p=0.8, top_k=50, repetition_penalty=1.0, verbose=False)
            text = result.text if hasattr(result, "text") else str(result)
            logger.info("Raw model output (%d chars): %s", len(text), text[:2000])
            detection = _parse_output(text, width, height)
            print(json.dumps({"ok": True, "result": detection}), flush=True)
        except Exception as e:
            print(json.dumps({"ok": False, "error": str(e)}), flush=True)
        finally:
            if tmp_path is not None:
                try:
                    os.unlink(tmp_path)
                except OSError:
                    pass


if __name__ == "__main__":
    main()
analyse-image