Project Files
src / fastvlm_server / qwen3_vl_runner.py
"""Qwen3-VL grounding runner — persistent subprocess worker.
Protocol:
- Startup: prints {"ready": true} after model loads, or {"ready": false, "error": "..."} and exits.
- Request: JSON line {"image": "<base64>", "task": "<OD>|<OPEN_VOCABULARY_DETECTION>label"}
- Response: JSON line {"ok": true, "result": {"bboxes": [...], "labels": [...], "width": int, "height": int}}
or {"ok": false, "error": "..."}
Output format from Qwen3-VL:
The model returns JSON with bbox_2d arrays in 0–1000 normalized coordinate space:
[{"bbox_2d": [x1, y1, x2, y2], "label": "..."}]
Conversion: pixel = coord / 1000 * image_dimension
"""
from __future__ import annotations
import base64
import io
import json
import logging
import os
import re
import sys
import tempfile
from PIL import Image
logging.basicConfig(stream=sys.stderr, level=logging.INFO, format="%(levelname)s %(message)s")
logger = logging.getLogger(__name__)
# Regex to extract JSON array embedded in ```json ... ``` fences (or bare JSON).
_JSON_FENCE_RE = re.compile(r"```(?:json)?\s*([\s\S]*?)```", re.IGNORECASE)
# Fallback: extract individual complete bbox_2d objects from truncated output.
_ITEM_RE = re.compile(
r'\{"bbox_2d":\s*\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\],\s*"label":\s*"([^"]+)"\}',
re.IGNORECASE,
)
_JSON_FORMAT = (
" Output JSON only — a JSON array where each element has"
" 'bbox_2d' ([x1, y1, x2, y2] as integers normalized 0–1000) and 'label' (a string)."
" No prose, no markdown, no explanation."
)
_OD_INSTRUCTION = (
"Detect objects in the image with strict hierarchical prioritization.\n\n"
"PRIORITY 1 (CRITICAL - MUST DETECT FIRST):\n"
"- You MUST detect \"human face\" (highest priority if a person is present)\n"
"- You MUST detect \"person\" (if no face is clearly visible or if the person is the main subject)\n\n"
"PRIORITY 2 (MAIN SUBJECT / HERO ELEMENT):\n"
"- The most visually prominent object or subject that is NOT part of the background.\n"
"- Use specific, concrete labels (e.g., 'red car', 'fluffy owl toy').\n"
"- Avoid generic terms like 'object' or 'thing'.\n\n"
"PRIORITY 3 (CONTEXTUAL BACKGROUND ELEMENTS):\n"
"- Only detect background elements if they are significant to the scene composition OR if the main subject is interacting with them.\n"
"- Do not detect minor or redundant background details.\n\n"
"PRIORITY 4 (FOCUSSED MAIN SUBJECT / HERO ELEMENT):\n"
"- All visible body parts (hands, feet, arms, legs).\n"
"- Elements of the face, as far as clearly detectable and focussed on close-ups: nose, mouth, left and right eyes, eyebrows and ears\n"
"- anatomical details, as far as recognizable as \"focussed\" or \"prominent\" (e.g., 'iris', 'pupil', 'eyelid')\n\n"
"RULES:\n"
"- Maximum 16 objects total.\n"
"- Each bounding box must be unique and non-redundant.\n"
"- For clothing, name the specific garment (e.g., 'tank top', 'jeans').\n"
"- For body parts, qualify by position (e.g., 'left hand').\n"
"- NEVER prioritize background elements over the main subject or human face.\n"
"- NEVER prioritize anatomical details over general concepts unless they are solely focussed (e.g. only detect 'eyes' unless 'human face' is the dominant part of the image)\n"
"- If the main subject is a person, focus on the person and their immediate interactions. Ignore background elements unless they are directly involved in the interaction."
)
def _build_prompt(task: str) -> str:
if task.startswith("<OPEN_VOCABULARY_DETECTION>"):
label = task[len("<OPEN_VOCABULARY_DETECTION>"):].strip()
return f"Detect all instances of '{label}' in the image." + _JSON_FORMAT
if task == "<OD>":
od_instruction = os.environ.get("DETECT_OD_PROMPT") or _OD_INSTRUCTION
return od_instruction + _JSON_FORMAT
return task + _JSON_FORMAT
def _parse_output(text: str, width: int, height: int) -> dict:
"""Parse Qwen3-VL bbox_2d JSON output into bboxes/labels with pixel coords.
Qwen3-VL emits:
```json
[{"bbox_2d": [x1, y1, x2, y2], "label": "..."}]
```
Coordinates are in 0–1000 normalized space; converted to pixels here.
If the JSON is truncated (max_tokens hit), falls back to extracting individual
complete items via regex so partial results are not silently lost.
"""
bboxes: list = []
labels: list = []
seen: set = set()
# Extract JSON from code fence if present, otherwise try the raw text.
m = _JSON_FENCE_RE.search(text)
json_str = m.group(1).strip() if m else text.strip()
items: list | None = None
try:
parsed = json.loads(json_str)
items = parsed if isinstance(parsed, list) else [parsed]
except json.JSONDecodeError:
# Output was likely truncated at max_tokens — recover whatever complete
# objects are present using the item-level regex.
logger.warning(
"Qwen3-VL JSON parse failed (output may be truncated). "
"Attempting partial recovery. Raw text length: %d, first 120: %r",
len(text), text[:120],
)
recovered = [
{"bbox_2d": [int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(4))],
"label": m.group(5)}
for m in _ITEM_RE.finditer(text)
]
if recovered:
logger.warning("Partial recovery: %d items extracted.", len(recovered))
items = recovered
else:
logger.warning("Partial recovery yielded no items.")
return {"bboxes": bboxes, "labels": labels, "width": width, "height": height}
for item in items:
if not isinstance(item, dict):
continue
bb = item.get("bbox_2d")
label = item.get("label", "")
if not isinstance(bb, list) or len(bb) != 4:
continue
nx1, ny1, nx2, ny2 = bb
# Reject out-of-range coordinates (hallucinated huge numbers, negatives).
if not all(0 <= v <= 1000 for v in (nx1, ny1, nx2, ny2)):
continue
# Reject degenerate boxes with zero or negative area.
if nx2 <= nx1 or ny2 <= ny1:
continue
# Skip detections that cover the entire image (normalized coords ≈ [0,0,1000,1000]).
if nx1 < 10 and ny1 < 10 and nx2 > 990 and ny2 > 990:
continue
x1 = nx1 / 1000 * width
y1 = ny1 / 1000 * height
x2 = nx2 / 1000 * width
y2 = ny2 / 1000 * height
# Deduplicate: skip entries with identical (rounded bbox, label).
dedup_key = (round(nx1), round(ny1), round(nx2), round(ny2), label)
if dedup_key in seen:
continue
seen.add(dedup_key)
bboxes.append([x1, y1, x2, y2])
labels.append(label)
return {"bboxes": bboxes, "labels": labels, "width": width, "height": height}
def main() -> None:
if len(sys.argv) < 2:
print(json.dumps({"ready": False, "error": "Usage: qwen3_vl_runner.py <model_path>"}), flush=True)
sys.exit(1)
model_path = sys.argv[1]
try:
from mlx_vlm import load, generate
model, processor = load(model_path)
print(json.dumps({"ready": True}), flush=True)
except Exception as e:
print(json.dumps({"ready": False, "error": str(e)}), flush=True)
sys.exit(1)
for raw_line in sys.stdin:
line = raw_line.strip()
if not line:
continue
tmp_path: str | None = None
try:
req = json.loads(line)
image_bytes = base64.b64decode(req["image"])
image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
width, height = image.size
# generate() only accepts a file path, not a PIL object
fd, tmp_path = tempfile.mkstemp(suffix=".png")
os.close(fd)
image.save(tmp_path, format="PNG")
prompt_text = _build_prompt(req.get("task", "<OD>"))
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": tmp_path},
{"type": "text", "text": prompt_text},
],
}
]
formatted = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
import mlx.core as mx
mx.random.seed(175308301)
result = generate(model, processor, formatted, tmp_path, max_tokens=4096, temperature=0.3, top_p=0.8, top_k=50, repetition_penalty=1.0, verbose=False)
text = result.text if hasattr(result, "text") else str(result)
logger.info("Raw model output (%d chars): %s", len(text), text[:2000])
detection = _parse_output(text, width, height)
print(json.dumps({"ok": True, "result": detection}), flush=True)
except Exception as e:
print(json.dumps({"ok": False, "error": str(e)}), flush=True)
finally:
if tmp_path is not None:
try:
os.unlink(tmp_path)
except OSError:
pass
if __name__ == "__main__":
main()