Project Files
src / tools / annotate_image.ts
import { tool, type Tool, type ToolsProviderController, type ToolCallContext } from "@lmstudio/sdk";
import { z } from "zod";
import path from "path";
import fs from "fs";
import { pathToFileURL } from "url";
import {
syncAttachmentsToState,
getActiveChatContext,
resolveActiveLMStudioChatId,
getLMStudioWorkingDir,
readState,
writeStateAtomic,
generatePreviewFromBuffer,
appendImages,
getHealthyServerBaseUrl,
toHttpOriginalUrl,
toHttpPreviewUrl,
buildAuditLogger,
formatToolMetaBlock,
getSelfPluginIdentifier,
VARIANT_FULL_CONFIG,
} from "../core-bundle.mjs";
import { drawBboxesOnImage } from "../helpers/drawBboxesOnImage.js";
import {
detectLmStudioVisionBatch,
ensureLmStudioVisionInstanceReady,
type VisionAnalysisItem,
type VisionDetectionAnalyzerConfig,
type VisionDetectionBatchResult,
} from "../services/lmStudioVisionAnalyzer.js";
import { reportToolStatus, reportToolStep } from "../helpers/toolProgress.js";
import { defaultPluginSettings, globalConfigSchematics } from "../config.js";
// ─────────────────────────────────────────────────────────────────────────────
// Helpers
// ─────────────────────────────────────────────────────────────────────────────
function isoStampCompact(): string {
const d = new Date();
const year = d.getUTCFullYear();
const month = String(d.getUTCMonth() + 1).padStart(2, "0");
const day = String(d.getUTCDate()).padStart(2, "0");
const hours = String(d.getUTCHours()).padStart(2, "0");
const minutes = String(d.getUTCMinutes()).padStart(2, "0");
const seconds = String(d.getUTCSeconds()).padStart(2, "0");
const millis = String(d.getUTCMilliseconds()).padStart(3, "0");
return `${year}${month}${day}T${hours}${minutes}${seconds}${millis}Z`;
}
function parsePrefixedNotation(
s: string
): { pool: "attachment" | "image" | "variant" | "picture"; index: number } | null {
const t = String(s || "").trim().toLowerCase();
const m = t.match(/^([avip])(\d+)$/);
if (!m) return null;
const idx = Math.max(1, parseInt(m[2], 10));
const pool =
m[1] === "a" ? "attachment" : m[1] === "v" ? "variant" : m[1] === "i" ? "image" : "picture";
return { pool, index: idx };
}
function formatPluginMeta(): string {
return formatToolMetaBlock();
}
function getGlobalConfig(ctl: ToolsProviderController): any | null {
const ctlAny = ctl as any;
const getter = ctlAny.getGlobalPluginConfig || ctlAny.getGlobalConfig;
if (!getter) return null;
try {
return getter.call(ctl, globalConfigSchematics);
} catch {
return null;
}
}
function getGlobalString(gcfg: any | null, key: string, fallback: string): string {
try {
const value = gcfg?.get(key);
return typeof value === "string" ? value : fallback;
} catch {
return fallback;
}
}
function getGlobalNumber(gcfg: any | null, key: string, fallback: number): number {
try {
const value = gcfg?.get(key);
return typeof value === "number" && Number.isFinite(value) ? value : fallback;
} catch {
return fallback;
}
}
/**
* Expand (positive) or shrink (negative) a bbox [x1, y1, x2, y2].
* Number → % of box diagonal. String → value + optional 'px' or '%' suffix.
*/
function applyFrameAdjust(
bbox: [number, number, number, number],
frameAdjust: number | string,
imgW: number,
imgH: number,
): [number, number, number, number] {
const [x1, y1, x2, y2] = bbox;
const diag = Math.hypot(x2 - x1, y2 - y1);
let d_px: number;
if (typeof frameAdjust === "string") {
const m = String(frameAdjust).trim().match(/^([+-]?\d+(?:\.\d+)?)\s*(%|px)?$/i);
if (!m) return bbox;
const val = parseFloat(m[1]);
d_px = m[2]?.toLowerCase() === "px" ? val : (val / 100) * diag;
} else {
d_px = (frameAdjust / 100) * diag;
}
return [
Math.max(0, Math.round(x1 - d_px)),
Math.max(0, Math.round(y1 - d_px)),
Math.min(imgW - 1, Math.round(x2 + d_px)),
Math.min(imgH - 1, Math.round(y2 + d_px)),
];
}
// ─────────────────────────────────────────────────────────────────────────────
// Schema
// ─────────────────────────────────────────────────────────────────────────────
const FlexibleTargetsList = z
.union([
z.string().transform((s) => (s.match(/[aivp]\d+/gi) ?? []).map((x) => x.toLowerCase())),
z.array(z.string()),
])
.refine((arr) => arr.length >= 1, "targets must contain at least one notation")
.refine((arr) => arr.length <= 16, "targets must contain at most 16 notations");
const AnnotateImageParamsShape = {
targets: FlexibleTargetsList.optional().describe(
"One or more image notations to process. Each notation is a letter followed by a number: " +
"a=attachment (a1, a2, …), i=generated image (i1, i2, …), v=variant (v1, v2, …), p=picture (p1, p2, …). " +
"Pass via the targets field, e.g. annotate_image({\"targets\":[\"a1\", \"i3\"]}). " +
"Omit when there is exactly one image — it will be selected automatically."
),
task: z
.string()
.optional()
.default("")
.describe(
"What to detect. Omit for full-image general object detection. " +
"Use natural language to target specific subjects (e.g. 'all faces and hands', 'the dog', 'cars and bicycles'). " +
"Not used on correction calls — detections are loaded from state."
),
color: z
.string()
.optional()
.describe("Box color for all detected objects. CSS color name or hex, e.g. 'pink', 'red', '#FF0000'. Default: pink."),
lineWeight: z
.coerce.number().int().min(1).max(50)
.optional()
.describe("Line thickness in pixels for all bounding boxes. Default: 5."),
frameAdjust: z
.union([z.number(), z.string()])
.optional()
.describe(
"Expand (positive) or shrink (negative) bounding boxes before drawing. " +
"Number: percent of box diagonal (e.g. 5 = +5 %). " +
"String: value + optional 'px' or '%' suffix, e.g. '10px', '-5%'. " +
"Without detectLabel: applied to all boxes. With detectLabel: applied to the selected box only. Default: 5."
),
detectLabel: z
.union([
z.string().transform((s) =>
s.split(/\s*,\s*/).map((x) => x.trim()).filter((x) => x.length > 0)
),
z.array(z.string().min(1)),
])
.optional()
.describe(
"On a correction call: label(s) to match (case-insensitive). " +
"Single string, comma-separated list ('left eye, right eye'), or JSON array. " +
"When set, ONLY the matching detection(s) are drawn — all others are omitted. " +
"Single label with no detectIndex: auto-expands to ALL detections for that label (Option A). " +
"canvas may be the original source (e.g. a1) or the previous annotate_image result (e.g. i3)."
),
detectIndex: z
.union([
z.string().transform((s) => (s.match(/\d+/g) ?? []).map(Number)),
z.coerce.number().int().min(0).transform((n) => [n]),
z.array(z.coerce.number().int().min(0)),
])
.optional()
.describe(
"Zero-based index or list of indices, parallel to detectLabel. " +
"indices[li] ?? indices[0] ?? 0 for missing entries. Default: 0. " +
"Single label + multiple indices draws that label at each specified occurrence (Option B). " +
"Bracket notation accepted: '[2, 4, 7]'."
),
x1: z.preprocess(
(val) => { if (typeof val === "string" && val.trim().startsWith("[")) { try { return JSON.parse(val); } catch {} } return val; },
z.union([z.number(), z.array(z.union([z.number(), z.null()]))])
).optional().describe(
"Manual left edge(s) in original image pixels. " +
"Scalar: applies to all selected detections. Array (parallel to detectLabel): null = keep stored value. " +
"E.g. x1=[null,50,null] moves only the second detection's left edge."
),
y1: z.preprocess(
(val) => { if (typeof val === "string" && val.trim().startsWith("[")) { try { return JSON.parse(val); } catch {} } return val; },
z.union([z.number(), z.array(z.union([z.number(), z.null()]))])
).optional().describe("Manual top edge(s) in original image pixels. Scalar or array (null = keep stored). See x1."),
x2: z.preprocess(
(val) => { if (typeof val === "string" && val.trim().startsWith("[")) { try { return JSON.parse(val); } catch {} } return val; },
z.union([z.number(), z.array(z.union([z.number(), z.null()]))])
).optional().describe("Manual right edge(s) in original image pixels. Scalar or array (null = keep stored). See x1."),
y2: z.preprocess(
(val) => { if (typeof val === "string" && val.trim().startsWith("[")) { try { return JSON.parse(val); } catch {} } return val; },
z.union([z.number(), z.array(z.union([z.number(), z.null()]))])
).optional().describe("Manual bottom edge(s) in original image pixels. Scalar or array (null = keep stored). See x1."),
} satisfies Record<string, z.ZodTypeAny>;
// ─────────────────────────────────────────────────────────────────────────────
// Internal types
// ─────────────────────────────────────────────────────────────────────────────
type StoredDetection = {
label: string;
bbox: { x1: number; y1: number; x2: number; y2: number };
crop?: { cropLeft: number; cropRight: number; cropTop: number; cropBottom: number };
};
type ResolvedEntry =
| { mode: "detect"; id: string; previewBuf: Buffer; origBuf: Buffer }
| {
mode: "redraw";
id: string; // original draw source (e.g. "a1")
origBuf: Buffer;
task: string;
detections: StoredDetection[];
imageWidth: number;
imageHeight: number;
};
/**
* Resolve a per-box coordinate override, analogous to resolveBoxOverride in mask.
* override: scalar (applies to all boxes) | array (parallel, null = keep stored) | undefined.
* Returns undefined when no override → caller uses storedValue.
*/
function resolveCoordOverride(
override: number | (number | null)[] | undefined,
index: number,
): number | undefined {
if (override === undefined) return undefined;
if (Array.isArray(override)) {
const entry = override[index];
if (entry === null || entry === undefined) return undefined;
return entry;
}
return override;
}
/**
* Option A helper: count occurrences of label in detections and return [0..N-1] if N > 1, else [0].
*/
function expandDetectIndices(detections: StoredDetection[], label: string): number[] {
const lower = label.toLowerCase();
let count = 0;
for (const d of detections) { if (d.label.toLowerCase() === lower) count++; }
return count > 1 ? Array.from({ length: count }, (_, i) => i) : [0];
}
// ─────────────────────────────────────────────────────────────────────────────
// Tool factory
// ─────────────────────────────────────────────────────────────────────────────
export function createAnnotateImageTool(ctl: ToolsProviderController): Tool {
return tool({
name: "annotate_image",
description: `Highlights specific areas or elements on images. If well described, these elements are precisely framed with bounding boxes in the chosen color.
Detections are saved to state for later correction, refinement, or re-drawing with adjusted labels and edge positions.
--- Correction call (omit task) ---
Redraws from stored detections without inference. canvas may be the original source (e.g. a1) or a previous annotate_image result (e.g. i3).
Without detectLabel: ALL stored detections are drawn with the given color/lineWeight/frameAdjust.
With detectLabel: ONLY the matching detection(s) are drawn.
- Option A — single label, no detectIndex: auto-expands to ALL detections for that label.
detectLabel="face" with 8 stored faces → draws all 8 faces.
- Option B — single label, multiple detectIndex: draws that label at each specified occurrence.
detectLabel="face", detectIndex="[2,4,7]" → draws face #2, #4, #7.
- Multi-label — comma-separated or array: one detection per label.
detectLabel="left eye, right eye" → draws both eyes.
Adjusting box edges (x1/y1/x2/y2):
- Scalar: applies the same value to all selected boxes.
detectLabel="face, hand, dog", y2=300 → all three boxes get bottom edge at y=300.
- Array (parallel to detectLabel, null = keep stored value): per-box override.
detectLabel="face, hand, dog", y2=[null, null, 240] → only dog's bottom edge moves to y=240.
- Partial: omitted axes always keep the stored value.
detectLabel="face", y2=240 → only the bottom edge changes, x1/y1/x2 unchanged.
--- Re-detect with new prompt ---
Pass task to force fresh inference even on an already-annotated image. Replaces stored detections.
Parameters:
- targets: Image notation(s), e.g. ["a1"]. Omit when exactly one image is available.
- task: What to detect (natural language). Omit on a correction call. Providing task always triggers fresh inference.
- color: Box color (CSS name or hex). Default: pink.
- lineWeight: Line thickness in pixels. Default: 5.
- frameAdjust: Expand (+) or shrink (−) boxes as % of box diagonal or absolute px. Default: 5.
- detectLabel: Label(s) to match (case-insensitive). Comma-separated or array. When set, draws ONLY matching detections.
- detectIndex: Index or list of indices, parallel to detectLabel. Bracket notation '[2,4,7]' accepted. Default: 0.
- x1/y1/x2/y2: Box edge override(s) in original image pixels. Scalar or array parallel to detectLabel (null = keep stored).
${formatPluginMeta()}`,
parameters: AnnotateImageParamsShape,
implementation: async (args: any, ctx: ToolCallContext) => {
try {
// ── Parse args ─────────────────────────────────────────────────────
let rawTargets: string[] = [];
if (Array.isArray(args?.targets)) {
rawTargets = args.targets.map((s: any) => String(s).trim()).filter(Boolean);
} else if (typeof args?.targets === "string" && args.targets.trim()) {
const trimmed = args.targets.trim();
if (trimmed.startsWith("[")) {
try {
const parsed = JSON.parse(trimmed);
rawTargets = Array.isArray(parsed)
? parsed.map((s: any) => String(s).trim()).filter(Boolean)
: trimmed.split(/[\s,]+/).map((s: string) => s.trim()).filter(Boolean);
} catch {
rawTargets = trimmed.split(/[\s,]+/).map((s: string) => s.trim()).filter(Boolean);
}
} else {
rawTargets = trimmed.split(/[\s,]+/).map((s: string) => s.trim()).filter(Boolean);
}
}
const taskArg = typeof args?.task === "string" && args.task.trim() ? args.task.trim() : "";
const globalColor = typeof args?.color === "string" && args.color.trim() ? args.color.trim() : "pink";
const globalLineWeight = typeof args?.lineWeight === "number"
? Math.max(1, Math.min(50, Math.round(args.lineWeight))) : 5;
const globalFrameAdjust: number | string = args?.frameAdjust !== undefined ? args.frameAdjust : 5;
// detectLabel: string → string[] via comma-split; array → as-is
let globalLabels: string[] | undefined;
{
const dlRaw = args?.detectLabel;
if (Array.isArray(dlRaw)) {
const arr = dlRaw.map((s: any) => String(s).trim()).filter(Boolean);
if (arr.length > 0) globalLabels = arr;
} else if (typeof dlRaw === "string" && dlRaw.trim()) {
globalLabels = dlRaw.split(/\s*,\s*/).map((x) => x.trim()).filter(Boolean);
}
}
// detectIndex: number → [n]; string → extract digit runs; array → as-is
let globalIndices: number[] = [];
{
const diRaw = args?.detectIndex;
if (Array.isArray(diRaw)) {
globalIndices = diRaw.map((n: any) => typeof n === "number" ? Math.floor(n) : parseInt(String(n), 10)).filter((n) => !isNaN(n) && n >= 0);
} else if (typeof diRaw === "string" && diRaw.trim()) {
globalIndices = (diRaw.match(/\d+/g) ?? []).map(Number);
} else if (typeof diRaw === "number" && diRaw >= 0) {
globalIndices = [Math.floor(diRaw)];
}
}
const rawX1 = args?.x1;
const rawY1 = args?.y1;
const rawX2 = args?.x2;
const rawY2 = args?.y2;
// Normalise: scalar number | number-or-null array | undefined
function normaliseCoord(raw: any): number | (number | null)[] | undefined {
if (raw === undefined || raw === null) return undefined;
if (typeof raw === "number") return raw;
if (Array.isArray(raw)) return raw.map((v: any) => (v === null || v === undefined) ? null : Number(v));
if (typeof raw === "string") {
const t = raw.trim();
if (t.startsWith("[")) { try { const p = JSON.parse(t); if (Array.isArray(p)) return p.map((v: any) => (v === null || v === undefined) ? null : Number(v)); } catch {} }
const n = Number(t); return isNaN(n) ? undefined : n;
}
return undefined;
}
const manualX1 = normaliseCoord(rawX1);
const manualY1 = normaliseCoord(rawY1);
const manualX2 = normaliseCoord(rawX2);
const manualY2 = normaliseCoord(rawY2);
const hasAnyManualCoord = manualX1 !== undefined || manualY1 !== undefined || manualX2 !== undefined || manualY2 !== undefined;
console.log("[annotate_image] invoked", { targets: rawTargets, task: taskArg, color: globalColor, lineWeight: globalLineWeight, frameAdjust: globalFrameAdjust, detectLabels: globalLabels, detectIndices: globalIndices, manualCoords: { x1: manualX1, y1: manualY1, x2: manualX2, y2: manualY2 } });
// ── Resolve working directory ──────────────────────────────────────
let currentLmChatId: string | null = null;
let currentLmWorkingDir: string | null = null;
try {
const chatCtx = await getActiveChatContext();
if ((chatCtx as any)?.chatId) currentLmChatId = (chatCtx as any).chatId;
if ((chatCtx as any)?.workingDir) currentLmWorkingDir = (chatCtx as any).workingDir;
} catch {}
if (!currentLmChatId) {
try {
const resolved = await resolveActiveLMStudioChatId();
if ((resolved as any)?.ok) currentLmChatId = (resolved as any).chatId;
} catch {}
}
const primaryOutDir: string | undefined =
currentLmWorkingDir ||
(currentLmChatId ? getLMStudioWorkingDir(currentLmChatId) : undefined);
if (!primaryOutDir) {
return {
content: [{ type: "text", text: "annotate_image failed: could not resolve LM Studio chat working directory." }],
isError: true as const,
};
}
await fs.promises.mkdir(primaryOutDir, { recursive: true }).catch(() => {});
// ── Sync attachments ──────────────────────────────────────────────
try {
await syncAttachmentsToState(primaryOutDir, false, Number.MAX_SAFE_INTEGER);
} catch (e) {
console.warn("[annotate_image] attachment sync failed (non-fatal):", (e as any)?.message ?? e);
}
// ── Read state ────────────────────────────────────────────────────
const st = await readState(primaryOutDir);
const attachments: any[] = Array.isArray(st?.attachments) ? st.attachments : [];
const pictures: any[] = Array.isArray(st?.pictures) ? st.pictures : [];
const imageRecords: any[] = Array.isArray(st?.images) ? st.images : [];
const variantRecords: any[] = Array.isArray(st?.variants) ? st.variants : [];
// ── Buffer resolution helpers ─────────────────────────────────────
async function resolvePreviewBuf(notation: string): Promise<Buffer> {
const pref = parsePrefixedNotation(notation);
if (!pref) throw new Error(`Invalid notation: ${notation}`);
if (pref.pool === "attachment") {
const rec = attachments.find((a: any) => a?.a === pref.index);
const r = rec && typeof rec.preview === "string" ? rec.preview : "";
if (!r) throw new Error(`Preview for a${pref.index} not found.`);
return fs.promises.readFile(path.join(primaryOutDir!, r));
}
if (pref.pool === "image") {
const rec = imageRecords.find((r: any) => r?.i === pref.index);
const r = rec && typeof rec.preview === "string" ? rec.preview : "";
if (!r) throw new Error(`Preview for i${pref.index} not found.`);
return fs.promises.readFile(path.join(primaryOutDir!, r));
}
if (pref.pool === "variant") {
const rec = variantRecords.find((v: any) => v?.v === pref.index);
const r = rec && typeof rec.preview === "string" ? rec.preview : "";
if (!r) throw new Error(`Preview for v${pref.index} not found.`);
return fs.promises.readFile(path.join(primaryOutDir!, r));
}
const rec = pictures.find((p: any) => p?.p === pref.index);
const r = rec && typeof rec.preview === "string" ? rec.preview : "";
if (!r) throw new Error(`Preview for p${pref.index} not found.`);
return fs.promises.readFile(path.join(primaryOutDir!, r));
}
async function resolveOriginalBuf(notation: string, fallback: Buffer): Promise<Buffer> {
try {
const pref = parsePrefixedNotation(notation);
if (!pref) return fallback;
if (pref.pool === "attachment") {
const rec = attachments.find((a: any) => a?.a === pref.index);
const abs = rec && typeof rec.originAbs === "string" ? rec.originAbs : "";
if (!abs) return fallback;
return await fs.promises.readFile(abs);
}
let rec: any;
if (pref.pool === "image") rec = imageRecords.find((r: any) => r?.i === pref.index);
else if (pref.pool === "variant") rec = variantRecords.find((v: any) => v?.v === pref.index);
else rec = pictures.find((p: any) => p?.p === pref.index);
const fn = rec && typeof rec.filename === "string" ? rec.filename : "";
if (!fn) return fallback;
return await fs.promises.readFile(path.join(primaryOutDir!, fn));
} catch {
return fallback;
}
}
// ── Resolve auto-selected source when no targets given ─────────────
let autoId: string | null = null;
if (rawTargets.length === 0) {
const total = attachments.length + variantRecords.length + imageRecords.length + pictures.length;
if (total === 0) {
return { content: [{ type: "text", text: "No source image available." }], isError: true as const };
}
if (total > 1) {
return { content: [{ type: "text", text: "Ambiguous source — specify targets explicitly." }], isError: true as const };
}
if (attachments.length === 1) autoId = `a${typeof attachments[0]?.a === "number" ? attachments[0].a : 1}`;
else if (variantRecords.length === 1) autoId = `v${typeof variantRecords[0]?.v === "number" ? variantRecords[0].v : 1}`;
else if (imageRecords.length === 1) autoId = `i${typeof imageRecords[0]?.i === "number" ? imageRecords[0].i : 1}`;
else autoId = `p${pictures[0]?.p ?? 1}`;
rawTargets = [autoId];
}
// ── Classify each target: detect or redraw ────────────────────────
// If task is explicitly provided, always run fresh inference (ignore any prior record).
const forceDetect = taskArg.length > 0;
const resolvedEntries: ResolvedEntry[] = [];
for (const rawId of rawTargets) {
let drawSourceId = rawId;
let stateRec: any = null;
// Case A: target is an iN detection/annotation result
const pref = parsePrefixedNotation(rawId);
if (pref?.pool === "image") {
const imgRec = imageRecords.find((r: any) => r?.i === pref.index);
if (
imgRec &&
Array.isArray(imgRec.detections) &&
imgRec.detections.length > 0 &&
typeof imgRec.imageWidth === "number"
) {
stateRec = imgRec;
drawSourceId = (typeof imgRec.detectSource === "string" && imgRec.detectSource)
? imgRec.detectSource
: rawId;
}
}
// Case B: target is an original (a1, p1, …) with a prior detection on record
if (!stateRec) {
const prior = [...imageRecords]
.reverse()
.find(
(r: any) =>
(r?.detectSource === rawId) &&
Array.isArray(r.detections) &&
r.detections.length > 0 &&
typeof r.imageWidth === "number",
);
if (prior) {
stateRec = prior;
drawSourceId = rawId;
}
}
try {
if (stateRec && !forceDetect) {
const preview = await resolvePreviewBuf(drawSourceId).catch(() => null);
const origBuf = preview
? await resolveOriginalBuf(drawSourceId, preview)
: Buffer.alloc(0);
resolvedEntries.push({
mode: "redraw",
id: drawSourceId,
origBuf,
task: typeof stateRec.task === "string" ? stateRec.task : taskArg,
detections: stateRec.detections as StoredDetection[],
imageWidth: stateRec.imageWidth as number,
imageHeight: stateRec.imageHeight as number,
});
} else {
const previewBuf = await resolvePreviewBuf(rawId);
const origBuf = await resolveOriginalBuf(rawId, previewBuf);
resolvedEntries.push({ mode: "detect", id: rawId, previewBuf, origBuf });
}
} catch (e) {
return {
content: [{ type: "text", text: String((e as any)?.message || e) }],
isError: true as const,
};
}
}
// ── Run Qwen3-VL for detect-mode entries ───────────────────────────
const detectEntries = resolvedEntries.filter((e): e is Extract<ResolvedEntry, { mode: "detect" }> => e.mode === "detect");
const progressTotalSteps = detectEntries.length > 0
? detectEntries.length + resolvedEntries.length + 4
: resolvedEntries.length + 3;
let batchResult: VisionDetectionBatchResult | null = null;
if (detectEntries.length > 0) {
const globalConfig = getGlobalConfig(ctl);
const visionBaseUrl = getGlobalString(globalConfig, "embeddingBaseUrl", process.env.LMSTUDIO_VISION_API_BASE_URL || defaultPluginSettings.embeddingBaseUrl);
const visionApiKey = getGlobalString(globalConfig, "embeddingApiKey", process.env.LMSTUDIO_VISION_API_KEY || defaultPluginSettings.embeddingApiKey);
const visionModelKey = getGlobalString(globalConfig, "qwen3VlModelPath", process.env.LMSTUDIO_VISION_MODEL_KEY || defaultPluginSettings.qwen3VlModelPath);
const envDetectMaxTokens = Number.parseInt(process.env.DETECT_MAX_TOKENS || "", 10);
const envDetectTemperature = Number.parseFloat(process.env.DETECT_TEMPERATURE || "");
const configuredDetectMaxTokens = Math.floor(getGlobalNumber(
globalConfig,
"detectMaxTokens",
Number.isFinite(envDetectMaxTokens) && envDetectMaxTokens > 0 ? envDetectMaxTokens : defaultPluginSettings.detectMaxTokens
));
const configuredDetectTemperature = getGlobalNumber(
globalConfig,
"detectTemperature",
Number.isFinite(envDetectTemperature) ? envDetectTemperature : defaultPluginSettings.detectTemperature
);
const detectionConfig: VisionDetectionAnalyzerConfig = {
task: taskArg,
odPrompt: getGlobalString(globalConfig, "qwen3VlOdPrompt", process.env.DETECT_OD_PROMPT || defaultPluginSettings.qwen3VlOdPrompt) || undefined,
maxTokens: configuredDetectMaxTokens,
temperature: configuredDetectTemperature,
timeoutMs: 120_000,
};
const tmpPaths: string[] = [];
const detectionItems: VisionAnalysisItem[] = [];
for (const entry of detectEntries) {
const tmpPath = path.join(primaryOutDir, `_tmp_annotate_src_${entry.id}_${Date.now()}.png`);
await fs.promises.writeFile(tmpPath, entry.previewBuf);
tmpPaths.push(tmpPath);
detectionItems.push({ id: entry.id, filePath: tmpPath });
}
try {
reportToolStatus(ctx, `Detecting objects in ${detectEntries.length} image${detectEntries.length === 1 ? "" : "s"}...`);
reportToolStep(ctx, 1, progressTotalSteps, `Preparing ${detectEntries.length} image${detectEntries.length === 1 ? "" : "s"} for annotation detection...`);
const ready = await ensureLmStudioVisionInstanceReady({
baseUrl: visionBaseUrl,
apiKey: visionApiKey,
modelKey: visionModelKey,
status: (message) => { try { ctx.status(message); } catch {} },
});
if (!ready.ok) {
throw new Error(ready.error);
}
batchResult = {
results: [],
totalInferenceTimeMs: 0,
backend: "vision-api",
};
for (let idx = 0; idx < detectionItems.length; idx++) {
const item = detectionItems[idx];
reportToolStep(ctx, idx + 2, progressTotalSteps, `Detecting objects in ${item.id} (${idx + 1}/${detectionItems.length})...`);
const singleResult = await detectLmStudioVisionBatch([item], {
...detectionConfig,
baseUrl: visionBaseUrl,
apiKey: visionApiKey,
model: visionModelKey,
});
batchResult.results.push(...singleResult.results);
batchResult.totalInferenceTimeMs += singleResult.totalInferenceTimeMs;
batchResult.backend = singleResult.backend;
}
try {
const totalObjects = batchResult.results.reduce((s, r) => s + (r.objects?.length ?? 0), 0);
const ms = Math.round(batchResult.totalInferenceTimeMs);
reportToolStep(ctx, detectEntries.length + 2, progressTotalSteps, `${totalObjects} object${totalObjects === 1 ? "" : "s"} found (${ms}ms); drawing boxes...`);
} catch {}
} finally {
for (const tp of tmpPaths) await fs.promises.unlink(tp).catch(() => {});
}
if (!batchResult || !batchResult.results.length) {
return {
content: [{ type: "text", text: "annotate_image: no results returned from detection API." }],
isError: true as const,
};
}
} else {
reportToolStatus(ctx, `Redrawing ${resolvedEntries.length} annotated image${resolvedEntries.length === 1 ? "" : "s"} from stored detections...`);
reportToolStep(ctx, 1, progressTotalSteps, `Redrawing ${resolvedEntries.length} annotated image${resolvedEntries.length === 1 ? "" : "s"} from stored detections...`);
}
// ── Per-entry: apply frameAdjust, draw, save ───────────────────────
const variantPreviewSpec = (VARIANT_FULL_CONFIG as any).preview;
const stamp = isoStampCompact();
let nextI = Math.max(1, st.counters?.nextImageI ?? 1);
const imageRecordsForState: any[] = [];
const resultEntries: Array<{
id: string;
i: number;
isRedraw: boolean;
task: string;
detObjects: StoredDetection[];
imageWidth: number;
imageHeight: number;
savedPath: string;
savedFileUrl: string;
savedSize: number;
preview: any | null;
httpOriginal: string;
httpPreview: string;
inferenceTimeMs: number;
}> = [];
const httpBase = await getHealthyServerBaseUrl();
let detectResultIdx = 0;
let resolvedIdx = 0;
const drawBaseStep = detectEntries.length > 0 ? detectEntries.length + 3 : 2;
for (const entry of resolvedEntries) {
reportToolStep(ctx, drawBaseStep + resolvedIdx, progressTotalSteps, `Drawing annotation for ${entry.id} (${resolvedIdx + 1}/${resolvedEntries.length})...`);
resolvedIdx++;
let rawBboxes: [number, number, number, number][];
let imgW: number;
let imgH: number;
let detObjects: StoredDetection[];
let isRedraw: boolean;
let entryTask: string;
let inferenceTimeMs = 0;
let bboxesAlreadyAdjusted = false;
if (entry.mode === "redraw") {
imgW = entry.imageWidth;
imgH = entry.imageHeight;
isRedraw = true;
entryTask = entry.task;
if (globalLabels !== undefined && globalLabels.length > 0) {
// detectLabel mode: resolve label/index pairs, draw ONLY the selected detections
let labels = [...globalLabels];
let indices = [...globalIndices];
// Option A: single label + no explicit index → auto-expand to all detections for that label
if (labels.length === 1 && indices.length === 0) {
const allIndices = expandDetectIndices(entry.detections, labels[0]);
if (allIndices.length > 1) {
labels = Array(allIndices.length).fill(labels[0]);
indices = allIndices;
}
}
// Option B: single label + multiple explicit indices → expand labels to match
if (labels.length === 1 && indices.length > 1) {
labels = Array(indices.length).fill(labels[0]);
}
// Pre-group detections by lowercase label once — O(1) lookup per box.
const detByLabel = new Map<string, StoredDetection[]>();
for (const d of entry.detections) {
const key = d.label.toLowerCase();
if (!detByLabel.has(key)) detByLabel.set(key, []);
detByLabel.get(key)!.push(d);
}
const resolvedBoxes: { det: StoredDetection; bbox: [number, number, number, number] }[] = [];
for (let li = 0; li < labels.length; li++) {
const label = labels[li];
const idx = indices[li] ?? indices[0] ?? 0;
const selectedDet = detByLabel.get(label.toLowerCase())?.[idx];
if (!selectedDet) {
const available = [...new Set(entry.detections.map((d) => d.label))].join(", ");
return {
content: [{ type: "text", text: `annotate_image: label '${label}' (index ${idx}) not found in stored detections. Available: ${available || "(none)"}` }],
isError: true as const,
};
}
// Coord overrides: resolveCoordOverride per axis, fall back to stored value.
// Applies to all selected boxes; scalar = same for all, array = parallel (null = keep stored).
const ox1 = hasAnyManualCoord ? resolveCoordOverride(manualX1 as any, li) : undefined;
const oy1 = hasAnyManualCoord ? resolveCoordOverride(manualY1 as any, li) : undefined;
const ox2 = hasAnyManualCoord ? resolveCoordOverride(manualX2 as any, li) : undefined;
const oy2 = hasAnyManualCoord ? resolveCoordOverride(manualY2 as any, li) : undefined;
const bbox: [number, number, number, number] = [
ox1 ?? selectedDet.bbox.x1,
oy1 ?? selectedDet.bbox.y1,
ox2 ?? selectedDet.bbox.x2,
oy2 ?? selectedDet.bbox.y2,
];
resolvedBoxes.push({ det: selectedDet, bbox });
}
rawBboxes = resolvedBoxes.map(({ bbox }) => applyFrameAdjust(bbox, globalFrameAdjust, imgW, imgH));
detObjects = resolvedBoxes.map(({ det, bbox }) => ({
...det,
bbox: { x1: bbox[0], y1: bbox[1], x2: bbox[2], y2: bbox[3] },
}));
bboxesAlreadyAdjusted = true;
} else if (globalIndices.length > 0) {
// No detectLabel but explicit detectIndex: select stored detections by position.
const selected: StoredDetection[] = [];
for (const idx of globalIndices) {
const det = entry.detections[idx];
if (!det) {
return {
content: [{ type: "text", text: `annotate_image: detectIndex ${idx} out of range (${entry.detections.length} stored detections).` }],
isError: true as const,
};
}
selected.push(det);
}
rawBboxes = selected.map((d) => [d.bbox.x1, d.bbox.y1, d.bbox.x2, d.bbox.y2] as [number, number, number, number]);
detObjects = selected;
bboxesAlreadyAdjusted = false;
} else {
// No detectLabel, no detectIndex: draw all stored boxes, apply frameAdjust to all
rawBboxes = entry.detections.map((d) => [d.bbox.x1, d.bbox.y1, d.bbox.x2, d.bbox.y2] as [number, number, number, number]);
detObjects = entry.detections;
}
} else {
const detResult = batchResult!.results[detectResultIdx++];
rawBboxes = detResult.objects.map((o) => o.bbox as [number, number, number, number]);
imgW = detResult.imageWidth;
imgH = detResult.imageHeight;
detObjects = detResult.objects.map((o) => ({
label: o.label,
bbox: { x1: o.bbox[0], y1: o.bbox[1], x2: o.bbox[2], y2: o.bbox[3] },
crop: { cropLeft: o.cropLeft, cropRight: o.cropRight, cropTop: o.cropTop, cropBottom: o.cropBottom },
}));
isRedraw = false;
entryTask = taskArg;
inferenceTimeMs = detResult.inferenceTimeMs ?? 0;
}
const adjustedBboxes = bboxesAlreadyAdjusted
? rawBboxes
: rawBboxes.map((bbox) => applyFrameAdjust(bbox, globalFrameAdjust, imgW, imgH));
const annotatedBuf = await drawBboxesOnImage(entry.origBuf, adjustedBboxes, {
sourceDims: { width: imgW, height: imgH },
palette: false,
color: globalColor,
lineWeight: globalLineWeight,
});
const currentI = nextI++;
const baseName = `image-${stamp}-i${currentI}`;
const savedPath = path.join(primaryOutDir, `${baseName}.png`);
await fs.promises.writeFile(savedPath, annotatedBuf);
const savedFileUrl = pathToFileURL(savedPath).toString();
const savedSize = annotatedBuf.length;
let preview: any = null;
try {
const p = await generatePreviewFromBuffer(annotatedBuf, primaryOutDir, `${baseName}.png`, variantPreviewSpec);
preview = {
ok: true as const,
filePath: p.previewAbs,
fileName: p.previewFilename,
fileUrl: pathToFileURL(p.previewAbs).toString(),
size_bytes: p.data.length,
width: p.width,
height: p.height,
mimeType: "image/jpeg" as const,
dataBase64: p.data.toString("base64"),
};
} catch (e) {
console.warn(`[annotate_image] preview generation failed for ${entry.id}:`, String(e));
}
const httpOriginal = httpBase
? toHttpOriginalUrl(`${baseName}.png`, httpBase, currentLmChatId || undefined) : "";
const httpPreview = (() => {
if (!httpBase || !currentLmChatId || !preview?.fileName) return "";
return toHttpPreviewUrl(preview.fileName, httpBase, currentLmChatId);
})();
imageRecordsForState.push({
filename: `${baseName}.png`,
preview: preview ? `preview-${baseName}.jpg` : undefined,
i: currentI,
sourceTool: `${getSelfPluginIdentifier()}/annotate_image`,
detectSource: entry.id,
task: entryTask,
annotateColor: globalColor,
annotateLineWeight: globalLineWeight,
annotateFrameAdjust: globalFrameAdjust,
imageWidth: imgW,
imageHeight: imgH,
detections: detObjects.map((d) => ({
label: d.label,
bbox: { x1: d.bbox.x1, y1: d.bbox.y1, x2: d.bbox.x2, y2: d.bbox.y2 },
crop: d.crop ?? {},
})),
});
resultEntries.push({
id: entry.id,
i: currentI,
isRedraw,
task: entryTask,
detObjects,
imageWidth: imgW,
imageHeight: imgH,
savedPath,
savedFileUrl,
savedSize,
preview,
httpOriginal,
httpPreview,
inferenceTimeMs,
});
}
// ── Update state ──────────────────────────────────────────────────
reportToolStep(ctx, progressTotalSteps - 1, progressTotalSteps, "Updating image state and audit log...");
try {
const stateForUpdate = await readState(primaryOutDir);
const appendResult = appendImages(stateForUpdate, imageRecordsForState);
if (appendResult.changed) {
await writeStateAtomic(primaryOutDir, stateForUpdate);
}
} catch (e) {
console.warn("[annotate_image] state update failed:", String(e));
}
// ── Audit log ─────────────────────────────────────────────────────
try {
const audit = buildAuditLogger({ backend: "annotate_image", mode: "annotate_image" as any, requestId: undefined });
if (currentLmChatId) audit.setChatId(currentLmChatId);
audit.setUserRequest({ targets: rawTargets, task: taskArg, color: globalColor, lineWeight: globalLineWeight, frameAdjust: globalFrameAdjust } as any);
audit.setOutput({
images: resultEntries.map((r) => ({
id: r.id,
i: r.i,
redraw: r.isRedraw,
detections: r.detObjects.length,
path: r.savedPath,
url: r.savedFileUrl,
bytes: r.savedSize,
...(r.httpOriginal ? { http_url: r.httpOriginal } : {}),
...(r.preview ? { preview_path: r.preview.filePath, preview_url: r.preview.fileUrl } : {}),
...(r.httpPreview ? { http_preview_url: r.httpPreview } : {}),
})),
} as any);
await audit.write();
} catch {}
// ── Assemble result ───────────────────────────────────────────────
reportToolStep(ctx, progressTotalSteps, progressTotalSteps, "Assembling annotation result...");
const summaries = resultEntries.map((r) => ({
tool: "annotate_image",
source: r.id,
i: r.i,
redraw: r.isRedraw,
color: globalColor,
lineWeight: globalLineWeight,
frameAdjust: globalFrameAdjust,
...(r.inferenceTimeMs > 0 ? { inferenceTimeMs: r.inferenceTimeMs } : {}),
detections: r.detObjects.map((d) => ({
label: d.label,
bbox: { x1: d.bbox.x1, y1: d.bbox.y1, x2: d.bbox.x2, y2: d.bbox.y2 },
})),
}));
const envPreviewRaw = process.env["PREVIEW_IN_CHAT"];
const previewInChat =
envPreviewRaw === undefined
? true
: envPreviewRaw === "1" || envPreviewRaw.toLowerCase() === "true";
// Build target notations for hints (e.g. ["i9", "i10"])
const resultNotations = resultEntries.map((r) => `i${r.i}`);
const targetsJson = JSON.stringify(resultNotations);
const reviewHintFalse =
`Carefully examine the preview to make absolutely sure that the object detection matches your intent. Registered as ${resultNotations.join(", ")}. Use show_image({"targets":${targetsJson}}) to show, review_image({"targets":${targetsJson}}) to review, or annotate_image({"targets":${targetsJson}}) to apply corrections.`;
const reviewHintTrue =
`Carefully examine the preview to make absolutely sure that the object detection matches your intent. This is an image file. Present the image to the user by using the markdown above. Registered as ${resultNotations.join(", ")}. Use review_image({"targets":${targetsJson}}) to review, or annotate_image({"targets":${targetsJson}}) to apply corrections.`;
const content: any[] = [];
for (const r of resultEntries) {
const fallbackPreviewUrl = r.preview?.fileUrl || r.savedFileUrl;
if (previewInChat && r.preview) {
const fname = String(r.preview.fileName || "");
content.push({
type: "image",
fileName: fname,
mimeType: r.preview.mimeType,
markdown: ``,
$hint: reviewHintTrue,
} as any);
}
// TODO: Restore when LM Studio renders file/HTTP links again
// content.push({ type: "text", text: `Preview i${r.i}: ${r.httpPreview || fallbackPreviewUrl}` });
// content.push({ type: "text", text: `Original i${r.i}: ${r.httpOriginal || r.savedFileUrl}` });
}
if (batchResult && batchResult.totalInferenceTimeMs > 0) {
content.push({ type: "text", text: `Total inference time: ${Math.round(batchResult.totalInferenceTimeMs)}ms` });
}
content.push({
type: "text",
text: JSON.stringify(summaries.length === 1 ? summaries[0] : summaries),
...(previewInChat ? {} : { $hint: reviewHintFalse }),
});
return { content };
} catch (error) {
return {
content: [{ type: "text", text: `annotate_image failed: ${(error as Error).message || String(error)}` }],
isError: true as const,
};
}
},
});
}