Project Files
src / tools / detect_object.ts
import { tool, type Tool, type ToolsProviderController, type ToolCallContext } from "@lmstudio/sdk";
import { z } from "zod";
import path from "path";
import fs from "fs";
import { pathToFileURL } from "url";
import {
syncAttachmentsToState,
getActiveChatContext,
resolveActiveLMStudioChatId,
getLMStudioWorkingDir,
readState,
writeStateAtomic,
generatePreviewFromBuffer,
appendImages,
getHealthyServerBaseUrl,
toHttpOriginalUrl,
toHttpPreviewUrl,
buildAuditLogger,
getSelfPluginIdentifier,
VARIANT_FULL_CONFIG,
} from "../core-bundle.mjs";
async function drawBboxesOnImage(
buffer: Buffer,
bboxes: [number, number, number, number][],
sourceDims?: { width: number; height: number }
): Promise<Buffer> {
const _require = typeof require !== "undefined" ? require : (await import("module")).createRequire(__filename);
const jimpMod: any = _require("jimp");
const Jimp: any = jimpMod.Jimp ?? jimpMod.default ?? jimpMod;
if (!Jimp || typeof Jimp.read !== "function") {
throw new Error("drawBboxesOnImage: Jimp.read not available");
}
const img: any = await Jimp.read(buffer);
const imgW: number =
typeof img.getWidth === "function" ? img.getWidth() :
typeof img.width === "number" ? img.width :
img.bitmap?.width || 0;
const imgH: number =
typeof img.getHeight === "function" ? img.getHeight() :
typeof img.height === "number" ? img.height :
img.bitmap?.height || 0;
// Scale factors: convert bbox coords from sourceDims space to this image's space.
// When sourceDims matches the image (or is omitted), scaleX/scaleY = 1 (no change).
const scaleX = (sourceDims && sourceDims.width > 0) ? imgW / sourceDims.width : 1;
const scaleY = (sourceDims && sourceDims.height > 0) ? imgH / sourceDims.height : 1;
const palette: [number, number, number, number][] = [
[0xff, 0x3b, 0x30, 0xff],
[0x34, 0xc7, 0x59, 0xff],
[0x00, 0x7a, 0xff, 0xff],
[0xff, 0x9f, 0x0a, 0xff],
[0xbf, 0x5a, 0xf2, 0xff],
[0xff, 0xd6, 0x0a, 0xff],
];
const thickness = 2;
for (let bi = 0; bi < bboxes.length; bi++) {
const [r, g, b, a] = palette[bi % palette.length];
const colorInt = (((r & 0xff) << 24) | ((g & 0xff) << 16) | ((b & 0xff) << 8) | (a & 0xff)) >>> 0;
const [bx1, by1, bx2, by2] = bboxes[bi];
const x1raw = bx1 * scaleX;
const y1raw = by1 * scaleY;
const x2raw = bx2 * scaleX;
const y2raw = by2 * scaleY;
const x1 = Math.max(0, Math.min(imgW - 1, Math.round(x1raw)));
const y1 = Math.max(0, Math.min(imgH - 1, Math.round(y1raw)));
const x2 = Math.max(0, Math.min(imgW - 1, Math.round(x2raw)));
const y2 = Math.max(0, Math.min(imgH - 1, Math.round(y2raw)));
for (let t = 0; t < thickness; t++) {
for (let x = x1; x <= x2; x++) {
if (y1 + t < imgH) img.setPixelColor(colorInt, x, y1 + t);
if (y2 - t >= 0) img.setPixelColor(colorInt, x, y2 - t);
}
for (let y = y1; y <= y2; y++) {
if (x1 + t < imgW) img.setPixelColor(colorInt, x1 + t, y);
if (x2 - t >= 0) img.setPixelColor(colorInt, x2 - t, y);
}
}
}
const bufResult: any =
typeof img.getBufferAsync === "function"
? img.getBufferAsync("image/png")
: img.getBuffer("image/png");
if (bufResult && typeof bufResult.then === "function") {
return bufResult as Promise<Buffer>;
}
// Legacy Jimp v0 callback style
return new Promise<Buffer>((resolve, reject) =>
img.getBuffer("image/png", (err: any, data: Buffer) =>
err ? reject(err) : resolve(data)
)
);
}
import {
analyzeMlxDetectionBatch,
type MlxDetectionAnalyzerConfig,
type MlxAnalysisItem,
} from "../services/mlxVisionAnalyzer.js";
import { FLORENCE2_MODEL_PATH, DETECT_ENDPOINT } from "../config.js";
import { ensureDetectServerRunning } from "../detect-server-manager.js";
function isoStampCompact(): string {
const d = new Date();
const year = d.getUTCFullYear();
const month = String(d.getUTCMonth() + 1).padStart(2, "0");
const day = String(d.getUTCDate()).padStart(2, "0");
const hours = String(d.getUTCHours()).padStart(2, "0");
const minutes = String(d.getUTCMinutes()).padStart(2, "0");
const seconds = String(d.getUTCSeconds()).padStart(2, "0");
const millis = String(d.getUTCMilliseconds()).padStart(3, "0");
return `${year}${month}${day}T${hours}${minutes}${seconds}${millis}Z`;
}
function parsePrefixedNotation(
s: string
): { pool: "attachment" | "image" | "variant" | "picture"; index: number } | null {
const t = String(s || "").trim().toLowerCase();
const m = t.match(/^([avip])(\d+)$/);
if (!m) return null;
const idx = Math.max(1, parseInt(m[2], 10));
const pool =
m[1] === "a" ? "attachment" : m[1] === "v" ? "variant" : m[1] === "i" ? "image" : "picture";
return { pool, index: idx };
}
function formatPluginMeta(): string {
try {
const cwd = process.cwd();
const pkg = JSON.parse(fs.readFileSync(path.join(cwd, "package.json"), "utf-8"));
const mf = JSON.parse(fs.readFileSync(path.join(cwd, "manifest.json"), "utf-8"));
const id = mf?.owner && mf?.name ? `${mf.owner}/${mf.name}` : pkg?.name || "ceveyne/analyse-image";
return `Plugin-Identifier: ${id}\nPlugin version: ${pkg?.version || ""}`;
} catch {
return "Plugin-Identifier: ceveyne/analyse-image";
}
}
/**
* Flexible targets schema: accepts array OR comma/space-separated string.
* Analogous to analyse_image targets — supports single or batch detection.
*/
const FlexibleTargetsList = z
.union([
z.string().transform((s) =>
s
.split(/[\s,]+/)
.map((x) => x.trim())
.filter((x) => x.length > 0)
),
z.array(z.string()),
])
.refine((arr) => arr.length >= 1, "targets must contain at least one notation")
.refine((arr) => arr.length <= 16, "targets must contain at most 16 notations");
const DetectObjectParamsShape = {
targets: FlexibleTargetsList.optional().describe(
"One or more source image notations (e.g. ['a1','i3'] or 'a1, i3'). " +
"Omit when there is exactly one image — it will be selected automatically."
),
task: z
.string()
.optional()
.default("<OD>")
.describe("Florence-2 detection task token. Default: '<OD>' (object detection)."),
} satisfies Record<string, z.ZodTypeAny>;
export function createDetectObjectTool(ctl: ToolsProviderController): Tool {
return tool({
name: "detect_object",
description: `Detect objects in one or more images and draw colored bounding boxes on each result.
For each source image, returns a new annotated image (saved as iN) with bounding boxes for each detected object, plus a JSON summary with labels, coordinates, and crop percentages. The active backend is configured in plugin settings (Florence-2 or Qwen3-VL).
Parameters:
- targets: One or more image notations (e.g. ['a1','i3'] or 'a1, i3'). Accepts array or comma-separated string. Omit when there is exactly one image.
- task: What to detect.
- Florence-2 backend — use task tokens: '<OD>' (generic objects with labels), '<DENSE_REGION_CAPTION>' (richer captions per region), '<REGION_PROPOSAL>' (regions without labels), '<OPEN_VOCABULARY_DETECTION>dog' (specific concept, replace 'dog').
- Qwen3-VL backend — use natural language: e.g. 'Detect all faces and hands' or 'Find the cat and the laptop'. Omit for full-image general detection.
${formatPluginMeta()}`,
parameters: DetectObjectParamsShape,
implementation: async (args: any, ctx: ToolCallContext) => {
try {
// Parse targets: accept array or comma/space-separated string (analogous to analyse_image)
let rawTargets: string[] = [];
if (Array.isArray(args?.targets)) {
rawTargets = args.targets.map((s: any) => String(s).trim()).filter(Boolean);
} else if (typeof args?.targets === "string" && args.targets.trim()) {
rawTargets = args.targets.trim().split(/[\s,]+/).map((s: string) => s.trim()).filter(Boolean);
}
const task =
typeof args?.task === "string" && args.task.trim() ? args.task.trim() : "<OD>";
console.log("[detect_object] invoked", { targets: rawTargets, task });
let currentLmChatId: string | null = null;
let currentLmWorkingDir: string | null = null;
try {
const chatCtx = await getActiveChatContext();
if ((chatCtx as any)?.chatId) currentLmChatId = (chatCtx as any).chatId;
if ((chatCtx as any)?.workingDir) currentLmWorkingDir = (chatCtx as any).workingDir;
} catch {}
if (!currentLmChatId) {
try {
const resolved = await resolveActiveLMStudioChatId();
if ((resolved as any)?.ok) currentLmChatId = (resolved as any).chatId;
} catch {}
}
const primaryOutDir: string | undefined =
currentLmWorkingDir ||
(currentLmChatId ? getLMStudioWorkingDir(currentLmChatId) : undefined);
if (!primaryOutDir) {
console.error("[detect_object] could not resolve working directory");
return {
content: [
{
type: "text",
text: "detect_object failed: could not resolve LM Studio chat working directory.",
},
],
isError: true as const,
};
}
console.log("[detect_object] primaryOutDir:", primaryOutDir);
await fs.promises.mkdir(primaryOutDir, { recursive: true }).catch(() => {});
// Sync attachments so state is up to date
console.log("[detect_object] syncing attachments...");
try {
await syncAttachmentsToState(primaryOutDir, false, Number.MAX_SAFE_INTEGER);
} catch (e) {
console.warn("[detect_object] attachment sync failed (non-fatal):", (e as any)?.message ?? e);
}
console.log("[detect_object] attachment sync done");
console.log("[detect_object] reading state...");
const st = await readState(primaryOutDir);
const attachments: any[] = Array.isArray(st?.attachments) ? st.attachments : [];
const pictures: any[] = Array.isArray(st?.pictures) ? st.pictures : [];
const imageRecords: any[] = Array.isArray(st?.images) ? st.images : [];
const images: Array<{ i: number; path: string }> = imageRecords
.filter((r: any) => r && typeof r.filename === "string")
.sort((a: any, b: any) => (a.i || 0) - (b.i || 0))
.map((r: any) => ({ i: r.i || 1, path: path.join(primaryOutDir, r.filename) }));
const variantRecords: any[] = Array.isArray(st?.variants) ? st.variants : [];
const variants: Array<{ v: number; path: string }> = variantRecords
.filter((v: any) => v && typeof v.filename === "string")
.map((v: any) => ({ v: v.v || 1, path: path.join(primaryOutDir, v.filename) }));
console.log("[detect_object] state:", {
attachments: attachments.length,
images: images.length,
variants: variants.length,
pictures: pictures.length,
});
// Resolve source buffers for each target (or auto-select if none given)
type SourceEntry = { id: string; buf: Buffer; origBuf: Buffer };
const sourceEntries: SourceEntry[] = [];
async function resolveOneBuf(rawCanvas: string): Promise<Buffer> {
const pref = parsePrefixedNotation(rawCanvas);
if (!pref) throw new Error(`Invalid canvas notation: ${rawCanvas}`);
if (pref.pool === "attachment") {
const rec = attachments.find((a: any) => a?.a === pref.index);
const previewRel = rec && typeof rec.preview === "string" ? rec.preview : "";
if (!previewRel) throw new Error(`Preview for attachment a${pref.index} not found.`);
return fs.promises.readFile(path.join(primaryOutDir!, previewRel));
} else if (pref.pool === "image") {
const rec = imageRecords.find((r: any) => r?.i === pref.index);
const previewRel = rec && typeof rec.preview === "string" ? rec.preview : "";
if (!previewRel) throw new Error(`Preview for image i${pref.index} not found.`);
return fs.promises.readFile(path.join(primaryOutDir!, previewRel));
} else if (pref.pool === "variant") {
const rec = variantRecords.find((v: any) => v?.v === pref.index);
const previewRel = rec && typeof rec.preview === "string" ? rec.preview : "";
if (!previewRel) throw new Error(`Preview for variant v${pref.index} not found.`);
return fs.promises.readFile(path.join(primaryOutDir!, previewRel));
} else {
const rec = pictures.find((p: any) => p?.p === pref.index);
const previewRel = rec && typeof rec.preview === "string" ? rec.preview : "";
if (!previewRel) throw new Error(`Preview for picture p${pref.index} not found.`);
return fs.promises.readFile(path.join(primaryOutDir!, previewRel));
}
}
// Resolves the original (full-resolution) file for a canvas notation.
// Falls back to previewFallback when the original is unavailable.
async function resolveOriginalBuf(rawCanvas: string, previewFallback: Buffer): Promise<Buffer> {
try {
const pref = parsePrefixedNotation(rawCanvas);
if (!pref) return previewFallback;
if (pref.pool === "attachment") {
const rec = attachments.find((a: any) => a?.a === pref.index);
const originAbs = rec && typeof rec.originAbs === "string" ? rec.originAbs : "";
if (!originAbs) return previewFallback;
return await fs.promises.readFile(originAbs);
}
let rec: any;
if (pref.pool === "image") rec = imageRecords.find((r: any) => r?.i === pref.index);
else if (pref.pool === "variant") rec = variantRecords.find((v: any) => v?.v === pref.index);
else rec = pictures.find((p: any) => p?.p === pref.index);
const filename = rec && typeof rec.filename === "string" ? rec.filename : "";
if (!filename) return previewFallback;
return await fs.promises.readFile(path.join(primaryOutDir!, filename));
} catch {
return previewFallback;
}
}
try {
if (rawTargets.length > 0) {
for (const t of rawTargets) {
const buf = await resolveOneBuf(t);
const origBuf = await resolveOriginalBuf(t, buf);
sourceEntries.push({ id: t, buf, origBuf });
}
} else {
// Auto-select single source
const total = attachments.length + variantRecords.length + imageRecords.length + pictures.length;
if (total === 0) throw new Error("No source image available.");
if (total > 1) throw new Error("Ambiguous source — specify targets explicitly.");
let buf: Buffer;
let id: string;
if (attachments.length === 1) {
const rec = attachments[0];
const previewRel = rec && typeof rec.preview === "string" ? rec.preview : "";
if (!previewRel) throw new Error("Preview for attachment not found.");
buf = await fs.promises.readFile(path.join(primaryOutDir!, previewRel));
id = `a${typeof rec.a === "number" ? rec.a : 1}`;
} else if (variantRecords.length === 1) {
const rec = variantRecords[0];
const previewRel = rec && typeof rec.preview === "string" ? rec.preview : "";
if (!previewRel) throw new Error("Preview for variant not found.");
buf = await fs.promises.readFile(path.join(primaryOutDir!, previewRel));
id = `v${typeof rec.v === "number" ? rec.v : 1}`;
} else if (imageRecords.length === 1) {
const rec = imageRecords[0];
const previewRel = rec && typeof rec.preview === "string" ? rec.preview : "";
if (!previewRel) throw new Error("Preview for image not found.");
buf = await fs.promises.readFile(path.join(primaryOutDir!, previewRel));
id = `i${typeof rec.i === "number" ? rec.i : 1}`;
} else {
const rec = pictures[0];
const previewRel = rec && typeof rec.preview === "string" ? rec.preview : "";
if (!previewRel) throw new Error("Preview for picture not found.");
buf = await fs.promises.readFile(path.join(primaryOutDir!, previewRel));
id = `p${rec.p ?? 1}`;
}
const origBuf = await resolveOriginalBuf(id, buf);
sourceEntries.push({ id, buf, origBuf });
}
} catch (e) {
return {
content: [{ type: "text", text: String((e as any)?.message || e) }],
isError: true as const,
};
}
console.log("[detect_object] sources resolved:", sourceEntries.map((s) => s.id));
// Start managed server unless TTL=0 (external server mode)
const serverTTL = parseInt(process.env.SERVER_TTL ?? "1440", 10);
if (serverTTL !== 0) {
try {
await ensureDetectServerRunning(
{
port: parseInt(process.env.MLX_VISION_PORT ?? "8765", 10),
mlxVisionModelPath: process.env.MLX_VISION_MODEL_PATH ?? "",
mlxVisionEnabled: process.env.MLX_VISION_ENABLED !== "false",
florence2ModelPath: process.env.FLORENCE2_MODEL_PATH || process.env.DETECT_MODEL_PATH || "",
backend: process.env.FASTVLM_BACKEND || "mlx",
maxTokens: process.env.MLX_VISION_MAX_TOKENS ? parseInt(process.env.MLX_VISION_MAX_TOKENS, 10) : undefined,
temperature: process.env.MLX_VISION_TEMPERATURE ? parseFloat(process.env.MLX_VISION_TEMPERATURE) : undefined,
detectBackend: process.env.FASTVLM_DETECT_BACKEND || "florence2",
qwen3VlModelPath: process.env.FASTVLM_QWEN3_VL_MODEL_PATH || "",
},
(msg) => { try { ctx.status(msg); } catch {} }
);
} catch (e) {
throw new Error(`Failed to start detection server: ${(e as Error).message || String(e)}`);
}
}
try {
ctx.status(`Detecting objects in ${sourceEntries.length} image${sourceEntries.length === 1 ? "" : "s"}…`);
} catch {}
// Write source buffers to temp files and build detection items
const detectPort = parseInt(process.env.MLX_VISION_PORT ?? "8765", 10);
const detectionConfig: MlxDetectionAnalyzerConfig = {
endpoint: process.env.DETECT_ENDPOINT || DETECT_ENDPOINT || `http://localhost:${detectPort}/detect`,
task,
florence2ModelPath: process.env.FLORENCE2_MODEL_PATH || FLORENCE2_MODEL_PATH,
timeoutMs: 120_000,
};
const tmpPaths: string[] = [];
const detectionItems: MlxAnalysisItem[] = [];
for (const entry of sourceEntries) {
const tmpPath = path.join(primaryOutDir, `_tmp_detect_src_${entry.id}_${Date.now()}.png`);
await fs.promises.writeFile(tmpPath, entry.buf);
tmpPaths.push(tmpPath);
detectionItems.push({ id: entry.id, filePath: tmpPath });
}
console.log("[detect_object] calling detection API for", detectionItems.length, "items");
let batchResult;
try {
batchResult = await analyzeMlxDetectionBatch(detectionItems, detectionConfig);
console.log("[detect_object] detection API returned:", {
results: batchResult.results.length,
totalMs: batchResult.totalInferenceTimeMs,
});
try {
const totalObjects = batchResult.results.reduce((s, r) => s + (r.objects?.length ?? 0), 0);
const ms = Math.round(batchResult.totalInferenceTimeMs);
ctx.status(`${totalObjects} object${totalObjects === 1 ? "" : "s"} found across ${batchResult.results.length} image${batchResult.results.length === 1 ? "" : "s"} (${ms}ms) — drawing bounding boxes…`);
} catch {}
} finally {
for (const tp of tmpPaths) await fs.promises.unlink(tp).catch(() => {});
}
if (!batchResult.results.length) {
return {
content: [{ type: "text", text: "detect_object: no results returned from Florence-2 API." }],
isError: true as const,
};
}
// Per-image: draw bboxes, save, generate preview, build state records
const variantPreviewSpec = (VARIANT_FULL_CONFIG as any).preview;
const stamp = isoStampCompact();
let nextI = Math.max(1, st.counters?.nextImageI ?? 1);
const imageRecordsForState: any[] = [];
const resultEntries: Array<{
id: string;
i: number;
detResult: typeof batchResult.results[0];
savedPath: string;
savedFileUrl: string;
savedSize: number;
preview: any | null;
httpOriginal: string;
httpPreview: string;
}> = [];
const httpBase = await getHealthyServerBaseUrl();
for (let idx = 0; idx < batchResult.results.length; idx++) {
const detResult = batchResult.results[idx];
const sourceId = sourceEntries[idx]?.id ?? `canvas${idx + 1}`;
const origBuf = sourceEntries[idx].origBuf;
const currentI = nextI++;
const bboxes = detResult.objects.map((o) => o.bbox as [number, number, number, number]);
console.log(`[detect_object] drawing ${bboxes.length} bboxes for ${sourceId}...`);
// Draw on the original-resolution file; sourceDims carries the preview space in which
// the detection server reported its bbox coordinates so they get scaled up correctly.
const annotatedBuf = await drawBboxesOnImage(origBuf, bboxes, {
width: detResult.imageWidth,
height: detResult.imageHeight,
});
const baseName = `image-${stamp}-i${currentI}`;
const savedPath = path.join(primaryOutDir, `${baseName}.png`);
await fs.promises.writeFile(savedPath, annotatedBuf);
const savedFileUrl = pathToFileURL(savedPath).toString();
const savedSize = annotatedBuf.length;
console.log(`[detect_object] annotated image written: ${savedPath} (${savedSize} bytes)`);
let preview: any = null;
try {
const p = await generatePreviewFromBuffer(annotatedBuf, primaryOutDir, `${baseName}.png`, variantPreviewSpec);
preview = {
ok: true as const,
filePath: p.previewAbs,
fileName: p.previewFilename,
fileUrl: pathToFileURL(p.previewAbs).toString(),
size_bytes: p.data.length,
width: p.width,
height: p.height,
mimeType: "image/jpeg" as const,
dataBase64: p.data.toString("base64"),
};
} catch (e) {
console.warn(`[detect_object] preview generation failed for ${sourceId}:`, String(e));
}
const httpOriginal = httpBase
? toHttpOriginalUrl(`${baseName}.png`, httpBase, currentLmChatId || undefined)
: "";
const httpPreview = (() => {
if (!httpBase || !currentLmChatId || !preview?.fileName) return "";
return toHttpPreviewUrl(preview.fileName, httpBase, currentLmChatId);
})();
imageRecordsForState.push({
filename: `${baseName}.png`,
preview: preview ? `preview-${baseName}.jpg` : undefined,
i: currentI,
sourceTool: `${getSelfPluginIdentifier()}/detect_object`,
detectSource: sourceId,
task,
imageWidth: detResult.imageWidth,
imageHeight: detResult.imageHeight,
detections: detResult.objects.map((o) => ({
label: o.label,
bbox: { x1: o.bbox[0], y1: o.bbox[1], x2: o.bbox[2], y2: o.bbox[3] },
crop: {
cropLeft: o.cropLeft,
cropRight: o.cropRight,
cropTop: o.cropTop,
cropBottom: o.cropBottom,
},
})),
});
resultEntries.push({ id: sourceId, i: currentI, detResult, savedPath, savedFileUrl, savedSize, preview, httpOriginal, httpPreview });
}
// Update state once for all images
console.log("[detect_object] updating state...");
try {
const stateForUpdate = await readState(primaryOutDir);
const appendResult = appendImages(stateForUpdate, imageRecordsForState);
if (appendResult.changed) {
await writeStateAtomic(primaryOutDir, stateForUpdate);
console.log("[detect_object] state written, nextImageI:", stateForUpdate.counters?.nextImageI);
}
} catch (e) {
console.warn("[detect_object] state update failed:", String(e));
}
// Audit log
try {
const audit = buildAuditLogger({ backend: "detect_object", mode: "detect_object" as any, requestId: undefined });
if (currentLmChatId) audit.setChatId(currentLmChatId);
audit.setUserRequest({ targets: rawTargets, task });
audit.setOutput({
images: resultEntries.map((r) => ({
id: r.id,
i: r.i,
detections: r.detResult.objects.length,
path: r.savedPath,
url: r.savedFileUrl,
bytes: r.savedSize,
...(r.httpOriginal ? { http_url: r.httpOriginal } : {}),
...(r.preview ? { preview_path: r.preview.filePath, preview_url: r.preview.fileUrl } : {}),
...(r.httpPreview ? { http_preview_url: r.httpPreview } : {}),
})),
});
await audit.write();
} catch (e) {
console.warn("[detect_object] audit logging failed:", String(e));
}
// Assemble tool result
const envPreviewRaw = process.env["PREVIEW_IN_CHAT"];
const previewInChat =
envPreviewRaw === undefined
? true
: envPreviewRaw === "1" || envPreviewRaw.toLowerCase() === "true";
const summaries = resultEntries.map((r) => ({
tool: "detect_object",
source: r.id,
i: r.i,
imageWidth: r.detResult.imageWidth,
imageHeight: r.detResult.imageHeight,
task,
inferenceTimeMs: r.detResult.inferenceTimeMs,
detections: r.detResult.objects.map((o) => ({
label: o.label,
bbox: { x1: o.bbox[0], y1: o.bbox[1], x2: o.bbox[2], y2: o.bbox[3] },
crop: {
left: { pct: o.cropLeft, px: Math.round((o.cropLeft / 100) * r.detResult.imageWidth) },
right: { pct: o.cropRight, px: Math.round((o.cropRight / 100) * r.detResult.imageWidth) },
top: { pct: o.cropTop, px: Math.round((o.cropTop / 100) * r.detResult.imageHeight) },
bottom: { pct: o.cropBottom, px: Math.round((o.cropBottom / 100) * r.detResult.imageHeight) },
},
crop_tool_hint: "Pass crop.left.pct as cropLeft, crop.right.pct as cropRight, crop.top.pct as cropTop, crop.bottom.pct as cropBottom to the crop tool.",
})),
}));
const reviewHint = "Carefully examine the preview and comment on how well the object detection matches your intent.";
const content: any[] = [];
for (const r of resultEntries) {
const fallbackPreviewUrl = r.preview?.fileUrl || r.savedFileUrl;
const previewLine = `Preview i${r.i}: ${r.httpPreview ? r.httpPreview : fallbackPreviewUrl}`;
const originalLine = `Original i${r.i}: ${r.httpOriginal ? r.httpOriginal : r.savedFileUrl}`;
if (previewInChat && r.preview) {
const fname = String(r.preview.fileName || "");
content.push({
type: "image",
fileName: fname,
mimeType: r.preview.mimeType,
markdown: ``,
$hint: "This is an image file. Present the image to the user by using the markdown above.",
} as any);
}
content.push({ type: "text", text: previewLine });
content.push({ type: "text", text: originalLine });
}
const totalMs = Math.round(batchResult.totalInferenceTimeMs);
if (totalMs > 0) {
content.push({ type: "text", text: `Total inference time: ${totalMs}ms` });
}
content.push({
type: "text",
text: JSON.stringify(summaries.length === 1 ? summaries[0] : summaries),
$hint: reviewHint,
});
return { content };
} catch (error) {
return {
content: [
{
type: "text",
text: `detect_object failed: ${(error as Error).message || String(error)}`,
},
],
isError: true as const,
};
}
},
});
}