Project Files
src / services / lmStudioVisionAnalyzer.ts
import fs from "fs";
import path from "path";
import { getLogsDir } from "../core-bundle.mjs";
export interface VisionAnalysisItem {
id: string;
filePath: string;
}
export interface VisionAnalysisResult {
id: string;
text: string;
inferenceTimeMs: number;
}
export interface VisionAnalysisBatchResult {
results: VisionAnalysisResult[];
totalInferenceTimeMs: number;
backend: string;
}
export interface VisionDetectionObject {
label: string;
bbox: [number, number, number, number];
cropLeft: number;
cropRight: number;
cropTop: number;
cropBottom: number;
}
export interface VisionDetectionResult {
id: string;
objects: VisionDetectionObject[];
imageWidth: number;
imageHeight: number;
inferenceTimeMs: number;
}
export interface VisionDetectionBatchResult {
results: VisionDetectionResult[];
totalInferenceTimeMs: number;
backend: string;
}
export interface VisionDetectionAnalyzerConfig {
task?: string;
odPrompt?: string;
maxTokens?: number;
temperature?: number;
timeoutMs?: number;
}
export interface LmStudioVisionAnalyzerConfig {
/** Vision API root/base URL, e.g. http://127.0.0.1:1234/v1 */
baseUrl: string;
/** Optional bearer token for the Vision API. */
apiKey?: string;
/** Vision API model key. */
model?: string;
/** Prompt sent to the vision model for each image. */
prompt?: string;
/** Max output tokens for response. */
maxTokens?: number;
/** Sampling temperature. */
temperature?: number;
/** Timeout in ms. */
timeoutMs?: number;
}
export interface LmStudioVisionEnsureConfig {
baseUrl: string;
apiKey?: string;
modelKey: string;
status?: (message: string) => void;
}
const JSON_FENCE_RE = /```(?:json)?\s*([\s\S]*?)```/i;
const ITEM_RE = /\{\s*"bbox_2d":\s*\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\],\s*"label":\s*"([^"]+)"\s*\}/gi;
const JSON_FORMAT =
" Output JSON only — a JSON array where each element has" +
" 'bbox_2d' ([x1, y1, x2, y2] as integers normalized 0–1000) and 'label' (a string)." +
" No prose, no markdown, no explanation.";
const LABEL_FORMAT_RULE =
"\n\nLABEL FORMAT RULE (mandatory):" +
"\n- Labels must be concise and specific: 2–4 words maximum." +
"\n- Examples: 'plugin list', 'plugin name', 'human face', 'left hand', 'red car', 'fluffy owl toy'";
function normalizeLmApiRoot(baseUrl: string): string {
return String(baseUrl || "")
.trim()
.replace(/\/(api\/v1|v1)\/?$/i, "")
.replace(/\/+$/, "");
}
function authHeaders(apiKey: string | undefined, contentType = false): Record<string, string> {
const headers: Record<string, string> = {};
if (contentType) headers["Content-Type"] = "application/json";
if (apiKey?.trim()) headers.Authorization = `Bearer ${apiKey.trim()}`;
return headers;
}
function logVisionRequestMetadata(metadata: Record<string, unknown>): void {
const line = `[LmStudioVisionAnalyzer] /api/v1/chat request ${JSON.stringify(metadata)}`;
console.info(line);
try {
const logsDir = getLogsDir();
if (!fs.existsSync(logsDir)) fs.mkdirSync(logsDir, { recursive: true });
fs.appendFileSync(
path.join(logsDir, "user-docs-plugin.log"),
`${new Date().toISOString()} - ${line}\n`,
"utf8"
);
} catch {}
}
function hasLoadedInstances(modelInfo: any): boolean {
return Array.isArray(modelInfo?.loaded_instances) && modelInfo.loaded_instances.length > 0;
}
async function getVisionModelState(
baseUrl: string,
apiKey: string | undefined,
modelKey: string
): Promise<{ loaded: boolean; modelKey?: string }> {
const apiRoot = normalizeLmApiRoot(baseUrl);
if (!apiRoot) return { loaded: false };
const normalizedModelKey = modelKey.trim().toLowerCase();
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), 5000);
try {
const response = await fetch(`${apiRoot}/api/v1/models`, {
headers: authHeaders(apiKey),
signal: controller.signal,
});
if (!response.ok) return { loaded: false };
const data = await response.json() as any;
const models: any[] = Array.isArray(data)
? data
: Array.isArray(data?.models)
? data.models
: Array.isArray(data?.data)
? data.data
: [];
const modelInfo = models.find((entry: any) => {
const key = String(entry?.key || entry?.id || "").trim().toLowerCase();
return key === normalizedModelKey;
});
if (!modelInfo) return { loaded: false };
return {
loaded: hasLoadedInstances(modelInfo),
modelKey: String(modelInfo?.key || modelInfo?.id || "").trim() || undefined,
};
} catch {
return { loaded: false };
} finally {
clearTimeout(timeout);
}
}
async function loadVisionInstanceViaApi(
baseUrl: string,
apiKey: string | undefined,
modelKey: string
): Promise<{ ok: true } | { ok: false; error: string }> {
const apiRoot = normalizeLmApiRoot(baseUrl);
if (!apiRoot) {
return { ok: false, error: "Vision API base URL is empty." };
}
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), 600_000);
try {
const response = await fetch(`${apiRoot}/api/v1/models/load`, {
method: "POST",
headers: authHeaders(apiKey, true),
body: JSON.stringify({
model: modelKey,
echo_load_config: true,
}),
signal: controller.signal,
});
const text = await response.text().catch(() => "");
let data: any = null;
if (text.trim()) {
try {
data = JSON.parse(text);
} catch {
data = { raw: text };
}
}
const apiError = data?.error?.message || data?.error || data?.message;
if (!response.ok || apiError) {
const detail = apiError || text || `${response.status} ${response.statusText}`;
return {
ok: false,
error: `Vision API could not load '${modelKey}' via /api/v1/models/load. This can happen when there are not enough system resources available. Error: ${detail}`,
};
}
return { ok: true };
} catch (error: any) {
const detail = error?.name === "AbortError"
? "request timed out after 600000 ms"
: error?.message || String(error);
return {
ok: false,
error: `Vision API could not load '${modelKey}' via /api/v1/models/load. This can happen when there are not enough system resources available. Error: ${detail}`,
};
} finally {
clearTimeout(timeout);
}
}
export async function ensureLmStudioVisionInstanceReady(
config: LmStudioVisionEnsureConfig
): Promise<{ ok: true; loaded: boolean } | { ok: false; error: string }> {
const modelKey = String(config.modelKey || "").trim();
if (!modelKey) {
return {
ok: false,
error: "Vision API mode is active, but Qwen3-VL model key is empty.",
};
}
const initialState = await getVisionModelState(config.baseUrl, config.apiKey, modelKey);
if (initialState.loaded) {
return { ok: true, loaded: false };
}
try { config.status?.(`Loading ${modelKey}...`); } catch {}
const loadResult = await loadVisionInstanceViaApi(
config.baseUrl,
config.apiKey,
modelKey
);
if (!loadResult.ok) return loadResult;
const loadedState = await getVisionModelState(config.baseUrl, config.apiKey, modelKey);
if (!loadedState.loaded) {
return {
ok: false,
error: `Vision API loaded '${modelKey}' via /api/v1/models/load, but /api/v1/models did not report it as loaded.`,
};
}
if (loadedState.modelKey?.trim().toLowerCase() !== modelKey.toLowerCase()) {
return {
ok: false,
error: `Vision API loaded a model, but /api/v1/models reports '${loadedState.modelKey || "unknown model"}' instead of '${modelKey}'.`,
};
}
return { ok: true, loaded: true };
}
function mimeFromPath(filePath: string): string {
const ext = path.extname(filePath).toLowerCase();
if (ext === ".jpg" || ext === ".jpeg") return "image/jpeg";
if (ext === ".webp") return "image/webp";
if (ext === ".gif") return "image/gif";
return "image/png";
}
function readUInt24LE(buffer: Buffer, offset: number): number {
return buffer[offset] | (buffer[offset + 1] << 8) | (buffer[offset + 2] << 16);
}
function readPngDimensions(buffer: Buffer): { width: number; height: number } | null {
if (buffer.length < 24) return null;
if (buffer.toString("ascii", 1, 4) !== "PNG") return null;
return {
width: buffer.readUInt32BE(16),
height: buffer.readUInt32BE(20),
};
}
function readGifDimensions(buffer: Buffer): { width: number; height: number } | null {
if (buffer.length < 10) return null;
const signature = buffer.toString("ascii", 0, 6);
if (signature !== "GIF87a" && signature !== "GIF89a") return null;
return {
width: buffer.readUInt16LE(6),
height: buffer.readUInt16LE(8),
};
}
function readWebpDimensions(buffer: Buffer): { width: number; height: number } | null {
if (buffer.length < 30) return null;
if (buffer.toString("ascii", 0, 4) !== "RIFF" || buffer.toString("ascii", 8, 12) !== "WEBP") {
return null;
}
const chunkType = buffer.toString("ascii", 12, 16);
if (chunkType === "VP8X" && buffer.length >= 30) {
return {
width: readUInt24LE(buffer, 24) + 1,
height: readUInt24LE(buffer, 27) + 1,
};
}
if (chunkType === "VP8L" && buffer.length >= 25 && buffer[20] === 0x2f) {
const bits = buffer.readUInt32LE(21);
return {
width: (bits & 0x3fff) + 1,
height: ((bits >> 14) & 0x3fff) + 1,
};
}
if (chunkType === "VP8 " && buffer.length >= 30) {
return {
width: buffer.readUInt16LE(26) & 0x3fff,
height: buffer.readUInt16LE(28) & 0x3fff,
};
}
return null;
}
function readJpegDimensions(buffer: Buffer): { width: number; height: number } | null {
if (buffer.length < 4 || buffer[0] !== 0xff || buffer[1] !== 0xd8) return null;
let offset = 2;
while (offset + 9 < buffer.length) {
if (buffer[offset] !== 0xff) {
offset += 1;
continue;
}
while (offset < buffer.length && buffer[offset] === 0xff) offset += 1;
const marker = buffer[offset];
offset += 1;
if (marker === 0xd9 || marker === 0xda) break;
if (offset + 2 > buffer.length) break;
const segmentLength = buffer.readUInt16BE(offset);
if (segmentLength < 2 || offset + segmentLength > buffer.length) break;
const isStartOfFrame =
(marker >= 0xc0 && marker <= 0xc3) ||
(marker >= 0xc5 && marker <= 0xc7) ||
(marker >= 0xc9 && marker <= 0xcb) ||
(marker >= 0xcd && marker <= 0xcf);
if (isStartOfFrame && segmentLength >= 7) {
return {
height: buffer.readUInt16BE(offset + 3),
width: buffer.readUInt16BE(offset + 5),
};
}
offset += segmentLength;
}
return null;
}
async function readImageDimensions(filePath: string): Promise<{ width: number; height: number }> {
const buffer = await fs.promises.readFile(filePath);
const dimensions =
readPngDimensions(buffer) ||
readJpegDimensions(buffer) ||
readWebpDimensions(buffer) ||
readGifDimensions(buffer);
if (!dimensions || dimensions.width <= 0 || dimensions.height <= 0) {
throw new Error(`Could not determine image dimensions for ${filePath}`);
}
return dimensions;
}
function extractMessageText(data: any): string {
const output = Array.isArray(data?.output) ? data.output : [];
const pieces: string[] = [];
for (const item of output) {
if (item?.type !== "message") continue;
const content = item?.content;
if (typeof content === "string") {
pieces.push(content);
} else if (Array.isArray(content)) {
for (const part of content) {
if (typeof part === "string") {
pieces.push(part);
} else if (typeof part?.text === "string") {
pieces.push(part.text);
} else if (typeof part?.content === "string") {
pieces.push(part.content);
}
}
}
}
if (pieces.length === 0 && typeof data?.text === "string") {
pieces.push(data.text);
}
if (pieces.length === 0 && typeof data?.content === "string") {
pieces.push(data.content);
}
return pieces.join("\n").trim();
}
function buildDetectPrompt(task: string | undefined, odPrompt: string | undefined): string {
const label = String(task || "").trim();
if (label) {
return `Detect all instances of '${label}' in the image.` + JSON_FORMAT;
}
const instruction = String(odPrompt || "").trim();
if (!instruction) {
throw new Error("No OD prompt available: odPrompt not set and DETECT_OD_PROMPT env var not set");
}
return instruction + LABEL_FORMAT_RULE + JSON_FORMAT;
}
function bboxToCrop(
bbox: [number, number, number, number],
width: number,
height: number
): { cropLeft: number; cropRight: number; cropTop: number; cropBottom: number } {
const [x1, y1, x2, y2] = bbox;
return {
cropLeft: (x1 / width) * 100,
cropRight: ((width - x2) / width) * 100,
cropTop: (y1 / height) * 100,
cropBottom: ((height - y2) / height) * 100,
};
}
function parseQwen3VlDetectionOutput(text: string, width: number, height: number): VisionDetectionObject[] {
const objects: VisionDetectionObject[] = [];
const seen = new Set<string>();
const fenceMatch = JSON_FENCE_RE.exec(text);
const jsonText = fenceMatch ? fenceMatch[1].trim() : text.trim();
let items: any[] | null = null;
try {
const parsed = JSON.parse(jsonText);
items = Array.isArray(parsed) ? parsed : [parsed];
} catch {
const recovered: any[] = [];
ITEM_RE.lastIndex = 0;
for (const match of text.matchAll(ITEM_RE)) {
recovered.push({
bbox_2d: [Number(match[1]), Number(match[2]), Number(match[3]), Number(match[4])],
label: match[5],
});
}
items = recovered.length > 0 ? recovered : [];
}
for (const item of items) {
if (!item || typeof item !== "object") continue;
const bbox = item.bbox_2d;
const label = String(item.label || "");
if (!Array.isArray(bbox) || bbox.length !== 4) continue;
const [nx1, ny1, nx2, ny2] = bbox.map((value: unknown) => Number(value));
if (![nx1, ny1, nx2, ny2].every((value) => Number.isFinite(value) && value >= 0 && value <= 1000)) {
continue;
}
if (nx2 <= nx1 || ny2 <= ny1) continue;
if (nx1 < 10 && ny1 < 10 && nx2 > 990 && ny2 > 990) continue;
const dedupKey = `${Math.round(nx1)}:${Math.round(ny1)}:${Math.round(nx2)}:${Math.round(ny2)}:${label}`;
if (seen.has(dedupKey)) continue;
seen.add(dedupKey);
const pixelBbox: [number, number, number, number] = [
(nx1 / 1000) * width,
(ny1 / 1000) * height,
(nx2 / 1000) * width,
(ny2 / 1000) * height,
];
objects.push({
label,
bbox: pixelBbox,
...bboxToCrop(pixelBbox, width, height),
});
}
return objects;
}
async function chatOnce(
item: VisionAnalysisItem,
prompt: string,
config: {
baseUrl: string;
apiKey?: string;
model?: string;
maxTokens?: number;
temperature?: number;
timeoutMs?: number;
}
): Promise<{ text: string; elapsedMs: number; bytes: number; modelInstanceId: string }> {
const apiRoot = normalizeLmApiRoot(config.baseUrl);
if (!apiRoot) {
throw new Error("Vision API base URL is empty");
}
const endpoint = `${apiRoot}/api/v1/chat`;
const timeoutMs = config.timeoutMs ?? 180_000;
const model = config.model || "vision-capability-priming";
const buf = await fs.promises.readFile(item.filePath);
const dataUrl = `data:${mimeFromPath(item.filePath)};base64,${buf.toString("base64")}`;
const payload: any = {
model,
input: [
{ type: "text", content: prompt },
{ type: "image", data_url: dataUrl },
],
store: false,
};
if (typeof config.maxTokens === "number" && Number.isFinite(config.maxTokens) && config.maxTokens > 0) {
payload.max_output_tokens = Math.floor(config.maxTokens);
}
if (typeof config.temperature === "number" && Number.isFinite(config.temperature)) {
payload.temperature = config.temperature;
}
logVisionRequestMetadata({
configuredBaseUrl: config.baseUrl,
apiRoot,
endpoint,
model,
store: payload.store,
max_output_tokens: payload.max_output_tokens ?? null,
temperature: payload.temperature ?? null,
promptChars: prompt.length,
imageBytes: buf.byteLength,
inputTypes: Array.isArray(payload.input) ? payload.input.map((part: any) => part.type) : [],
payloadKeys: Object.keys(payload),
});
const headers = authHeaders(config.apiKey, true);
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), timeoutMs);
const startedAt = Date.now();
let data: any;
try {
const resp = await fetch(endpoint, {
method: "POST",
headers,
body: JSON.stringify(payload),
signal: controller.signal,
});
clearTimeout(timeout);
if (!resp.ok) {
const detail = await resp.text().catch(() => "(no body)");
throw new Error(`Vision API ${resp.status}: ${detail}`);
}
data = await resp.json();
} catch (error: any) {
clearTimeout(timeout);
if (error?.name === "AbortError") {
throw new Error(`Vision API timed out after ${timeoutMs}ms`);
}
throw new Error(`Vision API failed: ${error?.message || String(error)}`);
}
return {
text: extractMessageText(data),
elapsedMs: Date.now() - startedAt,
bytes: buf.byteLength,
modelInstanceId: typeof data?.model_instance_id === "string" ? data.model_instance_id : "",
};
}
export async function analyzeLmStudioVisionBatch(
items: VisionAnalysisItem[],
config: LmStudioVisionAnalyzerConfig
): Promise<VisionAnalysisBatchResult> {
if (!items.length) {
return {
results: [],
totalInferenceTimeMs: 0,
backend: "vision-api",
};
}
const results: VisionAnalysisResult[] = [];
let totalInferenceTimeMs = 0;
for (const item of items) {
console.info(
`[LmStudioVisionAnalyzer] /api/v1/chat start mode=analyze id=${item.id} timeoutMs=${config.timeoutMs ?? 180_000}`
);
const response = await chatOnce(item, config.prompt || "Describe the image.", config);
console.info(
`[LmStudioVisionAnalyzer] /api/v1/chat ok mode=analyze id=${item.id} bytes=${response.bytes} elapsedMs=${response.elapsedMs} modelInstance=${response.modelInstanceId || "?"}`
);
results.push({
id: item.id,
text: response.text,
inferenceTimeMs: response.elapsedMs,
});
totalInferenceTimeMs += response.elapsedMs;
}
return {
results,
totalInferenceTimeMs,
backend: "vision-api",
};
}
export async function detectLmStudioVisionBatch(
items: VisionAnalysisItem[],
config: VisionDetectionAnalyzerConfig & {
baseUrl: string;
apiKey?: string;
model?: string;
}
): Promise<VisionDetectionBatchResult> {
if (!items.length) {
return {
results: [],
totalInferenceTimeMs: 0,
backend: "vision-api",
};
}
const prompt = buildDetectPrompt(config.task, config.odPrompt);
const results: VisionDetectionResult[] = [];
let totalInferenceTimeMs = 0;
for (const item of items) {
const { width, height } = await readImageDimensions(item.filePath);
console.info(
`[LmStudioVisionAnalyzer] /api/v1/chat start mode=detect id=${item.id} timeoutMs=${config.timeoutMs ?? 120_000}`
);
const response = await chatOnce(item, prompt, {
baseUrl: config.baseUrl,
apiKey: config.apiKey,
model: config.model || "vision-capability-priming",
maxTokens: config.maxTokens,
temperature: config.temperature,
timeoutMs: config.timeoutMs ?? 120_000,
});
const objects = parseQwen3VlDetectionOutput(response.text, width, height);
console.info(
`[LmStudioVisionAnalyzer] /api/v1/chat ok mode=detect id=${item.id} objects=${objects.length} bytes=${response.bytes} elapsedMs=${response.elapsedMs} modelInstance=${response.modelInstanceId || "?"}`
);
results.push({
id: item.id,
objects,
imageWidth: width,
imageHeight: height,
inferenceTimeMs: response.elapsedMs,
});
totalInferenceTimeMs += response.elapsedMs;
}
return {
results,
totalInferenceTimeMs,
backend: "vision-api",
};
}