Project Files
src / services / mlxVisionAnalyzer.ts
import fs from "fs";
import path from "path";
/* -------------------------------------------------------------------------- */
/* MLX Vision Analyzer: HTTP API wrapper for localhost:8765/analyze */
/* -------------------------------------------------------------------------- */
export interface MlxVisionAnalyzerConfig {
/** Full URL to the MLX /analyze endpoint, e.g. "http://localhost:8765/analyze" */
endpoint: string;
/** Prompt sent to the vision model for each image. Empty = model default. */
prompt?: string;
/** Max tokens for response (1-4096, default: 128) */
maxTokens?: number;
/** Temperature (0.0-2.0, default: 0.7) */
temperature?: number;
/** Timeout in ms (default: 30_000) */
timeoutMs?: number;
}
export interface MlxAnalysisItem {
/** Stable identifier for this image (e.g., "a1", "v3", "img_0") */
id: string;
/** Absolute path to the image file (JPEG/PNG/WebP) */
filePath: string;
}
export interface MlxAnalysisResult {
/** Image identifier (echoed from input) */
id: string;
/** Vision model's text response */
text: string;
/** Inference time for this image (ms) */
inferenceTimeMs: number;
}
export interface MlxAnalysisBatchResult {
/** Per-image results (same order as input) */
results: MlxAnalysisResult[];
/** Total inference time for the batch (ms) */
totalInferenceTimeMs: number;
/** Backend identifier (always "mlx") */
backend: string;
}
/**
* Check if the MLX Vision Analyzer endpoint is reachable.
* Sends a minimal request with no images; expects a 400 (bad request) or 200.
* A network error or 503 means the service is down.
*/
export async function isMlxVisionAvailable(endpoint: string): Promise<boolean> {
try {
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), 3000);
const resp = await fetch(endpoint, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ images: [], prompt: "ping" }),
signal: controller.signal,
});
clearTimeout(timeout);
// 400 = "no images" validation error → service is alive
// 200 = empty success → also alive
// 503 = model not loaded → NOT available for inference
return resp.status !== 503;
} catch {
return false;
}
}
/**
* Analyze one or more images using the MLX Vision API.
* Returns structured results with image IDs preserved for client-side mapping.
*
* @param items - Array of images with stable IDs and file paths
* @param config - MLX API configuration (endpoint, prompt, etc.)
* @returns Batch result with per-image descriptions and timing info
* @throws Error if the API call fails (network, 503, 500, etc.)
*/
export async function analyzeMlxVisionBatch(
items: MlxAnalysisItem[],
config: MlxVisionAnalyzerConfig
): Promise<MlxAnalysisBatchResult> {
if (!items.length) {
return {
results: [],
totalInferenceTimeMs: 0,
backend: "mlx",
};
}
// 1. Read image files → base64
const images: Array<{ id: string; data: string }> = [];
for (const it of items) {
const buf = await fs.promises.readFile(it.filePath);
const b64 = buf.toString("base64");
images.push({
id: it.id,
data: b64,
});
}
// 2. Build request payload
const payload: any = { images };
if (config.prompt !== undefined && config.prompt !== "") {
payload.prompt = config.prompt;
}
if (config.maxTokens !== undefined) {
payload.max_tokens = config.maxTokens;
}
if (config.temperature !== undefined) {
payload.temperature = config.temperature;
}
// 3. Call MLX /analyze (batch)
const controller = new AbortController();
const timeoutMs = config.timeoutMs ?? 30_000;
const timeout = setTimeout(() => controller.abort(), timeoutMs);
let data: any;
try {
const resp = await fetch(config.endpoint, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify(payload),
signal: controller.signal,
});
clearTimeout(timeout);
if (!resp.ok) {
const detail = await resp.text().catch(() => "(no body)");
throw new Error(`MLX Vision API ${resp.status}: ${detail}`);
}
data = await resp.json();
} catch (e) {
clearTimeout(timeout);
throw new Error(
`MLX Vision API /analyze failed: ${(e as Error).message || String(e)}`
);
}
// 4. Parse response
const results: MlxAnalysisResult[] = Array.isArray(data?.results)
? data.results.map((r: any) => ({
id: String(r?.id || ""),
text: String(r?.text || "").trim(),
inferenceTimeMs:
typeof r?.inference_time_ms === "number" ? r.inference_time_ms : 0,
}))
: [];
const totalInferenceTimeMs =
typeof data?.total_inference_time_ms === "number"
? data.total_inference_time_ms
: 0;
const backend = typeof data?.backend === "string" ? data.backend : "mlx";
return {
results,
totalInferenceTimeMs,
backend,
};
}
/**
* Convenience wrapper: analyze a single image and return the text description.
* Throws if the API call fails or returns no result.
*/
export async function analyzeMlxVisionSingle(
filePath: string,
config: MlxVisionAnalyzerConfig
): Promise<string> {
const id = `img_${Date.now()}`;
const batch = await analyzeMlxVisionBatch([{ id, filePath }], config);
if (!batch.results.length) {
throw new Error("MLX Vision API returned no results");
}
return batch.results[0].text;
}
/* -------------------------------------------------------------------------- */
/* Detection types (Florence-2 /detect endpoint) */
/* -------------------------------------------------------------------------- */
export interface MlxDetectionObject {
label: string;
bbox: [number, number, number, number];
cropLeft: number;
cropRight: number;
cropTop: number;
cropBottom: number;
}
export interface MlxDetectionResult {
id: string;
objects: MlxDetectionObject[];
imageWidth: number;
imageHeight: number;
inferenceTimeMs: number;
}
export interface MlxDetectionBatchResult {
results: MlxDetectionResult[];
totalInferenceTimeMs: number;
backend: string;
}
export interface MlxDetectionAnalyzerConfig {
/** Full URL to the /detect endpoint, e.g. "http://localhost:8765/detect" */
endpoint: string;
/** Florence-2 task token, e.g. "<OD>" */
task?: string;
/** Absolute path to the Florence-2 model directory */
florence2ModelPath: string;
/** Timeout in ms (default: 60_000) */
timeoutMs?: number;
}
function bboxToCrop(
bbox: [number, number, number, number],
W: number,
H: number
): { cropLeft: number; cropRight: number; cropTop: number; cropBottom: number } {
const [x1, y1, x2, y2] = bbox;
return {
cropLeft: (x1 / W) * 100,
cropRight: ((W - x2) / W) * 100,
cropTop: (y1 / H) * 100,
cropBottom: ((H - y2) / H) * 100,
};
}
/**
* Detect objects in one or more images using the Florence-2 /detect endpoint.
* Returns structured bounding boxes with crop percentages for each image.
*
* @param items - Array of images with stable IDs and file paths
* @param config - Detection API configuration (endpoint, task, florence2ModelPath, etc.)
* @returns Batch result with per-image detections and timing info
* @throws Error if the API call fails (network, 503, 500, etc.)
*/
export async function analyzeMlxDetectionBatch(
items: MlxAnalysisItem[],
config: MlxDetectionAnalyzerConfig
): Promise<MlxDetectionBatchResult> {
if (!items.length) {
return {
results: [],
totalInferenceTimeMs: 0,
backend: "florence2",
};
}
// 1. Read image files → base64
const images: Array<{ id: string; data: string }> = [];
for (const it of items) {
const buf = await fs.promises.readFile(it.filePath);
const b64 = buf.toString("base64");
images.push({ id: it.id, data: b64 });
}
// 2. Build request payload
const payload: any = {
images,
task: config.task ?? "<OD>",
florence2_model_path: config.florence2ModelPath,
};
// 3. Call /detect endpoint
const controller = new AbortController();
const timeoutMs = config.timeoutMs ?? 60_000;
const timeout = setTimeout(() => controller.abort(), timeoutMs);
let data: any;
try {
const resp = await fetch(config.endpoint, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify(payload),
signal: controller.signal,
});
clearTimeout(timeout);
if (!resp.ok) {
const detail = await resp.text().catch(() => "(no body)");
throw new Error(`Florence-2 /detect API ${resp.status}: ${detail}`);
}
data = await resp.json();
} catch (e) {
clearTimeout(timeout);
throw new Error(
`Florence-2 /detect API failed: ${(e as Error).message || String(e)}`
);
}
// 4. Parse response
const results: MlxDetectionResult[] = Array.isArray(data?.results)
? data.results.map((r: any) => {
const W: number = typeof r?.width === "number" ? r.width : 0;
const H: number = typeof r?.height === "number" ? r.height : 0;
const rawBboxes: [number, number, number, number][] = Array.isArray(r?.bboxes)
? r.bboxes
: [];
const rawLabels: string[] = Array.isArray(r?.labels) ? r.labels : [];
const objects: MlxDetectionObject[] = rawBboxes.map(
(bbox: [number, number, number, number], idx: number) => ({
label: rawLabels[idx] ?? "",
bbox,
...bboxToCrop(bbox, W, H),
})
);
return {
id: String(r?.id || ""),
objects,
imageWidth: W,
imageHeight: H,
inferenceTimeMs:
typeof r?.inference_time_ms === "number" ? r.inference_time_ms : 0,
};
})
: [];
const totalInferenceTimeMs =
typeof data?.total_inference_time_ms === "number"
? data.total_inference_time_ms
: 0;
return {
results,
totalInferenceTimeMs,
backend: "florence2",
};
}