Project Files
src / fastvlm-server-manager.ts
import fs from "fs";
import os from "os";
import path from "path";
import { spawn, execFileSync } from "child_process";
const HEALTH_POLL_INTERVAL_MS = 1_000;
const HEALTH_POLL_TIMEOUT_MS = 90_000;
const HEALTH_FETCH_TIMEOUT_MS = 2_000;
const VENV_PACKAGES_COMMON = [
"fastapi==0.128.8",
"uvicorn[standard]==0.39.0",
"pillow==10.4.0",
"transformers==4.57.6",
"torch==2.8.0",
"einops==0.8.2",
"timm==1.0.26",
"tokenizers==0.22.2",
"sentencepiece==0.2.1",
"huggingface_hub==0.36.2",
];
const VENV_PACKAGES_MACOS = [
"mlx==0.29.3",
"mlx-lm==0.29.1",
"mlx-vlm==0.1.13",
"coremltools==9.0",
];
const VENV_PACKAGES = process.platform === "darwin"
? [...VENV_PACKAGES_COMMON, ...VENV_PACKAGES_MACOS]
: VENV_PACKAGES_COMMON;
let _activePort: number | null = null;
function pluginRoot(): string {
return process.cwd();
}
function venvDir(): string {
return path.join(os.homedir(), ".fastvlm", "venv");
}
function venvPython(): string {
return path.join(venvDir(), "bin", "python3");
}
function venvPip(): string {
return path.join(venvDir(), "bin", "pip");
}
function venvExists(): boolean {
return fs.existsSync(venvPython());
}
function venvPythonVersion(): string | null {
try {
const out = execFileSync(venvPython(), ["--version"], { encoding: "utf-8", timeout: 5_000 });
const m = out.trim().match(/^Python (\d+\.\d+)/);
return m ? m[1] : null;
} catch {
return null;
}
}
function logsDir(): string {
return path.join(pluginRoot(), "logs");
}
function pidFilePath(): string {
return path.join(logsDir(), "fastvlm-server.pid");
}
function logFilePath(): string {
return path.join(logsDir(), "mlx-vision-server.log");
}
function runAndStream(
cmd: string,
args: string[],
cwd: string,
onLine: (line: string) => void,
logFile?: string
): Promise<void> {
return new Promise((resolve, reject) => {
const child = spawn(cmd, args, { cwd, stdio: ["ignore", "pipe", "pipe"] });
const logStream = logFile ? fs.createWriteStream(logFile, { flags: "a" }) : null;
function onData(data: Buffer) {
for (const line of data.toString().split("\n")) {
const t = line.trim();
if (t) {
onLine(t);
logStream?.write(t + "\n");
}
}
}
child.stdout.on("data", onData);
child.stderr.on("data", onData);
child.on("error", (err) => {
logStream?.end();
reject(err);
});
child.on("close", (code) => {
logStream?.end();
if (code === 0) resolve();
else reject(new Error(`${path.basename(cmd)} exited with code ${code}`));
});
});
}
// Python 3.9 from Xcode CLT — matches the working venv (lib/python3.9/).
// mlx-vlm 0.1.13 requires this version for llava_qwen2 remapping support.
const PYTHON39 = "/Library/Developer/CommandLineTools/usr/bin/python3.9";
async function ensureVenv(onStatus: (msg: string) => void, logFile: string): Promise<void> {
if (venvExists()) {
const version = venvPythonVersion();
if (version !== null && !version.startsWith("3.9")) {
onStatus(`Python environment is ${version} — rebuilding with Python 3.9…`);
fs.rmSync(venvDir(), { recursive: true, force: true });
}
}
if (!venvExists()) {
onStatus("Creating Python environment…");
await runAndStream(PYTHON39, ["-m", "venv", venvDir()], pluginRoot(), onStatus, logFile);
onStatus("Installing dependencies — this may take a few minutes…");
await runAndStream(venvPip(), ["install", ...VENV_PACKAGES], pluginRoot(), onStatus, logFile);
}
await ensureVenvPatches(onStatus, logFile);
onStatus("Python environment ready");
}
// Applies all mlx-vlm patches required for FastVLM-7B support.
// Runs on every startup so that gaps left by older venv setups are filled.
async function ensureVenvPatches(onStatus: (msg: string) => void, logFile: string): Promise<void> {
// Resolve site-packages dynamically — the venv Python version may vary (3.9, 3.13, …).
const libDir = path.join(venvDir(), "lib");
const pythonSubdir = fs.existsSync(libDir)
? fs.readdirSync(libDir).find((d) => d.startsWith("python")) ?? "python3.9"
: "python3.9";
const sitePackages = path.join(venvDir(), "lib", pythonSubdir, "site-packages");
const mlxVlmRoot = path.join(sitePackages, "mlx_vlm");
// Ensure coremltools is installed — required by models/fastvlm/fastvlm.py.
// Use a dedicated marker file instead of checking .dist-info (whose path includes the
// Python version and would not be found if the venv was created with a different version).
const coremltoolsMarker = path.join(venvDir(), "coremltools_installed.marker");
if (!fs.existsSync(coremltoolsMarker)) {
onStatus("Installing missing dependencies…");
await runAndStream(venvPip(), ["install", "coremltools==9.0"], pluginRoot(), onStatus, logFile);
fs.writeFileSync(coremltoolsMarker, new Date().toISOString());
}
// Patch 1: utils.py — three sub-patches required for FastVLM support
const mlxVlmUtils = path.join(mlxVlmRoot, "utils.py");
if (fs.existsSync(mlxVlmUtils)) {
let src = fs.readFileSync(mlxVlmUtils, "utf-8");
// 1a: add "llava_qwen2" -> "fastvlm" to MODEL_REMAPPING
src = src.replace(
/MODEL_REMAPPING\s*=\s*\{([^}]*)\}/,
(match) => {
if (match.includes("llava_qwen2")) return match;
return match.replace(/\}$/, ', "llava_qwen2": "fastvlm"}');
}
);
// 1b: add "import coremltools" after "import mlx.nn as nn" if not already present
if (!src.includes("import coremltools")) {
src = src.replace(
/^(import mlx\.nn as nn\s*\n)/m,
"$1import coremltools\n"
);
}
// 1c: guard sanitize_weights(model_class.VisionModel, ...) with hasattr check
// and load CoreML vision tower for models that have no VisionModel (e.g. fastvlm)
if (!src.includes("hasattr(model_class, 'VisionModel')")) {
src = src.replace(
/ weights = sanitize_weights\(\s*model_class\.VisionModel, weights, model_config\.vision_config\s*\)/,
[
" if hasattr(model_class, 'VisionModel'):",
" weights = sanitize_weights(",
" model_class.VisionModel, weights, model_config.vision_config",
" )",
" else:",
" # Load CoreML vision tower (used by fastvlm)",
" print(\"Looking for CoreML vision tower\")",
" coreml_file = glob.glob(str(model_path / \"*.mlpackage\"))",
" if len(coreml_file) > 0:",
" assert len(coreml_file) == 1, \"Found multiple vision model files.\"",
" print(f\"Loading {coreml_file[0]} vision tower\")",
" model.vision_tower = coremltools.models.MLModel(coreml_file[0], compute_units=coremltools.ComputeUnit.CPU_ONLY)",
].join("\n")
);
}
fs.writeFileSync(mlxVlmUtils, src, "utf-8");
}
// Patch 2: prompt_utils.py — add llava_qwen2 format entry and fix null chat_template
const mlxVlmPromptUtils = path.join(mlxVlmRoot, "prompt_utils.py");
if (fs.existsSync(mlxVlmPromptUtils)) {
let src = fs.readFileSync(mlxVlmPromptUtils, "utf-8");
src = src.replace(
/("llava":\s*"message_list_with_image",)(\s*"llava_next":)/,
(match, llavaEntry, llavaNextEntry) => {
if (src.includes('"llava_qwen2"')) return match;
return `${llavaEntry}\n "llava_qwen2": "message_with_image_token_new_line",${llavaNextEntry}`;
}
);
src = src.replace(
/if "chat_template" in processor\.__dict__\.keys\(\):/,
(match) => {
if (src.includes("processor.chat_template is not None")) return match;
return `if ("chat_template" in processor.__dict__.keys()) and (processor.chat_template is not None):`;
}
);
fs.writeFileSync(mlxVlmPromptUtils, src, "utf-8");
}
// Patch 3: copy mlx_vlm/models/fastvlm/ — the model implementation missing from the wheel.
// Always re-copy so updated source files (e.g. copy=False fix in fastvlm.py) are applied.
const fastvlmModelDest = path.join(mlxVlmRoot, "models", "fastvlm");
const fastvlmModelSrc = path.join(pluginRoot(), "src", "fastvlm_server", "mlx_vlm_patches", "models", "fastvlm");
fs.mkdirSync(fastvlmModelDest, { recursive: true });
for (const file of fs.readdirSync(fastvlmModelSrc)) {
fs.copyFileSync(path.join(fastvlmModelSrc, file), path.join(fastvlmModelDest, file));
}
}
const PYTHON310 = "/opt/homebrew/bin/python3.10";
function qwen3VlVenvDir(): string {
return path.join(os.homedir(), ".fastvlm", "qwen3vl_venv");
}
function qwen3VlVenvPip(): string {
return path.join(qwen3VlVenvDir(), "bin", "pip");
}
function qwen3VlVenvExists(): boolean {
return fs.existsSync(path.join(qwen3VlVenvDir(), "bin", "python3"));
}
function qwen3VlReadyMarker(): string {
return path.join(qwen3VlVenvDir(), "qwen3vl_ready.marker");
}
async function ensureQwen3VlVenv(onStatus: (msg: string) => void, logFile: string): Promise<void> {
if (fs.existsSync(qwen3VlReadyMarker())) {
return;
}
if (!qwen3VlVenvExists()) {
onStatus("Creating Qwen3-VL Python environment (Python 3.10)…");
await runAndStream(PYTHON310, ["-m", "venv", qwen3VlVenvDir()], pluginRoot(), onStatus, logFile);
}
onStatus("Installing mlx-vlm for Qwen3-VL — this may take a few minutes…");
await runAndStream(
qwen3VlVenvPip(),
["install", "--upgrade", "pip"],
pluginRoot(),
onStatus,
logFile
);
await runAndStream(
qwen3VlVenvPip(),
["install", "pillow", "git+https://github.com/Blaizzy/mlx-vlm.git"],
pluginRoot(),
onStatus,
logFile
);
fs.writeFileSync(qwen3VlReadyMarker(), new Date().toISOString());
onStatus("Qwen3-VL environment ready");
}
function readPid(): number | null {
try {
const raw = fs.readFileSync(pidFilePath(), "utf-8").trim();
const n = parseInt(raw, 10);
return Number.isFinite(n) && n > 0 ? n : null;
} catch {
return null;
}
}
function isProcessAlive(pid: number): boolean {
try {
process.kill(pid, 0);
return true;
} catch {
return false;
}
}
function fetchWithTimeout(url: string, options: RequestInit, timeoutMs: number): Promise<Response> {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), timeoutMs);
return fetch(url, { ...options, signal: controller.signal }).finally(() => clearTimeout(timer));
}
async function pollHealth(port: number, onAttempt: (n: number) => void): Promise<boolean> {
const url = `http://127.0.0.1:${port}/health`;
const deadline = Date.now() + HEALTH_POLL_TIMEOUT_MS;
let attempt = 0;
while (Date.now() < deadline) {
attempt++;
onAttempt(attempt);
try {
const res = await fetchWithTimeout(url, {}, HEALTH_FETCH_TIMEOUT_MS);
if (res.ok) return true;
} catch {
// not ready yet
}
await new Promise<void>((r) => setTimeout(r, HEALTH_POLL_INTERVAL_MS));
}
return false;
}
export interface FastVLMServerConfig {
port: number;
/** Absolute path to the FastVLM model directory. Empty string = skip --model arg. */
modelPath: string;
/** Whether to load the FastVLM model on server start. */
mlxVisionEnabled: boolean;
/** Absolute path to the Florence-2 model directory. Empty string = Florence-2 not loaded. */
florence2ModelPath?: string;
/** Backend mode: 'ane' (CoreML+ANE) or 'mlx' (MLX-only). Default: 'mlx'. */
backend?: string;
/** Max tokens for generation. */
maxTokens?: number;
/** Sampling temperature. */
temperature?: number;
/** Object detection backend: 'florence2' or 'qwen3-vl'. Default: 'florence2'. */
detectBackend?: string;
/** Absolute path to the Qwen3-VL model directory. Required when detectBackend='qwen3-vl'. */
qwen3VlModelPath?: string;
}
/**
* Ensures the FastVLM server process is running and healthy.
*
* If a healthy server is already running (detected via PID file + /health),
* returns immediately. Otherwise spawns a new detached process.
*
* @param config Server parameters derived from plugin config
* @param onStatus Callback for streaming status messages (ctx.status)
* @throws if the server fails to become healthy within the timeout
*/
export async function ensureFastvlmServerRunning(
config: FastVLMServerConfig,
onStatus: (msg: string) => void
): Promise<void> {
const { port } = config;
_activePort = port;
// Adopt any healthy server already listening on the port — unless the backend differs.
// The PID file may be stale (wrong PID) or absent, but the server can still be running.
const requestedBackend = config.backend ?? "mlx";
const requestedDetectBackend = config.detectBackend ?? "florence2";
try {
const res = await fetchWithTimeout(
`http://127.0.0.1:${port}/health`,
{},
HEALTH_FETCH_TIMEOUT_MS
);
if (res.ok) {
// Check if the running server's backend or detect_backend differs from what we need.
let runningBackend: string | null = null;
let runningDetectBackend: string | null = null;
try {
const statusRes = await fetchWithTimeout(
`http://127.0.0.1:${port}/status`,
{},
HEALTH_FETCH_TIMEOUT_MS
);
if (statusRes.ok) {
const statusJson = await statusRes.json() as { backend?: string; detect_backend?: string };
runningBackend = statusJson.backend ?? null;
runningDetectBackend = statusJson.detect_backend ?? null;
}
} catch {
// /status unreachable — treat as matching to avoid unnecessary restart.
}
const backendMismatch = runningBackend !== null && runningBackend !== requestedBackend;
const detectBackendMismatch = runningDetectBackend !== null && runningDetectBackend !== requestedDetectBackend;
if (backendMismatch || detectBackendMismatch) {
const isoNow = () => new Date().toISOString().replace(/\.\d+Z$/, "Z");
fs.appendFileSync(logFilePath(), `[mgr] ${isoNow()} Backend mismatch: running=${runningBackend}/${runningDetectBackend}, requested=${requestedBackend}/${requestedDetectBackend} — restarting [${port}]\n`);
onStatus("Restarting server (backend changed)…");
await stopFastvlmServer(port);
// Fall through to spawn with new backend.
} else {
if (requestedDetectBackend === "qwen3-vl") {
const logFile = logFilePath();
fs.mkdirSync(logsDir(), { recursive: true });
await ensureQwen3VlVenv(onStatus, logFile);
}
const pid = readPid();
fs.appendFileSync(logFilePath(), `[mgr] ${new Date().toISOString().replace(/\.\d+Z$/, "Z")} Adopted [${port}]${pid !== null ? ` — PID ${pid}` : ""}\n`);
return;
}
}
} catch {
// Nothing healthy on the port — proceed to spawn.
}
// Kill any stale process recorded in the PID file so the port is free before spawn.
const stalePid = readPid();
if (stalePid !== null && isProcessAlive(stalePid)) {
try { process.kill(stalePid, "SIGTERM"); } catch { /* already gone */ }
}
fs.mkdirSync(logsDir(), { recursive: true });
const logFile = logFilePath();
// MLX/FastVLM is macOS-only; force-disable on other platforms
const effectiveConfig: FastVLMServerConfig = process.platform !== "darwin"
? { ...config, mlxVisionEnabled: false }
: config;
await ensureVenv(onStatus, logFile);
if (effectiveConfig.detectBackend === "qwen3-vl") {
await ensureQwen3VlVenv(onStatus, logFile);
}
const args: string[] = [
"-m",
"fastvlm_server",
"--port",
String(port),
"--host",
"127.0.0.1",
"--pid-file",
pidFilePath(),
];
if (effectiveConfig.mlxVisionEnabled && effectiveConfig.modelPath.trim()) {
args.push("--model", effectiveConfig.modelPath.trim());
}
const backend = effectiveConfig.backend ?? "mlx";
args.push("--backend", backend);
if (effectiveConfig.maxTokens !== undefined) {
args.push("--max-tokens", String(effectiveConfig.maxTokens));
}
if (effectiveConfig.temperature !== undefined) {
args.push("--temperature", String(effectiveConfig.temperature));
}
if (effectiveConfig.florence2ModelPath?.trim()) {
args.push("--florence2-model-path", effectiveConfig.florence2ModelPath.trim());
}
if (effectiveConfig.detectBackend) {
args.push("--detect-backend", effectiveConfig.detectBackend);
}
if (effectiveConfig.qwen3VlModelPath?.trim()) {
args.push("--qwen3-vl-model-path", effectiveConfig.qwen3VlModelPath.trim());
}
// Plugin mode: always lazy-load models on first request
args.push("--lazy");
// Python server writes its own log via --log-file (FileHandler in setup_logging).
// stdout/stderr are redirected to /dev/null by the bash intermediary below.
args.push("--log-file", logFile);
onStatus("Starting server…");
const pythonPath = path.join(pluginRoot(), "src");
const isoNow = () => new Date().toISOString().replace(/\.\d+Z$/, "Z");
fs.appendFileSync(logFile, `[mgr] ${isoNow()} Starting [${port}] model=${effectiveConfig.modelPath.trim() || "(none)"}\n`);
// TEST MODE: direct spawn without double-fork — to verify whether LM Studio
// kills this process on plugin unload/reload. If the server survives without
// double-fork, we can remove the bash intermediary permanently.
// Revert this block to the double-fork variant once the test result is known.
const child = spawn(venvPython(), args, {
cwd: pluginRoot(),
detached: true,
stdio: "ignore",
env: { ...process.env, PYTHONPATH: pythonPath },
});
child.unref();
fs.appendFileSync(logFile, `[mgr] ${isoNow()} Spawned [${port}] via double-fork\n`);
const ready = await pollHealth(port, (n) => {
onStatus("Loading…");
});
if (!ready) {
throw new Error(
`FastVLM server did not become healthy within ${HEALTH_POLL_TIMEOUT_MS / 1000}s. ` +
`Check logs: ${logFile}`
);
}
const startedPid = readPid();
fs.appendFileSync(logFile, `[mgr] ${isoNow()} Started [${port}]${startedPid !== null ? ` — PID ${startedPid}` : ""}\n`);
onStatus("Server ready");
}
/**
* Shuts down the server gracefully via /shutdown, falling back to SIGTERM.
* Cleans up the PID file afterwards.
*/
export async function stopFastvlmServer(port: number): Promise<void> {
const logFile = logFilePath();
const isoNow = () => new Date().toISOString().replace(/\.\d+Z$/, "Z");
const pid = readPid();
if (pid !== null && isProcessAlive(pid)) {
fs.appendFileSync(logFile, `[mgr] ${isoNow()} Stopping [${port}]...\n`);
try {
await fetchWithTimeout(
`http://127.0.0.1:${port}/shutdown`,
{ method: "POST" },
3_000
);
} catch {
try {
process.kill(pid, "SIGTERM");
} catch {
// process already gone
}
}
fs.appendFileSync(logFile, `[mgr] ${isoNow()} Stopped [${port}]\n`);
}
try {
fs.unlinkSync(pidFilePath());
} catch {
// already gone
}
}
/**
* Stops the server that was started in this process session.
* Uses the port recorded by the last ensureFastvlmServerRunning call,
* falling back to the default port 8765.
*/
export async function stopActiveFastvlmServer(): Promise<void> {
await stopFastvlmServer(_activePort ?? 8765);
}