import { createConfigSchematics } from "@lmstudio/sdk";

export const globalConfigSchematics = createConfigSchematics()
  .field(
    "PREVIEW_IN_CHAT",
    "boolean",
    {
      displayName: "Previews in Chat",
      subtitle:
        "When enabled, tool responses include inline image previews. Recommended for local models without vision capability.",
    },
    true
  )
  .field(
    "mlxVisionEnabled",
    "boolean",
    {
      displayName: "MLX Vision: Load Model",
      subtitle:
        "When enabled, the FastVLM model is loaded on server start. Disable if you do not use vision analysis.",
    },
    true
  )
  .field(
    "mlxVisionBackend",
    "boolean",
    {
      displayName: "FastVLM: CoreML Vision Backend",
      subtitle:
        "Off (default): MLX Metal GPU. On: CoreML CPU inference for vision — frees Metal GPU for language generation (requires fastvithd.mlpackage).",
      engineDoesNotSupport: true,
    },
    false
  )
  .field(
    "mlxVisionEndpoint",
    "string",
    {
      displayName: "MLX Vision Endpoint",
      subtitle:
        "URL of the MLX Vision /analyze endpoint. Default: http://localhost:8765/analyze",
      placeholder: "http://localhost:8765/analyze",
      engineDoesNotSupport: true,
    },
    "http://localhost:8765/analyze"
  )
  .field(
    "mlxVisionPrompt",
    "string",
    {
      displayName: "Vision Prompt",
      subtitle:
        "Default prompt sent to the vision model. Leave empty to use the model default.",
      placeholder: "",
      isParagraph: true,
    },
    [
      "Analyze this image based strictly on what is directly visible. Do not infer, assume, or complete information that is not present.",
      "",
      "STEP 1 — GROUND TRUTH (always required):",
      'Before any detailed analysis, state in one sentence what the image actually shows (e.g., "This image shows a person", "This image shows a geometric shape", "This image shows a product on a plain background"). If the image does not contain a person, skip all person-specific sections below and describe only what is present.',
      "",
      "---",
      "",
      "IF AND ONLY IF a person is visible, describe:",
      "",
      "1. SUBJECT PHYSICAL CHARACTERISTICS:",
      "   - Face shape, skin tone, facial features (eyes, nose, lips, eyebrows) — only what is clearly visible",
      "   - Hair: color, length, style, texture, specific arrangement",
      "   - Visible age indicators and gender markers based on physical traits alone",
      "",
      "2. CLOTHING & DESIGN ELEMENTS — only if clothing is present:",
      "   - Garment type, colors, patterns, textures (only if present and visible)",
      "   - Specific design features (lines, shapes, geometric elements)",
      "   - Color palette with exact color names where possible",
      "",
      "3. COMPOSITION & FRAMING:",
      "   - What is included in the frame (head position, body coverage)",
      "   - Background characteristics and spatial relationships",
      "   - Lighting quality, direction, shadow patterns",
      "",
      "4. CULTURAL & ETHNIC INDICATORS:",
      "   - Specific facial features that suggest ethnic background",
      "   - Any visible cultural markers in clothing or styling",
      "   - Note only what is visually present, not inferred",
      "",
      "---",
      "",
      "IF no person is visible, describe only:",
      "- Shape, form, color, texture, and spatial relationships of what is actually present",
      "- Composition and framing as above",
      "",
      "---",
      "",
      "INTERPRETATIONS (separate section, always):",
      "- Based solely on the observable facts above, note any stylistic intentions or design approaches suggested by the visual evidence",
      "- Clearly distinguish between what IS seen and what CAN BE INFERRED",
      "",
      'Avoid vague terms like "beautiful," "modern," "gender-neutral" unless supported by specific visual evidence. Never describe content that is not present in the image.',
    ].join("\n")
  )
  .field(
    "detectEndpoint",
    "string",
    {
      displayName: "Florence-2 Detect Endpoint",
      subtitle:
        "URL of the Florence-2 /detect endpoint. Default: http://localhost:8765/detect",
      placeholder: "http://localhost:8765/detect",
      engineDoesNotSupport: true,
    },
    "http://localhost:8765/detect"
  )
  .field(
    "mlxVisionModelPath",
    "string",
    {
      displayName: "MLX Vision: Model Path",
      subtitle:
        "Absolute path to the FastVLM model directory (e.g. FastVLM-7B-int4). Required for Node-managed server mode.",
      placeholder: "~/Documents/Models/FastVLM-7B-MLX",
    },
    ""
  )
  .field(
    "mlxVisionPort",
    "numeric",
    {
      displayName: "MLX Vision: Port",
      subtitle:
        "Port for the local FastVLM server (shared with Florence-2 detect). Default: 8765.",
    },
    8765
  )
  .field(
    "mlxVisionMaxTokens",
    "numeric",
    {
      displayName: "MLX Vision: Max Tokens",
      subtitle: "Maximum response length in tokens (1–4096). Default: 384.",
    },
    384
  )
  .field(
    "mlxVisionTemperature",
    "numeric",
    {
      displayName: "MLX Vision: Temperature",
      subtitle: "Sampling temperature (0.0–2.0). Default: 0.7.",
    },
    0.7
  )
  .field(
    "detectEnabled",
    "boolean",
    {
      displayName: "Detection: Load Model",
      subtitle:
        "When enabled, the detection model is loaded on server start. Disable if object detection is not needed.",
    },
    true
  )
  .field(
    "detectModelPath",
    "string",
    {
      displayName: "Florence-2: Model Path",
      subtitle:
        "Absolute path to the Florence-2 model directory. Required for Node-managed server mode.",
      placeholder: "~/Documents/Models/Florence-2-large",
    },
    ""
  )
  .field(
    "detectBackend",
    "boolean",
    {
      displayName: "Detection Backend: Use Qwen3-VL",
      subtitle:
        "When enabled, Qwen3-VL is used for object detection instead of Florence-2. Requires Qwen3-VL Model Path below.",
      engineDoesNotSupport: false,
    },
    false
  )
  .field(
    "qwen3VlOdPrompt",
    "string",
    {
      displayName: "Qwen3-VL: Object Detection Prompt",
      subtitle:
        "Instruction sent to Qwen3-VL for default object detection (task omitted or '<OD>'). Leave empty to use the built-in default.",
      placeholder: "",
      isParagraph: true,
      engineDoesNotSupport: false,
    },
    [
      "Detect objects in the image with strict hierarchical prioritization.",
      "",
      "PRIORITY 1 (CRITICAL - MUST DETECT FIRST):",
      '- You MUST detect \"human face\" (highest priority if a person is present)',
      '- You MUST detect \"person\" (if no face is clearly visible or if the person is the main subject)',
      "",
      "PRIORITY 2 (MAIN SUBJECT / HERO ELEMENT):",
      "- The most visually prominent object or subject that is NOT part of the background.",
      "- Use specific, concrete labels (e.g., 'red car', 'fluffy owl toy').",
      "- Avoid generic terms like 'object' or 'thing'.",
      "",
      "PRIORITY 3 (CONTEXTUAL BACKGROUND ELEMENTS):",
      "- Only detect background elements if they are significant to the scene composition OR if the main subject is interacting with them.",
      "- Do not detect minor or redundant background details.",
      "",
      "PRIORITY 4 (FOCUSSED MAIN SUBJECT / HERO ELEMENT):",
      "- All visible body parts (hands, feet, arms, legs).",
      "- Elements of the face, as far as clearly detectable and focussed on close-ups: nose, mouth, left and right eyes, eyebrows and ears",
      "- anatomical details, as far as recognizable as \\\"focussed\\\" or \\\"prominent\\\" (e.g., 'iris', 'pupil', 'eyelid')",
      "",
      "RULES:",
      "- Maximum 16 objects total.",
      "- Each bounding box must be unique and non-redundant.",
      "- For clothing, name the specific garment (e.g., 'tank top', 'jeans').",
      "- For body parts, qualify by position (e.g., 'left hand').",
      "- NEVER prioritize background elements over the main subject or human face.",
      "- NEVER prioritize anatomical details over general concepts unless they are solely focussed (e.g. only detect 'eyes' unless 'human face' is the dominant part of the image)",
      "- If the main subject is a person, focus on the person and their immediate interactions. Ignore background elements unless they are directly involved in the interaction.",
    ].join("\n")
  )
  .field(
    "qwen3VlModelPath",
    "string",
    {
      displayName: "Qwen3-VL: Model Path",
      subtitle:
        "Absolute path to the Qwen3-VL MLX model directory (e.g. Qwen3-VL-8B-Instruct-MLX-4bit). Required when Detection Backend is set to Qwen3-VL.",
      placeholder: "~/.lmstudio/models/lmstudio-community/Qwen3-VL-8B-Instruct-MLX-4bit",
      engineDoesNotSupport: false,
    },
    ""
  )
  .field(
    "serverTTL",
    "numeric",
    {
      displayName: "Server TTL (minutes)",
      subtitle:
        "Controls server lifetime. 0 = do not start server; N = offload model after N minutes of inactivity; 1440 = keep loaded until LM Studio exits.",
      engineDoesNotSupport: true,
    },
    1440
  )
  .field(
    "HTTP_SERVER_PORT",
    "numeric",
    {
      displayName: "Local HTTP Server Port",
      subtitle:
        "Port for serving generated images over localhost (default: 54760).",
      engineDoesNotSupport: true,
    },
    54760
  )
  .field(
    "includeGenerationMetadata",
    "boolean",
    {
      displayName: "Include Generation Metadata",
      subtitle:
        "When enabled, Draw Things generation parameters (prompt, model, sampler, seed, …) embedded in PNG files are appended to each analysis result.",
      engineDoesNotSupport: false,
    },
    true
  )
  .build();

export const FLORENCE2_MODEL_PATH = process.env["FLORENCE2_MODEL_PATH"] ?? "";

export const DETECT_ENDPOINT =
  process.env["DETECT_ENDPOINT"] ?? "http://localhost:8765/detect";
analyse-image