export type ChatMessage = { role: "system" | "user" | "assistant"; content: string };

export type CompleteOptions = {
  host: string;
  model: string;
  temperature: number;
  maxTokens?: number;
};

export async function complete(
  messages: ChatMessage[],
  opts: CompleteOptions,
): Promise<string> {
  const res = await fetch(`http://${opts.host}/v1/chat/completions`, {
    method: "POST",
    headers: { "content-type": "application/json" },
    body: JSON.stringify({
      model: opts.model,
      messages,
      temperature: opts.temperature,
      // Budget tokens. Un tour narrateur fait ~150 mots = ~300 tokens de
      // CONTENT, mais Qwen3 avec thinking consomme 200-500 tokens en
      // reasoning_content qui comptent aussi dans max_tokens. À 512 on
      // se faisait couper avant les A/B/C (observé session 997e90…).
      // 1500 = marge confortable pour thinking + content + 3 options.
      // Si tu charges un modèle no-thinking, baisse via opts.maxTokens.
      max_tokens: opts.maxTokens ?? 1500,
      stream: false,
    }),
  });
  if (!res.ok) throw new Error(`LM Studio ${opts.host} ${res.status}: ${await res.text()}`);
  const json = await res.json() as {
    choices?: Array<{ message?: { content?: string } }>;
  };
  const content = json.choices?.[0]?.message?.content;
  if (!content) throw new Error(`LM Studio ${opts.host} returned empty content`);
  return content;
}

export type StreamChunk = { kind: "token"; tok: string } | { kind: "reasoning"; tok: string };

export async function* streamComplete(
  messages: ChatMessage[],
  opts: CompleteOptions,
): AsyncGenerator<StreamChunk, string, void> {
  const res = await fetch(`http://${opts.host}/v1/chat/completions`, {
    method: "POST",
    headers: { "content-type": "application/json" },
    body: JSON.stringify({
      model: opts.model,
      messages,
      temperature: opts.temperature,
      // Budget tokens. Un tour narrateur fait ~150 mots = ~300 tokens de
      // CONTENT, mais Qwen3 avec thinking consomme 200-500 tokens en
      // reasoning_content qui comptent aussi dans max_tokens. À 512 on
      // se faisait couper avant les A/B/C (observé session 997e90…).
      // 1500 = marge confortable pour thinking + content + 3 options.
      // Si tu charges un modèle no-thinking, baisse via opts.maxTokens.
      max_tokens: opts.maxTokens ?? 1500,
      stream: true,
    }),
  });
  if (!res.ok || !res.body) {
    throw new Error(`LM Studio ${opts.host} ${res.status}: ${await res.text()}`);
  }
  const reader = res.body.getReader();
  const decoder = new TextDecoder();
  let buf = "";
  let content = "";
  let evt = "";
  let lastFinishReason: string | null = null;
  while (true) {
    const { value, done } = await reader.read();
    if (done) break;
    buf += decoder.decode(value, { stream: true });
    let nl: number;
    while ((nl = buf.indexOf("\n")) !== -1) {
      const line = buf.slice(0, nl).trim();
      buf = buf.slice(nl + 1);
      if (line.startsWith("event:")) { evt = line.slice(6).trim(); continue; }
      if (!line.startsWith("data:")) continue;
      const data = line.slice(5).trim();
      if (data === "[DONE]") {
        if (lastFinishReason === "length") {
          // Coupé par max_tokens — surface une erreur explicite. Sans
          // ça, l'utilisateur voit juste un tour incomplet (narration +
          // italique sans A/B/C) sans comprendre pourquoi.
          throw new Error(
            `LM Studio a coupé la génération à max_tokens=${opts.maxTokens ?? 1500} (finish_reason=length). ` +
            `Le tour est tronqué. Augmente maxTokens dans la config ou réduis le system prompt.`,
          );
        }
        return content;
      }
      if (evt === "error") throw new Error(`LM Studio SSE error: ${data}`);
      try {
        const parsed = JSON.parse(data);
        const choice = parsed.choices?.[0] ?? {};
        const delta = choice.delta ?? {};
        if (choice.finish_reason) lastFinishReason = choice.finish_reason;
        if (delta.reasoning_content) yield { kind: "reasoning", tok: delta.reasoning_content };
        if (delta.content) {
          content += delta.content;
          yield { kind: "token", tok: delta.content };
        }
      } catch { /* ignore malformed chunks */ }
    }
  }
  if (!content) throw new Error(`LM Studio ${opts.host} streamed empty content`);
  return content;
}
livre-heros-bac

livre-heros-bac