Project Files
src
config.ts
index.ts
promptPreprocessor.ts
.gitignore
manifest.json
package-lock.json
package.json
README.md
tsconfig.json
src / promptPreprocessor.ts
import {
  type ChatMessage,
  type FileHandle,
  type LLMDynamicHandle,
  type PredictionProcessStatusController,
  type PromptPreprocessorController,
} from "@lmstudio/sdk";
import { configSchematics } from "./config";

// Hybrid context-injection strategy.
//
// The llama-server engine's OpenAI-compatible serializer only accepts image content parts;
// any non-image FileHandle that survives preprocessing throws EngineProtocolCapabilityError
// ('application/pdf'). We therefore DETACH all non-image handles up front (preventing the
// crash at its root), parse them to text, and then decide — by projected token load —
// whether to inject the full extracted text or fall back to retrieval (RAG).

// Extracted text + existing history may occupy at most this fraction of the context window
// before we fall back from full-content injection to retrieval.
const FULL_CONTENT_BUDGET = 0.85;
// Above this fraction of the context window, warn that the response may be truncated.
const TRUNCATION_WARNING_FRACTION = 0.8;

interface ParsedDocument {
  file: FileHandle;
  content: string;
  tokenCount: number;
}

export async function preprocess(ctl: PromptPreprocessorController, userMessage: ChatMessage) {
  const originalUserPrompt = userMessage.getText();

  // (1) Capture and DETACH every non-image file handle before any applyPromptTemplate call.
  // Detaching here is what prevents the 'application/pdf' capability crash; the captured
  // handles are reused for parsing (full-content) and for files.retrieve (retrieval).
  const files = userMessage.consumeFiles(ctl.client, file => file.type !== "image");
  if (files.length === 0) {
    return userMessage;
  }

  const model = await ctl.client.llm.model();
  const status = ctl.createStatus({
    status: "loading",
    text: `Analyzing ${files.length} attached document(s)...`,
  });

  // (2) Parse each handle to text and count its tokens. Counting on the extracted string is
  // safe — no file handle is attached. (4) Each parse is isolated in try/catch: a failure is
  // surfaced as a status and the file is skipped rather than aborting preprocessing.
  const parsed: ParsedDocument[] = [];
  for (const file of files) {
    const fileStatus = status.addSubStatus({
      status: "loading",
      text: `Parsing ${file.name}...`,
    });
    try {
      const { content } = await ctl.client.files.parseDocument(file, {
        signal: ctl.abortSignal,
        onProgress: progress => {
          fileStatus.setState({
            status: "loading",
            text: `Parsing ${file.name}... (${(progress * 100).toFixed(1)}%)`,
          });
        },
      });
      const tokenCount = await model.countTokens(content);
      parsed.push({ file, content, tokenCount });
      fileStatus.setState({ status: "done", text: `Parsed ${file.name} (${tokenCount} tokens)` });
    } catch (error) {
      const message = error instanceof Error ? error.message : String(error);
      fileStatus.setState({ status: "error", text: `Failed to parse ${file.name}: ${message}` });
      ctl.debug(`Failed to parse ${file.name}: ${message}`);
    }
  }

  if (parsed.length === 0) {
    status.setState({ status: "error", text: `No attached document could be parsed` });
    return userMessage;
  }

  // (3) Decide routing from projected context load: extracted document tokens + the tokens
  // already present in the conversation history.
  const extractedTokens = parsed.reduce((sum, doc) => sum + doc.tokenCount, 0);
  const historyTokens = await measureHistoryTokens(ctl, model);
  const contextLength = await model.getContextLength();
  const projectedTokens = extractedTokens + historyTokens;
  const fullContentBudget = Math.floor(FULL_CONTENT_BUDGET * contextLength);

  ctl.debug(
    `Routing: extracted=${extractedTokens}, history=${historyTokens}, ` +
      `projected=${projectedTokens}, contextLength=${contextLength}, budget=${fullContentBudget}`,
  );

  // (5) Truncation warning when extracted content alone is large. (There is no native
  // "warning" status; "done" + a "⚠ WARNING" label is the honest non-failure representation.)
  if (extractedTokens > TRUNCATION_WARNING_FRACTION * contextLength) {
    const percent = ((extractedTokens / contextLength) * 100).toFixed(0);
    ctl.createStatus({
      status: "done",
      text: `⚠ אזהרה · WARNING · Extracted content occupies ${percent}% of context window. תוכן עשוי להיחתך · Response may be truncated.`,
    });
  }

  // (3) Route.
  if (projectedTokens <= fullContentBudget) {
    status.setState({
      status: "done",
      text: `Strategy: inject-full-content (${projectedTokens} / ${contextLength} tokens)`,
    });
    return prepareDocumentContextInjection(userMessage, originalUserPrompt, parsed);
  }

  status.setState({
    status: "done",
    text: `Strategy: retrieval (${projectedTokens} tokens exceeds ${fullContentBudget}-token budget)`,
  });
  return await prepareRetrievalResultsContextInjection(ctl, originalUserPrompt, files);
}

/**
 * Token count of the existing conversation, measured WITHOUT any attached file parts so
 * applyPromptTemplate cannot hit the 'application/pdf' capability gap. Degrades to 0 on any
 * failure — at worst this biases slightly toward full-content injection.
 */
async function measureHistoryTokens(
  ctl: PromptPreprocessorController,
  model: LLMDynamicHandle,
): Promise<number> {
  try {
    const ctx = await ctl.pullHistory();
    // Strip non-image handles from the throwaway measurement copy; we only need token counts.
    ctx.consumeFiles(ctl.client, file => file.type !== "image");
    const formatted = await model.applyPromptTemplate(ctx);
    return await model.countTokens(formatted);
  } catch (error) {
    const message = error instanceof Error ? error.message : String(error);
    ctl.debug(`History token measurement failed; treating history as 0 tokens: ${message}`);
    return 0;
  }
}

/**
 * inject-full-content path. Files were already consumed and parsed by preprocess(); this
 * just assembles the file-free text envelope and swaps it into the message body.
 */
function prepareDocumentContextInjection(
  input: ChatMessage,
  originalUserPrompt: string,
  parsed: ParsedDocument[],
): ChatMessage {
  let formattedFinalUserPrompt =
    "The following content was extracted from the file(s) provided by the user.\n\n";

  for (const { file, content } of parsed) {
    formattedFinalUserPrompt +=
      `---\n` +
      `START OF DOCUMENT: ${file.name}\n\n` +
      `${content}\n\n` +
      `END OF DOCUMENT: ${file.name}\n` +
      `---\n\n`;
  }

  formattedFinalUserPrompt +=
    `Based on the document content above, respond to the user query.\n\n` +
    `User query: ${originalUserPrompt}`;

  input.replaceText(formattedFinalUserPrompt);
  return input;
}

/**
 * retrieval path. Embeds the user query against the captured file handles (passed directly,
 * NOT pulled from message attachments) and injects the top-scoring chunks as citations.
 */
async function prepareRetrievalResultsContextInjection(
  ctl: PromptPreprocessorController,
  originalUserPrompt: string,
  files: Array<FileHandle>,
): Promise<string> {
  const pluginConfig = ctl.getPluginConfig(configSchematics);
  const retrievalLimit = pluginConfig.get("retrievalLimit");
  const retrievalAffinityThreshold = pluginConfig.get("retrievalAffinityThreshold");

  const statusSteps = new Map<FileHandle, PredictionProcessStatusController>();

  const retrievingStatus = ctl.createStatus({
    status: "loading",
    text: `Loading an embedding model for retrieval...`,
  });
  const model = await ctl.client.embedding.model("nomic-ai/nomic-embed-text-v1.5-GGUF", {
    signal: ctl.abortSignal,
  });
  retrievingStatus.setState({
    status: "loading",
    text: `Retrieving relevant citations for user query...`,
  });
  const result = await ctl.client.files.retrieve(originalUserPrompt, files, {
    embeddingModel: model,
    limit: retrievalLimit,
    signal: ctl.abortSignal,
    onFileProcessList(filesToProcess) {
      for (const file of filesToProcess) {
        statusSteps.set(
          file,
          retrievingStatus.addSubStatus({
            status: "waiting",
            text: `Process ${file.name} for retrieval`,
          }),
        );
      }
    },
    onFileProcessingStart(file) {
      statusSteps
        .get(file)!
        .setState({ status: "loading", text: `Processing ${file.name} for retrieval` });
    },
    onFileProcessingEnd(file) {
      statusSteps
        .get(file)!
        .setState({ status: "done", text: `Processed ${file.name} for retrieval` });
    },
    onFileProcessingStepProgress(file, step, progressInStep) {
      const verb = step === "loading" ? "Loading" : step === "chunking" ? "Chunking" : "Embedding";
      statusSteps.get(file)!.setState({
        status: "loading",
        text: `${verb} ${file.name} for retrieval (${(progressInStep * 100).toFixed(1)}%)`,
      });
    },
  });

  result.entries = result.entries.filter(entry => entry.score > retrievalAffinityThreshold);

  let processedContent = "";
  const numRetrievals = result.entries.length;
  if (numRetrievals > 0) {
    retrievingStatus.setState({
      status: "done",
      text: `Retrieved ${numRetrievals} relevant citations for user query`,
    });
    ctl.debug("Retrieval results", result);
    processedContent += "The following citations were found in the files provided by the user:\n\n";
    let citationNumber = 1;
    result.entries.forEach(entry => {
      processedContent += `Citation ${citationNumber}: "${entry.content}"\n\n`;
      citationNumber++;
    });
    await ctl.addCitations(result);
    processedContent +=
      `Use the citations above to respond to the user query, only if they are relevant. ` +
      `Otherwise, respond to the best of your ability without them.` +
      `\n\nUser Query:\n\n${originalUserPrompt}`;
  } else {
    retrievingStatus.setState({
      status: "canceled",
      text: `No relevant citations found for user query`,
    });
    ctl.debug("No relevant citations found for user query");
    processedContent =
      `Important: No citations were found in the user files for the user query. ` +
      `In less than one sentence, inform the user of this. ` +
      `Then respond to the query to the best of your ability.` +
      `\n\nUser Query:\n\n${originalUserPrompt}`;
  }
  ctl.debug("Processed content", processedContent);

  return processedContent;
}
rag-v1

rag-v1