src / promptPreprocessor.ts
import {
type ChatMessage,
type FileHandle,
type LLMDynamicHandle,
type PredictionProcessStatusController,
type PromptPreprocessorController,
} from "@lmstudio/sdk";
import { configSchematics } from "./config";
// Hybrid context-injection strategy.
//
// The llama-server engine's OpenAI-compatible serializer only accepts image content parts;
// any non-image FileHandle that survives preprocessing throws EngineProtocolCapabilityError
// ('application/pdf'). We therefore DETACH all non-image handles up front (preventing the
// crash at its root), parse them to text, and then decide โ by projected token load โ
// whether to inject the full extracted text or fall back to retrieval (RAG).
// Extracted text + existing history may occupy at most this fraction of the context window
// before we fall back from full-content injection to retrieval.
const FULL_CONTENT_BUDGET = 0.85;
// Above this fraction of the context window, warn that the response may be truncated.
const TRUNCATION_WARNING_FRACTION = 0.8;
interface ParsedDocument {
file: FileHandle;
content: string;
tokenCount: number;
}
export async function preprocess(ctl: PromptPreprocessorController, userMessage: ChatMessage) {
const originalUserPrompt = userMessage.getText();
// (1) Capture and DETACH every non-image file handle before any applyPromptTemplate call.
// Detaching here is what prevents the 'application/pdf' capability crash; the captured
// handles are reused for parsing (full-content) and for files.retrieve (retrieval).
const files = userMessage.consumeFiles(ctl.client, file => file.type !== "image");
if (files.length === 0) {
return userMessage;
}
const model = await ctl.client.llm.model();
const status = ctl.createStatus({
status: "loading",
text: `Analyzing ${files.length} attached document(s)...`,
});
// (2) Parse each handle to text and count its tokens. Counting on the extracted string is
// safe โ no file handle is attached. (4) Each parse is isolated in try/catch: a failure is
// surfaced as a status and the file is skipped rather than aborting preprocessing.
const parsed: ParsedDocument[] = [];
for (const file of files) {
const fileStatus = status.addSubStatus({
status: "loading",
text: `Parsing ${file.name}...`,
});
try {
const { content } = await ctl.client.files.parseDocument(file, {
signal: ctl.abortSignal,
onProgress: progress => {
fileStatus.setState({
status: "loading",
text: `Parsing ${file.name}... (${(progress * 100).toFixed(1)}%)`,
});
},
});
const tokenCount = await model.countTokens(content);
parsed.push({ file, content, tokenCount });
fileStatus.setState({ status: "done", text: `Parsed ${file.name} (${tokenCount} tokens)` });
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
fileStatus.setState({ status: "error", text: `Failed to parse ${file.name}: ${message}` });
ctl.debug(`Failed to parse ${file.name}: ${message}`);
}
}
if (parsed.length === 0) {
status.setState({ status: "error", text: `No attached document could be parsed` });
return userMessage;
}
// (3) Decide routing from projected context load: extracted document tokens + the tokens
// already present in the conversation history.
const extractedTokens = parsed.reduce((sum, doc) => sum + doc.tokenCount, 0);
const historyTokens = await measureHistoryTokens(ctl, model);
const contextLength = await model.getContextLength();
const projectedTokens = extractedTokens + historyTokens;
const fullContentBudget = Math.floor(FULL_CONTENT_BUDGET * contextLength);
ctl.debug(
`Routing: extracted=${extractedTokens}, history=${historyTokens}, ` +
`projected=${projectedTokens}, contextLength=${contextLength}, budget=${fullContentBudget}`,
);
// (5) Truncation warning when extracted content alone is large. (There is no native
// "warning" status; "done" + a "โ WARNING" label is the honest non-failure representation.)
if (extractedTokens > TRUNCATION_WARNING_FRACTION * contextLength) {
const percent = ((extractedTokens / contextLength) * 100).toFixed(0);
ctl.createStatus({
status: "done",
text: `โ ืืืืจื ยท WARNING ยท Extracted content occupies ${percent}% of context window. ืชืืื ืขืฉืื ืืืืืชื ยท Response may be truncated.`,
});
}
// (3) Route.
if (projectedTokens <= fullContentBudget) {
status.setState({
status: "done",
text: `Strategy: inject-full-content (${projectedTokens} / ${contextLength} tokens)`,
});
return prepareDocumentContextInjection(userMessage, originalUserPrompt, parsed);
}
status.setState({
status: "done",
text: `Strategy: retrieval (${projectedTokens} tokens exceeds ${fullContentBudget}-token budget)`,
});
return await prepareRetrievalResultsContextInjection(ctl, originalUserPrompt, files);
}
/**
* Token count of the existing conversation, measured WITHOUT any attached file parts so
* applyPromptTemplate cannot hit the 'application/pdf' capability gap. Degrades to 0 on any
* failure โ at worst this biases slightly toward full-content injection.
*/
async function measureHistoryTokens(
ctl: PromptPreprocessorController,
model: LLMDynamicHandle,
): Promise<number> {
try {
const ctx = await ctl.pullHistory();
// Strip non-image handles from the throwaway measurement copy; we only need token counts.
ctx.consumeFiles(ctl.client, file => file.type !== "image");
const formatted = await model.applyPromptTemplate(ctx);
return await model.countTokens(formatted);
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
ctl.debug(`History token measurement failed; treating history as 0 tokens: ${message}`);
return 0;
}
}
/**
* inject-full-content path. Files were already consumed and parsed by preprocess(); this
* just assembles the file-free text envelope and swaps it into the message body.
*/
function prepareDocumentContextInjection(
input: ChatMessage,
originalUserPrompt: string,
parsed: ParsedDocument[],
): ChatMessage {
let formattedFinalUserPrompt =
"The following content was extracted from the file(s) provided by the user.\n\n";
for (const { file, content } of parsed) {
formattedFinalUserPrompt +=
`---\n` +
`START OF DOCUMENT: ${file.name}\n\n` +
`${content}\n\n` +
`END OF DOCUMENT: ${file.name}\n` +
`---\n\n`;
}
formattedFinalUserPrompt +=
`Based on the document content above, respond to the user query.\n\n` +
`User query: ${originalUserPrompt}`;
input.replaceText(formattedFinalUserPrompt);
return input;
}
/**
* retrieval path. Embeds the user query against the captured file handles (passed directly,
* NOT pulled from message attachments) and injects the top-scoring chunks as citations.
*/
async function prepareRetrievalResultsContextInjection(
ctl: PromptPreprocessorController,
originalUserPrompt: string,
files: Array<FileHandle>,
): Promise<string> {
const pluginConfig = ctl.getPluginConfig(configSchematics);
const retrievalLimit = pluginConfig.get("retrievalLimit");
const retrievalAffinityThreshold = pluginConfig.get("retrievalAffinityThreshold");
const statusSteps = new Map<FileHandle, PredictionProcessStatusController>();
const retrievingStatus = ctl.createStatus({
status: "loading",
text: `Loading an embedding model for retrieval...`,
});
const model = await ctl.client.embedding.model("nomic-ai/nomic-embed-text-v1.5-GGUF", {
signal: ctl.abortSignal,
});
retrievingStatus.setState({
status: "loading",
text: `Retrieving relevant citations for user query...`,
});
const result = await ctl.client.files.retrieve(originalUserPrompt, files, {
embeddingModel: model,
limit: retrievalLimit,
signal: ctl.abortSignal,
onFileProcessList(filesToProcess) {
for (const file of filesToProcess) {
statusSteps.set(
file,
retrievingStatus.addSubStatus({
status: "waiting",
text: `Process ${file.name} for retrieval`,
}),
);
}
},
onFileProcessingStart(file) {
statusSteps
.get(file)!
.setState({ status: "loading", text: `Processing ${file.name} for retrieval` });
},
onFileProcessingEnd(file) {
statusSteps
.get(file)!
.setState({ status: "done", text: `Processed ${file.name} for retrieval` });
},
onFileProcessingStepProgress(file, step, progressInStep) {
const verb = step === "loading" ? "Loading" : step === "chunking" ? "Chunking" : "Embedding";
statusSteps.get(file)!.setState({
status: "loading",
text: `${verb} ${file.name} for retrieval (${(progressInStep * 100).toFixed(1)}%)`,
});
},
});
result.entries = result.entries.filter(entry => entry.score > retrievalAffinityThreshold);
let processedContent = "";
const numRetrievals = result.entries.length;
if (numRetrievals > 0) {
retrievingStatus.setState({
status: "done",
text: `Retrieved ${numRetrievals} relevant citations for user query`,
});
ctl.debug("Retrieval results", result);
processedContent += "The following citations were found in the files provided by the user:\n\n";
let citationNumber = 1;
result.entries.forEach(entry => {
processedContent += `Citation ${citationNumber}: "${entry.content}"\n\n`;
citationNumber++;
});
await ctl.addCitations(result);
processedContent +=
`Use the citations above to respond to the user query, only if they are relevant. ` +
`Otherwise, respond to the best of your ability without them.` +
`\n\nUser Query:\n\n${originalUserPrompt}`;
} else {
retrievingStatus.setState({
status: "canceled",
text: `No relevant citations found for user query`,
});
ctl.debug("No relevant citations found for user query");
processedContent =
`Important: No citations were found in the user files for the user query. ` +
`In less than one sentence, inform the user of this. ` +
`Then respond to the query to the best of your ability.` +
`\n\nUser Query:\n\n${originalUserPrompt}`;
}
ctl.debug("Processed content", processedContent);
return processedContent;
}