Project Files
dist / promptPreprocessor.js
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.preprocess = preprocess;
const sdk_1 = require("@lmstudio/sdk");
const config_1 = require("./config");
const MemoryService_1 = require("./memory/MemoryService");
const embeddingCache_1 = require("./retrieval/embeddingCache");
const queryExpansion_1 = require("./retrieval/queryExpansion");
const hybridSearch_1 = require("./retrieval/hybridSearch");
const dedupe_1 = require("./retrieval/dedupe");
// ── CHANGE 1 ──────────────────────────────────────────────────────────────────
// Import the single-chunk helper directly so we can compress inside the same
// .map() pass that builds BudgetedChunks (see CHANGE 3 below), eliminating
// the intermediate string[] allocation that compressChunks() created.
const compression_1 = require("./retrieval/compression");
const tokenBudget_1 = require("./retrieval/tokenBudget");
const reranker_1 = require("./retrieval/reranker");
const promptInjection_1 = require("./security/promptInjection");
const document_context_1 = require("./templates/document_context");
const text_1 = require("./utils/text");
const retrievalCache = new Map();
const CACHE_TTL_MS = 300_000;
// Hard cap on live cache entries *after* TTL sweeping.
const CACHE_MAX_SIZE = 64;
const MAX_RETRIEVAL_FILES = 8;
const MEMORY_MIN_PROMPT_LENGTH = 20;
const fileSignatureCache = new WeakMap();
const fileTimestampCache = new WeakMap();
const fileTextLikeCache = new WeakMap();
const fileSelectionCache = new Map();
const FILE_SELECTION_CACHE_MAX = 128;
// ── Loop prevention: per-query retrieval timeout ──────────────────────────
// If ctl.client.files.retrieve() stalls (e.g. the embedding model is loading
// or the LM Studio RPC layer is backed up), every query in the Promise.all
// waits indefinitely. A hard timeout converts a hang into a non-abort error,
// which the per-query catch block handles by returning an empty result set —
// letting the other queries (if any) still contribute, and unblocking the
// pipeline so the next user message isn't also stalled.
const RETRIEVAL_TIMEOUT_MS = 8_000;
function withTimeout(promise, ms) {
let timer;
const race = Promise.race([
promise,
new Promise((_, reject) => {
timer = setTimeout(() => reject(new Error(`retrieval timed out after ${ms}ms`)), ms);
}),
]);
// Always clear the timer so the Node process isn't kept alive by a dangling
// setTimeout after the promise settles (prevents a subtle event-loop hang).
return race.finally(() => clearTimeout(timer));
}
// ── Loop prevention: in-flight retrieval deduplication ───────────────────
// If preprocess() is called concurrently for the same prompt+files combination
// (LM Studio occasionally fires the preprocessor more than once before the
// first call has resolved), without this guard each concurrent call launches
// its own full Promise.all(queries) fan-out — doubling or tripling the number
// of embedding-model round-trips. The resulting load spike can push latencies
// past the RETRIEVAL_TIMEOUT_MS threshold, which causes failures, which the
// caller can retry, creating a tight loop of stalled → timeout → retry.
// Sharing a single in-flight Promise collapses concurrent callers onto one
// retrieval operation and breaks that cycle.
const inFlightRetrievals = new Map();
// ── Loop prevention: consecutive-failure circuit breaker ─────────────────
// If the embedding model is unavailable (not loaded, wrong name, OOM crash),
// every preprocess() call fails, the error is caught, the call returns
// userMessage, and LM Studio immediately fires the next turn's preprocessor —
// which fails again, instantly, forever. Tracking failures and backing off
// converts that tight loop into a graceful degradation: after
// MAX_CONSECUTIVE_FAILURES the plugin stops attempting retrieval until
// FAILURE_RESET_MS has elapsed, then tries once more.
const MAX_CONSECUTIVE_FAILURES = 3;
const FAILURE_RESET_MS = 30_000;
let consecutiveFailures = 0;
let lastFailureTime = 0;
function isAbortError(error) {
return (!!error &&
typeof error === "object" &&
"name" in error &&
error.name === "AbortError");
}
/**
* Two-phase eviction:
* 1. Sweep every entry whose TTL has expired (free wins, no size cost).
* 2. If the cache is still over CACHE_MAX_SIZE, drop the oldest entries by
* insertion order until it fits.
*/
function pruneRetrievalCache() {
const now = Date.now();
const expiredKeys = [];
for (const [key, entry] of retrievalCache) {
if (entry.expiresAt <= now)
expiredKeys.push(key);
}
for (const key of expiredKeys) {
retrievalCache.delete(key);
}
if (retrievalCache.size > CACHE_MAX_SIZE) {
const overflow = retrievalCache.size - CACHE_MAX_SIZE;
const keys = [...retrievalCache.keys()];
for (let i = 0; i < overflow; i++) {
retrievalCache.delete(keys[i]);
}
}
}
function getFileTimestamp(file) {
const cached = fileTimestampCache.get(file);
if (cached !== undefined)
return cached;
const record = file;
const candidateKeys = [
"lastModified",
"updatedAt",
"modifiedAt",
"mtime",
"timestamp",
"createdAt",
"created",
];
let result = null;
for (const key of candidateKeys) {
const value = record[key];
if (typeof value === "number" && Number.isFinite(value)) {
result = value;
break;
}
if (typeof value === "string") {
const parsed = Date.parse(value);
if (!Number.isNaN(parsed)) {
result = parsed;
break;
}
}
}
fileTimestampCache.set(file, result);
return result;
}
function compareFileByTimestamp(a, b) {
const aTime = getFileTimestamp(a);
const bTime = getFileTimestamp(b);
if (aTime !== null && bTime !== null)
return bTime - aTime;
if (aTime !== null)
return -1;
if (bTime !== null)
return 1;
return 0;
}
function isTextLikeFile(file) {
const cached = fileTextLikeCache.get(file);
if (cached !== undefined)
return cached;
const record = file;
const type = String(record.type ?? "").toLowerCase();
const name = String(record.name ??
record.fileName ??
record.filename ??
record.originalName ??
record.path ??
"").toLowerCase();
const result = (type && type !== "image") ||
/\.(txt|md|markdown|json|js|ts|py|java|c|cpp|cs|html|css|csv|yml|yaml|xml|log)$/i.test(name) ||
Boolean(record.content || record.text || record.document || record.snippet);
fileTextLikeCache.set(file, result);
return result;
}
function selectRetrievalFiles(files) {
const cacheKey = files.map(fileSignature).join(";");
const cached = fileSelectionCache.get(cacheKey);
if (cached)
return cached;
const validFiles = files.filter(isTextLikeFile);
const result = validFiles.length === 0
? files.slice(0, MAX_RETRIEVAL_FILES)
: validFiles.length <= MAX_RETRIEVAL_FILES
? validFiles
: [...validFiles].sort(compareFileByTimestamp).slice(0, MAX_RETRIEVAL_FILES);
if (fileSelectionCache.size >= FILE_SELECTION_CACHE_MAX) {
const oldest = fileSelectionCache.keys().next().value;
if (oldest !== undefined)
fileSelectionCache.delete(oldest);
}
fileSelectionCache.set(cacheKey, result);
return result;
}
function toText(value) {
if (typeof value === "string")
return value;
if (!value || typeof value !== "object")
return "";
const obj = value;
return String(obj.content ??
obj.text ??
obj.chunk ??
obj.document ??
obj.snippet ??
obj.value ??
"");
}
function toScore(value) {
if (!value || typeof value !== "object")
return 0;
const obj = value;
const raw = obj.score ?? obj.affinity ?? obj.relevance ?? obj.similarity ?? 0;
const n = typeof raw === "number" ? raw : Number(raw);
return Number.isFinite(n) ? n : 0;
}
function toCitation(value) {
if (!value || typeof value !== "object")
return "";
const obj = value;
return String(obj.fileName ??
obj.filename ??
obj.originalName ??
obj.sourceName ??
obj.path ??
"");
}
function normalizeResult(value) {
const textValue = (0, text_1.truncate)(toText(value).trim(), 5000);
const citation = toCitation(value);
const score = toScore(value);
return {
text: textValue,
score,
citation: citation || undefined,
sourceName: citation || undefined,
confidence: Math.max(0, Math.min(1, score)),
};
}
function fileSignature(file) {
const cached = fileSignatureCache.get(file);
if (cached)
return cached;
const record = file;
const parts = [
String(record.type ?? ""),
String(record.name ??
record.fileName ??
record.filename ??
record.originalName ??
record.path ??
""),
String(record.id ?? record.uuid ?? record.hash ?? record.source ?? ""),
].filter(Boolean);
const signature = parts.length > 0
? parts.join("|")
: (0, text_1.stableHash)(Object.keys(record)
.sort()
.map(key => `${key}:${String(record[key])}`)
.join("|"));
fileSignatureCache.set(file, signature);
return signature;
}
function buildCacheKey(prompt, files, limit, multiQueryCount, threshold) {
const fileKey = files.map(fileSignature).join(";");
return (0, text_1.stableHash)([prompt, fileKey, String(limit), String(multiQueryCount), String(threshold)].join("\u0000"));
}
async function retrieveAcrossQueries(ctl, prompt, files) {
const config = ctl.getPluginConfig(config_1.configSchematics);
const retrievalLimit = Math.max(1, Math.min(24, Number(config.get("retrievalLimit")) || 6));
const multiQueryCount = Math.max(1, Math.min(6, Number(config.get("multiQueryCount")) || 2));
const threshold = Math.max(0, Math.min(1, Number(config.get("retrievalAffinityThreshold")) || 0.55));
const cacheKey = buildCacheKey(prompt, files, retrievalLimit, multiQueryCount, threshold);
const cached = retrievalCache.get(cacheKey);
if (cached && cached.expiresAt > Date.now()) {
return cached.value;
}
// In-flight deduplication: return the existing promise rather than launching
// a second identical retrieval fan-out (see comment near inFlightRetrievals).
const inflight = inFlightRetrievals.get(cacheKey);
if (inflight !== undefined)
return inflight;
const promise = (async () => {
const embeddingModel = await (0, embeddingCache_1.getEmbeddingModel)(ctl);
const queries = (0, queryExpansion_1.expandQueries)(prompt, multiQueryCount);
// ── CHANGE 2: Parallel query execution ──────────────────────────────────
// Previously the queries ran in a sequential for-loop so each network
// round-trip blocked the next. The embedding-model retrieval call is
// I/O-bound, so firing all queries concurrently with Promise.all cuts
// latency roughly by a factor of multiQueryCount for users who raise that
// setting above the default of 1.
//
// Abort signals still propagate correctly: if any inner async function
// re-throws an AbortError, Promise.all rejects immediately with it, and
// the caller's try/catch in preprocess() rethrows it up the stack.
//
// Each individual retrieve call is wrapped with withTimeout() so a stalled
// embedding model cannot keep the plugin hanging indefinitely.
const queryResults = await Promise.all(queries.map(async (query) => {
try {
const items = (await withTimeout(ctl.client.files.retrieve(query, files, {
embeddingModel,
limit: retrievalLimit,
signal: ctl.abortSignal,
}), RETRIEVAL_TIMEOUT_MS));
return { query, items };
}
catch (error) {
if (isAbortError(error))
throw error;
// Non-abort errors (timeout, model unavailable, etc.) are treated as
// empty results so the other queries can still contribute.
return { query, items: [] };
}
}));
const results = [];
for (const { query, items } of queryResults) {
for (const item of items) {
const normalized = normalizeResult(item);
if (!normalized.text)
continue;
normalized.score = (0, hybridSearch_1.hybridScore)(query, normalized.text, normalized.score);
if (normalized.score < threshold)
continue;
results.push(normalized);
}
}
const deduped = (0, dedupe_1.dedupeResults)(results);
const ranked = deduped.length <= 4
? deduped
: (0, reranker_1.rerankChunks)(deduped, prompt);
pruneRetrievalCache();
retrievalCache.set(cacheKey, {
value: ranked,
expiresAt: Date.now() + CACHE_TTL_MS,
});
return ranked;
})();
inFlightRetrievals.set(cacheKey, promise);
// Always remove from the in-flight map on settlement so a failed retrieval
// does not permanently block the cache key for future callers.
promise.finally(() => inFlightRetrievals.delete(cacheKey));
return promise;
}
function adaptiveBudget(prompt, selectedSources) {
const length = (0, text_1.normalizeWhitespace)(prompt).length;
const base = selectedSources > 6 ? 12_000 : 10_000;
const growth = Math.min(8_000, Math.floor(length * 0.4));
return Math.max(6_000, Math.min(18_000, base + growth));
}
async function preprocess(ctl, userMessage) {
const rawPrompt = userMessage.getText();
if (!rawPrompt.trim())
return userMessage;
// ── Loop prevention: augmentation guard ──────────────────────────────────
// Primary check: reject messages that were already augmented by this plugin.
// This prevents the double-augmentation loop that occurs when LM Studio
// stores the preprocessed text and replays it through the pipeline on a
// subsequent turn or during context reconstruction.
if (rawPrompt.includes(document_context_1.RAG_AUGMENTED_MARKER))
return userMessage;
// ── Loop prevention: circuit breaker ─────────────────────────────────────
// If the embedding model (or the RPC layer) has failed MAX_CONSECUTIVE_FAILURES
// times in a row, stop attempting retrieval until FAILURE_RESET_MS has elapsed.
// Without this guard, a broken model causes every preprocess() invocation to
// attempt retrieval, fail, and return immediately — which LM Studio may
// interpret as success and immediately fire again, creating a tight loop.
if (consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) {
if (Date.now() - lastFailureTime < FAILURE_RESET_MS)
return userMessage;
// Backoff window has elapsed; reset and attempt once more.
consecutiveFailures = 0;
}
const config = ctl.getPluginConfig(config_1.configSchematics);
const prompt = config.get("enablePromptInjectionProtection")
? (0, promptInjection_1.sanitizePrompt)(rawPrompt)
: rawPrompt;
const history = await ctl.pullHistory();
history.append(userMessage);
const files = history
.getAllFiles(ctl.client)
.filter((file) => file.type !== "image");
if (!files.length)
return userMessage;
const retrievalFiles = selectRetrievalFiles(files);
let retrieved = [];
try {
retrieved = await retrieveAcrossQueries(ctl, prompt, retrievalFiles);
// Successful retrieval — reset the failure counter.
consecutiveFailures = 0;
}
catch (error) {
if (isAbortError(error))
throw error;
consecutiveFailures++;
lastFailureTime = Date.now();
return userMessage;
}
if (!retrieved.length)
return userMessage;
const selectedSourceCount = Math.min(retrieved.length, Math.max(1, Math.min(5, Number(config.get("selectedSourceCount")) || 5)));
// ── CHANGE 3: Single-pass compression + enrichment ───────────────────────
// The previous code did two separate .slice(0, selectedSourceCount).map()
// passes — one to extract text strings for compressChunks(), then a second
// to re-combine them into BudgetedChunks. This created an unnecessary
// intermediate array and repeated the slice allocation.
//
// We now do both in a single pass using compressChunk() (the single-item
// helper that compressChunks() was already delegating to), which halves
// the allocations with identical output.
const useCompression = Boolean(config.get("enableCompression"));
const enriched = retrieved
.slice(0, selectedSourceCount)
.map(item => ({
...item,
text: useCompression && retrieved.length >= 3 ? (0, compression_1.compressChunk)(item.text) : item.text,
}));
const selected = (0, tokenBudget_1.fitToBudget)(enriched, adaptiveBudget(prompt, selectedSourceCount));
if (!selected.length)
return userMessage;
const activeSources = new Map();
selected.forEach((item, index) => {
const key = item.sourceName || item.citation || `chunk-${index + 1}`;
if (!activeSources.has(key))
activeSources.set(key, index + 1);
});
const citations = [...activeSources.entries()]
.map(([name, index]) => `[${index}] ${name}`)
.join("\n");
let memory = "";
if (config.get("enableMemory")) {
const topic = MemoryService_1.MemoryService.extractTopic(rawPrompt);
memory = MemoryService_1.MemoryService.retrieve(topic);
if (topic && rawPrompt.length >= MEMORY_MIN_PROMPT_LENGTH) {
MemoryService_1.MemoryService.save(topic, (0, text_1.truncate)(rawPrompt, 240));
}
}
return (0, sdk_1.text)((0, document_context_1.buildDocumentContext)({
citations,
context: selected.map(item => item.text).join("\n\n---\n\n"),
memory,
prompt,
}));
}