/**
* Vector-RAG recall — the pure core of Phase F (no LM Studio dependency).
*
* The Phase C rolling summary keeps continuity (the *fil*) but is bounded: it
* recompresses fine old details away. This module is the other half of the
* documented hybrid — summary = continuity, retrieval = precise recall. It keeps
* the **verbatim text** of messages that have scrolled out of the protected
* recent window in a per-save store, and each turn surfaces the few past
* messages most semantically similar to the recent text (the `# Relevant past
* events` block).
*
* Everything here is pure arithmetic/string work: the embeddings are produced by
* `embed.ts` (I/O) and passed in, so this stays unit-testable. The keyword/cosine
* machinery mirrors `world/scan.ts` and `characters/select.ts` but is duplicated
* here on purpose — `memory` stays decoupled from `world` and `characters` (see
* CLAUDE.md). The token estimate is the same cheap `chars/4` heuristic.
*/
import { MemoryChunk } from "../state/schema.js";
import { cosineSimilarity, estimateTokens } from "../shared/vector.js";
export interface ExtendStoreOptions {
/**
* Most-recent messages NOT archived — they are still in the live window, so
* archiving (and later recalling) them would duplicate context.
*/
protectRecent: number;
/** Hard cap on stored chunks; the oldest are dropped past this. */
maxStore: number;
/** Turn stamped on newly-archived chunks (for recency/debug). */
turn: number;
}
export interface ExtendStoreResult {
store: MemoryChunk[];
/** New `storedMessageCount` marker (messages consumed into the store). */
storedMessageCount: number;
/** How many oldest chunks the cap dropped this call (for the debug log). */
dropped: number;
}
/**
* Archive the messages that have left the protected recent window. Pure.
*
* Appends `conversation[storedMessageCount : len - protectRecent]` as chunks,
* skipping any whose trimmed text is already stored (dedup, so an edit/regen of
* the tail can't double-store), then trims the store to the most-recent
* `maxStore` chunks. The marker always advances to `len - protectRecent` so the
* same messages are never re-considered, even when the cap drops old chunks.
*/
export function extendStore(
store: MemoryChunk[],
storedMessageCount: number,
conversation: string[],
options: ExtendStoreOptions,
): ExtendStoreResult {
const protectRecent = Math.max(0, Math.floor(options.protectRecent));
const maxStore = Math.max(0, Math.floor(options.maxStore));
const start = Math.max(0, Math.min(storedMessageCount, conversation.length));
const end = Math.max(start, conversation.length - protectRecent);
const seen = new Set(store.map((c) => c.text.trim()));
const next = [...store];
for (const raw of conversation.slice(start, end)) {
const text = raw.trim();
if (!text || seen.has(text)) continue;
seen.add(text);
next.push({ text, turn: options.turn });
}
let dropped = 0;
let trimmed = next;
if (maxStore > 0 && next.length > maxStore) {
dropped = next.length - maxStore;
trimmed = next.slice(dropped);
}
return { store: trimmed, storedMessageCount: end, dropped };
}
export interface RecallOptions {
/** Embedding of the recent text (the query). Null disables recall. */
queryEmbedding: number[] | null;
/** Per-chunk embeddings, aligned by index to the `store` array. */
chunkEmbeddings: (number[] | null)[];
/** Noise floor: a chunk must reach at least this cosine similarity to count. */
threshold: number;
/** Keep at most this many chunks — the closest. Non-finite means "no cap". */
topK: number;
/** Token budget for the whole recall block. Default: unlimited. */
maxTokens?: number;
}
/**
* Select the past messages to recall this turn. Pure and deterministic.
*
* 1. Score every chunk with a vector at/above the floor by cosine similarity;
* keep the closest `topK`.
* 2. Fit them under the token budget, dropping the lowest-scoring first.
* 3. Return the kept chunks in **chronological** (store) order, so the excerpt
* reads as a coherent slice of the past rather than a relevance jumble.
*/
export function selectMemories(
store: MemoryChunk[],
options: RecallOptions,
): MemoryChunk[] {
if (!options.queryEmbedding || store.length === 0) return [];
const maxTokens = options.maxTokens ?? Infinity;
const hits: { index: number; score: number }[] = [];
store.forEach((_, i) => {
const vec = options.chunkEmbeddings[i];
if (!vec) return;
const score = cosineSimilarity(options.queryEmbedding as number[], vec);
if (score >= options.threshold) hits.push({ index: i, score });
});
const topK = Number.isFinite(options.topK)
? Math.max(0, Math.floor(options.topK))
: Infinity;
// Rank by score, cap to top-K, then fit under the token budget (still by
// score, so the weakest are dropped first), and finally order chronologically.
const ranked = hits.sort((a, b) => b.score - a.score).slice(0, topK);
const kept: number[] = [];
let used = 0;
for (const { index } of ranked) {
const cost = estimateTokens(store[index].text);
if (used + cost > maxTokens) continue; // skip this one, a cheaper later hit may still fit
kept.push(index);
used += cost;
}
return kept.sort((a, b) => a - b).map((i) => store[i]);
}
/**
* The `# Relevant past events` block: the recalled verbatim excerpts, one per
* line group, kept apart from the `# Story so far` summary so the model reads
* them as precise recall rather than continuity. Null when nothing was recalled.
*/
export function recallBlock(chunks: MemoryChunk[]): string | null {
const parts = chunks.map((c) => c.text.trim()).filter(Boolean);
if (parts.length === 0) return null;
return [
"# Relevant past events",
"(Earlier moments surfaced because they relate to the current scene.)",
...parts,
].join("\n");
}