Project Files
src / retrieval / tokenBudget.ts
import { normalizeWhitespace } from "../utils/text";
export interface BudgetedChunk {
text: string;
score: number;
citation?: string;
sourceName?: string;
confidence?: number;
}
function sourceKey(chunk: BudgetedChunk, index: number): string {
return chunk.sourceName || chunk.citation || `chunk-${index}`;
}
export function fitToBudget(chunks: BudgetedChunk[], maxChars: number): BudgetedChunk[] {
if (chunks.length === 0) return [];
const sorted = chunks.slice().sort((a, b) => {
if (b.score !== a.score) return b.score - a.score;
const cA = a.confidence ?? 0;
const cB = b.confidence ?? 0;
if (cB !== cA) return cB - cA;
return a.text.length - b.text.length;
});
const n = sorted.length;
const lengths = new Array<number>(n);
for (let i = 0; i < n; i++) {
lengths[i] = normalizeWhitespace(sorted[i].text).length;
}
for (let i = n - 2; i >= 0; i--) {
lengths[i] = Math.min(lengths[i], lengths[i + 1]);
}
const kept: BudgetedChunk[] = [];
const sources = new Array<string>(5);
const counts = new Int32Array(5);
let sourceCount = 0;
let total = 0;
for (let i = 0; i < n; i++) {
const sepOverhead = kept.length === 0 ? 0 : 12;
if (maxChars - total < lengths[i] + sepOverhead) break;
const key = sourceKey(sorted[i], i);
let srcIdx = -1;
for (let j = 0; j < sourceCount; j++) {
if (sources[j] === key) { srcIdx = j; break; }
}
if (srcIdx === -1) {
if (sourceCount >= 5) continue;
srcIdx = sourceCount;
sources[sourceCount] = key;
counts[sourceCount] = 1;
sourceCount++;
} else if (counts[srcIdx] >= 2) {
continue;
}
const extra = kept.length === 0 ? lengths[i] : lengths[i] + 12;
if (total + extra > maxChars) continue;
counts[srcIdx]++;
kept.push(sorted[i]);
total += extra;
if (total >= maxChars) break;
}
if (kept.length === 0) {
const first = sorted[0];
kept.push({
...first,
text: first.text.slice(0, Math.max(0, maxChars - 1)).trimEnd() +
(first.text.length > maxChars ? "\u2026" : ""),
});
}
return kept;
}