Project Files
src / utils / textChunker.ts
/**
* Structure-aware text chunker.
*
* Splits text along the most semantically meaningful boundary that still lets
* the pieces fit in `chunkSize` words: paragraphs first, then lines, then
* sentence terminators, then words. Pieces are then greedily packed into
* chunks up to `chunkSize` words with `overlap` words of overlap at each
* boundary — the overlap is trimmed if it would cause the next chunk to
* exceed the size limit.
*
* The unit is words (whitespace-separated tokens), matching the rest of the
* codebase's `chunkSize` / `chunkOverlap` config. "Word" is a rough proxy for
* tokens — for nomic-embed-text-v1.5 the ratio is ~1.3 tokens per word, so
* a 512-word chunk lands comfortably inside the 2048-token embedding window.
*/
const SEPARATORS: readonly string[] = [
"\n\n", // paragraphs
"\n", // lines (lists, tables, slides)
". ", // sentences
"? ",
"! ",
"; ",
", ",
" ", // words
];
export interface TextChunk {
text: string;
startIndex: number;
endIndex: number;
}
export function chunkText(
text: string,
chunkSize: number,
overlap: number,
): TextChunk[] {
if (!text || text.trim() === "") return [];
const safeOverlap = Math.min(Math.max(0, overlap), Math.max(0, chunkSize - 1));
const pieces = recursiveSplit(text, chunkSize, 0)
.map((p) => p.trim())
.filter((p) => p.length > 0);
return packChunks(pieces, chunkSize, safeOverlap);
}
function wordCount(s: string): number {
const t = s.trim();
if (t === "") return 0;
return t.split(/\s+/).length;
}
/**
* Recursively split `text` along progressively finer separators until every
* piece fits in `chunkSize` words.
*/
function recursiveSplit(text: string, chunkSize: number, sepIdx: number): string[] {
if (wordCount(text) <= chunkSize) return [text];
if (sepIdx >= SEPARATORS.length) {
// No more separators — hard-split by word count
const words = text.split(/\s+/);
const out: string[] = [];
for (let i = 0; i < words.length; i += chunkSize) {
out.push(words.slice(i, i + chunkSize).join(" "));
}
return out;
}
const sep = SEPARATORS[sepIdx];
const parts = text.split(sep);
// If the separator doesn't actually split anything, skip to the next one.
if (parts.length <= 1) return recursiveSplit(text, chunkSize, sepIdx + 1);
const glue = sep;
const out: string[] = [];
for (let i = 0; i < parts.length; i++) {
const piece = i < parts.length - 1 ? parts[i] + glue : parts[i];
if (wordCount(piece) <= chunkSize) {
out.push(piece);
} else {
out.push(...recursiveSplit(piece, chunkSize, sepIdx + 1));
}
}
return out;
}
/**
* Greedily pack pieces into chunks, carrying `overlap` words across each
* boundary. The overlap is reduced if keeping the full amount would push the
* next chunk over `chunkSize`.
*/
function packChunks(
pieces: string[],
chunkSize: number,
overlap: number,
): TextChunk[] {
const chunks: TextChunk[] = [];
let currentWords: string[] = [];
let currentStart = 0;
let globalPos = 0;
const flush = () => {
if (currentWords.length === 0) return;
chunks.push({
text: currentWords.join(" "),
startIndex: currentStart,
endIndex: globalPos,
});
};
for (const piece of pieces) {
const pieceWords = piece.split(/\s+/).filter(Boolean);
if (pieceWords.length === 0) continue;
if (
currentWords.length > 0 &&
currentWords.length + pieceWords.length > chunkSize
) {
flush();
// Keep at most `overlap` trailing words — and at most `chunkSize - pieceSize`
// so appending the next piece still fits.
const maxOverlap = Math.max(0, chunkSize - pieceWords.length);
const keep = Math.min(overlap, maxOverlap, currentWords.length);
const tail = currentWords.slice(currentWords.length - keep);
currentStart = globalPos - tail.length;
currentWords = tail;
}
currentWords.push(...pieceWords);
globalPos += pieceWords.length;
// Defensive: a single piece larger than chunkSize (e.g. a 2000-word run
// of unbroken text) gets hard-split here.
while (currentWords.length > chunkSize) {
const full = currentWords.slice(0, chunkSize);
chunks.push({
text: full.join(" "),
startIndex: currentStart,
endIndex: currentStart + chunkSize,
});
const keep = Math.min(overlap, chunkSize - 1);
const tail = currentWords.slice(chunkSize - keep, chunkSize + (currentWords.length - chunkSize));
currentStart = currentStart + chunkSize - keep;
currentWords = tail.concat(currentWords.slice(chunkSize));
}
}
flush();
return chunks;
}
/**
* Estimate token count (rough approximation: 1 token ≈ 4 characters)
*/
export function estimateTokenCount(text: string): number {
return Math.ceil(text.length / 4);
}