Project Files
src / chunker.ts
export interface Chunk {
text: string;
index: number;
charStart: number;
charEnd: number;
}
export function chunkText(text: string, chunkSize: number, overlap: number): Chunk[] {
const chunks: Chunk[] = [];
if (!text.trim()) return chunks;
// Split into paragraphs first, then reassemble into chunks
const paragraphs = text.split(/\n{2,}/).map(p => p.trim()).filter(Boolean);
let current = "";
let charPos = 0;
let chunkStart = 0;
const flush = () => {
const trimmed = current.trim();
if (trimmed.length > 0) {
chunks.push({
text: trimmed,
index: chunks.length,
charStart: chunkStart,
charEnd: chunkStart + trimmed.length,
});
}
};
for (const para of paragraphs) {
if (current.length + para.length + 2 > chunkSize && current.length > 0) {
flush();
// Overlap: keep the tail of current chunk
const tail = current.length > overlap ? current.slice(current.length - overlap) : current;
chunkStart = charPos - tail.length;
current = tail.trim() + "\n\n" + para;
} else {
if (current.length > 0) current += "\n\n";
current += para;
}
charPos += para.length + 2;
}
if (current.trim()) flush();
// If a single paragraph exceeds chunkSize, hard-split it
const result: Chunk[] = [];
for (const chunk of chunks) {
if (chunk.text.length <= chunkSize) {
result.push({ ...chunk, index: result.length });
continue;
}
let pos = 0;
while (pos < chunk.text.length) {
const end = Math.min(pos + chunkSize, chunk.text.length);
const slice = chunk.text.slice(pos, end).trim();
if (slice) {
result.push({
text: slice,
index: result.length,
charStart: chunk.charStart + pos,
charEnd: chunk.charStart + end,
});
}
pos += chunkSize - overlap;
if (pos >= chunk.text.length) break;
}
}
return result;
}