Forked from mindstudio/big-rag
Project Files
src / utils / textChunker.ts
/**
* Simple text chunker that splits text into overlapping chunks.
* chunkSize and overlap are configured in tokens, approximated as 4 chars/token.
*/
const CHARS_PER_TOKEN = 4;
export type ChunkingStrategy = "character" | "sentence";
export function chunkText(
text: string,
chunkSize: number,
overlap: number,
strategy: ChunkingStrategy = "character",
): Array<{ text: string; startIndex: number; endIndex: number }> {
if (strategy === "sentence") {
return chunkBySentence(text, chunkSize, overlap);
}
const chunks: Array<{ text: string; startIndex: number; endIndex: number }> = [];
if (!text || text.length === 0) {
return chunks;
}
const chunkChars = chunkSize * CHARS_PER_TOKEN;
const overlapChars = overlap * CHARS_PER_TOKEN;
const stepChars = Math.max(1, chunkChars - overlapChars);
let start = 0;
while (start < text.length) {
let end = Math.min(start + chunkChars, text.length);
// Snap end forward to the next whitespace boundary to avoid mid-word cuts.
if (end < text.length) {
const nextSpace = text.indexOf(" ", end);
if (nextSpace !== -1 && nextSpace - end < 40) {
end = nextSpace;
}
}
const slice = text.slice(start, end).trim();
if (slice.length > 0) {
chunks.push({ text: slice, startIndex: start, endIndex: end });
}
start += stepChars;
if (end >= text.length) {
break;
}
}
return chunks;
}
function splitSentences(text: string): string[] {
return text.split(/(?<=[.!?])\s+/).filter((s) => s.trim().length > 0);
}
function chunkBySentence(
text: string,
chunkSize: number,
overlap: number,
): Array<{ text: string; startIndex: number; endIndex: number }> {
const sentences = splitSentences(text);
const chunks: Array<{ text: string; startIndex: number; endIndex: number }> = [];
let i = 0;
let searchFrom = 0;
while (i < sentences.length) {
let buf = "";
let j = i;
while (j < sentences.length && estimateTokenCount(buf + sentences[j]) <= chunkSize) {
buf += (buf ? " " : "") + sentences[j];
j++;
}
if (buf.length === 0 && j < sentences.length) {
buf = sentences[j];
j++;
}
const startChar = text.indexOf(sentences[i], searchFrom);
const safeStartChar = startChar < 0 ? searchFrom : startChar;
const endChar = Math.min(safeStartChar + buf.length, text.length);
chunks.push({ text: buf.trim(), startIndex: safeStartChar, endIndex: endChar });
let overlapTokens = 0;
let backtrack = j - 1;
while (backtrack > i && overlapTokens < overlap) {
overlapTokens += estimateTokenCount(sentences[backtrack]);
backtrack--;
}
const nextI = Math.max(i + 1, backtrack + 1);
if (nextI > i) {
const nextStart = text.indexOf(sentences[nextI] ?? "", safeStartChar);
searchFrom = nextStart < 0 ? endChar : nextStart;
}
i = nextI;
}
return chunks;
}
/**
* Estimate token count (rough approximation: 1 token ~= 4 characters)
*/
export function estimateTokenCount(text: string): number {
return Math.ceil(text.length / 4);
}