Project Files
src / rag / chunker.ts
import type { Chunk } from "../types";
export interface ChunkerConfig {
chunkSize: number;
chunkOverlap: number;
documentPath: string;
documentId: number;
}
/**
* Splits text into overlapping chunks for better semantic retrieval
* Preserves table structure and markdown formatting
*/
export class TextChunker {
private config: ChunkerConfig;
constructor(config: ChunkerConfig) {
this.config = config;
}
/**
* Chunks text with structure-aware splitting
* Preserves tables and markdown formatting
*/
chunk(text: string): Omit<Chunk, 'id' | 'embedding'>[] {
const { chunkSize, chunkOverlap, documentPath, documentId } = this.config;
if (!text || text.trim().length === 0) {
return [];
}
// Split by double newlines (paragraphs) first
const paragraphs = text.split(/\n\n+/);
const chunks: Omit<Chunk, 'id' | 'embedding'>[] = [];
let currentChunk = '';
let chunkIndex = 0;
for (const para of paragraphs) {
const trimmedPara = para.trim();
if (!trimmedPara) continue;
// Check if this paragraph is a table (starts with |)
const isTable = trimmedPara.startsWith('|') || trimmedPara.includes('\n|');
// Image-sticky: a paragraph consisting only of Markdown image links is
// kept together with the preceding text chunk. This preserves the
// text→image proximity that is essential for image retrieval in find_doc.
// Both plain `` and angle-bracket `` syntax.
const isImageOnly = /^(!\[.*?\]\(<[^>]+>\)|!\[.*?\]\([^)]+\))(\s*\n\s*(!\[.*?\]\(<[^>]+>\)|!\[.*?\]\([^)]+\)))*$/.test(trimmedPara);
// If adding this paragraph would exceed chunk size AND it is not an
// image-only paragraph (those always stick to the preceding chunk).
if (currentChunk && !isImageOnly && (currentChunk.length + trimmedPara.length + 2) > chunkSize) {
// Save current chunk
if (currentChunk.trim()) {
chunks.push({
documentId,
content: currentChunk.trim(),
metadata: {
documentPath,
chunkIndex,
},
});
chunkIndex++;
}
// Start new chunk with overlap from previous
if (chunkOverlap > 0 && currentChunk.length > chunkOverlap) {
// Take last part of previous chunk as overlap
const overlapStart = currentChunk.lastIndexOf('\n', currentChunk.length - chunkOverlap);
if (overlapStart > 0) {
currentChunk = currentChunk.slice(overlapStart + 1) + '\n\n' + trimmedPara;
} else {
currentChunk = trimmedPara;
}
} else {
currentChunk = trimmedPara;
}
} else {
// Add to current chunk (image-only paragraphs always land here,
// appended to whatever text came before them).
currentChunk = currentChunk ? currentChunk + '\n\n' + trimmedPara : trimmedPara;
}
// If this is a table and chunk is getting large, force a new chunk after it
if (isTable && currentChunk.length > chunkSize * 0.7) {
chunks.push({
documentId,
content: currentChunk.trim(),
metadata: {
documentPath,
chunkIndex,
isTable: true,
},
});
chunkIndex++;
currentChunk = '';
}
}
// Don't forget the last chunk
if (currentChunk.trim()) {
chunks.push({
documentId,
content: currentChunk.trim(),
metadata: {
documentPath,
chunkIndex,
},
});
}
// Safety check: if no chunks were created, fall back to simple splitting
if (chunks.length === 0 && text.length > 0) {
return this.simpleSplit(text, chunkSize, chunkOverlap, documentPath, documentId);
}
return chunks;
}
/**
* Simple fallback splitting - no fancy logic
*/
private simpleSplit(
text: string,
chunkSize: number,
chunkOverlap: number,
documentPath: string,
documentId: number
): Omit<Chunk, 'id' | 'embedding'>[] {
const chunks: Omit<Chunk, 'id' | 'embedding'>[] = [];
let chunkIndex = 0;
let start = 0;
while (start < text.length) {
const end = Math.min(start + chunkSize, text.length);
const content = text.slice(start, end).trim();
if (content) {
chunks.push({
documentId,
content,
metadata: {
documentPath,
chunkIndex,
},
});
chunkIndex++;
}
start = end - chunkOverlap;
if (start <= 0 && end >= text.length) break; // Prevent infinite loop
if (start >= text.length) break;
}
return chunks;
}
/**
* Estimate token count (rough approximation: 1 token ≈ 4 chars)
*/
static estimateTokens(text: string): number {
return Math.ceil(text.length / 4);
}
}