/**
 * Structure-aware text chunker.
 *
 * Splits text along the most semantically meaningful boundary that still lets
 * the pieces fit in `chunkSize` words: paragraphs first, then lines, then
 * sentence terminators, then words. Pieces are then greedily packed into
 * chunks up to `chunkSize` words with `overlap` words of overlap at each
 * boundary — the overlap is trimmed if it would cause the next chunk to
 * exceed the size limit.
 *
 * The unit is words (whitespace-separated tokens), matching the rest of the
 * codebase's `chunkSize` / `chunkOverlap` config. "Word" is a rough proxy for
 * tokens — for nomic-embed-text-v1.5 the ratio is ~1.3 tokens per word, so
 * a 512-word chunk lands comfortably inside the 2048-token embedding window.
 */

const SEPARATORS: readonly string[] = [
  "\n\n", // paragraphs
  "\n",   // lines (lists, tables, slides)
  ". ",   // sentences
  "? ",
  "! ",
  "; ",
  ", ",
  " ",    // words
];

export interface TextChunk {
  text: string;
  startIndex: number;
  endIndex: number;
}

export function chunkText(
  text: string,
  chunkSize: number,
  overlap: number,
): TextChunk[] {
  if (!text || text.trim() === "") return [];
  const safeOverlap = Math.min(Math.max(0, overlap), Math.max(0, chunkSize - 1));

  const pieces = recursiveSplit(text, chunkSize, 0)
    .map((p) => p.trim())
    .filter((p) => p.length > 0);

  return packChunks(pieces, chunkSize, safeOverlap);
}

function wordCount(s: string): number {
  const t = s.trim();
  if (t === "") return 0;
  return t.split(/\s+/).length;
}

/**
 * Recursively split `text` along progressively finer separators until every
 * piece fits in `chunkSize` words.
 */
function recursiveSplit(text: string, chunkSize: number, sepIdx: number): string[] {
  if (wordCount(text) <= chunkSize) return [text];

  if (sepIdx >= SEPARATORS.length) {
    // No more separators — hard-split by word count
    const words = text.split(/\s+/);
    const out: string[] = [];
    for (let i = 0; i < words.length; i += chunkSize) {
      out.push(words.slice(i, i + chunkSize).join(" "));
    }
    return out;
  }

  const sep = SEPARATORS[sepIdx];
  const parts = text.split(sep);
  // If the separator doesn't actually split anything, skip to the next one.
  if (parts.length <= 1) return recursiveSplit(text, chunkSize, sepIdx + 1);

  const glue = sep;
  const out: string[] = [];
  for (let i = 0; i < parts.length; i++) {
    const piece = i < parts.length - 1 ? parts[i] + glue : parts[i];
    if (wordCount(piece) <= chunkSize) {
      out.push(piece);
    } else {
      out.push(...recursiveSplit(piece, chunkSize, sepIdx + 1));
    }
  }
  return out;
}

/**
 * Greedily pack pieces into chunks, carrying `overlap` words across each
 * boundary. The overlap is reduced if keeping the full amount would push the
 * next chunk over `chunkSize`.
 */
function packChunks(
  pieces: string[],
  chunkSize: number,
  overlap: number,
): TextChunk[] {
  const chunks: TextChunk[] = [];
  let currentWords: string[] = [];
  let currentStart = 0;
  let globalPos = 0;

  const flush = () => {
    if (currentWords.length === 0) return;
    chunks.push({
      text: currentWords.join(" "),
      startIndex: currentStart,
      endIndex: globalPos,
    });
  };

  for (const piece of pieces) {
    const pieceWords = piece.split(/\s+/).filter(Boolean);
    if (pieceWords.length === 0) continue;

    if (
      currentWords.length > 0 &&
      currentWords.length + pieceWords.length > chunkSize
    ) {
      flush();
      // Keep at most `overlap` trailing words — and at most `chunkSize - pieceSize`
      // so appending the next piece still fits.
      const maxOverlap = Math.max(0, chunkSize - pieceWords.length);
      const keep = Math.min(overlap, maxOverlap, currentWords.length);
      const tail = currentWords.slice(currentWords.length - keep);
      currentStart = globalPos - tail.length;
      currentWords = tail;
    }

    currentWords.push(...pieceWords);
    globalPos += pieceWords.length;

    // Defensive: a single piece larger than chunkSize (e.g. a 2000-word run
    // of unbroken text) gets hard-split here.
    while (currentWords.length > chunkSize) {
      const full = currentWords.slice(0, chunkSize);
      chunks.push({
        text: full.join(" "),
        startIndex: currentStart,
        endIndex: currentStart + chunkSize,
      });
      const keep = Math.min(overlap, chunkSize - 1);
      const tail = currentWords.slice(chunkSize - keep, chunkSize + (currentWords.length - chunkSize));
      currentStart = currentStart + chunkSize - keep;
      currentWords = tail.concat(currentWords.slice(chunkSize));
    }
  }

  flush();
  return chunks;
}

/**
 * Estimate token count (rough approximation: 1 token ≈ 4 characters)
 */
export function estimateTokenCount(text: string): number {
  return Math.ceil(text.length / 4);
}
big-rag-modified