/**
 * Simple text chunker that splits text into overlapping chunks.
 * chunkSize and overlap are configured in tokens, approximated as 4 chars/token.
 */
const CHARS_PER_TOKEN = 4;

export type ChunkingStrategy = "character" | "sentence";

export function chunkText(
  text: string,
  chunkSize: number,
  overlap: number,
  strategy: ChunkingStrategy = "character",
): Array<{ text: string; startIndex: number; endIndex: number }> {
  if (strategy === "sentence") {
    return chunkBySentence(text, chunkSize, overlap);
  }

  const chunks: Array<{ text: string; startIndex: number; endIndex: number }> = [];
  if (!text || text.length === 0) {
    return chunks;
  }

  const chunkChars = chunkSize * CHARS_PER_TOKEN;
  const overlapChars = overlap * CHARS_PER_TOKEN;
  const stepChars = Math.max(1, chunkChars - overlapChars);

  let start = 0;
  while (start < text.length) {
    let end = Math.min(start + chunkChars, text.length);

    // Snap end forward to the next whitespace boundary to avoid mid-word cuts.
    if (end < text.length) {
      const nextSpace = text.indexOf(" ", end);
      if (nextSpace !== -1 && nextSpace - end < 40) {
        end = nextSpace;
      }
    }

    const slice = text.slice(start, end).trim();
    if (slice.length > 0) {
      chunks.push({ text: slice, startIndex: start, endIndex: end });
    }

    start += stepChars;
    if (end >= text.length) {
      break;
    }
  }

  return chunks;
}

function splitSentences(text: string): string[] {
  return text.split(/(?<=[.!?])\s+/).filter((s) => s.trim().length > 0);
}

function chunkBySentence(
  text: string,
  chunkSize: number,
  overlap: number,
): Array<{ text: string; startIndex: number; endIndex: number }> {
  const sentences = splitSentences(text);
  const chunks: Array<{ text: string; startIndex: number; endIndex: number }> = [];
  let i = 0;
  let searchFrom = 0;

  while (i < sentences.length) {
    let buf = "";
    let j = i;
    while (j < sentences.length && estimateTokenCount(buf + sentences[j]) <= chunkSize) {
      buf += (buf ? " " : "") + sentences[j];
      j++;
    }

    if (buf.length === 0 && j < sentences.length) {
      buf = sentences[j];
      j++;
    }

    const startChar = text.indexOf(sentences[i], searchFrom);
    const safeStartChar = startChar < 0 ? searchFrom : startChar;
    const endChar = Math.min(safeStartChar + buf.length, text.length);
    chunks.push({ text: buf.trim(), startIndex: safeStartChar, endIndex: endChar });

    let overlapTokens = 0;
    let backtrack = j - 1;
    while (backtrack > i && overlapTokens < overlap) {
      overlapTokens += estimateTokenCount(sentences[backtrack]);
      backtrack--;
    }

    const nextI = Math.max(i + 1, backtrack + 1);
    if (nextI > i) {
      const nextStart = text.indexOf(sentences[nextI] ?? "", safeStartChar);
      searchFrom = nextStart < 0 ? endChar : nextStart;
    }
    i = nextI;
  }

  return chunks;
}

/**
 * Estimate token count (rough approximation: 1 token ~= 4 characters)
 */
export function estimateTokenCount(text: string): number {
  return Math.ceil(text.length / 4);
}
big-rag