import * as fs from "fs/promises";
import * as path from "path";
import { LocalIndex } from "vectra";

const MAX_ITEMS_PER_SHARD = 10000;
const SHARD_DIR_PREFIX = "shard_";
const SHARD_DIR_REGEX = /^shard_(\d+)$/;

export interface DocumentChunk {
  id: string;
  text: string;
  vector: number[];
  filePath: string;
  fileName: string;
  fileHash: string;
  chunkIndex: number;
  metadata: Record<string, any>;
}

export interface SearchResult {
  text: string;
  score: number;
  filePath: string;
  fileName: string;
  chunkIndex: number;
  shardName: string;
  metadata: Record<string, any>;
}

export interface FilenameSearchResult {
  filePath: string;
  fileName: string;
  matchedKeywords: string[];
}

type ChunkMetadata = {
  text: string;
  filePath: string;
  fileName: string;
  fileHash: string;
  chunkIndex: number;
  [key: string]: any;
};

/**
 * Compute cosine similarity between two vectors.
 */
function cosineSimilarity(a: number[], b: number[]): number {
  let dot = 0;
  let normA = 0;
  let normB = 0;
  for (let i = 0; i < a.length; i++) {
    dot += a[i] * b[i];
    normA += a[i] * a[i];
    normB += b[i] * b[i];
  }
  const denom = Math.sqrt(normA) * Math.sqrt(normB);
  return denom === 0 ? 0 : dot / denom;
}

export class VectorStore {
  private dbPath: string;
  private shardDirs: string[] = [];
  private activeShard: LocalIndex | null = null;
  private activeShardCount: number = 0;
  private updateMutex: Promise<void> = Promise.resolve();

  constructor(dbPath: string) {
    this.dbPath = path.resolve(dbPath);
  }

  /**
   * Open a shard by directory name (e.g. "shard_000"). Caller must not hold the reference
   * after use so GC can free the parsed index data.
   */
  private openShard(dir: string): LocalIndex {
    const fullPath = path.join(this.dbPath, dir);
    return new LocalIndex(fullPath);
  }

  /**
   * Scan dbPath for shard_NNN directories and return sorted list.
   */
  private async discoverShardDirs(): Promise<string[]> {
    const entries = await fs.readdir(this.dbPath, { withFileTypes: true });
    const dirs: string[] = [];
    for (const e of entries) {
      if (e.isDirectory() && SHARD_DIR_REGEX.test(e.name)) {
        dirs.push(e.name);
      }
    }
    dirs.sort((a, b) => {
      const n = (m: string) => parseInt(m.match(SHARD_DIR_REGEX)![1], 10);
      return n(a) - n(b);
    });
    return dirs;
  }

  /**
   * Initialize the vector store: discover or create shards, open the last as active.
   */
  async initialize(): Promise<void> {
    await fs.mkdir(this.dbPath, { recursive: true });
    this.shardDirs = await this.discoverShardDirs();

    if (this.shardDirs.length === 0) {
      const firstDir = `${SHARD_DIR_PREFIX}000`;
      const fullPath = path.join(this.dbPath, firstDir);
      const index = new LocalIndex(fullPath);
      await index.createIndex({ version: 1 });
      this.shardDirs = [firstDir];
      this.activeShard = index;
      this.activeShardCount = 0;
    } else {
      const lastDir = this.shardDirs[this.shardDirs.length - 1];
      this.activeShard = this.openShard(lastDir);
      const items = await this.activeShard.listItems();
      this.activeShardCount = items.length;
    }
    console.log("Vector store initialized successfully");
  }

  /**
   * Add document chunks to the active shard. Rotates to a new shard when full.
   */
  async addChunks(chunks: DocumentChunk[]): Promise<void> {
    if (!this.activeShard) {
      throw new Error("Vector store not initialized");
    }
    if (chunks.length === 0) return;

    this.updateMutex = this.updateMutex.then(async () => {
      await this.activeShard!.beginUpdate();
      try {
        for (const chunk of chunks) {
          const metadata: ChunkMetadata = {
            text: chunk.text,
            filePath: chunk.filePath,
            fileName: chunk.fileName,
            fileHash: chunk.fileHash,
            chunkIndex: chunk.chunkIndex,
            ...chunk.metadata,
          };
          await this.activeShard!.upsertItem({
            id: chunk.id,
            vector: chunk.vector,
            metadata,
          });
        }
        await this.activeShard!.endUpdate();
      } catch (e) {
        this.activeShard!.cancelUpdate();
        throw e;
      }
      this.activeShardCount += chunks.length;
      console.log(`Added ${chunks.length} chunks to vector store`);

      if (this.activeShardCount >= MAX_ITEMS_PER_SHARD) {
        const nextNum = this.shardDirs.length;
        const nextDir = `${SHARD_DIR_PREFIX}${String(nextNum).padStart(3, "0")}`;
        const fullPath = path.join(this.dbPath, nextDir);
        const newIndex = new LocalIndex(fullPath);
        await newIndex.createIndex({ version: 1 });
        this.shardDirs.push(nextDir);
        this.activeShard = newIndex;
        this.activeShardCount = 0;
      }
    });

    return this.updateMutex;
  }

  /**
   * Search: query each shard in turn, merge results, sort by score, filter by threshold, return top limit.
   */
  async search(
    queryVector: number[],
    limit: number = 5,
    threshold: number = 0.5,
  ): Promise<SearchResult[]> {
    const merged: SearchResult[] = [];
    for (const dir of this.shardDirs) {
      const shard = this.openShard(dir);
      // Request more candidates per shard to improve recall before threshold filtering
      const shardLimit = Math.max(limit * 3, 20);
      const results = await shard.queryItems(
        queryVector,
        "",
        shardLimit,
        undefined,
        false,
      );
      for (const r of results) {
        const m = r.item.metadata as ChunkMetadata;
        merged.push({
          text: m?.text ?? "",
          score: r.score,
          filePath: m?.filePath ?? "",
          fileName: m?.fileName ?? "",
          chunkIndex: m?.chunkIndex ?? 0,
          shardName: dir,
          metadata: (r.item.metadata as Record<string, any>) ?? {},
        });
      }
    }

    // Sort by score descending for diagnostics and filtering
    merged.sort((a, b) => b.score - a.score);

    // Diagnostic logging: show top scores before threshold filtering
    if (merged.length > 0) {
      const topScores = merged
        .slice(0, Math.min(5, merged.length))
        .map((r) => `${r.score.toFixed(4)} (${path.basename(r.filePath)})`);
      const aboveThreshold = merged.filter((r) => r.score >= threshold).length;
      console.info(
        `[BigRAG] Vector search raw: top_scores=[${topScores.join(", ")}], ` +
        `above_threshold=${aboveThreshold}/${merged.length}, threshold=${threshold}`,
      );
    } else {
      console.warn("[BigRAG] Vector search returned 0 raw results from all shards.");
    }

    return merged
      .filter((r) => r.score >= threshold)
      .slice(0, limit);
  }

  /**
   * Delete all chunks for a file (by hash) across all shards.
   */
  async deleteByFileHash(fileHash: string): Promise<void> {
    const lastDir = this.shardDirs[this.shardDirs.length - 1];
    this.updateMutex = this.updateMutex.then(async () => {
      for (const dir of this.shardDirs) {
        const shard = this.openShard(dir);
        const items = await shard.listItems();
        const toDelete = items.filter(
          (i) => (i.metadata as ChunkMetadata)?.fileHash === fileHash,
        );
        if (toDelete.length > 0) {
          await shard.beginUpdate();
          for (const item of toDelete) {
            await shard.deleteItem(item.id);
          }
          await shard.endUpdate();
          if (dir === lastDir && this.activeShard) {
            this.activeShardCount = (await this.activeShard.listItems()).length;
          }
        }
      }
      console.log(`Deleted chunks for file hash: ${fileHash}`);
    });
    return this.updateMutex;
  }

  /**
   * Get file path -> set of file hashes currently in the store.
   */
  async getFileHashInventory(): Promise<Map<string, Set<string>>> {
    const inventory = new Map<string, Set<string>>();
    for (const dir of this.shardDirs) {
      const shard = this.openShard(dir);
      const items = await shard.listItems();
      for (const item of items) {
        const m = item.metadata as ChunkMetadata;
        const filePath = m?.filePath;
        const fileHash = m?.fileHash;
        if (!filePath || !fileHash) continue;
        let set = inventory.get(filePath);
        if (!set) {
          set = new Set<string>();
          inventory.set(filePath, set);
        }
        set.add(fileHash);
      }
    }
    return inventory;
  }

  /**
   * Get total chunk count and unique file count.
   */
  async getStats(): Promise<{
    totalChunks: number;
    uniqueFiles: number;
  }> {
    let totalChunks = 0;
    const uniqueHashes = new Set<string>();
    for (const dir of this.shardDirs) {
      const shard = this.openShard(dir);
      const items = await shard.listItems();
      totalChunks += items.length;
      for (const item of items) {
        const h = (item.metadata as ChunkMetadata)?.fileHash;
        if (h) uniqueHashes.add(h);
      }
    }
    return { totalChunks, uniqueFiles: uniqueHashes.size };
  }

  /**
   * Check if any chunk exists for the given file hash (short-circuits on first match).
   */
  async hasFile(fileHash: string): Promise<boolean> {
    for (const dir of this.shardDirs) {
      const shard = this.openShard(dir);
      const items = await shard.listItems();
      if (items.some((i) => (i.metadata as ChunkMetadata)?.fileHash === fileHash)) {
        return true;
      }
    }
    return false;
  }

  /**
   * Search for files whose names contain ALL the specified keywords (case-insensitive substring match).
   * Scans shard metadata to build a unique list of indexed file paths, then filters by keywords.
   * Returns a list of matching files with the keywords that matched each.
   */
  async searchByFilenames(keywords: string[]): Promise<FilenameSearchResult[]> {
    if (keywords.length === 0) return [];

    const lowerKeywords = keywords.map((k) => k.toLowerCase());
    const seenPaths = new Map<string, string>(); // filePath -> fileName

    for (const dir of this.shardDirs) {
      const shard = this.openShard(dir);
      const items = await shard.listItems();
      for (const item of items) {
        const m = item.metadata as ChunkMetadata;
        if (m?.filePath && m?.fileName && !seenPaths.has(m.filePath)) {
          seenPaths.set(m.filePath, m.fileName);
        }
      }
    }

    const results: FilenameSearchResult[] = [];
    for (const [filePath, fileName] of seenPaths) {
      // Search in full path (including directory names) and file name
      const lowerPath = filePath.toLowerCase();
      const lowerName = fileName.toLowerCase();
      const matchedKeywords: string[] = [];

      for (const kw of lowerKeywords) {
        if (lowerPath.includes(kw) || lowerName.includes(kw)) {
          matchedKeywords.push(kw);
        }
      }

      // File matches if ALL keywords are found
      if (matchedKeywords.length === lowerKeywords.length) {
        results.push({ filePath, fileName, matchedKeywords });
      }
    }

    // Fallback: if AND search returned nothing and we have multiple keywords, try OR
    if (results.length === 0 && lowerKeywords.length > 1) {
      console.info(
        `[BigRAG] Filename AND-search found 0 files for [${keywords.join(", ")}]. Trying OR-search...`,
      );
      for (const [filePath, fileName] of seenPaths) {
        const lowerPath = filePath.toLowerCase();
        const lowerName = fileName.toLowerCase();
        const matchedKeywords: string[] = [];

        for (const kw of lowerKeywords) {
          if (lowerPath.includes(kw) || lowerName.includes(kw)) {
            matchedKeywords.push(kw);
          }
        }

        // File matches if ANY keyword is found
        if (matchedKeywords.length > 0) {
          results.push({ filePath, fileName, matchedKeywords });
        }
      }
    }

    console.log(
      `[BigRAG] Filename search for [${keywords.join(", ")}] found ${results.length} matching files`,
    );
    return results;
  }

  /**
   * Search for relevant chunks within specific files, ranked by vector similarity.
   * Used after a filename search to retrieve the most relevant content from matching files.
   */
  async searchInFiles(
    queryVector: number[],
    filePaths: string[],
    limit: number = 5,
    threshold: number = 0.3,
  ): Promise<SearchResult[]> {
    if (filePaths.length === 0) return [];
    const pathSet = new Set(filePaths);

    // Use queryItems() per shard (which has access to stored vectors),
    // then filter by file paths. Request more candidates for better recall.
    const shardQueryLimit = Math.max(limit * 5, 30);
    const candidates: SearchResult[] = [];
    for (const dir of this.shardDirs) {
      const shard = this.openShard(dir);
      const results = await shard.queryItems(
        queryVector,
        "",
        shardQueryLimit,
        undefined,
        false,
      );
      for (const r of results) {
        const m = r.item.metadata as ChunkMetadata;
        if (!m?.filePath || !pathSet.has(m.filePath)) continue;
        if (r.score >= threshold) {
          candidates.push({
            text: m.text ?? "",
            score: r.score,
            filePath: m.filePath,
            fileName: m.fileName ?? "",
            chunkIndex: m.chunkIndex ?? 0,
            shardName: dir,
            metadata: (r.item.metadata as Record<string, any>) ?? {},
          });
        }
      }
    }

    console.info(
      `[BigRAG] searchInFiles: found ${candidates.length} candidates in ${filePaths.length} file(s), ` +
      `threshold=${threshold}`,
    );

    return candidates
      .sort((a, b) => b.score - a.score)
      .slice(0, limit);
  }

  /**
   * Get all chunks for specific file paths (unsorted, for listing file content).
   */
  async getChunksForFiles(
    filePaths: string[],
    limit: number = 5,
  ): Promise<SearchResult[]> {
    if (filePaths.length === 0) return [];
    const pathSet = new Set(filePaths);

    const chunks: SearchResult[] = [];
    for (const dir of this.shardDirs) {
      const shard = this.openShard(dir);
      const items = await shard.listItems();
      for (const item of items) {
        const m = item.metadata as ChunkMetadata;
        if (!m?.filePath || !pathSet.has(m.filePath)) continue;
        chunks.push({
          text: m.text ?? "",
          score: 1.0,
          filePath: m.filePath,
          fileName: m.fileName ?? "",
          chunkIndex: m.chunkIndex ?? 0,
          shardName: dir,
          metadata: (item.metadata as Record<string, any>) ?? {},
        });
      }
    }

    // Group by file, sort chunks within each file by chunkIndex, take first `limit` per file
    const byFile = new Map<string, SearchResult[]>();
    for (const chunk of chunks) {
      let arr = byFile.get(chunk.filePath);
      if (!arr) {
        arr = [];
        byFile.set(chunk.filePath, arr);
      }
      arr.push(chunk);
    }

    const results: SearchResult[] = [];
    for (const [filePath, fileChunks] of byFile) {
      fileChunks.sort((a, b) => a.chunkIndex - b.chunkIndex);
      results.push(...fileChunks.slice(0, limit));
    }

    return results;
  }

  /**
   * Get the number of chunks stored for a specific file path.
   */
  async getChunkCountForFile(filePath: string): Promise<number> {
    let count = 0;
    for (const dir of this.shardDirs) {
      const shard = this.openShard(dir);
      const items = await shard.listItems();
      for (const item of items) {
        const m = item.metadata as ChunkMetadata;
        if (m?.filePath === filePath) {
          count++;
        }
      }
    }
    return count;
  }

  /**
   * Release the active shard reference.
   */
  async close(): Promise<void> {
    this.activeShard = null;
  }
}
big-rag-rus