import { test } from "node:test";
import * as assert from "node:assert/strict";
import * as path from "path";
import * as fs from "fs";
import { parseImage } from "../parsers/imageParser";
import { parseDocument, DEFAULT_OCR_SETTINGS, type OcrSettings } from "../parsers/documentParser";
import { tryOcrWithPdfJs } from "../parsers/pdfParser";

type PdfJsModule = typeof import("pdfjs-dist/legacy/build/pdf.mjs");

let cachedPdfjsLib: PdfJsModule | null = null;

async function getPdfjsLib() {
  if (!cachedPdfjsLib) {
    cachedPdfjsLib = await import("pdfjs-dist/legacy/build/pdf.mjs");
  }
  return cachedPdfjsLib;
}

async function extractPdfText(filePath: string): Promise<string> {
  const pdfjsLib = await getPdfjsLib();
  const data = new Uint8Array(await fs.promises.readFile(filePath));
  const pdfDocument = await pdfjsLib
    .getDocument({ data, verbosity: pdfjsLib.VerbosityLevel.ERRORS })
    .promise;

  const textParts: string[] = [];
  for (let pageNum = 1; pageNum <= pdfDocument.numPages; pageNum++) {
    const page = await pdfDocument.getPage(pageNum);
    try {
      const textContent = await page.getTextContent();
      const pageText = textContent.items
        .map((item: any) => item.str || "")
        .join(" ");
      if (pageText.trim().length > 0) {
        textParts.push(pageText.trim());
      }
    } finally {
      await page.cleanup();
    }
  }

  return textParts.join("\n");
}

const FIXTURE_DIR = path.resolve(__dirname, "../../test-fixtures");

function ocr(lang: string): OcrSettings {
  return { ...DEFAULT_OCR_SETTINGS, language: lang };
}

// Helper: нормализация текста для сравнения (lowercase, удалить лишние пробелы)
function normalize(text: string): string {
  return text.toLowerCase().replace(/\s+/g, " ").trim();
}

// ============================================================
// Image OCR Tests (Tesseract)
// ============================================================

test("parseImage recognises English text from PNG (lang=eng)", async () => {
  const imagePath = path.join(FIXTURE_DIR, "sample_eng.png");
  const text = await parseImage(imagePath, ocr("eng"));

  console.log("[OCR ENG] Raw output:", JSON.stringify(text));

  assert.ok(text.length > 0, "OCR should produce non-empty text for English image");
  const normalized = normalize(text);
  assert.ok(
    normalized.includes("hello") && normalized.includes("world"),
    `Expected "Hello World" in OCR output, got: ${text}`,
  );
});

test("parseImage recognises Russian text from PNG (lang=rus)", async () => {
  const imagePath = path.join(FIXTURE_DIR, "sample_rus.png");
  const text = await parseImage(imagePath, ocr("rus"));

  console.log("[OCR RUS] Raw output:", JSON.stringify(text));

  assert.ok(text.length > 0, "OCR should produce non-empty text for Russian image");
  const normalized = normalize(text);
  assert.ok(
    normalized.includes("привет") || normalized.includes("мир"),
    `Expected "Привет мир" in OCR output, got: ${text}`,
  );
});

test("parseDocument with OCR extracts English text from PNG", async () => {
  const imagePath = path.join(FIXTURE_DIR, "sample_eng.png");
  const result = await parseDocument(imagePath, true, undefined, ocr("eng"));

  console.log("[parseDocument OCR ENG] Result:", JSON.stringify(result, null, 2));

  assert.equal(result.success, true, `Expected success, got: ${result.success ? "ok" : (result as any).reason}`);
  if (!result.success) return;

  const normalized = normalize(result.document.text);
  assert.ok(
    normalized.includes("hello") && normalized.includes("world"),
    `Expected "Hello World" in document text, got: ${result.document.text}`,
  );
});

test("parseDocument with OCR extracts Russian text from PNG", async () => {
  const imagePath = path.join(FIXTURE_DIR, "sample_rus.png");
  const result = await parseDocument(imagePath, true, undefined, ocr("rus"));

  console.log("[parseDocument OCR RUS] Result:", JSON.stringify(result, null, 2));

  assert.equal(result.success, true, `Expected success, got: ${result.success ? "ok" : (result as any).reason}`);
  if (!result.success) return;

  const normalized = normalize(result.document.text);
  assert.ok(
    normalized.includes("привет") || normalized.includes("мир"),
    `Expected "Привет мир" in document text, got: ${result.document.text}`,
  );
});

// ============================================================
// PDF Text Extraction Tests (pdfjs-dist, без LM Studio)
// ============================================================

test("pdfjs extracts English text from PDF", async () => {
  const pdfPath = path.join(FIXTURE_DIR, "sample_eng.pdf");
  const text = await extractPdfText(pdfPath);

  console.log("[PDF ENG] Raw output:", JSON.stringify(text));

  assert.ok(text.length > 0, "pdfjs should extract text from English PDF");
  const normalized = normalize(text);
  assert.ok(
    normalized.includes("english") || normalized.includes("document"),
    `Expected English text in PDF output, got: ${text}`,
  );
});

test("pdfjs extracts Russian text from PDF", async () => {
  const pdfPath = path.join(FIXTURE_DIR, "sample_rus.pdf");
  const text = await extractPdfText(pdfPath);

  console.log("[PDF RUS] Raw output:", JSON.stringify(text));

  assert.ok(text.length > 0, "pdfjs should extract text from Russian PDF");
  const normalized = normalize(text);
  assert.ok(
    normalized.includes("русский") || normalized.includes("документ"),
    `Expected Russian text in PDF output, got: ${text}`,
  );
});

test("parseDocument returns pdf.missing-client for PDF without LM Studio client", async () => {
  const pdfPath = path.join(FIXTURE_DIR, "sample_eng.pdf");
  const result = await parseDocument(pdfPath, false, undefined, ocr("eng"));

  console.log("[parseDocument PDF no-client] Result:", JSON.stringify(result, null, 2));

  assert.equal(result.success, false, "Should fail without LM Studio client");
  if (!result.success) {
    assert.equal(result.reason, "pdf.missing-client", "Should report missing client");
  }
});

// ============================================================
// Mixed Language Tests (EN + RU, lang="eng+rus")
// ============================================================

test("parseImage recognises mixed EN+RU text from PNG (lang=eng+rus)", async () => {
  const imagePath = path.join(FIXTURE_DIR, "sample_mix.png");
  const text = await parseImage(imagePath, ocr("eng+rus"));

  console.log("[OCR MIX PNG] Raw output:", JSON.stringify(text));

  assert.ok(text.length > 0, "OCR should produce non-empty text for mixed-language image");
  const normalized = normalize(text);
  const hasEng = normalized.includes("hello") || normalized.includes("world");
  const hasRus = normalized.includes("привет") || normalized.includes("мир");
  assert.ok(
    hasEng && hasRus,
    `Expected both English and Russian text in OCR output, got: ${text}`,
  );
});

test("pdfjs extracts mixed EN+RU text from PDF with text layer", async () => {
  const pdfPath = path.join(FIXTURE_DIR, "sample_mix_text.pdf");
  const text = await extractPdfText(pdfPath);

  console.log("[PDF MIX text-layer] Raw output:", JSON.stringify(text));

  assert.ok(text.length > 0, "pdfjs should extract text from mixed-language PDF");
  const normalized = normalize(text);
  const hasEng = normalized.includes("english") || normalized.includes("document") || normalized.includes("fox");
  const hasRus = normalized.includes("документ") || normalized.includes("языках") || normalized.includes("булок");
  assert.ok(
    hasEng && hasRus,
    `Expected both English and Russian text in PDF output, got: ${text}`,
  );
});

test("tryOcrWithPdfJs extracts mixed EN+RU text from image-only PDF (lang=eng+rus)", async () => {
  const pdfPath = path.join(FIXTURE_DIR, "sample_mix_ocr.pdf");
  const result = await tryOcrWithPdfJs(pdfPath, ocr("eng+rus"));

  console.log("[PDF MIX OCR] Result:", JSON.stringify(result, null, 2));

  assert.equal(result.success, true, `Expected success, got: ${result.success ? "ok" : (result as any).reason}`);
  if (!result.success) return;

  const normalized = normalize(result.text);
  const hasEng = normalized.includes("hello") || normalized.includes("world");
  const hasRus = normalized.includes("привет") || normalized.includes("мир");
  assert.ok(
    hasEng && hasRus,
    `Expected both English and Russian text in OCR PDF output, got: ${result.text}`,
  );
});
big-rag-rus