Forked from mindstudio/big-rag
Project Files
src / tests / languageRecognition.test.ts
import { test } from "node:test";
import * as assert from "node:assert/strict";
import * as path from "path";
import * as fs from "fs";
import { parseImage } from "../parsers/imageParser";
import { parseDocument, DEFAULT_OCR_SETTINGS, type OcrSettings } from "../parsers/documentParser";
import { tryOcrWithPdfJs } from "../parsers/pdfParser";
type PdfJsModule = typeof import("pdfjs-dist/legacy/build/pdf.mjs");
let cachedPdfjsLib: PdfJsModule | null = null;
async function getPdfjsLib() {
if (!cachedPdfjsLib) {
cachedPdfjsLib = await import("pdfjs-dist/legacy/build/pdf.mjs");
}
return cachedPdfjsLib;
}
async function extractPdfText(filePath: string): Promise<string> {
const pdfjsLib = await getPdfjsLib();
const data = new Uint8Array(await fs.promises.readFile(filePath));
const pdfDocument = await pdfjsLib
.getDocument({ data, verbosity: pdfjsLib.VerbosityLevel.ERRORS })
.promise;
const textParts: string[] = [];
for (let pageNum = 1; pageNum <= pdfDocument.numPages; pageNum++) {
const page = await pdfDocument.getPage(pageNum);
try {
const textContent = await page.getTextContent();
const pageText = textContent.items
.map((item: any) => item.str || "")
.join(" ");
if (pageText.trim().length > 0) {
textParts.push(pageText.trim());
}
} finally {
await page.cleanup();
}
}
return textParts.join("\n");
}
const FIXTURE_DIR = path.resolve(__dirname, "../../test-fixtures");
function ocr(lang: string): OcrSettings {
return { ...DEFAULT_OCR_SETTINGS, language: lang };
}
// Helper: Π½ΠΎΡΠΌΠ°Π»ΠΈΠ·Π°ΡΠΈΡ ΡΠ΅ΠΊΡΡΠ° Π΄Π»Ρ ΡΡΠ°Π²Π½Π΅Π½ΠΈΡ (lowercase, ΡΠ΄Π°Π»ΠΈΡΡ Π»ΠΈΡΠ½ΠΈΠ΅ ΠΏΡΠΎΠ±Π΅Π»Ρ)
function normalize(text: string): string {
return text.toLowerCase().replace(/\s+/g, " ").trim();
}
// ============================================================
// Image OCR Tests (Tesseract)
// ============================================================
test("parseImage recognises English text from PNG (lang=eng)", async () => {
const imagePath = path.join(FIXTURE_DIR, "sample_eng.png");
const text = await parseImage(imagePath, ocr("eng"));
console.log("[OCR ENG] Raw output:", JSON.stringify(text));
assert.ok(text.length > 0, "OCR should produce non-empty text for English image");
const normalized = normalize(text);
assert.ok(
normalized.includes("hello") && normalized.includes("world"),
`Expected "Hello World" in OCR output, got: ${text}`,
);
});
test("parseImage recognises Russian text from PNG (lang=rus)", async () => {
const imagePath = path.join(FIXTURE_DIR, "sample_rus.png");
const text = await parseImage(imagePath, ocr("rus"));
console.log("[OCR RUS] Raw output:", JSON.stringify(text));
assert.ok(text.length > 0, "OCR should produce non-empty text for Russian image");
const normalized = normalize(text);
assert.ok(
normalized.includes("ΠΏΡΠΈΠ²Π΅Ρ") || normalized.includes("ΠΌΠΈΡ"),
`Expected "ΠΡΠΈΠ²Π΅Ρ ΠΌΠΈΡ" in OCR output, got: ${text}`,
);
});
test("parseDocument with OCR extracts English text from PNG", async () => {
const imagePath = path.join(FIXTURE_DIR, "sample_eng.png");
const result = await parseDocument(imagePath, true, undefined, ocr("eng"));
console.log("[parseDocument OCR ENG] Result:", JSON.stringify(result, null, 2));
assert.equal(result.success, true, `Expected success, got: ${result.success ? "ok" : (result as any).reason}`);
if (!result.success) return;
const normalized = normalize(result.document.text);
assert.ok(
normalized.includes("hello") && normalized.includes("world"),
`Expected "Hello World" in document text, got: ${result.document.text}`,
);
});
test("parseDocument with OCR extracts Russian text from PNG", async () => {
const imagePath = path.join(FIXTURE_DIR, "sample_rus.png");
const result = await parseDocument(imagePath, true, undefined, ocr("rus"));
console.log("[parseDocument OCR RUS] Result:", JSON.stringify(result, null, 2));
assert.equal(result.success, true, `Expected success, got: ${result.success ? "ok" : (result as any).reason}`);
if (!result.success) return;
const normalized = normalize(result.document.text);
assert.ok(
normalized.includes("ΠΏΡΠΈΠ²Π΅Ρ") || normalized.includes("ΠΌΠΈΡ"),
`Expected "ΠΡΠΈΠ²Π΅Ρ ΠΌΠΈΡ" in document text, got: ${result.document.text}`,
);
});
// ============================================================
// PDF Text Extraction Tests (pdfjs-dist, Π±Π΅Π· LM Studio)
// ============================================================
test("pdfjs extracts English text from PDF", async () => {
const pdfPath = path.join(FIXTURE_DIR, "sample_eng.pdf");
const text = await extractPdfText(pdfPath);
console.log("[PDF ENG] Raw output:", JSON.stringify(text));
assert.ok(text.length > 0, "pdfjs should extract text from English PDF");
const normalized = normalize(text);
assert.ok(
normalized.includes("english") || normalized.includes("document"),
`Expected English text in PDF output, got: ${text}`,
);
});
test("pdfjs extracts Russian text from PDF", async () => {
const pdfPath = path.join(FIXTURE_DIR, "sample_rus.pdf");
const text = await extractPdfText(pdfPath);
console.log("[PDF RUS] Raw output:", JSON.stringify(text));
assert.ok(text.length > 0, "pdfjs should extract text from Russian PDF");
const normalized = normalize(text);
assert.ok(
normalized.includes("ΡΡΡΡΠΊΠΈΠΉ") || normalized.includes("Π΄ΠΎΠΊΡΠΌΠ΅Π½Ρ"),
`Expected Russian text in PDF output, got: ${text}`,
);
});
test("parseDocument returns pdf.missing-client for PDF without LM Studio client", async () => {
const pdfPath = path.join(FIXTURE_DIR, "sample_eng.pdf");
const result = await parseDocument(pdfPath, false, undefined, ocr("eng"));
console.log("[parseDocument PDF no-client] Result:", JSON.stringify(result, null, 2));
assert.equal(result.success, false, "Should fail without LM Studio client");
if (!result.success) {
assert.equal(result.reason, "pdf.missing-client", "Should report missing client");
}
});
// ============================================================
// Mixed Language Tests (EN + RU, lang="eng+rus")
// ============================================================
test("parseImage recognises mixed EN+RU text from PNG (lang=eng+rus)", async () => {
const imagePath = path.join(FIXTURE_DIR, "sample_mix.png");
const text = await parseImage(imagePath, ocr("eng+rus"));
console.log("[OCR MIX PNG] Raw output:", JSON.stringify(text));
assert.ok(text.length > 0, "OCR should produce non-empty text for mixed-language image");
const normalized = normalize(text);
const hasEng = normalized.includes("hello") || normalized.includes("world");
const hasRus = normalized.includes("ΠΏΡΠΈΠ²Π΅Ρ") || normalized.includes("ΠΌΠΈΡ");
assert.ok(
hasEng && hasRus,
`Expected both English and Russian text in OCR output, got: ${text}`,
);
});
test("pdfjs extracts mixed EN+RU text from PDF with text layer", async () => {
const pdfPath = path.join(FIXTURE_DIR, "sample_mix_text.pdf");
const text = await extractPdfText(pdfPath);
console.log("[PDF MIX text-layer] Raw output:", JSON.stringify(text));
assert.ok(text.length > 0, "pdfjs should extract text from mixed-language PDF");
const normalized = normalize(text);
const hasEng = normalized.includes("english") || normalized.includes("document") || normalized.includes("fox");
const hasRus = normalized.includes("Π΄ΠΎΠΊΡΠΌΠ΅Π½Ρ") || normalized.includes("ΡΠ·ΡΠΊΠ°Ρ
") || normalized.includes("Π±ΡΠ»ΠΎΠΊ");
assert.ok(
hasEng && hasRus,
`Expected both English and Russian text in PDF output, got: ${text}`,
);
});
test("tryOcrWithPdfJs extracts mixed EN+RU text from image-only PDF (lang=eng+rus)", async () => {
const pdfPath = path.join(FIXTURE_DIR, "sample_mix_ocr.pdf");
const result = await tryOcrWithPdfJs(pdfPath, ocr("eng+rus"));
console.log("[PDF MIX OCR] Result:", JSON.stringify(result, null, 2));
assert.equal(result.success, true, `Expected success, got: ${result.success ? "ok" : (result as any).reason}`);
if (!result.success) return;
const normalized = normalize(result.text);
const hasEng = normalized.includes("hello") || normalized.includes("world");
const hasRus = normalized.includes("ΠΏΡΠΈΠ²Π΅Ρ") || normalized.includes("ΠΌΠΈΡ");
assert.ok(
hasEng && hasRus,
`Expected both English and Russian text in OCR PDF output, got: ${result.text}`,
);
});