Forked from mindstudio/big-rag
Project Files
src / tests / parseDocument.test.ts
import { describe, test } from "node:test";
import assert from "node:assert/strict";
import * as path from "path";
import { parseDocument } from "../parsers/documentParser";
const FIXTURE_DIR = path.resolve(__dirname, "../../test-fixtures");
describe("parseDocument", () => {
// βββ HTML βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
test("parses HTML files and extracts clean text", async () => {
const result = await parseDocument(path.join(FIXTURE_DIR, "sample.html"));
assert.ok(result.success, `Expected success, got: ${result.success ? "" : result.reason}`);
if (result.success) {
assert.ok(result.document.text.length > 0, "Expected non-empty text");
assert.ok(
result.document.text.includes("sample HTML file"),
`Expected text to contain "sample HTML file", got: ${result.document.text.substring(0, 200)}`,
);
}
});
// βββ XHTML ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
test("parses XHTML files and extracts clean text", async () => {
const result = await parseDocument(path.join(FIXTURE_DIR, "sample.xhtml"));
assert.ok(result.success, `Expected success, got: ${result.success ? "" : result.reason}`);
if (result.success) {
assert.ok(result.document.text.length > 0, "Expected non-empty text");
assert.ok(
result.document.text.includes("Big RAG"),
`Expected text to contain "Big RAG", got: ${result.document.text.substring(0, 200)}`,
);
}
});
// βββ Markdown βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
test("parses Markdown files and flattens formatting", async () => {
const result = await parseDocument(path.join(FIXTURE_DIR, "sample.md"));
assert.ok(result.success, `Expected success, got: ${result.success ? "" : result.reason}`);
if (result.success) {
assert.ok(result.document.text.length > 0, "Expected non-empty text");
// Check English content (Markdown title keeps original casing)
assert.ok(
result.document.text.includes("Sample"),
`Expected text to contain "Sample", got: ${result.document.text.substring(0, 200)}`,
);
// Check Russian content
assert.ok(
result.document.text.includes("ΡΡΡΡΠΊΠΎΠ³ΠΎ") || result.document.text.includes("ΡΡΡΡΠΊ"),
`Expected text to contain Russian text, got: ${result.document.text.substring(0, 400)}`,
);
assert.equal(result.document.metadata.extension, ".md");
}
});
// βββ MDX ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
test("parses MDX files as Markdown", async () => {
const result = await parseDocument(path.join(FIXTURE_DIR, "sample.mdx"));
assert.ok(result.success, `Expected success, got: ${result.success ? "" : result.reason}`);
if (result.success) {
assert.ok(result.document.text.length > 0, "Expected non-empty text");
assert.equal(result.document.metadata.extension, ".mdx");
}
});
// βββ Plain text βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
test("parses TXT files preserving paragraph spacing", async () => {
const result = await parseDocument(path.join(FIXTURE_DIR, "sample.txt"));
assert.ok(result.success, `Expected success, got: ${result.success ? "" : result.reason}`);
if (result.success) {
assert.ok(result.document.text.length > 0, "Expected non-empty text");
assert.equal(result.document.metadata.extension, ".txt");
}
});
// βββ DOCX βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
test("parses DOCX files and extracts text paragraphs", async () => {
const result = await parseDocument(path.join(FIXTURE_DIR, "sample.docx"));
assert.ok(result.success, `Expected success, got: ${result.success ? "" : result.reason}`);
if (result.success) {
assert.ok(result.document.text.length > 0, "Expected non-empty text");
assert.ok(
result.document.text.includes("Big RAG"),
`Expected text to contain "Big RAG", got: ${result.document.text.substring(0, 200)}`,
);
assert.equal(result.document.metadata.extension, ".docx");
}
});
// βββ XLSX βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
test("parses XLSX files and extracts cell text", async () => {
const result = await parseDocument(path.join(FIXTURE_DIR, "sample.xlsx"));
assert.ok(result.success, `Expected success, got: ${result.success ? "" : result.reason}`);
if (result.success) {
assert.ok(result.document.text.length > 0, "Expected non-empty text");
assert.ok(
result.document.text.includes("Test Row 1"),
`Expected text to contain "Test Row 1", got: ${result.document.text.substring(0, 200)}`,
);
assert.equal(result.document.metadata.extension, ".xlsx");
}
});
// βββ CSV ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
test("parses CSV files and extracts cell text", async () => {
const result = await parseDocument(path.join(FIXTURE_DIR, "sample.csv"));
assert.ok(result.success, `Expected success, got: ${result.success ? "" : result.reason}`);
if (result.success) {
assert.ok(result.document.text.length > 0, "Expected non-empty text");
assert.ok(
result.document.text.includes("CSV Row 1"),
`Expected text to contain "CSV Row 1", got: ${result.document.text.substring(0, 200)}`,
);
assert.equal(result.document.metadata.extension, ".csv");
}
});
// βββ PPTX βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
test("parses PPTX files and extracts slide text", async () => {
const result = await parseDocument(path.join(FIXTURE_DIR, "sample.pptx"));
assert.ok(result.success, `Expected success, got: ${result.success ? "" : result.reason}`);
if (result.success) {
assert.ok(result.document.text.length > 0, "Expected non-empty text");
assert.ok(
result.document.text.includes("slide one content"),
`Expected text to contain "slide one content", got: ${result.document.text.substring(0, 200)}`,
);
assert.ok(
result.document.text.includes("Second slide"),
`Expected text to contain "Second slide", got: ${result.document.text.substring(0, 300)}`,
);
assert.equal(result.document.metadata.extension, ".pptx");
}
});
// βββ EPUB βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
test("parses EPUB files and extracts text", async () => {
const result = await parseDocument(path.join(FIXTURE_DIR, "TestFormat.epub"));
assert.ok(result.success, `Expected success, got: ${result.success ? "" : result.reason}`);
if (result.success) {
assert.ok(result.document.text.length > 0, "Expected non-empty text from EPUB");
assert.equal(result.document.metadata.extension, ".epub");
}
});
// βββ OCR English (PNG) βββββββββββββββββββββββββββββββββββββββββββββββββ
// On first run Tesseract auto-downloads traineddata from CDN (30-120s).
test("parses PNG images with English OCR", { timeout: 180_000 }, async () => {
const result = await parseDocument(
path.join(FIXTURE_DIR, "sample_eng.png"),
true, // enableOCR
undefined, // no client needed for images
{
language: "eng",
dataPath: "",
pageSegMode: 3,
minTextLength: 20,
maxPages: 200,
maxImagesPerPage: 10,
minImageArea: 2500,
maxImagePixels: 100_000_000,
imageTimeoutMs: 60_000,
},
);
assert.ok(result.success, `Expected success, got: ${result.success ? "" : result.reason}`);
if (result.success) {
assert.ok(result.document.text.length > 0, "Expected non-empty OCR text from PNG");
console.log(`[Test OCR eng] Extracted: ${result.document.text.substring(0, 200)}`);
}
});
// βββ OCR Russian (PNG) βββββββββββββββββββββββββββββββββββββββββββββββββ
// Downloads rus.traineddata from CDN on first run.
test("parses PNG images with Russian OCR", { timeout: 180_000 }, async () => {
const result = await parseDocument(
path.join(FIXTURE_DIR, "sample_rus.png"),
true,
undefined,
{
language: "rus",
dataPath: "",
pageSegMode: 3,
minTextLength: 20,
maxPages: 200,
maxImagesPerPage: 10,
minImageArea: 2500,
maxImagePixels: 100_000_000,
imageTimeoutMs: 60_000,
},
);
assert.ok(result.success, `Expected success, got: ${result.success ? "" : result.reason}`);
if (result.success) {
assert.ok(result.document.text.length > 0, "Expected non-empty OCR text from Russian PNG");
console.log(`[Test OCR rus] Extracted: ${result.document.text.substring(0, 200)}`);
}
});
// βββ OCR Mixed English+Russian (PNG) βββββββββββββββββββββββββββββββββββ
// Tests eng+rus language combination with the mixed-language image.
test("parses PNG images with mixed English+Russian OCR", { timeout: 180_000 }, async () => {
const result = await parseDocument(
path.join(FIXTURE_DIR, "sample_mix.png"),
true,
undefined,
{
language: "eng+rus",
dataPath: "",
pageSegMode: 3,
minTextLength: 20,
maxPages: 200,
maxImagesPerPage: 10,
minImageArea: 2500,
maxImagePixels: 100_000_000,
imageTimeoutMs: 60_000,
},
);
assert.ok(result.success, `Expected success, got: ${result.success ? "" : result.reason}`);
if (result.success) {
assert.ok(result.document.text.length > 0, "Expected non-empty OCR text from mixed PNG");
console.log(`[Test OCR eng+rus] Extracted: ${result.document.text.substring(0, 200)}`);
}
});
});