Project Files

stubs

napi-rs-canvas

index.d.ts

index.js

package.json

.clinerules

.gitignore

EXAMPLES.md

manifest.json

package-lock.json

package.json

QUICKSTART.md

README.md

TESTING.md

tsconfig.json

src / parsers / imageParser.ts

import { createWorker } from "tesseract.js";
import { type OcrSettings, DEFAULT_OCR_SETTINGS } from "./documentParser";
import { resolveOcrLangPath } from "../utils/ocrLangPath";

/**
 * Parse image files using OCR (Tesseract).
 * @param filePath - Path to the image file.
 * @param ocrSettings - OCR configuration parameters.
 */
export async function parseImage(
  filePath: string,
  ocrSettings: OcrSettings = DEFAULT_OCR_SETTINGS,
): Promise<string> {
  try {
    const { language, dataPath, pageSegMode } = ocrSettings;

    const { langPath, gzip } = resolveOcrLangPath(dataPath, language);
    console.log(`[BigRAG][ImageParser] Using OCR language path: ${langPath ?? "CDN (auto-download)"}, gzip: ${gzip}`);
    const workerOptions: any = {};
    if (langPath) workerOptions.langPath = langPath;
    workerOptions.gzip = gzip;

    const worker = await createWorker(language, undefined, workerOptions);
    await worker.setParameters({ tessedit_pageseg_mode: pageSegMode as any });

    const { data: { text } } = await worker.recognize(filePath);

    await worker.terminate();

    return text
      .replace(/\s+/g, " ")
      .replace(/\n+/g, "\n")
      .trim();
  } catch (error) {
    console.error(`Error parsing image file ${filePath}:`, error);
    return "";
  }
}

big-rag-rus