Forked from mindstudio/big-rag
Project Files
src / parsers / imageParser.ts
import { createWorker } from "tesseract.js";
import { type OcrSettings, DEFAULT_OCR_SETTINGS } from "./documentParser";
import { resolveOcrLangPath } from "../utils/ocrLangPath";
/**
* Parse image files using OCR (Tesseract).
* @param filePath - Path to the image file.
* @param ocrSettings - OCR configuration parameters.
*/
export async function parseImage(
filePath: string,
ocrSettings: OcrSettings = DEFAULT_OCR_SETTINGS,
): Promise<string> {
try {
const { language, dataPath, pageSegMode } = ocrSettings;
const { langPath, gzip } = resolveOcrLangPath(dataPath, language);
console.log(`[BigRAG][ImageParser] Using OCR language path: ${langPath ?? "CDN (auto-download)"}, gzip: ${gzip}`);
const workerOptions: any = {};
if (langPath) workerOptions.langPath = langPath;
workerOptions.gzip = gzip;
const worker = await createWorker(language, undefined, workerOptions);
await worker.setParameters({ tessedit_pageseg_mode: pageSegMode as any });
const { data: { text } } = await worker.recognize(filePath);
await worker.terminate();
return text
.replace(/\s+/g, " ")
.replace(/\n+/g, "\n")
.trim();
} catch (error) {
console.error(`Error parsing image file ${filePath}:`, error);
return "";
}
}