Forked from mindstudio/big-rag
Project Files
src / utils / ocrLangPath.ts
import * as path from "path";
import * as fs from "fs";
/**
* Result of resolving the OCR language data directory.
*
* - `langPath` — absolute path to the directory containing `.traineddata` files,
* or `null` to let Tesseract.js auto-download from CDN
* - `gzip` — whether Tesseract should look for `.gz`-compressed language files
*/
export interface ResolvedLangPath {
langPath: string | null;
gzip: boolean;
}
/**
* Check whether any `.traineddata.gz` file exists in the given directory.
*/
function hasGzipFiles(dir: string): boolean {
try {
const entries = fs.readdirSync(dir);
return entries.some((e) => e.endsWith(".traineddata.gz"));
} catch {
return false;
}
}
/**
* Check whether any `.traineddata` (or `.traineddata.gz`) file exists in the given directory.
*/
function hasTraineddataFiles(dir: string): boolean {
try {
const entries = fs.readdirSync(dir);
return entries.some((e) => e.endsWith(".traineddata") || e.endsWith(".traineddata.gz"));
} catch {
return false;
}
}
/**
* Check whether all required language files exist in the given directory.
* Language string can be like "eng", "eng+rus", "rus+eng+deu".
*/
function hasAllLanguageFiles(dir: string, language: string): boolean {
const langs = language.split("+").filter((l) => l.length > 0);
if (langs.length === 0) return false;
try {
const entries = fs.readdirSync(dir);
for (const lang of langs) {
const hasPlain = entries.some((e) => e === `${lang}.traineddata`);
const hasGz = entries.some((e) => e === `${lang}.traineddata.gz`);
if (!hasPlain && !hasGz) return false;
}
return true;
} catch {
return false;
}
}
/**
* Find the plugin root directory by walking up from a starting directory
* and looking for `manifest.json` (which always exists in the plugin root).
*/
function findPluginRootFrom(startDir: string): string | null {
let dir = path.resolve(startDir);
for (let i = 0; i < 10; i++) {
try {
if (fs.existsSync(path.join(dir, "manifest.json"))) {
// Verify it looks like an LM Studio plugin manifest
const content = fs.readFileSync(path.join(dir, "manifest.json"), "utf-8");
if (content.includes('"type": "plugin"') || content.includes('"type":"plugin"')) {
return dir;
}
}
} catch {
// ignore read errors
}
const parent = path.dirname(dir);
if (parent === dir) break; // reached filesystem root
dir = parent;
}
return null;
}
/**
* Resolve the plugin root directory using multiple strategies:
*
* 1. Walk up from `__dirname` looking for `manifest.json` (works for `dist/` layout)
* 2. Walk up from `process.cwd()` looking for `manifest.json` (works for `lms dev`)
* 3. Fallback: `__dirname/../..` (legacy behavior for `dist/utils/` layout)
*
* This ensures the plugin root is found regardless of whether the code runs via:
* - `node dist/...` (production)
* - `lms dev` (esbuild bundled into temp directory)
* - `node --test dist/tests/...` (test runner)
*/
function findPluginRoot(): string {
// Strategy 1: Walk up from __dirname
const fromDirname = findPluginRootFrom(__dirname);
if (fromDirname) return fromDirname;
// Strategy 2: Walk up from process.cwd() (catches lms dev case)
const fromCwd = findPluginRootFrom(process.cwd());
if (fromCwd) return fromCwd;
// Strategy 3: Legacy fallback — assume dist/utils or dist/parsers layout
return path.resolve(__dirname, "..", "..");
}
// Cache the plugin root after first resolution
let cachedPluginRoot: string | null = null;
function getPluginRoot(): string {
if (!cachedPluginRoot) {
cachedPluginRoot = findPluginRoot();
}
return cachedPluginRoot;
}
/**
* Resolve the OCR language data directory and determine the correct gzip setting.
*
* Priority:
* 1. Explicit user-configured `dataPath` (non-empty string)
* 2. `<plugin_root>/` — only if ALL required language .traineddata files exist locally
* 3. Tesseract.js auto-downloads from CDN
*
* This ensures offline operation when `.traineddata` files are present in the
* plugin root — just copy the project folder to an offline machine.
*/
export function resolveOcrLangPath(dataPath?: string, language?: string): ResolvedLangPath {
const lang = language || "eng+rus";
// 1. Explicit user path
if (dataPath && dataPath.trim().length > 0) {
const langPath = dataPath.trim();
return { langPath, gzip: hasGzipFiles(langPath) };
}
// 2. Plugin root directory — use only if ALL required language files exist
const pluginRoot = getPluginRoot();
if (hasAllLanguageFiles(pluginRoot, lang)) {
const gzip = hasGzipFiles(pluginRoot);
console.log(`[BigRAG] Using existing OCR language data from: ${pluginRoot}`);
return { langPath: pluginRoot, gzip };
}
// No local files or some languages missing — let Tesseract auto-download from CDN
const existing = hasTraineddataFiles(pluginRoot);
if (existing) {
console.log(`[BigRAG] Some language files missing locally for "${lang}". Tesseract will auto-download from CDN.`);
} else {
console.log(`[BigRAG] No local .traineddata found in ${pluginRoot}. Tesseract will auto-download from CDN.`);
}
return { langPath: null, gzip: true };
}