Forked from mindstudio/big-rag
Project Files
src / config.ts
import { createConfigSchematics } from "@lmstudio/sdk";
export type ResponseLanguage = "ru" | "en";
export const DEFAULT_PROMPT_TEMPLATE_EN = `{{rag_context}}
Use the citations above to respond to the user query, only if they are relevant. Otherwise, respond to the best of your ability without them.
User Query:
{{user_query}}`;
export const DEFAULT_PROMPT_TEMPLATE_RU = `{{rag_context}}
Используй приведённые выше цитаты для ответа на запрос пользователя, только если они релевантны. В противном случае ответь самостоятельно, опираясь на свои знания.
Запрос пользователя:
{{user_query}}`;
export const DEFAULT_PROMPT_TEMPLATE = DEFAULT_PROMPT_TEMPLATE_EN;
export function getDefaultPromptTemplate(lang: ResponseLanguage): string {
return lang === "ru" ? DEFAULT_PROMPT_TEMPLATE_RU : DEFAULT_PROMPT_TEMPLATE_EN;
}
export const configSchematics = createConfigSchematics()
.field(
"responseLanguage",
"string",
{
displayName: "Response Language / Язык ответа",
subtitle: 'Language for RAG instructions sent to the model. "ru" = Русский, "en" = English.',
placeholder: "ru",
},
"ru",
)
.field(
"documentsDirectory",
"string",
{
displayName: "Documents Directory",
subtitle: "Root directory containing documents to index. All subdirectories will be scanned.",
placeholder: "/path/to/documents",
},
"",
)
.field(
"vectorStoreDirectory",
"string",
{
displayName: "Vector Store Directory",
subtitle: "Directory where the vector database will be stored.",
placeholder: "/path/to/vector/store",
},
"",
)
.field(
"retrievalLimit",
"numeric",
{
int: true,
min: 1,
max: 20,
displayName: "Retrieval Limit",
subtitle: "Maximum number of chunks to return during retrieval.",
slider: { min: 1, max: 20, step: 1 },
},
5,
)
.field(
"retrievalAffinityThreshold",
"numeric",
{
min: 0.0,
max: 1.0,
displayName: "Retrieval Affinity Threshold",
subtitle: "Minimum similarity score for a chunk to be considered relevant.",
slider: { min: 0.0, max: 1.0, step: 0.01 },
},
0.5,
)
.field(
"chunkSize",
"numeric",
{
int: true,
min: 128,
max: 2048,
displayName: "Chunk Size",
subtitle: "Size of text chunks for embedding (in tokens).",
slider: { min: 128, max: 2048, step: 128 },
},
512,
)
.field(
"chunkOverlap",
"numeric",
{
int: true,
min: 0,
max: 512,
displayName: "Chunk Overlap",
subtitle: "Overlap between consecutive chunks (in tokens).",
slider: { min: 0, max: 512, step: 32 },
},
100,
)
.field(
"maxConcurrentFiles",
"numeric",
{
int: true,
min: 1,
max: 10,
displayName: "Max Concurrent Files",
subtitle: "Maximum number of files to process concurrently during indexing. Recommend 1 for large PDF datasets.",
slider: { min: 1, max: 10, step: 1 },
},
1,
)
.field(
"parseDelayMs",
"numeric",
{
int: true,
min: 0,
max: 5000,
displayName: "Parser Delay (ms)",
subtitle: "Wait time before parsing each document (helps avoid WebSocket throttling).",
slider: { min: 0, max: 5000, step: 100 },
},
500,
)
// ─── Filename Search ──────────────────────────────────────────────
.field(
"enableFilenameSearch",
"boolean",
{
displayName: "Enable Filename Search",
subtitle:
"Extract keywords from the user query to find files by name. " +
"For example, \"найди файлы с именем протокол\" will list all indexed files matching \"протокол\". " +
"Works alongside normal vector content search.",
},
true,
)
.field(
"enableOCR",
"boolean",
{
displayName: "Enable OCR",
subtitle: "Enable OCR for image files and image-based PDFs using LM Studio's built-in document parser.",
},
true,
)
// ─── File Type Filters ────────────────────────────────────────────────
.field(
"indexHTML",
"boolean",
{
displayName: "Index HTML/XHTML",
subtitle: "Index .htm, .html, .xhtml files.",
},
true,
)
.field(
"indexPDF",
"boolean",
{
displayName: "Index PDF",
subtitle: "Index .pdf files.",
},
true,
)
.field(
"indexEPUB",
"boolean",
{
displayName: "Index EPUB",
subtitle: "Index .epub files.",
},
true,
)
.field(
"indexText",
"boolean",
{
displayName: "Index Text/Markdown",
subtitle: "Index .txt, .text, .md, .mdx, .markdown, .mdown, .mkd, .mkdn files.",
},
true,
)
.field(
"indexDocx",
"boolean",
{
displayName: "Index DOCX",
subtitle: "Index .docx files.",
},
true,
)
.field(
"indexXlsx",
"boolean",
{
displayName: "Index Spreadsheets",
subtitle: "Index .xlsx, .xls, .csv files.",
},
true,
)
.field(
"indexPptx",
"boolean",
{
displayName: "Index Presentations",
subtitle: "Index .pptx files.",
},
true,
)
.field(
"indexImages",
"boolean",
{
displayName: "Index Images (OCR)",
subtitle: "Index .bmp, .jpg, .jpeg, .png files via OCR.",
dependencies: [
{
key: "enableOCR",
condition: { type: "equals", value: true },
},
],
},
true,
)
.field(
"ocrLanguage",
"string",
{
displayName: "OCR Language",
subtitle: 'Tesseract language code: "eng" (English), "rus" (Russian), "eng+rus" (both), etc.',
placeholder: "eng+rus",
},
"eng+rus",
)
.field(
"ocrDataPath",
"string",
{
displayName: "OCR Data Path",
subtitle: 'Path to folder with .traineddata files. Leave empty to auto-detect: the plugin looks for .traineddata files in its own root folder (for offline use). If none found, Tesseract downloads them from CDN on first use. For best quality, download best-traineddata from tesseract-ocr/tessdata_best.',
placeholder: "",
},
"",
)
.field(
"ocrPageSegMode",
"numeric",
{
int: true,
min: 0,
max: 13,
displayName: "OCR Page Segmentation Mode (PSM)",
subtitle: "Tesseract PSM: 3=auto, 4=single column, 6=uniform block (tables/forms), 11=sparse text. Default 3.",
slider: { min: 0, max: 13, step: 1 },
},
3,
)
.field(
"ocrMinTextLength",
"numeric",
{
int: true,
min: 0,
max: 10000,
displayName: "OCR Min Text Length",
subtitle: "Minimum characters for PDF text to be considered valid. Lower values catch short pages (stamps, forms).",
slider: { min: 0, max: 10000, step: 10 },
},
20,
)
.field(
"ocrMaxPages",
"numeric",
{
int: true,
min: 1,
max: 50000,
displayName: "OCR Max Pages",
subtitle: "Maximum PDF pages to process with OCR. Increase for large documents.",
slider: { min: 1, max: 50000, step: 10 },
},
200,
)
.field(
"ocrMaxImagesPerPage",
"numeric",
{
int: true,
min: 1,
max: 100,
displayName: "OCR Max Images Per Page",
subtitle: "Maximum images per PDF page for OCR. Increase for pages with many diagrams/tables.",
slider: { min: 1, max: 100, step: 1 },
},
10,
)
.field(
"ocrMinImageArea",
"numeric",
{
int: true,
min: 0,
max: 100000,
displayName: "OCR Min Image Area",
subtitle: "Minimum image area (width×height in px) for OCR. Lower values process smaller images (signatures, stamps).",
slider: { min: 0, max: 100000, step: 100 },
},
2500,
)
.field(
"ocrMaxImagePixels",
"numeric",
{
int: true,
min: 1000000,
max: 500000000,
displayName: "OCR Max Image Pixels",
subtitle: "Maximum image area (px²) to process. Prevents OOM on huge scans. ~100M = 10000×10000.",
slider: { min: 1000000, max: 500000000, step: 1000000 },
},
100000000,
)
.field(
"ocrImageTimeoutMs",
"numeric",
{
int: true,
min: 5000,
max: 300000,
displayName: "OCR Image Timeout (ms)",
subtitle: "Timeout in ms for loading image data from PDF. Increase for slow systems.",
slider: { min: 5000, max: 300000, step: 5000 },
},
60000,
)
.field(
"embeddingModel",
"string",
{
displayName: "Embedding Model",
subtitle: "Model ID for text embeddings. Must be loaded in LM Studio. Examples: nomic-ai/nomic-embed-text-v1.5-GGUF, gpustack/text-embedding-bge-m3",
placeholder: "gpustack/text-embedding-bge-m3",
},
"gpustack/text-embedding-bge-m3",
)
.field(
"manualReindex.trigger",
"boolean",
{
displayName: "Manual Reindex Trigger",
subtitle:
"Toggle ON to request an immediate reindex. The plugin resets this after running. Use the “Skip Previously Indexed Files” option below to control whether unchanged files are skipped.",
},
false,
)
.field(
"manualReindex.skipPreviouslyIndexed",
"boolean",
{
displayName: "Skip Previously Indexed Files",
subtitle: "Skip unchanged files for faster manual runs. Only indexes new files or changed files.",
dependencies: [
{
key: "manualReindex.trigger",
condition: { type: "equals", value: true },
},
],
},
true,
)
.field(
"promptTemplate",
"string",
{
displayName: "Prompt Template",
subtitle:
"Supports {{rag_context}} (required) and {{user_query}} macros. Leave empty to auto-select based on Response Language setting.",
placeholder: DEFAULT_PROMPT_TEMPLATE,
isParagraph: true,
},
"",
)
.build();