Forked from mindstudio/big-rag
Project Files
src / utils / supportedExtensions.ts
const HTML_EXTENSIONS = [".htm", ".html", ".xhtml"];
const MARKDOWN_EXTENSIONS = [".md", ".markdown", ".mdown", ".mdx", ".mkd", ".mkdn"];
const TEXT_EXTENSIONS = [".txt", ".text"];
const PDF_EXTENSIONS = [".pdf"];
const EPUB_EXTENSIONS = [".epub"];
const IMAGE_EXTENSIONS = [".bmp", ".jpg", ".jpeg", ".png"];
const ARCHIVE_EXTENSIONS = [".rar"];
const DOCX_EXTENSIONS = [".docx"];
const XLSX_EXTENSIONS = [".xlsx", ".xls", ".csv"];
const PPTX_EXTENSIONS = [".pptx"];
const ALL_EXTENSION_GROUPS = [
HTML_EXTENSIONS,
MARKDOWN_EXTENSIONS,
TEXT_EXTENSIONS,
PDF_EXTENSIONS,
EPUB_EXTENSIONS,
IMAGE_EXTENSIONS,
ARCHIVE_EXTENSIONS,
DOCX_EXTENSIONS,
XLSX_EXTENSIONS,
PPTX_EXTENSIONS,
];
export const SUPPORTED_EXTENSIONS = new Set(
ALL_EXTENSION_GROUPS.flatMap((group) => group.map((ext) => ext.toLowerCase())),
);
export const HTML_EXTENSION_SET = new Set(HTML_EXTENSIONS);
export const MARKDOWN_EXTENSION_SET = new Set(MARKDOWN_EXTENSIONS);
export const TEXT_EXTENSION_SET = new Set(TEXT_EXTENSIONS);
export const IMAGE_EXTENSION_SET = new Set(IMAGE_EXTENSIONS);
export function isHtmlExtension(ext: string): boolean {
return HTML_EXTENSION_SET.has(ext.toLowerCase());
}
export function isMarkdownExtension(ext: string): boolean {
return MARKDOWN_EXTENSION_SET.has(ext.toLowerCase());
}
export function isPlainTextExtension(ext: string): boolean {
return TEXT_EXTENSION_SET.has(ext.toLowerCase());
}
export function isTextualExtension(ext: string): boolean {
return isMarkdownExtension(ext) || isPlainTextExtension(ext);
}
/**
* Configuration for enabling/disabling file type categories.
* All fields default to `true` (index everything).
*/
export interface FileTypeFilter {
indexHTML: boolean;
indexPDF: boolean;
indexEPUB: boolean;
indexText: boolean;
indexDocx: boolean;
indexXlsx: boolean;
indexPptx: boolean;
indexImages: boolean;
}
/**
* Given a FileTypeFilter, return a Set of enabled file extensions.
* Only extensions whose category is enabled (true) are included.
*/
export function getEnabledExtensions(filter: FileTypeFilter): Set<string> {
const groups: string[][] = [];
if (filter.indexHTML) groups.push(HTML_EXTENSIONS);
if (filter.indexPDF) groups.push(PDF_EXTENSIONS);
if (filter.indexEPUB) groups.push(EPUB_EXTENSIONS);
if (filter.indexText) groups.push(MARKDOWN_EXTENSIONS, TEXT_EXTENSIONS);
if (filter.indexDocx) groups.push(DOCX_EXTENSIONS);
if (filter.indexXlsx) groups.push(XLSX_EXTENSIONS);
if (filter.indexPptx) groups.push(PPTX_EXTENSIONS);
if (filter.indexImages) groups.push(IMAGE_EXTENSIONS);
return new Set(groups.flatMap((g) => g.map((ext) => ext.toLowerCase())));
}
export function listSupportedExtensions(): string[] {
return Array.from(SUPPORTED_EXTENSIONS.values()).sort();
}