Project Files
src / documents / loader.ts
/**
* Document Loader
* Orchestrates loading and parsing of documents from various formats
*/
import { readFile } from 'fs/promises';
import { createHash } from 'crypto';
import path from 'path';
import type { DocumentFormat, ParsedDocument, ParserOptions } from '../types';
import { TextParser, MarkdownParser } from './parsers/textParser';
import { PdfParser } from './parsers/pdfParser';
export class DocumentLoader {
/**
* Load and parse a document based on its format
*/
static async load(
filePath: string,
options?: ParserOptions
): Promise<ParsedDocument & { hash: string }> {
const format = this.detectFormat(filePath);
const hash = await this.hashFile(filePath);
let parsed: ParsedDocument;
switch (format) {
case 'txt':
parsed = await TextParser.parse(filePath, options);
break;
case 'md':
parsed = await MarkdownParser.parse(filePath, options);
break;
case 'pdf':
parsed = await PdfParser.parse(filePath, options);
break;
default:
throw new Error(`Unsupported file format: ${format}`);
}
// Add filename as title if not present
if (!parsed.metadata.title) {
parsed.metadata.title = path.basename(filePath, path.extname(filePath));
}
return {
...parsed,
hash,
};
}
/**
* Detect document format from file extension
*/
static detectFormat(filePath: string): DocumentFormat {
const ext = path.extname(filePath).toLowerCase();
switch (ext) {
case '.txt':
return 'txt';
case '.md':
case '.markdown':
return 'md';
case '.pdf':
return 'pdf';
default:
return 'unknown';
}
}
/**
* Calculate SHA-256 hash of file content
*/
private static async hashFile(filePath: string): Promise<string> {
const buffer = await readFile(filePath);
return createHash('sha256').update(buffer).digest('hex');
}
/**
* Check if file format is supported
*/
static isSupported(filePath: string): boolean {
const format = this.detectFormat(filePath);
return ['txt', 'md', 'pdf'].includes(format);
}
}