Project Files
src / utils / sanityChecks.ts
import * as fs from "fs";
import * as os from "os";
export interface SanityCheckResult {
passed: boolean;
warnings: string[];
errors: string[];
}
/**
* Perform sanity checks before indexing large directories
*/
export async function performSanityChecks(
documentsDir: string,
vectorStoreDir: string,
): Promise<SanityCheckResult> {
const warnings: string[] = [];
const errors: string[] = [];
// Check if directories exist
try {
await fs.promises.access(documentsDir, fs.constants.R_OK);
} catch {
errors.push(`Documents directory does not exist or is not readable: ${documentsDir}`);
}
try {
await fs.promises.access(vectorStoreDir, fs.constants.W_OK);
} catch {
// Try to create it
try {
await fs.promises.mkdir(vectorStoreDir, { recursive: true });
} catch {
errors.push(
`Vector store directory does not exist and cannot be created: ${vectorStoreDir}`
);
}
}
// Check available disk space
try {
const stats = await fs.promises.statfs(vectorStoreDir);
const availableGB = (stats.bavail * stats.bsize) / (1024 * 1024 * 1024);
if (availableGB < 1) {
errors.push(`Very low disk space available: ${availableGB.toFixed(2)} GB`);
} else if (availableGB < 10) {
warnings.push(`Low disk space available: ${availableGB.toFixed(2)} GB`);
}
} catch (error) {
warnings.push("Could not check available disk space");
}
// Check available memory
const freeMemoryGB = os.freemem() / (1024 * 1024 * 1024);
const totalMemoryGB = os.totalmem() / (1024 * 1024 * 1024);
const runningOnMac = process.platform === "darwin";
const lowMemoryMessage =
`Low free memory: ${freeMemoryGB.toFixed(2)} GB of ${totalMemoryGB.toFixed(2)} GB total. ` +
"Consider reducing concurrent file processing.";
const veryLowMemoryMessage =
`Very low free memory: ${freeMemoryGB.toFixed(2)} GB. ` +
(runningOnMac
? "macOS may be reporting cached pages as used; cached memory can usually be reclaimed automatically."
: "Indexing may fail due to insufficient RAM.");
if (freeMemoryGB < 0.5) {
if (runningOnMac) {
warnings.push(veryLowMemoryMessage);
} else {
errors.push(`Very low free memory: ${freeMemoryGB.toFixed(2)} GB`);
}
} else if (freeMemoryGB < 2) {
warnings.push(lowMemoryMessage);
}
// Estimate directory size (sample-based for performance)
try {
const sampleSize = await estimateDirectorySize(documentsDir);
const estimatedGB = sampleSize / (1024 * 1024 * 1024);
if (estimatedGB > 100) {
warnings.push(
`Large directory detected (~${estimatedGB.toFixed(1)} GB). Initial indexing may take several hours.`
);
} else if (estimatedGB > 10) {
warnings.push(
`Medium-sized directory detected (~${estimatedGB.toFixed(1)} GB). Initial indexing may take 30-60 minutes.`
);
}
} catch (error) {
warnings.push("Could not estimate directory size");
}
// Check if vector store already has data
try {
const files = await fs.promises.readdir(vectorStoreDir);
if (files.length > 0) {
warnings.push(
"Vector store directory is not empty. Existing data will be used for incremental indexing."
);
}
} catch {
// Directory doesn't exist yet, that's fine
}
return {
passed: errors.length === 0,
warnings,
errors,
};
}
/**
* Estimate directory size by sampling
* (Quick estimate, not exact)
*/
async function estimateDirectorySize(dir: string, maxSamples: number = 100): Promise<number> {
let totalSize = 0;
let fileCount = 0;
let sampledSize = 0;
let sampledCount = 0;
async function walk(currentDir: string): Promise<void> {
if (sampledCount >= maxSamples) {
return;
}
try {
const entries = await fs.promises.readdir(currentDir, { withFileTypes: true });
for (const entry of entries) {
if (sampledCount >= maxSamples) {
break;
}
const fullPath = `${currentDir}/${entry.name}`;
if (entry.isDirectory()) {
await walk(fullPath);
} else if (entry.isFile()) {
fileCount++;
if (sampledCount < maxSamples) {
try {
const stats = await fs.promises.stat(fullPath);
sampledSize += stats.size;
sampledCount++;
} catch {
// Skip files we can't stat
}
}
}
}
} catch {
// Skip directories we can't read
}
}
await walk(dir);
// Extrapolate from sample
if (sampledCount > 0 && fileCount > 0) {
const avgFileSize = sampledSize / sampledCount;
totalSize = avgFileSize * fileCount;
}
return totalSize;
}
/**
* Check system resources and provide recommendations
*/
export function getResourceRecommendations(
estimatedSizeGB: number,
freeMemoryGB: number,
): {
recommendedConcurrency: number;
recommendedChunkSize: number;
estimatedTime: string;
} {
let recommendedConcurrency = 3;
let recommendedChunkSize = 512;
let estimatedTime = "unknown";
// Adjust based on available memory
if (freeMemoryGB < 2) {
recommendedConcurrency = 1;
} else if (freeMemoryGB < 4) {
recommendedConcurrency = 2;
} else if (freeMemoryGB >= 8) {
recommendedConcurrency = 5;
}
// Adjust based on dataset size
if (estimatedSizeGB < 1) {
estimatedTime = "5-15 minutes";
} else if (estimatedSizeGB < 10) {
estimatedTime = "30-60 minutes";
recommendedChunkSize = 768;
} else if (estimatedSizeGB < 100) {
estimatedTime = "2-4 hours";
recommendedChunkSize = 1024;
} else {
estimatedTime = "4-12 hours";
recommendedChunkSize = 1024;
recommendedConcurrency = Math.min(recommendedConcurrency, 3);
}
return {
recommendedConcurrency,
recommendedChunkSize,
estimatedTime,
};
}