Project Files
packages / core / src / largeCorpus.ts
import type { RagDocument } from "./contracts";
import { buildHierarchicalDocumentIndex } from "./localRetrieval";
import type { RagExecutionRoute } from "./outputContracts";
import type {
FileExtensionCount,
FileInfoResponse,
RagCorpusAnalysis,
RagDirectoryManifest,
RagFileSynopsis,
RagFileSystemBrowser,
RagLoadedCorpus,
} from "./runtimeContracts";
const LARGE_FILE_BYTES = 512 * 1024;
const HUGE_FILE_BYTES = 2 * 1024 * 1024;
const BINARY_HEAVY_THRESHOLD = 0.6;
const LARGE_CORPUS_CACHE_LIMIT = 32;
const largeCorpusAnalysisCache = new Map<string, RagCorpusAnalysis>();
const GLOBAL_QUERY_PATTERNS = [
/\b(overall|summarize|summary|overview|what(?:'s| is) in|what does .* contain)\b/i,
/\bthemes?|topics?|patterns?|dominant|across the corpus|across the dataset|across the directory\b/i,
/\bkind(?:s)? of\b/i,
/\bhigh[- ]level\b/i,
];
const LOCAL_QUERY_PATTERNS = [
/\bfind\b/i,
/\bwhich\b/i,
/\bwhere\b/i,
/\bshow me\b/i,
/\bspecific\b/i,
/\bconversation\b/i,
/\bsession\b/i,
/\bmessage\b/i,
/\btool usage\b/i,
/\bwhen\b/i,
];
export async function analyzeLargeCorpus(
paths: Array<string> | undefined,
query: string,
corpus: RagLoadedCorpus,
browser: RagFileSystemBrowser | undefined
): Promise<RagCorpusAnalysis | undefined> {
if (!browser || !paths || paths.length === 0) {
return undefined;
}
const questionScope = inferQuestionScope(query);
const manifests: Array<RagDirectoryManifest> = [];
const synopses: Array<RagFileSynopsis> = [];
const notes: Array<string> = [];
const oversizedPaths: Array<string> = [];
const inspectedInfos: Array<FileInfoResponse> = [];
let sawDirectory = false;
let sawFile = false;
let textHeavyScore = 0;
let binaryHeavyScore = 0;
for (const inputPath of paths.slice(0, 8)) {
const info = await browser.fileInfo({ path: inputPath });
inspectedInfos.push(info);
}
const cacheKey = buildLargeCorpusCacheKey(questionScope, inspectedInfos);
const cached = largeCorpusAnalysisCache.get(cacheKey);
if (cached) {
return cloneAnalysis(cached, `Reused cached large-corpus analysis for ${inspectedInfos.length} path(s).`);
}
for (const [index, inputPath] of paths.slice(0, 8).entries()) {
const info = inspectedInfos[index];
if (!info?.exists || !info.type) {
notes.push(`Path could not be inspected: ${inputPath}.`);
continue;
}
if (info.type === "directory") {
sawDirectory = true;
const manifest = await buildDirectoryManifest(inputPath, info, browser);
manifests.push(manifest);
oversizedPaths.push(...manifest.oversizedFiles.map((file) => file.path));
if (manifest.dominantModality === "binary-heavy") {
binaryHeavyScore += 2;
} else if (manifest.dominantModality === "text-heavy") {
textHeavyScore += 2;
} else {
textHeavyScore += 1;
binaryHeavyScore += 1;
}
} else {
sawFile = true;
const synopsis = await buildFileSynopsis(inputPath, info, browser);
synopses.push(synopsis);
if (synopsis.oversized) {
oversizedPaths.push(synopsis.path);
}
if (synopsis.textLike) {
textHeavyScore += 2;
} else {
binaryHeavyScore += 2;
}
}
}
if (manifests.length === 0 && synopses.length === 0) {
return undefined;
}
const targetType = sawDirectory && sawFile ? "mixed" : sawDirectory ? "directory" : "file";
const modality =
binaryHeavyScore === 0 && textHeavyScore === 0
? "unknown"
: binaryHeavyScore > textHeavyScore
? "binary-heavy"
: textHeavyScore > binaryHeavyScore
? "text-heavy"
: "mixed";
const summaryDocuments = buildSummaryDocuments(manifests, synopses);
const hierarchicalIndex = shouldBuildHierarchicalIndex({
questionScope,
oversizedPaths,
modality,
targetType,
})
? buildHierarchicalDocumentIndex(selectHierarchicalDocuments(corpus.documents, synopses, oversizedPaths))
: undefined;
const recommendedRoute = recommendLargeCorpusRoute({
questionScope,
targetType,
modality,
oversizedFileCount: oversizedPaths.length,
corpus,
});
notes.push(
`Large-corpus analysis classified scope=${questionScope}, target=${targetType}, modality=${modality}, oversizedFiles=${oversizedPaths.length}.`
);
if (hierarchicalIndex) {
notes.push(`Built hierarchical index with ${hierarchicalIndex.nodes.length} parent nodes.`);
}
const analysis = {
questionScope,
targetType,
modality,
recommendedRoute,
notes,
summaryDocuments,
directoryManifests: manifests,
largeFileSynopses: synopses,
oversizedPaths: [...new Set(oversizedPaths)],
hierarchicalIndex,
} satisfies RagCorpusAnalysis;
setLargeCorpusCache(cacheKey, analysis);
return cloneAnalysis(analysis);
}
function buildLargeCorpusCacheKey(
questionScope: "local" | "global",
infos: Array<FileInfoResponse>
): string {
const parts = infos.map((info) => {
if (!info.exists) {
return [info.resolvedPath, "missing"].join("|");
}
return [
info.resolvedPath,
info.type ?? "unknown",
info.modifiedTimeMs ?? "nomtime",
info.sizeBytes ?? "nosize",
info.fileCount ?? "nofiles",
info.directoryCount ?? "nodirs",
].join("|");
});
return `${questionScope}::${parts.join("::")}`;
}
function setLargeCorpusCache(key: string, analysis: RagCorpusAnalysis) {
if (largeCorpusAnalysisCache.has(key)) {
largeCorpusAnalysisCache.delete(key);
}
largeCorpusAnalysisCache.set(key, analysis);
while (largeCorpusAnalysisCache.size > LARGE_CORPUS_CACHE_LIMIT) {
const oldestKey = largeCorpusAnalysisCache.keys().next().value;
if (!oldestKey) {
break;
}
largeCorpusAnalysisCache.delete(oldestKey);
}
}
function cloneAnalysis(
analysis: RagCorpusAnalysis,
extraNote?: string
): RagCorpusAnalysis {
return {
...analysis,
notes: extraNote ? [...analysis.notes, extraNote] : [...analysis.notes],
summaryDocuments: [...analysis.summaryDocuments],
directoryManifests: [...analysis.directoryManifests],
largeFileSynopses: [...analysis.largeFileSynopses],
oversizedPaths: [...analysis.oversizedPaths],
};
}
export function inferQuestionScope(query: string): "local" | "global" {
const normalized = query.trim();
if (GLOBAL_QUERY_PATTERNS.some((pattern) => pattern.test(normalized))) {
return "global";
}
if (LOCAL_QUERY_PATTERNS.some((pattern) => pattern.test(normalized))) {
return "local";
}
return normalized.split(/\s+/).length <= 8 ? "local" : "global";
}
async function buildDirectoryManifest(
inputPath: string,
info: FileInfoResponse,
browser: RagFileSystemBrowser
): Promise<RagDirectoryManifest> {
const browse = await browser.browse({
path: inputPath,
recursive: true,
maxDepth: 4,
maxEntries: 200,
includeHidden: false,
});
const topExtensions = info.topExtensions ?? browse.topExtensions ?? [];
const fileCount = info.fileCount ?? browse.fileCount ?? 0;
const directoryCount = info.directoryCount ?? browse.directoryCount ?? 0;
const representativeFiles = browse.entries
.filter((entry) => entry.type === "file")
.slice(0, 8)
.map((entry) => entry.path);
const oversizedFiles = browse.entries
.filter((entry) => entry.type === "file" && (entry.sizeBytes ?? 0) >= LARGE_FILE_BYTES)
.sort((left, right) => (right.sizeBytes ?? 0) - (left.sizeBytes ?? 0))
.slice(0, 8)
.map((entry) => ({
path: entry.path,
sizeBytes: entry.sizeBytes ?? 0,
extension: entry.extension,
}));
return {
path: inputPath,
resolvedPath: info.resolvedPath,
fileCount,
directoryCount,
topExtensions,
representativeFiles,
oversizedFiles,
dominantModality: inferDominantModality(fileCount, topExtensions),
truncated: browse.truncated,
};
}
async function buildFileSynopsis(
inputPath: string,
info: FileInfoResponse,
browser: RagFileSystemBrowser
): Promise<RagFileSynopsis> {
const extension = normalizeExtension(info.extension);
const format = detectFormat(extension);
const head = info.textLike
? await browser.readFile({ path: inputPath, startLine: 0, maxLines: 80, maxChars: 8000 })
: undefined;
let tail;
if (info.textLike && (info.sizeBytes ?? 0) >= LARGE_FILE_BYTES) {
tail = await browser.readFile({ path: inputPath, startLine: 400, maxLines: 80, maxChars: 8000 });
}
return {
path: inputPath,
resolvedPath: info.resolvedPath,
extension,
sizeBytes: info.sizeBytes ?? 0,
textLike: info.textLike ?? false,
oversized: (info.sizeBytes ?? 0) >= LARGE_FILE_BYTES,
format,
synopsis: summarizeSample(format, head?.content, tail?.content),
sampleStrategy:
(info.sizeBytes ?? 0) >= HUGE_FILE_BYTES ? "bounded-head-tail" : "bounded-head",
sampleHead: truncateForMetadata(head?.content),
sampleTail: truncateForMetadata(tail?.content),
};
}
function detectFormat(extension?: string): RagFileSynopsis["format"] {
if (extension === ".jsonl") {
return "jsonl";
}
if (extension === ".json") {
return "json";
}
if (extension === ".html" || extension === ".htm") {
return "html";
}
if (extension === ".md" || extension === ".markdown") {
return "markdown";
}
return "text";
}
function summarizeSample(
format: RagFileSynopsis["format"],
head?: string,
tail?: string
): string {
const parts: Array<string> = [];
const headSummary = summarizeTextWindow(head);
const tailSummary = summarizeTextWindow(tail);
if (format === "jsonl") {
parts.push("Structured as line-delimited JSON.");
} else if (format === "json") {
parts.push("Structured as JSON.");
} else if (format === "html") {
parts.push("Structured as HTML with markup-heavy content.");
}
if (headSummary) {
parts.push(`Head sample: ${headSummary}`);
}
if (tailSummary && tailSummary !== headSummary) {
parts.push(`Tail sample: ${tailSummary}`);
}
return parts.join(" ").trim() || "No text synopsis available.";
}
function summarizeTextWindow(text?: string): string | undefined {
if (!text) {
return undefined;
}
const cleaned = text
.replace(/<script[\s\S]*?<\/script>/gi, " ")
.replace(/<style[\s\S]*?<\/style>/gi, " ")
.replace(/<[^>]+>/g, " ")
.replace(/\s+/g, " ")
.trim();
if (!cleaned) {
return undefined;
}
return truncateForMetadata(cleaned, 280);
}
function truncateForMetadata(text?: string, maxLength = 500): string | undefined {
if (!text) {
return undefined;
}
const normalized = text.replace(/\s+/g, " ").trim();
if (normalized.length <= maxLength) {
return normalized;
}
return `${normalized.slice(0, maxLength - 1)}…`;
}
function buildSummaryDocuments(
manifests: Array<RagDirectoryManifest>,
synopses: Array<RagFileSynopsis>
): Array<RagDocument> {
const documents: Array<RagDocument> = [];
for (const manifest of manifests) {
documents.push({
id: `manifest:${manifest.resolvedPath}`,
name: `manifest:${manifest.path}`,
content: [
`Directory: ${manifest.path}`,
`Files: ${manifest.fileCount}`,
`Subdirectories: ${manifest.directoryCount}`,
`Dominant modality: ${manifest.dominantModality}`,
`Top extensions: ${formatTopExtensions(manifest.topExtensions)}`,
`Representative files: ${manifest.representativeFiles.join(", ") || "none"}`,
`Oversized files: ${manifest.oversizedFiles
.map((file) => `${file.path} (${file.sizeBytes} bytes)`)
.join(", ") || "none"}`,
].join("\n"),
metadata: {
sourceType: "directory-manifest",
path: manifest.path,
},
});
}
for (const synopsis of synopses) {
documents.push({
id: `synopsis:${synopsis.resolvedPath}`,
name: `synopsis:${synopsis.path}`,
content: [
`File: ${synopsis.path}`,
`Format: ${synopsis.format}`,
`Text-like: ${synopsis.textLike}`,
`Oversized: ${synopsis.oversized}`,
`Sample strategy: ${synopsis.sampleStrategy}`,
`Synopsis: ${synopsis.synopsis}`,
synopsis.sampleHead ? `Sample head: ${synopsis.sampleHead}` : undefined,
synopsis.sampleTail ? `Sample tail: ${synopsis.sampleTail}` : undefined,
]
.filter(Boolean)
.join("\n"),
metadata: {
sourceType: "file-synopsis",
path: synopsis.path,
format: synopsis.format,
},
});
}
return documents;
}
function shouldBuildHierarchicalIndex(input: {
questionScope: "local" | "global";
oversizedPaths: Array<string>;
modality: RagCorpusAnalysis["modality"];
targetType: RagCorpusAnalysis["targetType"];
}): boolean {
if (input.questionScope !== "local") {
return false;
}
if (input.oversizedPaths.length > 0) {
return true;
}
return input.targetType !== "directory" && input.modality === "text-heavy";
}
function selectHierarchicalDocuments(
documents: Array<RagDocument>,
synopses: Array<RagFileSynopsis>,
oversizedPaths: Array<string>
): Array<RagDocument> {
const oversizedSet = new Set(oversizedPaths.map((value) => value.toLowerCase()));
const synopsisPaths = new Set(synopses.map((synopsis) => synopsis.path.toLowerCase()));
const selected = documents.filter((document) => {
const candidatePaths = [
document.id,
document.name,
typeof document.metadata?.discoveredPath === "string"
? document.metadata.discoveredPath
: undefined,
typeof document.metadata?.absolutePath === "string"
? document.metadata.absolutePath
: undefined,
typeof document.metadata?.path === "string" ? document.metadata.path : undefined,
]
.filter((value): value is string => Boolean(value))
.map((value) => value.toLowerCase());
return candidatePaths.some(
(value) => oversizedSet.has(value) || synopsisPaths.has(value)
);
});
return selected.length > 0 ? selected : documents;
}
function recommendLargeCorpusRoute(input: {
questionScope: "local" | "global";
targetType: RagCorpusAnalysis["targetType"];
modality: RagCorpusAnalysis["modality"];
oversizedFileCount: number;
corpus: RagLoadedCorpus;
}): RagExecutionRoute {
if (input.questionScope === "global") {
return "global-summary";
}
if (input.targetType === "directory") {
return input.oversizedFileCount > 0 || input.modality === "binary-heavy"
? "sample"
: "hierarchical-retrieval";
}
if (input.oversizedFileCount > 0) {
return input.modality === "text-heavy" ? "hierarchical-retrieval" : "sample";
}
if ((input.corpus.estimatedTokens ?? 0) <= 4000) {
return "full-context";
}
return "retrieval";
}
function inferDominantModality(
fileCount: number,
topExtensions: Array<FileExtensionCount>
): RagDirectoryManifest["dominantModality"] {
if (fileCount <= 0 || topExtensions.length === 0) {
return "unknown";
}
const binaryCount = topExtensions.reduce((sum, entry) => {
return sum + (isLikelyBinaryExtension(entry.extension) ? entry.count : 0);
}, 0);
const textCount = topExtensions.reduce((sum, entry) => {
return sum + (isLikelyTextExtension(entry.extension) ? entry.count : 0);
}, 0);
if (binaryCount / fileCount >= BINARY_HEAVY_THRESHOLD) {
return "binary-heavy";
}
if (textCount >= binaryCount) {
return "text-heavy";
}
return "mixed";
}
function isLikelyTextExtension(extension: string): boolean {
const normalized = normalizeExtension(extension);
return normalized
? [".txt", ".md", ".markdown", ".json", ".jsonl", ".html", ".htm", ".csv", ".ts", ".js", ".tsx", ".jsx", ".py", ".log", ".xml", ".yaml", ".yml"].includes(
normalized
)
: false;
}
function isLikelyBinaryExtension(extension: string): boolean {
const normalized = normalizeExtension(extension);
return normalized
? [".png", ".jpg", ".jpeg", ".gif", ".webp", ".heic", ".mp4", ".mp3", ".wav", ".mov", ".zip", ".pdf", ".docx", ".pptx", ".xlsx"].includes(
normalized
)
: false;
}
function normalizeExtension(extension?: string): string | undefined {
if (!extension) {
return undefined;
}
return extension.startsWith(".") ? extension.toLowerCase() : `.${extension.toLowerCase()}`;
}
function formatTopExtensions(topExtensions: Array<FileExtensionCount>): string {
if (topExtensions.length === 0) {
return "none";
}
return topExtensions.map((entry) => `${entry.extension}:${entry.count}`).join(", ");
}