Project Files
src / retrieval / compression.ts
import { HEADINGS_RE, LISTS_RE, NUM_LIST_RE, HEADER_RE, NEWLINE_RE, DOUBLE_NEWLINE_RE, normalizeWhitespace, truncate } from "../utils/text";
function scoreLine(line: string): number {
const trimmed = line.trim();
if (!trimmed) return 0;
if (HEADINGS_RE.test(trimmed)) return 4;
if (LISTS_RE.test(trimmed)) return 3;
if (NUM_LIST_RE.test(trimmed)) return 3;
if (trimmed.length < 80 && HEADER_RE.test(trimmed)) return 2;
return 1;
}
export function compressChunk(text: string): string {
const normalized = normalizeWhitespace(text);
const lines = text.split(NEWLINE_RE).map((l) => l.trim()).filter(Boolean);
if (lines.length === 0) return truncate(normalized, 1200);
const kept: string[] = [];
const seen = new Set<string>();
for (const line of lines) {
if (kept.length >= 6) break;
const key = normalizeWhitespace(line).toLowerCase();
if (seen.has(key)) continue;
seen.add(key);
if (scoreLine(line) >= 3) kept.push(line);
}
if (kept.length < 2) {
const paragraphs = normalized.split(DOUBLE_NEWLINE_RE).map((p) => p.trim()).filter(Boolean);
for (const p of paragraphs.slice(0, 3)) kept.push(p);
}
return truncate(kept.join("\n"), 1400);
}
export function compressChunks(chunks: string[]): string[] {
return chunks.map(compressChunk);
}