src / markdown.ts
// Concatenates per-page transcriptions and applies the optional "clean" pass
// that strips lines repeated across ≥3 pages (running headers/footers).
export type ConcatArgs = {
pages: Array<{ pageNumber: number; markdown: string }>;
includeSeparators: boolean;
cleanRepeatedHeaders: boolean;
};
export function concatPages(args: ConcatArgs): string {
const cleaned = args.cleanRepeatedHeaders ? stripRepeatedLines(args.pages) : args.pages;
const chunks: string[] = [];
for (const p of cleaned) {
if (args.includeSeparators) {
chunks.push(`\n\n---\n\n# Page ${p.pageNumber}\n\n${p.markdown}`);
} else {
chunks.push(p.markdown);
}
}
return chunks.join(args.includeSeparators ? "" : "\n\n").trim();
}
// Lines that appear on at least 3 distinct pages, AND are short (≤ 80 chars),
// AND are not section markers like '#'/'-' lists, are treated as repeated
// headers/footers and removed from every page.
function stripRepeatedLines(
pages: Array<{ pageNumber: number; markdown: string }>,
): Array<{ pageNumber: number; markdown: string }> {
if (pages.length < 3) return pages;
const counts = new Map<string, number>();
for (const p of pages) {
const seenInPage = new Set<string>();
for (const raw of p.markdown.split("\n")) {
const line = raw.trim();
if (!line || line.length > 80) continue;
if (line.startsWith("#") || line.startsWith("-") || line.startsWith("*")) continue;
if (seenInPage.has(line)) continue;
seenInPage.add(line);
counts.set(line, (counts.get(line) ?? 0) + 1);
}
}
const repeated = new Set(
[...counts.entries()].filter(([, n]) => n >= 3).map(([line]) => line),
);
if (repeated.size === 0) return pages;
return pages.map((p) => ({
pageNumber: p.pageNumber,
markdown: p.markdown
.split("\n")
.filter((l) => !repeated.has(l.trim()))
.join("\n")
.replace(/\n{3,}/g, "\n\n")
.trim(),
}));
}