Project Files
src / utils / formatPage.ts
export interface FormattedPageOptions {
includeRawHtml?: boolean;
maxLinks?: number;
maxInfoboxRows?: number;
maxContentChars?: number;
maxRawHtmlChars?: number;
}
function truncate(text: string, maxChars?: number): string {
if (!maxChars || maxChars <= 0 || text.length <= maxChars) {
return text;
}
return `${text.slice(0, Math.max(0, maxChars - 1)).trimEnd()}…`;
}
export function formatCrawledPage(
page: {
title: string;
url: string;
content: string;
rawHtml: string;
links: string[];
infoboxes: Array<{ title?: string; rows: Record<string, string> }>;
recentUpdates: Array<{ title: string; url: string }>;
},
options: FormattedPageOptions = {}
): string {
const lines: string[] = [
`TITLE: ${page.title || "Untitled"}`,
`URL: ${page.url}`
];
if (page.recentUpdates.length > 0) {
lines.push(
"",
"RECENT UPDATES:",
...page.recentUpdates.map((update) => `- ${update.title} (${update.url})`)
);
}
if (page.infoboxes.length > 0) {
lines.push("");
lines.push("INFOBOXES:");
page.infoboxes.forEach((box, index) => {
const header = box.title?.trim() || `Infobox ${index + 1}`;
lines.push(`- ${header}`);
const rows = Object.entries(box.rows).slice(
0,
options.maxInfoboxRows && options.maxInfoboxRows > 0 ? options.maxInfoboxRows : 40
);
rows.forEach(([key, value]) => {
lines.push(` - ${key}: ${value}`);
});
});
}
lines.push("");
lines.push("CONTENT:");
lines.push(truncate(page.content, options.maxContentChars ?? 12_000));
if (page.links.length > 0) {
const maxLinks = options.maxLinks && options.maxLinks > 0 ? options.maxLinks : 120;
lines.push("");
lines.push("LINKS:");
page.links.slice(0, maxLinks).forEach((link) => {
lines.push(link);
});
}
if (options.includeRawHtml) {
lines.push("");
lines.push("RAW HTML:");
lines.push(
options.maxRawHtmlChars && options.maxRawHtmlChars > 0
? truncate(page.rawHtml, options.maxRawHtmlChars)
: page.rawHtml
);
}
return lines.join("\n");
}