Project Files
src / sources / normalizer.ts
import { createHash } from "crypto";
import path from "path";
import type { ImageRef, IndexedDocument, SourceDocument } from "./types.js";
import { resolveUrl } from "./http.js";
export function normalizeSourceDocument(source: SourceDocument): IndexedDocument {
const content = source.rawContentType === "html"
? htmlToMarkdownishText(source.rawContent)
: source.rawContent;
const imageRefs = source.rawContentType === "html"
? extractHtmlImageRefs(source)
: extractMarkdownImageRefs(source);
const contentHash = createHash("sha256")
.update(source.rawContent)
.digest("hex");
return {
sourceId: source.sourceId,
sourceKind: source.sourceKind,
canonicalUrl: source.canonicalUrl,
title: source.title,
content,
contentHash,
baseUrl: source.baseUrl,
version: source.version,
imageRefs,
metadata: {
...(source.metadata ?? {}),
sourceKind: source.sourceKind,
canonicalUrl: source.canonicalUrl,
baseUrl: source.baseUrl,
version: source.version,
fetchedAt: source.fetchedAt,
imageRefs,
},
};
}
function extractMarkdownImageRefs(source: SourceDocument): ImageRef[] {
const baseUrl = source.baseUrl ?? source.canonicalUrl;
if (!baseUrl) return [];
const refs: ImageRef[] = [];
const seen = new Set<string>();
const add = (url: string | null, altText?: string) => {
if (!url || seen.has(url)) return;
seen.add(url);
refs.push({ url, altText, sourceId: source.sourceId, sourceKind: source.sourceKind, baseUrl });
};
const markdownImage = /!\[([^\]]*)\]\((<([^>]+)>|[^)\s]+)(?:\s+"[^"]*")?\)/g;
let match: RegExpExecArray | null;
while ((match = markdownImage.exec(source.rawContent)) !== null) {
const raw = (match[3] ?? match[2]).replace(/^<|>$/g, "").trim();
add(resolveMarkdownImageRef(raw, baseUrl, source.sourceKind), match[1]?.trim() || undefined);
}
return refs;
}
function resolveMarkdownImageRef(ref: string, baseUrl: string, sourceKind: SourceDocument["sourceKind"]): string | null {
const raw = ref.trim();
if (!raw || raw.startsWith("data:")) return null;
if (/^file:\/\//i.test(raw)) {
try {
return new URL(raw).pathname;
} catch {
return null;
}
}
if ((sourceKind === "conversation" || sourceKind === "file") && path.isAbsolute(raw)) return raw;
if ((sourceKind === "conversation" || sourceKind === "file") && path.isAbsolute(baseUrl)) {
return path.resolve(baseUrl, raw);
}
return resolveUrl(raw, baseUrl);
}
function extractHtmlImageRefs(source: SourceDocument): ImageRef[] {
const baseUrl = source.baseUrl ?? source.canonicalUrl;
if (!baseUrl) return [];
const html = extractPrimaryHtml(source.rawContent) ?? source.rawContent;
const refs: ImageRef[] = [];
const seen = new Set<string>();
const add = (url: string | null, altText?: string) => {
if (!url || seen.has(url)) return;
seen.add(url);
refs.push({ url, altText, sourceId: source.sourceId, sourceKind: source.sourceKind, baseUrl });
};
const imgTag = /<img\b[^>]*>/gi;
let img: RegExpExecArray | null;
while ((img = imgTag.exec(html)) !== null) {
const tag = img[0];
const src = attr(tag, "src");
const alt = attr(tag, "alt") || undefined;
add(src ? resolveUrl(src, baseUrl) : null, alt);
const srcset = attr(tag, "srcset");
const firstSrcset = srcset?.split(",")[0]?.trim().split(/\s+/)[0];
if (firstSrcset) add(resolveUrl(firstSrcset, baseUrl), alt);
}
if (refs.length > 0) return refs;
const ogImage = /<meta\b[^>]*(?:property|name)=["'](?:og:image|twitter:image)["'][^>]*>/gi;
let meta: RegExpExecArray | null;
while ((meta = ogImage.exec(source.rawContent)) !== null) {
const content = attr(meta[0], "content");
add(content ? resolveUrl(content, baseUrl) : null, source.title);
}
return refs;
}
function htmlToMarkdownishText(html: string): string {
let text = extractPrimaryHtml(html) ?? html;
text = text.replace(/<script\b[\s\S]*?<\/script>/gi, "\n");
text = text.replace(/<style\b[\s\S]*?<\/style>/gi, "\n");
text = text.replace(/<nav\b[\s\S]*?<\/nav>/gi, "\n");
text = text.replace(/<video\b[\s\S]*?<\/video>/gi, "\n");
text = text.replace(/<img\b[^>]*>/gi, (tag) => {
const src = attr(tag, "src") ?? attr(tag, "data-src");
const alt = attr(tag, "alt") ?? "";
return src ? `\n\n\n\n` : "\n";
});
text = text.replace(/<\/(h[1-6]|p|li|section|article|div|br)>/gi, "\n");
text = text.replace(/<h1\b[^>]*>/gi, "\n# ");
text = text.replace(/<h2\b[^>]*>/gi, "\n## ");
text = text.replace(/<h3\b[^>]*>/gi, "\n### ");
text = text.replace(/<li\b[^>]*>/gi, "\n- ");
text = text.replace(/<a\b[^>]*href=["']([^"']+)["'][^>]*>([\s\S]*?)<\/a>/gi, (_m, href, label) => {
return `${stripTags(label).trim()} (${href})`;
});
text = text.replace(/<[^>]+>/g, " ");
text = decodeHtmlEntities(text);
text = text.replace(/[ \t]+/g, " ");
text = text.replace(/\n{3,}/g, "\n\n");
return text.trim();
}
function extractPrimaryHtml(html: string): string | null {
for (const tag of ["article", "main"]) {
const re = new RegExp(`<${tag}\\b[^>]*>([\\s\\S]*?)<\\/${tag}>`, "i");
const match = re.exec(html);
if (match?.[1]?.trim()) return match[1];
}
const roleMain = /<([a-z0-9-]+)\b[^>]*role=["']main["'][^>]*>([\s\S]*?)<\/\1>/i.exec(html);
return roleMain?.[2]?.trim() || null;
}
function attr(tag: string, name: string): string | null {
const re = new RegExp(`${name}=["']([^"']+)["']`, "i");
return re.exec(tag)?.[1] ?? null;
}
function stripTags(input: string): string {
return input.replace(/<[^>]+>/g, " ");
}
function decodeHtmlEntities(input: string): string {
return input
.replace(/&#x([0-9a-f]+);/gi, (_m, hex) => String.fromCodePoint(parseInt(hex, 16)))
.replace(/&#([0-9]+);/g, (_m, dec) => String.fromCodePoint(parseInt(dec, 10)))
.replace(/&/g, "&")
.replace(/</g, "<")
.replace(/>/g, ">")
.replace(/"/g, '"')
.replace(/'/g, "'")
.replace(/'/g, "'")
.replace(/ /g, " ");
}