Project Files
src / sources / adapters / staticHtmlSourceAdapter.ts
import type { SourceAdapter, SourceAdapterContext, SourceDocument } from "../types.js";
import { fetchTextWithLimits, resolveUrl } from "../http.js";
export class StaticHtmlSourceAdapter implements SourceAdapter {
canHandle(source: string): boolean {
try {
const url = new URL(source.trim());
return url.protocol === "https:" || url.hostname === "localhost" || url.hostname === "127.0.0.1";
} catch {
return false;
}
}
async load(source: string, context: SourceAdapterContext): Promise<SourceDocument[]> {
const startUrl = new URL(source.trim()).toString();
const docs: SourceDocument[] = [];
const seen = new Set<string>();
const queue = [startUrl];
const start = new URL(startUrl);
while (queue.length > 0 && docs.length < context.maxPages) {
const url = queue.shift()!;
if (seen.has(url)) continue;
seen.add(url);
try {
const doc = await this.loadPage(url, context);
docs.push(doc);
if (url === startUrl) {
for (const link of extractDocsLinks(doc.rawContent, url)) {
if (docs.length + queue.length >= context.maxPages) break;
const parsed = new URL(link);
if (parsed.hostname !== start.hostname) continue;
if (!parsed.pathname.startsWith(start.pathname.replace(/\/$/, ""))) continue;
if (!seen.has(parsed.toString())) queue.push(parsed.toString());
}
}
} catch (err) {
console.warn(`[sources/html] failed to load ${url}:`, String(err));
}
}
return docs;
}
private async loadPage(url: string, context: SourceAdapterContext): Promise<SourceDocument> {
const { text, finalUrl, etag, lastModified } = await fetchTextWithLimits(url, {
timeoutMs: context.fetchTimeoutMs,
maxBytes: context.maxBytes,
headers: { "Accept": "text/html,application/xhtml+xml" },
});
const title = extractTitle(text) ?? new URL(finalUrl).pathname;
const canonicalUrl = extractCanonical(text, finalUrl) ?? finalUrl;
const baseUrl = finalUrl.substring(0, finalUrl.lastIndexOf("/") + 1);
return {
sourceId: canonicalUrl,
sourceKind: "https",
canonicalUrl,
title,
rawContent: text,
rawContentType: "html",
baseUrl,
fetchedAt: new Date().toISOString(),
version: etag ?? lastModified,
metadata: {
finalUrl,
},
};
}
}
function extractDocsLinks(html: string, baseUrl: string): string[] {
const links: string[] = [];
const seen = new Set<string>();
const anchor = /<a\b[^>]*href=["']([^"'#]+(?:#[^"']*)?)["'][^>]*>/gi;
let match: RegExpExecArray | null;
while ((match = anchor.exec(html)) !== null) {
const resolved = resolveUrl(match[1], baseUrl);
if (!resolved || seen.has(resolved)) continue;
seen.add(resolved);
links.push(resolved);
}
return links;
}
function extractTitle(html: string): string | undefined {
const og = /<meta\b[^>]*(?:property|name)=["']og:title["'][^>]*content=["']([^"']+)["'][^>]*>/i.exec(html);
if (og?.[1]) return decodeBasic(og[1]).trim();
const title = /<title\b[^>]*>([\s\S]*?)<\/title>/i.exec(html);
return title?.[1] ? decodeBasic(title[1].replace(/<[^>]+>/g, " ")).trim() : undefined;
}
function extractCanonical(html: string, fallbackBase: string): string | undefined {
const canonical = /<link\b[^>]*rel=["']canonical["'][^>]*href=["']([^"']+)["'][^>]*>/i.exec(html)
?? /<link\b[^>]*href=["']([^"']+)["'][^>]*rel=["']canonical["'][^>]*>/i.exec(html);
return canonical?.[1] ? resolveUrl(canonical[1], fallbackBase) ?? undefined : undefined;
}
function decodeBasic(input: string): string {
return input
.replace(/&#x([0-9a-f]+);/gi, (_m, hex) => String.fromCodePoint(parseInt(hex, 16)))
.replace(/&#([0-9]+);/g, (_m, dec) => String.fromCodePoint(parseInt(dec, 10)))
.replace(/&/g, "&")
.replace(/</g, "<")
.replace(/>/g, ">")
.replace(/"/g, '"')
.replace(/'/g, "'");
}