Project Files
src / sources / adapters / huggingFaceMarkdownSourceAdapter.ts
import path from "path";
import type { SourceAdapter, SourceAdapterContext, SourceDocument } from "../types.js";
import { fetchTextWithLimits } from "../http.js";
type HuggingFaceRepoType = "model" | "dataset" | "space";
interface HuggingFaceTarget {
repoType: HuggingFaceRepoType;
owner?: string;
repo: string;
ref: string;
path?: string;
}
export class HuggingFaceMarkdownSourceAdapter implements SourceAdapter {
canHandle(source: string): boolean {
return parseHuggingFaceTarget(source.trim()) !== null;
}
async load(source: string, context: SourceAdapterContext): Promise<SourceDocument[]> {
const target = parseHuggingFaceTarget(source.trim());
if (!target) return [];
if (target.path && isMarkdownPath(target.path)) {
return [await this.loadMarkdownFile({ ...target, path: target.path }, context)];
}
if (target.path && !isMarkdownPath(target.path)) {
return [];
}
const paths = await this.enumerateMarkdownPaths(target, context);
const docs: SourceDocument[] = [];
for (const filePath of paths.slice(0, context.maxPages)) {
try {
docs.push(await this.loadMarkdownFile({ ...target, path: filePath }, context));
} catch (err) {
console.warn(`[sources/huggingface] failed to load ${repoId(target)}/${filePath}:`, String(err));
}
}
return docs;
}
private async enumerateMarkdownPaths(target: HuggingFaceTarget, context: SourceAdapterContext): Promise<string[]> {
const headers = huggingFaceHeaders(context);
const apiUrl = huggingFaceTreeApiUrl(target);
const paths: string[] = [];
const seen = new Set<string>();
try {
const { text } = await fetchTextWithLimits(apiUrl, {
timeoutMs: context.fetchTimeoutMs,
maxBytes: context.maxBytes,
headers,
});
const payload = JSON.parse(text);
const entries = Array.isArray(payload) ? payload : [];
for (const entry of entries) {
if (entry?.type !== "file" || typeof entry?.path !== "string") continue;
if (!isMarkdownPath(entry.path) || seen.has(entry.path)) continue;
seen.add(entry.path);
paths.push(entry.path);
if (paths.length >= context.maxPages) break;
}
} catch (err) {
console.warn(`[sources/huggingface] failed to enumerate ${apiUrl}:`, String(err));
}
if (paths.length === 0) paths.push("README.md");
return paths;
}
private async loadMarkdownFile(target: HuggingFaceTarget & { path: string }, context: SourceAdapterContext): Promise<SourceDocument> {
const rawUrl = huggingFaceRawUrl(target);
const headers = huggingFaceHeaders(context);
const { text, finalUrl, etag, lastModified } = await fetchTextWithLimits(rawUrl, {
timeoutMs: context.fetchTimeoutMs,
maxBytes: context.maxBytes,
headers,
});
const sourceId = `huggingface://${target.repoType}/${repoId(target)}/${target.ref}/${target.path}`;
return {
sourceId,
sourceKind: "huggingface",
canonicalUrl: huggingFaceBlobUrl(target),
title: path.basename(target.path, path.extname(target.path)),
rawContent: text,
rawContentType: "markdown",
baseUrl: finalUrl.substring(0, finalUrl.lastIndexOf("/") + 1),
fetchedAt: new Date().toISOString(),
version: etag ?? lastModified,
metadata: {
repoType: target.repoType,
owner: target.owner,
repo: target.repo,
ref: target.ref,
path: target.path,
rawUrl: finalUrl,
},
};
}
}
function parseHuggingFaceTarget(source: string): HuggingFaceTarget | null {
const modelAliasScheme = /^huggingface:\/\/([^/]+)$/i.exec(source);
if (modelAliasScheme && !/^(models?|datasets?|spaces?)$/i.test(modelAliasScheme[1])) {
return {
repoType: "model",
repo: modelAliasScheme[1],
ref: "main",
};
}
const scheme = /^huggingface:\/\/(?:(models?|datasets?|spaces?)\/)?([^/]+)\/([^/]+)(?:\/([^/]+)(?:\/(.+))?)?$/i.exec(source);
if (scheme) {
const repoType = parseRepoType(scheme[1]);
return {
repoType,
owner: scheme[2],
repo: scheme[3],
ref: scheme[4] ?? "main",
path: scheme[5],
};
}
let url: URL;
try {
url = new URL(source);
} catch {
return null;
}
if (url.protocol !== "https:" || url.hostname !== "huggingface.co") return null;
const parts = url.pathname.split("/").filter(Boolean).map(decodeURIComponent);
if (parts.length < 1) return null;
const first = parts[0]?.toLowerCase();
let repoType: HuggingFaceRepoType = "model";
let offset = 0;
if (first === "datasets" || first === "spaces") {
repoType = first === "datasets" ? "dataset" : "space";
offset = 1;
}
const owner = parts[offset];
const repo = parts[offset + 1];
if (repoType === "model" && owner && (!repo || isRepoRouteMarker(repo))) {
const marker = repo?.toLowerCase();
if (marker === "blob" || marker === "raw" || marker === "resolve") {
const ref = parts[offset + 2] ?? "main";
const filePath = parts.slice(offset + 3).join("/");
return { repoType, repo: owner, ref, path: filePath || undefined };
}
if (marker === "tree") {
const ref = parts[offset + 2] ?? "main";
return { repoType, repo: owner, ref };
}
return { repoType, repo: owner, ref: "main" };
}
if (!owner || !repo) return null;
const marker = parts[offset + 2]?.toLowerCase();
if (marker === "blob" || marker === "raw" || marker === "resolve") {
const ref = parts[offset + 3] ?? "main";
const filePath = parts.slice(offset + 4).join("/");
return { repoType, owner, repo, ref, path: filePath || undefined };
}
if (marker === "tree") {
const ref = parts[offset + 3] ?? "main";
return { repoType, owner, repo, ref };
}
if (!marker) {
return { repoType, owner, repo, ref: "main" };
}
return null;
}
function parseRepoType(value: string | undefined): HuggingFaceRepoType {
const normalized = value?.toLowerCase();
if (normalized === "dataset" || normalized === "datasets") return "dataset";
if (normalized === "space" || normalized === "spaces") return "space";
return "model";
}
function repoId(target: HuggingFaceTarget): string {
return target.owner ? `${target.owner}/${target.repo}` : target.repo;
}
function huggingFaceTreeApiUrl(target: HuggingFaceTarget): string {
const prefix = target.repoType === "dataset" ? "datasets" : target.repoType === "space" ? "spaces" : "models";
return `https://huggingface.co/api/${prefix}/${repoPath(target)}/tree/${encodeURIComponent(target.ref)}?recursive=true`;
}
function huggingFaceRawUrl(target: HuggingFaceTarget & { path: string }): string {
const prefix = target.repoType === "dataset" ? "/datasets" : target.repoType === "space" ? "/spaces" : "";
return `https://huggingface.co${prefix}/${repoPath(target)}/raw/${encodeURIComponent(target.ref)}/${target.path.split("/").map(encodeURIComponent).join("/")}`;
}
function huggingFaceBlobUrl(target: HuggingFaceTarget & { path: string }): string {
const prefix = target.repoType === "dataset" ? "/datasets" : target.repoType === "space" ? "/spaces" : "";
return `https://huggingface.co${prefix}/${repoPath(target)}/blob/${encodeURIComponent(target.ref)}/${target.path.split("/").map(encodeURIComponent).join("/")}`;
}
function repoPath(target: HuggingFaceTarget): string {
return target.owner
? `${encodeURIComponent(target.owner)}/${encodeURIComponent(target.repo)}`
: encodeURIComponent(target.repo);
}
function isRepoRouteMarker(value: string | undefined): boolean {
return /^(blob|raw|resolve|tree)$/i.test(value ?? "");
}
function isMarkdownPath(filePath: string): boolean {
return /\.(md|markdown)$/i.test(filePath);
}
function huggingFaceHeaders(context: SourceAdapterContext): Record<string, string> {
const headers: Record<string, string> = {
"Accept": "text/markdown, text/plain;q=0.9, application/json;q=0.8, */*;q=0.7",
"User-Agent": "ceveyne-user-docs",
};
if (context.huggingFaceToken?.trim()) {
headers.Authorization = `Bearer ${context.huggingFaceToken.trim()}`;
}
return headers;
}