Project Files

docs

initial-docs

ANNOTATE-IMAGE-DETECTION.md

MEMORIZE-DOC-IMAGES.md

SETUP.md

USER-DOCS.md

CHANGELOG.md

DEPLOYMENT.md

USER_GUIDE.md

python

docling_parser.py

extract_image_page.py

pymupdf_parser.py

requirements.txt

src

documents

parsers

pdfParser.ts

textParser.ts

fileWatcher.ts

loader.ts

fpzip

decompress.ts

fpzip_bridge.cpp

fpzip_loader.ts

fpzip_wasm.js

fpzip_wasm.wasm

package.json

helpers

documentImages.ts

drawBboxesOnImage.ts

embedLocalImages.ts

frontmatter.ts

globalConfigReader.ts

pngMetadata.ts

readPngMetadata.ts

sequenceExtractor.ts

toolProgress.ts

videoAssembler.ts

visionCapabilityPrimer.ts

visionPromotionLog.ts

rag

bm25.ts

chunker.ts

embeddings.ts

retriever.ts

retrieverSingleton.ts

vectorStore.ts

services

chatExporter.ts

lmStudioVisionAnalyzer.ts

toolResultHarvester.ts

userDocsGuidePrimer.ts

sources

adapters

githubMarkdownSourceAdapter.ts

huggingFaceMarkdownSourceAdapter.ts

lmStudioConversationSourceAdapter.ts

staticHtmlSourceAdapter.ts

http.ts

lmStudioConversationMarkdown.ts

normalizer.ts

registry.ts

remoteImageResolver.ts

types.ts

tools

analyse_image.ts

annotate_image.ts

detect_object.ts

export_doc.ts

extract_image.ts

fetch_image.ts

find_doc.ts

forget_doc.ts

memorize_doc.ts

read_config.ts

read_doc.ts

rewrite_doc.ts

show_image.ts

skip_doc.ts

types

external-shims.d.ts

utils

language.ts

pythonRunner.ts

ragLogger.ts

ragVenvSetup.ts

config.ts

core-bundle.mjs

index.ts

orchestrator.ts

promptPreprocessor.ts

thinkingToolCallParser.ts

toolsProvider.ts

types.ts

.gitignore

.lmsignore

.npmignore

.swcrc

build.mjs

LICENSE

manifest.json

mcp-shims.d.ts

package-lock.json

package.json

README.md

rollup.config.mjs

tsconfig.json

src / sources / adapters / staticHtmlSourceAdapter.ts

import type { SourceAdapter, SourceAdapterContext, SourceDocument } from "../types.js";
import { fetchTextWithLimits, resolveUrl } from "../http.js";

export class StaticHtmlSourceAdapter implements SourceAdapter {
  canHandle(source: string): boolean {
    try {
      const url = new URL(source.trim());
      return url.protocol === "https:" || url.hostname === "localhost" || url.hostname === "127.0.0.1";
    } catch {
      return false;
    }
  }

  async load(source: string, context: SourceAdapterContext): Promise<SourceDocument[]> {
    const startUrl = new URL(source.trim()).toString();
    const docs: SourceDocument[] = [];
    const seen = new Set<string>();
    const queue = [startUrl];
    const start = new URL(startUrl);

    while (queue.length > 0 && docs.length < context.maxPages) {
      const url = queue.shift()!;
      if (seen.has(url)) continue;
      seen.add(url);

      try {
        const doc = await this.loadPage(url, context);
        docs.push(doc);

        if (url === startUrl) {
          for (const link of extractDocsLinks(doc.rawContent, url)) {
            if (docs.length + queue.length >= context.maxPages) break;
            const parsed = new URL(link);
            if (parsed.hostname !== start.hostname) continue;
            if (!parsed.pathname.startsWith(start.pathname.replace(/\/$/, ""))) continue;
            if (!seen.has(parsed.toString())) queue.push(parsed.toString());
          }
        }
      } catch (err) {
        console.warn(`[sources/html] failed to load ${url}:`, String(err));
      }
    }

    return docs;
  }

  private async loadPage(url: string, context: SourceAdapterContext): Promise<SourceDocument> {
    const { text, finalUrl, etag, lastModified } = await fetchTextWithLimits(url, {
      timeoutMs: context.fetchTimeoutMs,
      maxBytes: context.maxBytes,
      headers: { "Accept": "text/html,application/xhtml+xml" },
    });
    const title = extractTitle(text) ?? new URL(finalUrl).pathname;
    const canonicalUrl = extractCanonical(text, finalUrl) ?? finalUrl;
    const baseUrl = finalUrl.substring(0, finalUrl.lastIndexOf("/") + 1);
    return {
      sourceId: canonicalUrl,
      sourceKind: "https",
      canonicalUrl,
      title,
      rawContent: text,
      rawContentType: "html",
      baseUrl,
      fetchedAt: new Date().toISOString(),
      version: etag ?? lastModified,
      metadata: {
        finalUrl,
      },
    };
  }
}

function extractDocsLinks(html: string, baseUrl: string): string[] {
  const links: string[] = [];
  const seen = new Set<string>();
  const anchor = /<a\b[^>]*href=["']([^"'#]+(?:#[^"']*)?)["'][^>]*>/gi;
  let match: RegExpExecArray | null;
  while ((match = anchor.exec(html)) !== null) {
    const resolved = resolveUrl(match[1], baseUrl);
    if (!resolved || seen.has(resolved)) continue;
    seen.add(resolved);
    links.push(resolved);
  }
  return links;
}

function extractTitle(html: string): string | undefined {
  const og = /<meta\b[^>]*(?:property|name)=["']og:title["'][^>]*content=["']([^"']+)["'][^>]*>/i.exec(html);
  if (og?.[1]) return decodeBasic(og[1]).trim();
  const title = /<title\b[^>]*>([\s\S]*?)<\/title>/i.exec(html);
  return title?.[1] ? decodeBasic(title[1].replace(/<[^>]+>/g, " ")).trim() : undefined;
}

function extractCanonical(html: string, fallbackBase: string): string | undefined {
  const canonical = /<link\b[^>]*rel=["']canonical["'][^>]*href=["']([^"']+)["'][^>]*>/i.exec(html)
    ?? /<link\b[^>]*href=["']([^"']+)["'][^>]*rel=["']canonical["'][^>]*>/i.exec(html);
  return canonical?.[1] ? resolveUrl(canonical[1], fallbackBase) ?? undefined : undefined;
}

function decodeBasic(input: string): string {
  return input
    .replace(/&#x([0-9a-f]+);/gi, (_m, hex) => String.fromCodePoint(parseInt(hex, 16)))
    .replace(/&#([0-9]+);/g, (_m, dec) => String.fromCodePoint(parseInt(dec, 10)))
    .replace(/&amp;/g, "&")
    .replace(/&lt;/g, "<")
    .replace(/&gt;/g, ">")
    .replace(/&quot;/g, '"')
    .replace(/&#39;/g, "'");
}

user-docs

user-docs