Project Files

docs

initial-docs

ANNOTATE-IMAGE-DETECTION.md

MEMORIZE-DOC-IMAGES.md

SETUP.md

USER-DOCS.md

CHANGELOG.md

DEPLOYMENT.md

USER_GUIDE.md

python

docling_parser.py

extract_image_page.py

pymupdf_parser.py

requirements.txt

src

documents

parsers

pdfParser.ts

textParser.ts

fileWatcher.ts

loader.ts

fpzip

decompress.ts

fpzip_bridge.cpp

fpzip_loader.ts

fpzip_wasm.js

fpzip_wasm.wasm

package.json

helpers

documentImages.ts

drawBboxesOnImage.ts

embedLocalImages.ts

frontmatter.ts

globalConfigReader.ts

pngMetadata.ts

readPngMetadata.ts

sequenceExtractor.ts

toolProgress.ts

videoAssembler.ts

visionCapabilityPrimer.ts

visionPromotionLog.ts

rag

bm25.ts

chunker.ts

embeddings.ts

retriever.ts

retrieverSingleton.ts

vectorStore.ts

services

chatExporter.ts

lmStudioVisionAnalyzer.ts

toolResultHarvester.ts

userDocsGuidePrimer.ts

sources

adapters

githubMarkdownSourceAdapter.ts

huggingFaceMarkdownSourceAdapter.ts

lmStudioConversationSourceAdapter.ts

staticHtmlSourceAdapter.ts

http.ts

lmStudioConversationMarkdown.ts

normalizer.ts

registry.ts

remoteImageResolver.ts

types.ts

tools

analyse_image.ts

annotate_image.ts

detect_object.ts

export_doc.ts

extract_image.ts

fetch_image.ts

find_doc.ts

forget_doc.ts

memorize_doc.ts

read_config.ts

read_doc.ts

rewrite_doc.ts

show_image.ts

skip_doc.ts

types

external-shims.d.ts

utils

language.ts

pythonRunner.ts

ragLogger.ts

ragVenvSetup.ts

config.ts

core-bundle.mjs

index.ts

orchestrator.ts

promptPreprocessor.ts

thinkingToolCallParser.ts

toolsProvider.ts

types.ts

.gitignore

.lmsignore

.npmignore

.swcrc

build.mjs

LICENSE

manifest.json

mcp-shims.d.ts

package-lock.json

package.json

README.md

rollup.config.mjs

tsconfig.json

src / sources / normalizer.ts

import { createHash } from "crypto";
import path from "path";
import type { ImageRef, IndexedDocument, SourceDocument } from "./types.js";
import { resolveUrl } from "./http.js";

export function normalizeSourceDocument(source: SourceDocument): IndexedDocument {
  const content = source.rawContentType === "html"
    ? htmlToMarkdownishText(source.rawContent)
    : source.rawContent;

  const imageRefs = source.rawContentType === "html"
    ? extractHtmlImageRefs(source)
    : extractMarkdownImageRefs(source);

  const contentHash = createHash("sha256")
    .update(source.rawContent)
    .digest("hex");

  return {
    sourceId: source.sourceId,
    sourceKind: source.sourceKind,
    canonicalUrl: source.canonicalUrl,
    title: source.title,
    content,
    contentHash,
    baseUrl: source.baseUrl,
    version: source.version,
    imageRefs,
    metadata: {
      ...(source.metadata ?? {}),
      sourceKind: source.sourceKind,
      canonicalUrl: source.canonicalUrl,
      baseUrl: source.baseUrl,
      version: source.version,
      fetchedAt: source.fetchedAt,
      imageRefs,
    },
  };
}

function extractMarkdownImageRefs(source: SourceDocument): ImageRef[] {
  const baseUrl = source.baseUrl ?? source.canonicalUrl;
  if (!baseUrl) return [];
  const refs: ImageRef[] = [];
  const seen = new Set<string>();
  const add = (url: string | null, altText?: string) => {
    if (!url || seen.has(url)) return;
    seen.add(url);
    refs.push({ url, altText, sourceId: source.sourceId, sourceKind: source.sourceKind, baseUrl });
  };

  const markdownImage = /!\[([^\]]*)\]\((<([^>]+)>|[^)\s]+)(?:\s+"[^"]*")?\)/g;
  let match: RegExpExecArray | null;
  while ((match = markdownImage.exec(source.rawContent)) !== null) {
    const raw = (match[3] ?? match[2]).replace(/^<|>$/g, "").trim();
    add(resolveMarkdownImageRef(raw, baseUrl, source.sourceKind), match[1]?.trim() || undefined);
  }

  return refs;
}

function resolveMarkdownImageRef(ref: string, baseUrl: string, sourceKind: SourceDocument["sourceKind"]): string | null {
  const raw = ref.trim();
  if (!raw || raw.startsWith("data:")) return null;
  if (/^file:\/\//i.test(raw)) {
    try {
      return new URL(raw).pathname;
    } catch {
      return null;
    }
  }
  if ((sourceKind === "conversation" || sourceKind === "file") && path.isAbsolute(raw)) return raw;
  if ((sourceKind === "conversation" || sourceKind === "file") && path.isAbsolute(baseUrl)) {
    return path.resolve(baseUrl, raw);
  }
  return resolveUrl(raw, baseUrl);
}

function extractHtmlImageRefs(source: SourceDocument): ImageRef[] {
  const baseUrl = source.baseUrl ?? source.canonicalUrl;
  if (!baseUrl) return [];
  const html = extractPrimaryHtml(source.rawContent) ?? source.rawContent;
  const refs: ImageRef[] = [];
  const seen = new Set<string>();
  const add = (url: string | null, altText?: string) => {
    if (!url || seen.has(url)) return;
    seen.add(url);
    refs.push({ url, altText, sourceId: source.sourceId, sourceKind: source.sourceKind, baseUrl });
  };

  const imgTag = /<img\b[^>]*>/gi;
  let img: RegExpExecArray | null;
  while ((img = imgTag.exec(html)) !== null) {
    const tag = img[0];
    const src = attr(tag, "src");
    const alt = attr(tag, "alt") || undefined;
    add(src ? resolveUrl(src, baseUrl) : null, alt);
    const srcset = attr(tag, "srcset");
    const firstSrcset = srcset?.split(",")[0]?.trim().split(/\s+/)[0];
    if (firstSrcset) add(resolveUrl(firstSrcset, baseUrl), alt);
  }

  if (refs.length > 0) return refs;

  const ogImage = /<meta\b[^>]*(?:property|name)=["'](?:og:image|twitter:image)["'][^>]*>/gi;
  let meta: RegExpExecArray | null;
  while ((meta = ogImage.exec(source.rawContent)) !== null) {
    const content = attr(meta[0], "content");
    add(content ? resolveUrl(content, baseUrl) : null, source.title);
  }

  return refs;
}

function htmlToMarkdownishText(html: string): string {
  let text = extractPrimaryHtml(html) ?? html;
  text = text.replace(/<script\b[\s\S]*?<\/script>/gi, "\n");
  text = text.replace(/<style\b[\s\S]*?<\/style>/gi, "\n");
  text = text.replace(/<nav\b[\s\S]*?<\/nav>/gi, "\n");
  text = text.replace(/<video\b[\s\S]*?<\/video>/gi, "\n");
  text = text.replace(/<img\b[^>]*>/gi, (tag) => {
    const src = attr(tag, "src") ?? attr(tag, "data-src");
    const alt = attr(tag, "alt") ?? "";
    return src ? `\n\n![${stripTags(alt).trim()}](${src})\n\n` : "\n";
  });
  text = text.replace(/<\/(h[1-6]|p|li|section|article|div|br)>/gi, "\n");
  text = text.replace(/<h1\b[^>]*>/gi, "\n# ");
  text = text.replace(/<h2\b[^>]*>/gi, "\n## ");
  text = text.replace(/<h3\b[^>]*>/gi, "\n### ");
  text = text.replace(/<li\b[^>]*>/gi, "\n- ");
  text = text.replace(/<a\b[^>]*href=["']([^"']+)["'][^>]*>([\s\S]*?)<\/a>/gi, (_m, href, label) => {
    return `${stripTags(label).trim()} (${href})`;
  });
  text = text.replace(/<[^>]+>/g, " ");
  text = decodeHtmlEntities(text);
  text = text.replace(/[ \t]+/g, " ");
  text = text.replace(/\n{3,}/g, "\n\n");
  return text.trim();
}

function extractPrimaryHtml(html: string): string | null {
  for (const tag of ["article", "main"]) {
    const re = new RegExp(`<${tag}\\b[^>]*>([\\s\\S]*?)<\\/${tag}>`, "i");
    const match = re.exec(html);
    if (match?.[1]?.trim()) return match[1];
  }

  const roleMain = /<([a-z0-9-]+)\b[^>]*role=["']main["'][^>]*>([\s\S]*?)<\/\1>/i.exec(html);
  return roleMain?.[2]?.trim() || null;
}

function attr(tag: string, name: string): string | null {
  const re = new RegExp(`${name}=["']([^"']+)["']`, "i");
  return re.exec(tag)?.[1] ?? null;
}

function stripTags(input: string): string {
  return input.replace(/<[^>]+>/g, " ");
}

function decodeHtmlEntities(input: string): string {
  return input
    .replace(/&#x([0-9a-f]+);/gi, (_m, hex) => String.fromCodePoint(parseInt(hex, 16)))
    .replace(/&#([0-9]+);/g, (_m, dec) => String.fromCodePoint(parseInt(dec, 10)))
    .replace(/&amp;/g, "&")
    .replace(/&lt;/g, "<")
    .replace(/&gt;/g, ">")
    .replace(/&quot;/g, '"')
    .replace(/&#39;/g, "'")
    .replace(/&#x27;/g, "'")
    .replace(/&nbsp;/g, " ");
}

user-docs

user-docs