// Result ranking, filtering, and noise reduction. Pure functions, no I/O.
// Shared between the LM Studio plugin and the standalone MCP server.

import type { SearchResult } from "./engines";

// ---------- URL normalization ----------

const TRACKING_PARAM_RE = /^(utm_|fbclid$|gclid$|mc_|ref$|ref_$|source$|spm$)/i;

// Some engines return wrapper / redirect URLs that aren't real destinations.
// Drop these outright.
const REDIRECT_HOST_PREFIXES = [
  "duckduckgo.com/l/",
  "duckduckgo.com/y.js",
  "www.google.com/url",
  "www.bing.com/ck/",
  "r.jina.ai/",
];

export function isRedirectUrl(url: string): boolean {
  const lower = url.toLowerCase();
  return REDIRECT_HOST_PREFIXES.some((p) => lower.includes(p));
}

export function normalizeUrl(raw: string): string {
  try {
    const u = new URL(raw);
    u.hash = "";
    u.hostname = u.hostname.toLowerCase().replace(/^www\./, "");
    // Strip tracking-ish params.
    const keep: [string, string][] = [];
    for (const [k, v] of u.searchParams) {
      if (TRACKING_PARAM_RE.test(k)) continue;
      keep.push([k, v]);
    }
    u.search = "";
    for (const [k, v] of keep) u.searchParams.append(k, v);
    // Strip trailing slash on path (but keep "/").
    if (u.pathname.length > 1 && u.pathname.endsWith("/")) {
      u.pathname = u.pathname.slice(0, -1);
    }
    return u.toString();
  } catch {
    return raw;
  }
}

export function hostOf(url: string): string {
  try {
    return new URL(url).hostname.toLowerCase().replace(/^www\./, "");
  } catch {
    return "";
  }
}

// ---------- Host trust ----------

// Coarse trust score per host suffix. Higher = more reliable / authoritative.
// Used as one of several signals in scoring; not a hard filter.
const HOST_TRUST: Array<[RegExp, number]> = [
  [/(^|\.)wikipedia\.org$/, 0.95],
  [/(^|\.)wikimedia\.org$/, 0.9],
  [/(^|\.)arxiv\.org$/, 0.9],
  [/(^|\.)nature\.com$/, 0.9],
  [/(^|\.)science\.org$/, 0.9],
  [/(^|\.)nih\.gov$/, 0.9],
  [/(^|\.)ncbi\.nlm\.nih\.gov$/, 0.9],
  [/\.gov$/, 0.85],
  [/\.edu$/, 0.8],
  [/(^|\.)github\.com$/, 0.8],
  [/(^|\.)stackoverflow\.com$/, 0.8],
  [/(^|\.)stackexchange\.com$/, 0.75],
  [/(^|\.)mozilla\.org$/, 0.8],
  [/(^|\.)developer\.mozilla\.org$/, 0.85],
  [/(^|\.)readthedocs\.io$/, 0.75],
  [/(^|\.)microsoft\.com$/, 0.7],
  [/(^|\.)apple\.com$/, 0.7],
  [/(^|\.)bbc\.(co\.uk|com)$/, 0.75],
  [/(^|\.)reuters\.com$/, 0.75],
  [/(^|\.)apnews\.com$/, 0.75],
  [/(^|\.)nytimes\.com$/, 0.7],
  [/(^|\.)theguardian\.com$/, 0.7],
  [/(^|\.)reddit\.com$/, 0.55],
  [/(^|\.)medium\.com$/, 0.5],
  [/(^|\.)quora\.com$/, 0.45],
  [/(^|\.)pinterest\.(com|.+)$/, 0.2],
  [/(^|\.)answers\.com$/, 0.2],
];

export function hostTrust(host: string): number {
  for (const [re, score] of HOST_TRUST) if (re.test(host)) return score;
  return 0.5; // unknown host
}

// ---------- Query/text matching ----------

const STOPWORDS = new Set([
  "a", "an", "the", "of", "in", "on", "at", "for", "to", "and", "or", "but",
  "is", "are", "was", "were", "be", "by", "with", "from", "as", "it", "its",
  "this", "that", "these", "those", "i", "you", "we", "they", "he", "she",
  "me", "him", "her", "us", "them", "my", "your", "our", "their",
  "how", "what", "when", "where", "why", "who", "which", "do", "does", "did",
  "can", "could", "should", "would", "will", "shall", "may", "might", "must",
  "about", "into", "over", "under", "than", "then", "so", "if", "not", "no",
]);

export function tokenize(s: string): string[] {
  return s
    .toLowerCase()
    .replace(/[^\p{L}\p{N}\s]/gu, " ")
    .split(/\s+/)
    .filter((t) => t.length >= 2 && !STOPWORDS.has(t));
}

export function queryTokens(query: string): string[] {
  return Array.from(new Set(tokenize(query)));
}

// 0..1 — what fraction of the query's content tokens appear in `text`.
export function overlapScore(qTokens: string[], text: string): number {
  if (qTokens.length === 0) return 0;
  const tt = new Set(tokenize(text));
  let hit = 0;
  for (const t of qTokens) if (tt.has(t)) hit++;
  return hit / qTokens.length;
}

// ---------- Result scoring ----------

export interface ScoredResult extends SearchResult {
  score: number;
  host: string;
  score_breakdown?: {
    host_trust: number;
    title_overlap: number;
    snippet_overlap: number;
    snippet_length_bonus: number;
  };
}

export interface RankOptions {
  query: string;
  topK: number;
  snippetMaxChars: number;
  includeScoreBreakdown?: boolean;
}

function trimSnippet(s: string, max: number): string {
  if (!s) return "";
  const cleaned = s.replace(/\s+/g, " ").trim();
  if (cleaned.length <= max) return cleaned;
  return cleaned.slice(0, max - 1) + "…";
}

// Drop junk: empty title or URL, redirect wrappers, missing http(s) scheme.
function isUsable(r: SearchResult): boolean {
  if (!r.url || !r.title) return false;
  if (!/^https?:\/\//i.test(r.url)) return false;
  if (isRedirectUrl(r.url)) return false;
  if (r.title.trim().length < 2) return false;
  return true;
}

// Filter junk, dedupe by normalized URL, score, sort, top-K, trim snippets.
export function rankAndFilter(
  results: SearchResult[],
  opts: RankOptions,
): ScoredResult[] {
  const qTokens = queryTokens(opts.query);
  const seen = new Map<string, ScoredResult>();

  for (const r of results) {
    if (!isUsable(r)) continue;
    const normUrl = normalizeUrl(r.url);
    const host = hostOf(normUrl);
    const trust = hostTrust(host);
    const titleOverlap = overlapScore(qTokens, r.title);
    const snippetOverlap = overlapScore(qTokens, r.snippet || "");
    const snipLen = (r.snippet || "").length;
    // Reward results that actually have a snippet (>40 chars) up to ~0.3.
    const snipBonus = Math.min(0.3, snipLen / 600);

    // Weighted blend. Title overlap dominates; trust is a meaningful prior;
    // snippet evidence is secondary; length is a tiebreaker.
    const score =
      0.45 * titleOverlap +
      0.30 * trust +
      0.20 * snippetOverlap +
      0.05 * snipBonus;

    const scored: ScoredResult = {
      title: r.title.trim(),
      snippet: trimSnippet(r.snippet || "", opts.snippetMaxChars),
      url: normUrl,
      source: r.source,
      host,
      score: Math.round(score * 1000) / 1000,
      ...(opts.includeScoreBreakdown
        ? {
            score_breakdown: {
              host_trust: trust,
              title_overlap: Math.round(titleOverlap * 1000) / 1000,
              snippet_overlap: Math.round(snippetOverlap * 1000) / 1000,
              snippet_length_bonus: Math.round(snipBonus * 1000) / 1000,
            },
          }
        : {}),
    };

    // Dedupe: keep the higher-scoring instance per normalized URL.
    const existing = seen.get(normUrl);
    if (!existing || scored.score > existing.score) {
      seen.set(normUrl, scored);
    }
  }

  const sorted = Array.from(seen.values()).sort((a, b) => b.score - a.score);
  return sorted.slice(0, opts.topK);
}
multi-search

multi-search