src / scorer.ts

export interface ScoringSignal {
  name: string;
  score: number;
  weight: number;
}

export interface SourceScore {
  score: number;
  verdict: "high" | "moderate" | "low" | "very_low";
  signals: ScoringSignal[];
  caveat: string;
}

const SENSATIONALISM = [
  "shocking","you won't believe","incredible","unbelievable","mind-blowing",
  "bombshell","explosive","stunning","outrageous","jaw-dropping",
  "breaking:","urgent:","alert:","exposed","revealed","secret",
  "they don't want you to know","doctors hate","trick",
];

const EVIDENCE_MARKERS = [
  "according to","study found","reported by","research shows",
  "data shows","statistics show","survey found","analysis found",
  "cited in","published in","peer-reviewed","researchers found",
];

const AUTHORITY_TLDS = [".edu",".gov",".ac.",".mil"];
const MODERATE_TLDS = [".org",".int"];

const TIME_SENSITIVE_DOMAINS = [
  "techcrunch","wired","arstechnica","theverge","zdnet","engadget",
  "webmd","mayoclinic","nih.gov","cdc.gov","wsj","bloomberg",
  "reuters","ft.com","economist","forbes",
];

const CLICKBAIT_URL_PATTERNS = [
  "/top-10/","/top-5/","/shocking/","/you-wont-believe/","/viral/",
];

function extractDomain(url: string): string {
  try { return new URL(url).hostname.replace(/^www\./, ""); }
  catch { return url; }
}

function scoreCrossSourceCorroboration(url: string, otherSourceUrls: string[]): number {
  if (otherSourceUrls.length === 0) return 0;
  const currentDomain = extractDomain(url);
  const uniqueOtherDomains = new Set(
    otherSourceUrls.map(extractDomain).filter(d => d !== currentDomain)
  );
  if (uniqueOtherDomains.size === 0) return 0;
  if (uniqueOtherDomains.size === 1) return 50;
  return 100;
}

function scoreLinguisticObjectivity(bodyText: string): number {
  const lower = bodyText.toLowerCase();
  const sensationalCount = SENSATIONALISM.filter(m => lower.includes(m)).length;
  const evidenceCount = EVIDENCE_MARKERS.filter(m => lower.includes(m)).length;
  const numberCount = (bodyText.match(/\b\d+(?:\.\d+)?(?:\s*%|\s*million|\s*billion)?\b/g) ?? []).length;
  const evidenceBonus = Math.min(evidenceCount * 7 + Math.min(numberCount, 5) * 3, 40);
  const sensationalPenalty = Math.min(sensationalCount * 15, 60);
  return Math.max(0, Math.min(100, 60 - sensationalPenalty + evidenceBonus));
}

function scoreContentDepth(bodyText: string): number {
  const wordCount = bodyText.split(/\s+/).filter(Boolean).length;
  let score = 0;
  if (wordCount >= 800) score += 60;
  else if (wordCount >= 400) score += 40;
  else if (wordCount >= 200) score += 20;
  const lower = bodyText.toLowerCase();
  const citationMarkers = ["according to","cited in","published in","source:","[1]","[2]","ibid","footnote"];
  score += Math.min(citationMarkers.filter(m => lower.includes(m)).length * 10, 40);
  return Math.min(score, 100);
}

function scoreAuthorityAccountability(url: string, bodyText: string): number {
  let score = 30;
  const domain = extractDomain(url);
  if (AUTHORITY_TLDS.some(t => domain.includes(t))) score += 35;
  else if (MODERATE_TLDS.some(t => domain.endsWith(t))) score += 10;
  const opening = bodyText.toLowerCase().slice(0, 500);
  if (["by ","author:","written by","reporter:","published by"].some(p => opening.includes(p))) score += 20;
  if (/[A-Z][a-z]+\s+[A-Z][a-z]+/.test(bodyText.slice(0, 200))) score += 10;
  return Math.min(score, 100);
}

function scoreStructuralClarity(url: string, hasSchema: boolean, hasHeadings: boolean): number {
  let score = 30;
  if (hasSchema) score += 35;
  if (hasHeadings) score += 20;
  if (!CLICKBAIT_URL_PATTERNS.some(p => url.includes(p))) score += 15;
  return Math.min(score, 100);
}

// Recency is conditional: only penalizes time-sensitive domains (tech/medical/finance/news).
// Evergreen domains (history, law, academic) return a neutral 70 regardless of age.
function scoreRecency(publishedDate: string, domain: string): number {
  if (!publishedDate) return 70;
  if (!TIME_SENSITIVE_DOMAINS.some(d => domain.includes(d))) return 70;
  const pub = new Date(publishedDate);
  if (isNaN(pub.getTime())) return 70;
  const ageYears = (Date.now() - pub.getTime()) / (1000 * 60 * 60 * 24 * 365);
  if (ageYears <= 1) return 100;
  if (ageYears <= 2) return 80;
  if (ageYears <= 3) return 60;
  return 20;
}

function getVerdict(score: number): "high" | "moderate" | "low" | "very_low" {
  if (score >= 75) return "high";
  if (score >= 50) return "moderate";
  if (score >= 25) return "low";
  return "very_low";
}

export function scoreSource(
  url: string,
  bodyText: string,
  publishedDate: string,
  hasSchema: boolean,
  hasHeadings: boolean,
  otherSourceUrls: string[] = [],
): SourceScore {
  const domain = extractDomain(url);
  const signals: ScoringSignal[] = [
    { name: "cross_source_corroboration", score: scoreCrossSourceCorroboration(url, otherSourceUrls), weight: 0.25 },
    { name: "linguistic_objectivity",     score: scoreLinguisticObjectivity(bodyText), weight: 0.20 },
    { name: "authority_accountability",   score: scoreAuthorityAccountability(url, bodyText), weight: 0.20 },
    { name: "content_depth",             score: scoreContentDepth(bodyText), weight: 0.15 },
    { name: "structural_clarity",        score: scoreStructuralClarity(url, hasSchema, hasHeadings), weight: 0.10 },
    { name: "recency",                   score: scoreRecency(publishedDate, domain), weight: 0.10 },
  ];
  const score = Math.round(signals.reduce((sum, s) => sum + s.score * s.weight, 0));
  return { score, verdict: getVerdict(score), signals, caveat: "Score measures reliability proxies, not factual correctness." };
}