Forked from altra/research
src / scorer.ts
export interface ScoringSignal {
name: string;
score: number;
weight: number;
}
export interface SourceScore {
score: number;
verdict: "high" | "moderate" | "low" | "very_low";
signals: ScoringSignal[];
caveat: string;
}
const SENSATIONALISM = [
"shocking","you won't believe","incredible","unbelievable","mind-blowing",
"bombshell","explosive","stunning","outrageous","jaw-dropping",
"breaking:","urgent:","alert:","exposed","revealed","secret",
"they don't want you to know","doctors hate","trick",
];
const EVIDENCE_MARKERS = [
"according to","study found","reported by","research shows",
"data shows","statistics show","survey found","analysis found",
"cited in","published in","peer-reviewed","researchers found",
];
const AUTHORITY_TLDS = [".edu",".gov",".ac.",".mil"];
const MODERATE_TLDS = [".org",".int"];
const TIME_SENSITIVE_DOMAINS = [
"techcrunch","wired","arstechnica","theverge","zdnet","engadget",
"webmd","mayoclinic","nih.gov","cdc.gov","wsj","bloomberg",
"reuters","ft.com","economist","forbes",
];
const CLICKBAIT_URL_PATTERNS = [
"/top-10/","/top-5/","/shocking/","/you-wont-believe/","/viral/",
];
function extractDomain(url: string): string {
try { return new URL(url).hostname.replace(/^www\./, ""); }
catch { return url; }
}
function scoreCrossSourceCorroboration(url: string, otherSourceUrls: string[]): number {
if (otherSourceUrls.length === 0) return 0;
const currentDomain = extractDomain(url);
const uniqueOtherDomains = new Set(
otherSourceUrls.map(extractDomain).filter(d => d !== currentDomain)
);
if (uniqueOtherDomains.size === 0) return 0;
if (uniqueOtherDomains.size === 1) return 50;
return 100;
}
function scoreLinguisticObjectivity(bodyText: string): number {
const lower = bodyText.toLowerCase();
const sensationalCount = SENSATIONALISM.filter(m => lower.includes(m)).length;
const evidenceCount = EVIDENCE_MARKERS.filter(m => lower.includes(m)).length;
const numberCount = (bodyText.match(/\b\d+(?:\.\d+)?(?:\s*%|\s*million|\s*billion)?\b/g) ?? []).length;
const evidenceBonus = Math.min(evidenceCount * 7 + Math.min(numberCount, 5) * 3, 40);
const sensationalPenalty = Math.min(sensationalCount * 15, 60);
return Math.max(0, Math.min(100, 60 - sensationalPenalty + evidenceBonus));
}
function scoreContentDepth(bodyText: string): number {
const wordCount = bodyText.split(/\s+/).filter(Boolean).length;
let score = 0;
if (wordCount >= 800) score += 60;
else if (wordCount >= 400) score += 40;
else if (wordCount >= 200) score += 20;
const lower = bodyText.toLowerCase();
const citationMarkers = ["according to","cited in","published in","source:","[1]","[2]","ibid","footnote"];
score += Math.min(citationMarkers.filter(m => lower.includes(m)).length * 10, 40);
return Math.min(score, 100);
}
function scoreAuthorityAccountability(url: string, bodyText: string): number {
let score = 30;
const domain = extractDomain(url);
if (AUTHORITY_TLDS.some(t => domain.includes(t))) score += 35;
else if (MODERATE_TLDS.some(t => domain.endsWith(t))) score += 10;
const opening = bodyText.toLowerCase().slice(0, 500);
if (["by ","author:","written by","reporter:","published by"].some(p => opening.includes(p))) score += 20;
if (/[A-Z][a-z]+\s+[A-Z][a-z]+/.test(bodyText.slice(0, 200))) score += 10;
return Math.min(score, 100);
}
function scoreStructuralClarity(url: string, hasSchema: boolean, hasHeadings: boolean): number {
let score = 30;
if (hasSchema) score += 35;
if (hasHeadings) score += 20;
if (!CLICKBAIT_URL_PATTERNS.some(p => url.includes(p))) score += 15;
return Math.min(score, 100);
}
// Recency is conditional: only penalizes time-sensitive domains (tech/medical/finance/news).
// Evergreen domains (history, law, academic) return a neutral 70 regardless of age.
function scoreRecency(publishedDate: string, domain: string): number {
if (!publishedDate) return 70;
if (!TIME_SENSITIVE_DOMAINS.some(d => domain.includes(d))) return 70;
const pub = new Date(publishedDate);
if (isNaN(pub.getTime())) return 70;
const ageYears = (Date.now() - pub.getTime()) / (1000 * 60 * 60 * 24 * 365);
if (ageYears <= 1) return 100;
if (ageYears <= 2) return 80;
if (ageYears <= 3) return 60;
return 20;
}
function getVerdict(score: number): "high" | "moderate" | "low" | "very_low" {
if (score >= 75) return "high";
if (score >= 50) return "moderate";
if (score >= 25) return "low";
return "very_low";
}
export function scoreSource(
url: string,
bodyText: string,
publishedDate: string,
hasSchema: boolean,
hasHeadings: boolean,
otherSourceUrls: string[] = [],
): SourceScore {
const domain = extractDomain(url);
const signals: ScoringSignal[] = [
{ name: "cross_source_corroboration", score: scoreCrossSourceCorroboration(url, otherSourceUrls), weight: 0.25 },
{ name: "linguistic_objectivity", score: scoreLinguisticObjectivity(bodyText), weight: 0.20 },
{ name: "authority_accountability", score: scoreAuthorityAccountability(url, bodyText), weight: 0.20 },
{ name: "content_depth", score: scoreContentDepth(bodyText), weight: 0.15 },
{ name: "structural_clarity", score: scoreStructuralClarity(url, hasSchema, hasHeadings), weight: 0.10 },
{ name: "recency", score: scoreRecency(publishedDate, domain), weight: 0.10 },
];
const score = Math.round(signals.reduce((sum, s) => sum + s.score * s.weight, 0));
return { score, verdict: getVerdict(score), signals, caveat: "Score measures reliability proxies, not factual correctness." };
}