Project Files
src / retrieval / fusion.py
"""
retrieval/fusion.py — Result fusion strategies for hybrid retrieval.
Implemented
───────────
RRF Reciprocal Rank Fusion — rank-based, no score normalization needed.
The gold standard for combining heterogeneous rankers.
Reference: Cormack et al. (2009).
CombSUM Min-max normalized score summation.
Good when both rankers produce comparable magnitude scores.
"""
from __future__ import annotations
from collections import defaultdict
from typing import Any
def reciprocal_rank_fusion(
ranked_lists: list[list[tuple[Any, float]]],
k: int = 60,
) -> list[tuple[Any, float]]:
"""
Merge N ranked lists into one using RRF.
Args:
ranked_lists: Each list is [(id, score), …] sorted by score DESC.
Scores are ignored; only rank position matters.
k: RRF smoothing constant (60 is standard).
Returns:
[(id, rrf_score), …] sorted by rrf_score DESC.
"""
rrf: dict[Any, float] = defaultdict(float)
for ranked in ranked_lists:
for rank, (doc_id, _) in enumerate(ranked, start=1):
rrf[doc_id] += 1.0 / (k + rank)
return sorted(rrf.items(), key=lambda x: x[1], reverse=True)
def comb_sum(
ranked_lists: list[list[tuple[Any, float]]],
weights: list[float] | None = None,
) -> list[tuple[Any, float]]:
"""
Min-max normalize each list's scores, then sum (optionally weighted).
Args:
ranked_lists: Each list is [(id, score), …].
weights: Optional per-list weights. Defaults to equal weighting.
Returns:
[(id, combined_score), …] sorted DESC.
"""
if weights is None:
weights = [1.0] * len(ranked_lists)
combined: dict[Any, float] = defaultdict(float)
for ranked, w in zip(ranked_lists, weights):
if not ranked:
continue
scores = [s for _, s in ranked]
lo, hi = min(scores), max(scores)
span = hi - lo if hi != lo else 1.0
for doc_id, score in ranked:
combined[doc_id] += w * (score - lo) / span
return sorted(combined.items(), key=lambda x: x[1], reverse=True)