Project Files
src / search / providers.py
"""
search/providers.py — Web search provider implementations.
Supported providers
───────────────────
duckduckgo No API key required. Good for general queries.
brave Requires BRAVE_API_KEY. Excellent quality, privacy-focused.
tavily Requires TAVILY_API_KEY. Best for AI/RAG use-cases; returns
cleaned, LLM-friendly snippets.
serper Requires SERPER_API_KEY. Google results via proxy API.
Each provider returns a uniform list of:
{"title": str, "url": str, "snippet": str}
"""
from __future__ import annotations
import httpx
from src.utils.logging import get_logger
log = get_logger("search.providers")
class WebSearchEngine:
"""
Unified web search interface. Instantiate once; call search() per query.
"""
def __init__(
self,
provider: str = "duckduckgo",
brave_api_key: str | None = None,
tavily_api_key: str | None = None,
serper_api_key: str | None = None,
) -> None:
self.provider = provider
self.brave_api_key = brave_api_key
self.tavily_api_key = tavily_api_key
self.serper_api_key = serper_api_key
log.info("Web search provider: %s", provider)
async def search(self, query: str, max_results: int = 5) -> list[dict]:
"""
Run a web search and return uniform result dicts.
Returns [] on provider error (never raises).
"""
try:
if self.provider == "duckduckgo":
return await self._duckduckgo(query, max_results)
if self.provider == "brave":
return await self._brave(query, max_results)
if self.provider == "tavily":
return await self._tavily(query, max_results)
if self.provider == "serper":
return await self._serper(query, max_results)
log.error("Unknown provider: %s", self.provider)
return []
except Exception as exc:
log.error("Web search error (%s): %s", self.provider, exc)
return []
# ── DuckDuckGo ────────────────────────────────────────────────────────────
async def _duckduckgo(self, query: str, max_results: int) -> list[dict]:
try:
from duckduckgo_search import DDGS
except ImportError:
log.error("duckduckgo-search not installed.")
return []
import asyncio
loop = asyncio.get_event_loop()
def _sync_search():
with DDGS() as ddgs:
return list(ddgs.text(query, max_results=max_results))
results = await loop.run_in_executor(None, _sync_search)
return [
{
"title": r.get("title", ""),
"url": r.get("href", ""),
"snippet": r.get("body", ""),
}
for r in results
]
# ── Brave ─────────────────────────────────────────────────────────────────
async def _brave(self, query: str, max_results: int) -> list[dict]:
if not self.brave_api_key:
log.error("Brave search requires --brave-api-key.")
return []
async with httpx.AsyncClient(timeout=20) as client:
resp = await client.get(
"https://api.search.brave.com/res/v1/web/search",
params={"q": query, "count": max_results},
headers={
"Accept": "application/json",
"Accept-Encoding": "gzip",
"X-Subscription-Token": self.brave_api_key,
},
)
resp.raise_for_status()
data = resp.json()
return [
{
"title": r.get("title", ""),
"url": r.get("url", ""),
"snippet": r.get("description", ""),
}
for r in data.get("web", {}).get("results", [])
]
# ── Tavily ────────────────────────────────────────────────────────────────
async def _tavily(self, query: str, max_results: int) -> list[dict]:
if not self.tavily_api_key:
log.error("Tavily search requires --tavily-api-key.")
return []
async with httpx.AsyncClient(timeout=20) as client:
resp = await client.post(
"https://api.tavily.com/search",
json={
"api_key": self.tavily_api_key,
"query": query,
"max_results": max_results,
"search_depth": "advanced", # Tavily-specific: better quality
"include_answer": False,
},
)
resp.raise_for_status()
data = resp.json()
return [
{
"title": r.get("title", ""),
"url": r.get("url", ""),
"snippet": r.get("content", ""),
}
for r in data.get("results", [])
]
# ── Serper (Google via proxy) ─────────────────────────────────────────────
async def _serper(self, query: str, max_results: int) -> list[dict]:
if not self.serper_api_key:
log.error("Serper search requires --serper-api-key.")
return []
async with httpx.AsyncClient(timeout=20) as client:
resp = await client.post(
"https://google.serper.dev/search",
json={"q": query, "num": max_results},
headers={
"X-API-KEY": self.serper_api_key,
"Content-Type": "application/json",
},
)
resp.raise_for_status()
data = resp.json()
results = []
for r in data.get("organic", []):
results.append({
"title": r.get("title", ""),
"url": r.get("link", ""),
"snippet": r.get("snippet", ""),
})
return results[:max_results]