Project Files
src / fetcher.ts
import fetch from "node-fetch";
import { WebFetcherConfig } from "./config";
let lastRequestTime = 0;
async function delay(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
export async function fetchWebpage(
url: string,
config: WebFetcherConfig
): Promise<{
success: boolean;
html?: string;
statusCode?: number;
error?: string;
}> {
try {
// Validate URL
const parsedUrl = new URL(url);
// Apply rate limiting
const delaySeconds = config.requestDelaySeconds || 2;
const timeSinceLastRequest = Date.now() - lastRequestTime;
if (timeSinceLastRequest < delaySeconds * 1000) {
await delay(delaySeconds * 1000 - timeSinceLastRequest);
}
lastRequestTime = Date.now();
// Fetch with retries
let lastError: Error | null = null;
const maxRetries = config.maxRetries || 2;
for (let attempt = 0; attempt <= maxRetries; attempt++) {
try {
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), 10000);
const response = await fetch(url, {
headers: {
"User-Agent": config.userAgent || "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
},
signal: controller.signal,
});
clearTimeout(timeoutId);
if (!response.ok) {
if (response.status >= 500 && attempt < maxRetries) {
// Retry on server error
await delay(1000 * (attempt + 1));
continue;
}
return {
success: false,
statusCode: response.status,
error: `HTTP ${response.status}: ${response.statusText}`,
};
}
const html = await response.text();
return {
success: true,
html: html,
statusCode: response.status,
};
} catch (error) {
lastError = error as Error;
if (attempt < maxRetries) {
// Retry on network error
await delay(1000 * (attempt + 1));
continue;
}
}
}
return {
success: false,
error: lastError?.message || "Failed to fetch webpage after retries",
};
} catch (error) {
return {
success: false,
error: error instanceof Error ? error.message : String(error),
};
}
}
export async function fetchAndClean(
url: string,
config: WebFetcherConfig
): Promise<{
success: boolean;
text?: string;
title?: string;
statusCode?: number;
error?: string;
}> {
const result = await fetchWebpage(url, config);
if (!result.success || !result.html) {
return {
success: false,
statusCode: result.statusCode,
error: result.error,
};
}
const cheerio = await import("cheerio");
const $ = cheerio.load(result.html);
// Extract title
const title = $("title").text() || $('meta[property="og:title"]').attr("content") || "";
// Remove script and style tags
$("script").remove();
$("style").remove();
$("noscript").remove();
// Extract and clean text
let text = $("body").text();
if (!text) {
text = $.text();
}
// Clean whitespace
text = text
.replace(/\s+/g, " ")
.replace(/\n+/g, " ")
.trim();
// Limit to 8000 characters
if (text.length > 8000) {
text = text.substring(0, 8000) + "...";
}
return {
success: true,
text: text,
title: title,
statusCode: result.statusCode,
};
}