Project Files
lib / fetcher.js
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.fetchWebpage = fetchWebpage;
exports.fetchAndClean = fetchAndClean;
const node_fetch_1 = __importDefault(require("node-fetch"));
let lastRequestTime = 0;
async function delay(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
async function fetchWebpage(url, config) {
try {
// Validate URL
const parsedUrl = new URL(url);
// Apply rate limiting
const delaySeconds = config.requestDelaySeconds || 2;
const timeSinceLastRequest = Date.now() - lastRequestTime;
if (timeSinceLastRequest < delaySeconds * 1000) {
await delay(delaySeconds * 1000 - timeSinceLastRequest);
}
lastRequestTime = Date.now();
// Fetch with retries
let lastError = null;
const maxRetries = config.maxRetries || 2;
for (let attempt = 0; attempt <= maxRetries; attempt++) {
try {
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), 10000);
const response = await (0, node_fetch_1.default)(url, {
headers: {
"User-Agent": config.userAgent || "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
},
signal: controller.signal,
});
clearTimeout(timeoutId);
if (!response.ok) {
if (response.status >= 500 && attempt < maxRetries) {
// Retry on server error
await delay(1000 * (attempt + 1));
continue;
}
return {
success: false,
statusCode: response.status,
error: `HTTP ${response.status}: ${response.statusText}`,
};
}
const html = await response.text();
return {
success: true,
html: html,
statusCode: response.status,
};
}
catch (error) {
lastError = error;
if (attempt < maxRetries) {
// Retry on network error
await delay(1000 * (attempt + 1));
continue;
}
}
}
return {
success: false,
error: lastError?.message || "Failed to fetch webpage after retries",
};
}
catch (error) {
return {
success: false,
error: error instanceof Error ? error.message : String(error),
};
}
}
async function fetchAndClean(url, config) {
const result = await fetchWebpage(url, config);
if (!result.success || !result.html) {
return {
success: false,
statusCode: result.statusCode,
error: result.error,
};
}
const cheerio = await Promise.resolve().then(() => __importStar(require("cheerio")));
const $ = cheerio.load(result.html);
// Extract title
const title = $("title").text() || $('meta[property="og:title"]').attr("content") || "";
// Remove script and style tags
$("script").remove();
$("style").remove();
$("noscript").remove();
// Extract and clean text
let text = $("body").text();
if (!text) {
text = $.text();
}
// Clean whitespace
text = text
.replace(/\s+/g, " ")
.replace(/\n+/g, " ")
.trim();
// Limit to 8000 characters
if (text.length > 8000) {
text = text.substring(0, 8000) + "...";
}
return {
success: true,
text: text,
title: title,
statusCode: result.statusCode,
};
}
//# sourceMappingURL=fetcher.js.map