Project Files
lib / parser.js
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.parseHtml = parseHtml;
exports.extractElementsBySelector = extractElementsBySelector;
exports.searchInHtml = searchInHtml;
const cheerio = __importStar(require("cheerio"));
async function parseHtml(html) {
const $ = cheerio.load(html);
// Extract title
const title = $("title").text() || $('meta[property="og:title"]').attr("content") || "";
// Extract description
const description = $('meta[name="description"]').attr("content") ||
$('meta[property="og:description"]').attr("content") ||
"";
// Extract og:image
const ogImage = $('meta[property="og:image"]').attr("content") || "";
// Extract all headings
const headings = [];
$("h1, h2, h3, h4, h5, h6").each((_, elem) => {
const text = $(elem).text().trim();
if (text) {
headings.push(text);
}
});
// Extract all paragraphs
const paragraphs = [];
$("p").each((_, elem) => {
const text = $(elem).text().trim();
if (text) {
paragraphs.push(text);
}
});
// Extract all links
const links = [];
$("a").each((_, elem) => {
const href = $(elem).attr("href") || "";
const text = $(elem).text().trim();
if (href && text) {
links.push({ href, text });
}
});
// Extract all meta tags
const metadata = {};
$("meta").each((_, elem) => {
const name = $(elem).attr("name") || $(elem).attr("property");
const content = $(elem).attr("content");
if (name && content) {
metadata[name] = content;
}
});
return {
title,
description,
ogImage,
headings,
paragraphs,
links,
metadata,
};
}
function extractElementsBySelector(html, selector) {
const $ = cheerio.load(html);
const results = [];
try {
$(selector).each((_, elem) => {
const tag = elem.name || "unknown";
const content = $(elem).html() || "";
const attributes = {};
if (elem.attribs) {
Object.keys(elem.attribs).forEach((key) => {
attributes[key] = elem.attribs[key];
});
}
results.push({
tag,
content: content.substring(0, 500), // Limit content length
attributes,
});
});
}
catch (error) {
console.error("Error extracting elements:", error);
}
return results;
}
function searchInHtml(html, keywords) {
const $ = cheerio.load(html);
const text = $.root().text();
return keywords.map((keyword) => {
const regex = new RegExp(keyword, "gi");
const matches = (text.match(regex) || []).length;
// Extract context around matches
const context = [];
const lowerKeyword = keyword.toLowerCase();
const sentences = text.split(/[.!?]+/);
for (const sentence of sentences) {
if (sentence.toLowerCase().includes(lowerKeyword) && context.length < 3) {
context.push(sentence.trim().substring(0, 200));
}
}
return {
keyword,
matches,
context,
};
});
}
//# sourceMappingURL=parser.js.map