Project Files
src / parser.ts
import * as cheerio from "cheerio";
export interface ParsedContent {
title?: string;
description?: string;
ogImage?: string;
headings: string[];
paragraphs: string[];
links: Array<{ href: string; text: string }>;
metadata: Record<string, string>;
}
export async function parseHtml(html: string): Promise<ParsedContent> {
const $ = cheerio.load(html);
// Extract title
const title = $("title").text() || $('meta[property="og:title"]').attr("content") || "";
// Extract description
const description =
$('meta[name="description"]').attr("content") ||
$('meta[property="og:description"]').attr("content") ||
"";
// Extract og:image
const ogImage = $('meta[property="og:image"]').attr("content") || "";
// Extract all headings
const headings: string[] = [];
$("h1, h2, h3, h4, h5, h6").each((_, elem) => {
const text = $(elem).text().trim();
if (text) {
headings.push(text);
}
});
// Extract all paragraphs
const paragraphs: string[] = [];
$("p").each((_, elem) => {
const text = $(elem).text().trim();
if (text) {
paragraphs.push(text);
}
});
// Extract all links
const links: Array<{ href: string; text: string }> = [];
$("a").each((_, elem) => {
const href = $(elem).attr("href") || "";
const text = $(elem).text().trim();
if (href && text) {
links.push({ href, text });
}
});
// Extract all meta tags
const metadata: Record<string, string> = {};
$("meta").each((_, elem) => {
const name = $(elem).attr("name") || $(elem).attr("property");
const content = $(elem).attr("content");
if (name && content) {
metadata[name] = content;
}
});
return {
title,
description,
ogImage,
headings,
paragraphs,
links,
metadata,
};
}
export function extractElementsBySelector(
html: string,
selector: string
): Array<{ tag: string; content: string; attributes: Record<string, string> }> {
const $ = cheerio.load(html);
const results: Array<{
tag: string;
content: string;
attributes: Record<string, string>;
}> = [];
try {
$(selector).each((_, elem: any) => {
const tag = elem.name || "unknown";
const content = $(elem).html() || "";
const attributes: Record<string, string> = {};
if (elem.attribs) {
Object.keys(elem.attribs).forEach((key) => {
attributes[key] = elem.attribs[key];
});
}
results.push({
tag,
content: content.substring(0, 500), // Limit content length
attributes,
});
});
} catch (error) {
console.error("Error extracting elements:", error);
}
return results;
}
export function searchInHtml(
html: string,
keywords: string[]
): Array<{ keyword: string; matches: number; context: string[] }> {
const $ = cheerio.load(html);
const text = $.root().text();
return keywords.map((keyword) => {
const regex = new RegExp(keyword, "gi");
const matches = (text.match(regex) || []).length;
// Extract context around matches
const context: string[] = [];
const lowerKeyword = keyword.toLowerCase();
const sentences = text.split(/[.!?]+/);
for (const sentence of sentences) {
if (sentence.toLowerCase().includes(lowerKeyword) && context.length < 3) {
context.push(sentence.trim().substring(0, 200));
}
}
return {
keyword,
matches,
context,
};
});
}