Project Files
src / parsers / htmlParser.ts
import * as cheerio from "cheerio";
import * as fs from "fs";
/**
* Parse HTML/HTM files and extract text content
*/
export async function parseHTML(filePath: string): Promise<string> {
try {
const content = await fs.promises.readFile(filePath, "utf-8");
const $ = cheerio.load(content);
// Remove script and style elements
$("script, style, noscript").remove();
// Extract text
const text = $("body").text() || $.text();
// Clean up whitespace
return text
.replace(/\s+/g, " ")
.replace(/\n+/g, "\n")
.trim();
} catch (error) {
console.error(`Error parsing HTML file ${filePath}:`, error);
return "";
}
}