Project Files
src / services / structuredExtraction.ts
import * as cheerio from "cheerio";
/**
* Rule-based structural extraction: pulls headed sections, lists, and
* definition blocks out of raw HTML. Designed to work on any site, not just
* patch-note wikis.
*/
export function extractStructuredElements(rawHtml: string): string[] {
if (!rawHtml) return [];
const $ = cheerio.load(rawHtml);
const structuredSections: string[] = [];
let $mainContainer = $("article, #mw-content-text, .mw-page-body, main, [role='main']").first();
if (!$mainContainer.length) {
$mainContainer = $("body");
}
// MODIFIED: Find all content blocks and structural markers linearly
// to bypass inverted DOM trees
const $elements = $mainContainer.find("h2, h3, h4, p, ul, ol, table");
let currentSection: string[] = [];
$elements.each((_, el) => {
const tagName = el.tagName.toLowerCase();
const $el = $(el);
const text = $el.text().replace(/\s+/g, " ").trim();
if (!text) return;
// If it's a heading block, flush the old section and build a fresh header block
if (["h2", "h3", "h4"].includes(tagName)) {
if (currentSection.length > 0) {
structuredSections.push(currentSection.join("\n\n"));
}
currentSection = [`## ${text}`];
}
// Catch independent update markers that are unmapped by standard headings
else if (text.toLowerCase().startsWith("patch") || text.toLowerCase().includes("[additions and changes]")) {
if (currentSection.length > 0) {
structuredSections.push(currentSection.join("\n\n"));
}
currentSection = [`## ${text}`];
}
else {
if (tagName === "ul" || tagName === "ol") {
$el.find("li").each((__, li) => {
const liText = $(li).text().replace(/\s+/g, " ").trim();
if (liText.length > 2) currentSection.push(` • ${liText}`);
});
} else if (tagName === "p" && text.length > 10) {
currentSection.push(text);
} else if (tagName === "table") {
$el.find("tr").each((__, row) => {
const cells = $(row)
.find("td, th")
.map((___, cell) => $(cell).text().replace(/\s+/g, " ").trim())
.get()
.filter(Boolean);
if (cells.length) currentSection.push(cells.join(" | "));
});
}
}
});
if (currentSection.length > 0) {
structuredSections.push(currentSection.join("\n\n"));
}
// Filter out minimal empty artifacts
return structuredSections.filter(section => section.trim().length > 25);
}