Project Files
tools / clean-text.js
/**
* clean-text.js β Π‘ΠΊΡΠΈΠΏΡ Π΄Π»Ρ ΠΎΡΠΈΡΡΠΊΠΈ ΡΠ΅ΠΊΡΡΠΎΠ²ΡΡ
ΡΠ°ΠΉΠ»ΠΎΠ² ΠΎΡ ΠΌΡΡΠΎΡΠ°
*
* ΠΡΠΏΠΎΠ»ΡΠ·ΠΎΠ²Π°Π½ΠΈΠ΅:
* node clean-text.js <input-file> [output-file]
*
* ΠΡΠΈΠΌΠ΅ΡΡ:
* node clean-text.js dirty.html
* node clean-text.js dirty.html clean.md
* node clean-text.js messy.txt
*/
import fs from "fs";
import path from "path";
// --- Π£ΡΠΈΠ»ΠΈΡΡ ---
function readFile(filePath) {
return fs.readFileSync(filePath, "utf-8");
}
function writeFile(filePath, content) {
fs.writeFileSync(filePath, content, "utf-8");
}
function getFileExt(filePath) {
return path.extname(filePath).toLowerCase();
}
// --- ΠΡΠΈΡΡΠΊΠ° HTML ---
function cleanHtml(html) {
// Π£Π΄Π°Π»ΡΠ΅ΠΌ ΡΠΊΡΠΈΠΏΡΡ
let text = html.replace(/<script[\s\S]*?<\/script>/gi, "");
// Π£Π΄Π°Π»ΡΠ΅ΠΌ ΡΡΠΈΠ»ΠΈ
text = text.replace(/<style[\s\S]*?<\/style>/gi, "");
// Π£Π΄Π°Π»ΡΠ΅ΠΌ ΠΊΠΎΠΌΠΌΠ΅Π½ΡΠ°ΡΠΈΠΈ
text = text.replace(/<!--[\s\S]*?-->/g, "");
// Π£Π΄Π°Π»ΡΠ΅ΠΌ Π½Π°Π²ΠΈΠ³Π°ΡΠΈΡ, ΡΡΡΠ΅ΡΡ, ΡΠ°ΠΉΠ΄Π±Π°ΡΡ
text = text.replace(/<(nav|footer|header|aside|sidebar)[\s\S]*?<\/\1>/gi, "");
// ΠΠ°ΠΌΠ΅Π½ΡΠ΅ΠΌ Π·Π°Π³ΠΎΠ»ΠΎΠ²ΠΊΠΈ Π½Π° markdown
text = text.replace(/<h1[^>]*>([\s\S]*?)<\/h1>/gi, "\n# $1\n");
text = text.replace(/<h2[^>]*>([\s\S]*?)<\/h2>/gi, "\n## $1\n");
text = text.replace(/<h3[^>]*>([\s\S]*?)<\/h3>/gi, "\n### $1\n");
text = text.replace(/<h4[^>]*>([\s\S]*?)<\/h4>/gi, "\n#### $1\n");
text = text.replace(/<h5[^>]*>([\s\S]*?)<\/h5>/gi, "\n##### $1\n");
text = text.replace(/<h6[^>]*>([\s\S]*?)<\/h6>/gi, "\n###### $1\n");
// ΠΠ°ΠΌΠ΅Π½ΡΠ΅ΠΌ ΠΏΠ°ΡΠ°Π³ΡΠ°ΡΡ
text = text.replace(/<p[^>]*>([\s\S]*?)<\/p>/gi, "\n$1\n");
// ΠΠ°ΠΌΠ΅Π½ΡΠ΅ΠΌ ΡΠΏΠΈΡΠΊΠΈ
text = text.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, "- $1\n");
// ΠΠ°ΠΌΠ΅Π½ΡΠ΅ΠΌ ΠΏΠ΅ΡΠ΅Π½ΠΎΡΡ ΡΡΡΠΎΠΊ
text = text.replace(/<(br|hr)[^>]*>/gi, "\n");
// ΠΠ°ΠΌΠ΅Π½ΡΠ΅ΠΌ ΠΆΠΈΡΠ½ΡΠΉ ΠΈ ΠΊΡΡΡΠΈΠ²
text = text.replace(/<strong[^>]*>([\s\S]*?)<\/strong>/gi, "**$1**");
text = text.replace(/<b[^>]*>([\s\S]*?)<\/b>/gi, "**$1**");
text = text.replace(/<em[^>]*>([\s\S]*?)<\/em>/gi, "*$1*");
text = text.replace(/<i[^>]*>([\s\S]*?)<\/i>/gi, "*$1*");
// ΠΠ°ΠΌΠ΅Π½ΡΠ΅ΠΌ ΡΡΡΠ»ΠΊΠΈ
text = text.replace(/<a[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, "[$2]($1)");
// ΠΠ°ΠΌΠ΅Π½ΡΠ΅ΠΌ ΠΊΠΎΠ΄
text = text.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, "`$1`");
text = text.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, "\n```\n$1\n```\n");
// ΠΠ°ΠΌΠ΅Π½ΡΠ΅ΠΌ ΡΠ°Π±Π»ΠΈΡΡ
text = text.replace(/<table[^>]*>([\s\S]*?)<\/table>/gi, "\n[TABLE]\n$1\n[/TABLE]\n");
text = text.replace(/<tr[^>]*>([\s\S]*?)<\/tr>/gi, "$1\n");
text = text.replace(/<th[^>]*>([\s\S]*?)<\/th>/gi, "| $1 ");
text = text.replace(/<td[^>]*>([\s\S]*?)<\/td>/gi, "| $1 ");
// Π£Π΄Π°Π»ΡΠ΅ΠΌ Π²ΡΠ΅ ΠΎΡΡΠ°Π»ΡΠ½ΡΠ΅ ΡΠ΅Π³ΠΈ
text = text.replace(/<[^>]+>/g, "");
// ΠΠ΅ΠΊΠΎΠ΄ΠΈΡΡΠ΅ΠΌ HTML-ΡΡΡΠ½ΠΎΡΡΠΈ
text = text
.replace(/&/g, "&")
.replace(/</g, "<")
.replace(/>/g, ">")
.replace(/"/g, '"')
.replace(/'/g, "'")
.replace(/ /g, " ")
.replace(/&#(\d+);/g, (_, code) => String.fromCharCode(Number(code)))
.replace(/—/g, "β")
.replace(/–/g, "β")
.replace(/…/g, "...")
.replace(/«/g, "Β«")
.replace(/»/g, "Β»");
// ΠΠΎΡΠΌΠ°Π»ΠΈΠ·ΡΠ΅ΠΌ ΠΏΡΠΎΠ±Π΅Π»Ρ ΠΈ ΠΏΡΡΡΡΠ΅ ΡΡΡΠΎΠΊΠΈ
text = text.replace(/[ \t]+/g, " ");
text = text.replace(/\n{3,}/g, "\n\n");
return text.trim();
}
// --- ΠΡΠΈΡΡΠΊΠ° ΠΎΠ±ΡΡΠ½ΠΎΠ³ΠΎ ΡΠ΅ΠΊΡΡΠ° ---
function cleanText(text) {
// Π£Π΄Π°Π»ΡΠ΅ΠΌ Π½Π΅ΡΠΈΡΠ°Π΅ΠΌΡΠ΅ ΡΠΈΠΌΠ²ΠΎΠ»Ρ
text = text.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, "");
// Π£Π΄Π°Π»ΡΠ΅ΠΌ BOM
text = text.replace(/^\uFEFF/, "");
// ΠΠΎΡΠΌΠ°Π»ΠΈΠ·ΡΠ΅ΠΌ ΠΏΠ΅ΡΠ΅Π½ΠΎΡΡ ΡΡΡΠΎΠΊ
text = text.replace(/\r\n/g, "\n").replace(/\r/g, "\n");
// Π£Π΄Π°Π»ΡΠ΅ΠΌ ΡΡΡΠΎΠΊΠΈ ΡΠΎΡΡΠΎΡΡΠΈΠ΅ ΡΠΎΠ»ΡΠΊΠΎ ΠΈΠ· ΡΠΏΠ΅ΡΡΠΈΠΌΠ²ΠΎΠ»ΠΎΠ²
text = text.replace(/^[^\wΠ°-ΡΡΠ-Π―Π\n]{3,}$/gm, "");
// Π£Π΄Π°Π»ΡΠ΅ΠΌ ΠΏΠΎΠ²ΡΠΎΡΡΡΡΠΈΠ΅ΡΡ ΠΏΡΡΡΡΠ΅ ΡΡΡΠΎΠΊΠΈ
text = text.replace(/\n{4,}/g, "\n\n\n");
// Π£Π΄Π°Π»ΡΠ΅ΠΌ trailing whitespace
text = text.replace(/[ \t]+$/gm, "");
return text.trim();
}
// --- ΠΡΠΈΡΡΠΊΠ° ΠΊΠΎΠ΄Π° ---
function cleanCode(code, ext) {
// Π£Π΄Π°Π»ΡΠ΅ΠΌ ΠΏΡΡΡΡΠ΅ ΡΡΡΠΎΠΊΠΈ Π² Π½Π°ΡΠ°Π»Π΅ ΠΈ ΠΊΠΎΠ½ΡΠ΅
code = code.trim();
// ΠΠΎΡΠΌΠ°Π»ΠΈΠ·ΡΠ΅ΠΌ ΠΏΠ΅ΡΠ΅Π½ΠΎΡΡ ΡΡΡΠΎΠΊ
code = code.replace(/\r\n/g, "\n").replace(/\r/g, "\n");
// Π£Π΄Π°Π»ΡΠ΅ΠΌ trailing whitespace
code = code.replace(/[ \t]+$/gm, "");
return code;
}
// --- ΠΠ»Π°Π²Π½Π°Ρ ΡΡΠ½ΠΊΡΠΈΡ ---
function processFile(inputPath, outputPath) {
const ext = getFileExt(inputPath);
const content = readFile(inputPath);
let result;
console.log(`π Processing: ${inputPath}`);
console.log(` Type: ${ext || "text"}`);
console.log(` Input size: ${content.length.toLocaleString()} chars`);
if (ext === ".html" || ext === ".htm") {
result = cleanHtml(content);
} else if ([".js", ".ts", ".py", ".jsx", ".tsx", ".css", ".java", ".go", ".rs"].includes(ext)) {
result = cleanCode(content, ext);
} else {
result = cleanText(content);
}
// Π£Π΄Π°Π»ΡΠ΅ΠΌ ΡΡΡΠΎΠΊΠΈ ΠΊΠΎΡΠΎΡΠ΅ 2 ΡΠΈΠΌΠ²ΠΎΠ»ΠΎΠ² (ΠΊΡΠΎΠΌΠ΅ ΠΏΡΡΡΡΡ
ΡΡΡΠΎΠΊ)
const lines = result.split("\n");
const filtered = lines.filter(line => {
const trimmed = line.trim();
if (trimmed === "") return true; // keep empty lines for structure
return trimmed.length >= 2;
});
result = filtered.join("\n");
// Π€ΠΈΠ½Π°Π»ΡΠ½Π°Ρ Π½ΠΎΡΠΌΠ°Π»ΠΈΠ·Π°ΡΠΈΡ
result = result.replace(/\n{3,}/g, "\n\n").trim();
if (!outputPath) {
const dir = path.dirname(inputPath);
const name = path.basename(inputPath, ext);
outputPath = path.join(dir, `${name}-clean${ext === ".html" || ext === ".htm" ? ".md" : ext}`);
}
writeFile(outputPath, result);
console.log(` Output size: ${result.length.toLocaleString()} chars`);
console.log(` Saved to: ${outputPath}`);
console.log(` Reduction: ${((1 - result.length / content.length) * 100).toFixed(1)}%\n`);
}
// --- CLI ---
const args = process.argv.slice(2);
if (args.length === 0) {
console.log("Usage: node clean-text.js <input-file> [output-file]");
console.log("\nExamples:");
console.log(" node clean-text.js page.html");
console.log(" node clean-text.js page.html clean.md");
console.log(" node clean-text.js messy.txt");
process.exit(1);
}
const inputFile = args[0];
const outputFile = args[1] || null;
if (!fs.existsSync(inputFile)) {
console.error(`β File not found: ${inputFile}`);
process.exit(1);
}
processFile(inputFile, outputFile);