Project Files
src / utils / cleanText.ts
/**
* Sanitizes and normalizes scraped text content.
* Strips out excessive line breaks, tabs, non-breaking spaces,
* and unwanted control characters to ensure uniform text structures.
*/
export function cleanText(text: string | null | undefined): string {
if (!text) {
return "";
}
return text
// 1. Replace non-breaking spaces (\u00a0) and tabs with standard spaces
.replace(/[\u00A0\t]+/g, " ")
// 2. Strip out hidden system control characters/null bytes
.replace(/[\x00-\x1F\x7F-\x9F]/g, "")
// 3. Normalize multiple back-to-back spaces into a single space
.replace(/[ ]+/g, " ")
// 4. Collapse three or more consecutive newlines down to a max of two
// (This preserves paragraph layout separation without giant empty gaps)
.replace(/\n{3,}/g, "\n\n")
// 5. Trim trailing whitespace from the ends of individual lines
.split("\n")
.map(line => line.trim())
.join("\n")
// 6. Final trim for the entire string package
.trim();
}