All files / storage html-to-text.ts

100% Statements 18/18
83.33% Branches 5/6
100% Functions 4/4
100% Lines 15/15

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67                  32x                                                     130x   25x     25x 25x     25x     25x     25x 1x   25x     25x     25x 25x 25x   25x    
/**
 * Lightweight HTML-to-text extraction for FTS indexing.
 *
 * Used as a fallback when an email has no plain-text MIME part — strips tags,
 * decodes common HTML entities, and collapses whitespace so FTS5 has something
 * meaningful to index. Not intended for display — just search.
 */
 
/** Common HTML entities. Covers the vast majority of email content. */
const ENTITIES: Record<string, string> = {
	amp: "&",
	lt: "<",
	gt: ">",
	quot: '"',
	apos: "'",
	nbsp: " ",
	mdash: "\u2014",
	ndash: "\u2013",
	lsquo: "\u2018",
	rsquo: "\u2019",
	ldquo: "\u201C",
	rdquo: "\u201D",
	bull: "\u2022",
	hellip: "\u2026",
	copy: "\u00A9",
	reg: "\u00AE",
	trade: "\u2122",
};
 
/**
 * Extract searchable plain text from an HTML string.
 *
 * Returns null if input is null/undefined or if the result is empty after
 * stripping (e.g. an HTML email that's just images with no alt text).
 */
export function htmlToText(html: string | null | undefined): string | null {
	if (html == null) return null;
 
	let text = html;
 
	// Remove <style> and <script> blocks entirely (content + tags)
	text = text.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "");
	text = text.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "");
 
	// Insert newlines before block-level elements for readability
	text = text.replace(/<\/?(?:p|div|br|h[1-6]|li|tr|blockquote|hr)[^>]*\/?>/gi, "\n");
 
	// Strip all remaining HTML tags
	text = text.replace(/<[^>]+>/g, "");
 
	// Decode numeric entities (&#123; and &#x1F; forms)
	text = text.replace(/&#x([0-9a-fA-F]+);/g, (_, hex) =>
		String.fromCodePoint(Number.parseInt(hex, 16)),
	);
	text = text.replace(/&#(\d+);/g, (_, dec) => String.fromCodePoint(Number.parseInt(dec, 10)));
 
	// Decode named entities
	text = text.replace(/&([a-zA-Z]+);/g, (match, name) => ENTITIES[name.toLowerCase()] ?? match);
 
	// Collapse whitespace: runs of spaces/tabs → single space, 3+ newlines → 2
	text = text.replace(/[ \t]+/g, " ");
	text = text.replace(/\n{3,}/g, "\n\n");
	text = text.trim();
 
	return text.length > 0 ? text : null;
}