Press n or j to go to the next uncovered block, b, p or k for the previous block.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | 32x 130x 25x 25x 25x 25x 25x 25x 1x 25x 25x 25x 25x 25x 25x | /**
* Lightweight HTML-to-text extraction for FTS indexing.
*
* Used as a fallback when an email has no plain-text MIME part — strips tags,
* decodes common HTML entities, and collapses whitespace so FTS5 has something
* meaningful to index. Not intended for display — just search.
*/
/** Common HTML entities. Covers the vast majority of email content. */
const ENTITIES: Record<string, string> = {
amp: "&",
lt: "<",
gt: ">",
quot: '"',
apos: "'",
nbsp: " ",
mdash: "\u2014",
ndash: "\u2013",
lsquo: "\u2018",
rsquo: "\u2019",
ldquo: "\u201C",
rdquo: "\u201D",
bull: "\u2022",
hellip: "\u2026",
copy: "\u00A9",
reg: "\u00AE",
trade: "\u2122",
};
/**
* Extract searchable plain text from an HTML string.
*
* Returns null if input is null/undefined or if the result is empty after
* stripping (e.g. an HTML email that's just images with no alt text).
*/
export function htmlToText(html: string | null | undefined): string | null {
if (html == null) return null;
let text = html;
// Remove <style> and <script> blocks entirely (content + tags)
text = text.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "");
text = text.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "");
// Insert newlines before block-level elements for readability
text = text.replace(/<\/?(?:p|div|br|h[1-6]|li|tr|blockquote|hr)[^>]*\/?>/gi, "\n");
// Strip all remaining HTML tags
text = text.replace(/<[^>]+>/g, "");
// Decode numeric entities ({ and  forms)
text = text.replace(/&#x([0-9a-fA-F]+);/g, (_, hex) =>
String.fromCodePoint(Number.parseInt(hex, 16)),
);
text = text.replace(/&#(\d+);/g, (_, dec) => String.fromCodePoint(Number.parseInt(dec, 10)));
// Decode named entities
text = text.replace(/&([a-zA-Z]+);/g, (match, name) => ENTITIES[name.toLowerCase()] ?? match);
// Collapse whitespace: runs of spaces/tabs → single space, 3+ newlines → 2
text = text.replace(/[ \t]+/g, " ");
text = text.replace(/\n{3,}/g, "\n\n");
text = text.trim();
return text.length > 0 ? text : null;
}
|