Remove HTML entity parsing (#274)

## Summary * Remove HTML entity parsing * This has been completely useless since the introduction of expat * expat tries to parse all entities in the document, but only knows of HTML ones * Parsing will never end with HTML entities in the text, so the additional step to parse them that we had went completely unused * We should figure out the best way to parse that content in the future, but for now remove that module as it generates a lot of heap allocations with its map and strings
2026-01-07 22:08:43 +10:00
parent 46fa186b82
commit 2b12a65011
3 changed files with 3 additions and 174 deletions
--- a/lib/Epub/Epub/htmlEntities.cpp
+++ b/lib/Epub/Epub/htmlEntities.cpp
@@ -1,163 +0,0 @@
-// from
-// https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp
-
-#include "htmlEntities.h"
-
-#include <cstring>
-#include <unordered_map>
-
-const int MAX_ENTITY_LENGTH = 10;
-
-// Use book: entities_ww2.epub to test this (Page 7: Entities parser test)
-// Note the supported keys are only in lowercase
-// Store the mappings in a unordered hash map
-static std::unordered_map<std::string, std::string> entity_lookup(
-    {{"&quot;", "\""},  {"&frasl;", "⁄"},   {"&amp;", "&"},      {"&lt;", "<"},     {"&gt;", ">"},
-     {"&Agrave;", "À"}, {"&Aacute;", "Á"},  {"&Acirc;", "Â"},    {"&Atilde;", "Ã"}, {"&Auml;", "Ä"},
-     {"&Aring;", "Å"},  {"&AElig;", "Æ"},   {"&Ccedil;", "Ç"},   {"&Egrave;", "È"}, {"&Eacute;", "É"},
-     {"&Ecirc;", "Ê"},  {"&Euml;", "Ë"},    {"&Igrave;", "Ì"},   {"&Iacute;", "Í"}, {"&Icirc;", "Î"},
-     {"&Iuml;", "Ï"},   {"&ETH;", "Ð"},     {"&Ntilde;", "Ñ"},   {"&Ograve;", "Ò"}, {"&Oacute;", "Ó"},
-     {"&Ocirc;", "Ô"},  {"&Otilde;", "Õ"},  {"&Ouml;", "Ö"},     {"&Oslash;", "Ø"}, {"&Ugrave;", "Ù"},
-     {"&Uacute;", "Ú"}, {"&Ucirc;", "Û"},   {"&Uuml;", "Ü"},     {"&Yacute;", "Ý"}, {"&THORN;", "Þ"},
-     {"&szlig;", "ß"},  {"&agrave;", "à"},  {"&aacute;", "á"},   {"&acirc;", "â"},  {"&atilde;", "ã"},
-     {"&auml;", "ä"},   {"&aring;", "å"},   {"&aelig;", "æ"},    {"&ccedil;", "ç"}, {"&egrave;", "è"},
-     {"&eacute;", "é"}, {"&ecirc;", "ê"},   {"&euml;", "ë"},     {"&igrave;", "ì"}, {"&iacute;", "í"},
-     {"&icirc;", "î"},  {"&iuml;", "ï"},    {"&eth;", "ð"},      {"&ntilde;", "ñ"}, {"&ograve;", "ò"},
-     {"&oacute;", "ó"}, {"&ocirc;", "ô"},   {"&otilde;", "õ"},   {"&ouml;", "ö"},   {"&oslash;", "ø"},
-     {"&ugrave;", "ù"}, {"&uacute;", "ú"},  {"&ucirc;", "û"},    {"&uuml;", "ü"},   {"&yacute;", "ý"},
-     {"&thorn;", "þ"},  {"&yuml;", "ÿ"},    {"&nbsp;", " "},     {"&iexcl;", "¡"},  {"&cent;", "¢"},
-     {"&pound;", "£"},  {"&curren;", "¤"},  {"&yen;", "¥"},      {"&brvbar;", "¦"}, {"&sect;", "§"},
-     {"&uml;", "¨"},    {"&copy;", "©"},    {"&ordf;", "ª"},     {"&laquo;", "«"},  {"&not;", "¬"},
-     {"&shy;", ""},    {"&reg;", "®"},     {"&macr;", "¯"},     {"&deg;", "°"},    {"&plusmn;", "±"},
-     {"&sup2;", "²"},   {"&sup3;", "³"},    {"&acute;", "´"},    {"&micro;", "µ"},  {"&para;", "¶"},
-     {"&cedil;", "¸"},  {"&sup1;", "¹"},    {"&ordm;", "º"},     {"&raquo;", "»"},  {"&frac14;", "¼"},
-     {"&frac12;", "½"}, {"&frac34;", "¾"},  {"&iquest;", "¿"},   {"&times;", "×"},  {"&divide;", "÷"},
-     {"&forall;", "∀"}, {"&part;", "∂"},    {"&exist;", "∃"},    {"&empty;", "∅"},  {"&nabla;", "∇"},
-     {"&isin;", "∈"},   {"&notin;", "∉"},   {"&ni;", "∋"},       {"&prod;", "∏"},   {"&sum;", "∑"},
-     {"&minus;", "−"},  {"&lowast;", "∗"},  {"&radic;", "√"},    {"&prop;", "∝"},   {"&infin;", "∞"},
-     {"&ang;", "∠"},    {"&and;", "∧"},     {"&or;", "∨"},       {"&cap;", "∩"},    {"&cup;", "∪"},
-     {"&int;", "∫"},    {"&there4;", "∴"},  {"&sim;", "∼"},      {"&cong;", "≅"},   {"&asymp;", "≈"},
-     {"&ne;", "≠"},     {"&equiv;", "≡"},   {"&le;", "≤"},       {"&ge;", "≥"},     {"&sub;", "⊂"},
-     {"&sup;", "⊃"},    {"&nsub;", "⊄"},    {"&sube;", "⊆"},     {"&supe;", "⊇"},   {"&oplus;", "⊕"},
-     {"&otimes;", "⊗"}, {"&perp;", "⊥"},    {"&sdot;", "⋅"},     {"&Alpha;", "Α"},  {"&Beta;", "Β"},
-     {"&Gamma;", "Γ"},  {"&Delta;", "Δ"},   {"&Epsilon;", "Ε"},  {"&Zeta;", "Ζ"},   {"&Eta;", "Η"},
-     {"&Theta;", "Θ"},  {"&Iota;", "Ι"},    {"&Kappa;", "Κ"},    {"&Lambda;", "Λ"}, {"&Mu;", "Μ"},
-     {"&Nu;", "Ν"},     {"&Xi;", "Ξ"},      {"&Omicron;", "Ο"},  {"&Pi;", "Π"},     {"&Rho;", "Ρ"},
-     {"&Sigma;", "Σ"},  {"&Tau;", "Τ"},     {"&Upsilon;", "Υ"},  {"&Phi;", "Φ"},    {"&Chi;", "Χ"},
-     {"&Psi;", "Ψ"},    {"&Omega;", "Ω"},   {"&alpha;", "α"},    {"&beta;", "β"},   {"&gamma;", "γ"},
-     {"&delta;", "δ"},  {"&epsilon;", "ε"}, {"&zeta;", "ζ"},     {"&eta;", "η"},    {"&theta;", "θ"},
-     {"&iota;", "ι"},   {"&kappa;", "κ"},   {"&lambda;", "λ"},   {"&mu;", "μ"},     {"&nu;", "ν"},
-     {"&xi;", "ξ"},     {"&omicron;", "ο"}, {"&pi;", "π"},       {"&rho;", "ρ"},    {"&sigmaf;", "ς"},
-     {"&sigma;", "σ"},  {"&tau;", "τ"},     {"&upsilon;", "υ"},  {"&phi;", "φ"},    {"&chi;", "χ"},
-     {"&psi;", "ψ"},    {"&omega;", "ω"},   {"&thetasym;", "ϑ"}, {"&upsih;", "ϒ"},  {"&piv;", "ϖ"},
-     {"&OElig;", "Œ"},  {"&oelig;", "œ"},   {"&Scaron;", "Š"},   {"&scaron;", "š"}, {"&Yuml;", "Ÿ"},
-     {"&fnof;", "ƒ"},   {"&circ;", "ˆ"},    {"&tilde;", "˜"},    {"&ensp;", ""},    {"&emsp;", ""},
-     {"&thinsp;", ""},  {"&zwnj;", "‌"},  {"&zwj;", "‍"},    {"&lrm;", "‎"},  {"&rlm;", "‏"},
-     {"&ndash;", "–"},  {"&mdash;", "—"},   {"&lsquo;", "‘"},    {"&rsquo;", "’"},  {"&sbquo;", "‚"},
-     {"&ldquo;", "“"},  {"&rdquo;", "”"},   {"&bdquo;", "„"},    {"&dagger;", "†"}, {"&Dagger;", "‡"},
-     {"&bull;", "•"},   {"&hellip;", "…"},  {"&permil;", "‰"},   {"&prime;", "′"},  {"&Prime;", "″"},
-     {"&lsaquo;", "‹"}, {"&rsaquo;", "›"},  {"&oline;", "‾"},    {"&euro;", "€"},   {"&trade;", "™"},
-     {"&larr;", "←"},   {"&uarr;", "↑"},    {"&rarr;", "→"},     {"&darr;", "↓"},   {"&harr;", "↔"},
-     {"&crarr;", "↵"},  {"&lceil;", "⌈"},   {"&rceil;", "⌉"},    {"&lfloor;", "⌊"}, {"&rfloor;", "⌋"},
-     {"&loz;", "◊"},    {"&spades;", "♠"},  {"&clubs;", "♣"},    {"&hearts;", "♥"}, {"&diams;", "♦"}});
-
-// converts from a unicode code point to the utf8 equivalent
-void convert_to_utf8(const int code, std::string& res) {
-  // convert to a utf8 sequence
-  if (code < 0x80) {
-    res += static_cast<char>(code);
-  } else if (code < 0x800) {
-    res += static_cast<char>(0xc0 | (code >> 6));
-    res += static_cast<char>(0x80 | (code & 0x3f));
-  } else if (code < 0x10000) {
-    res += static_cast<char>(0xe0 | (code >> 12));
-    res += static_cast<char>(0x80 | ((code >> 6) & 0x3f));
-    res += static_cast<char>(0x80 | (code & 0x3f));
-  } else if (code < 0x200000) {
-    res += static_cast<char>(0xf0 | (code >> 18));
-    res += static_cast<char>(0x80 | ((code >> 12) & 0x3f));
-    res += static_cast<char>(0x80 | ((code >> 6) & 0x3f));
-    res += static_cast<char>(0x80 | (code & 0x3f));
-  } else if (code < 0x4000000) {
-    res += static_cast<char>(0xf8 | (code >> 24));
-    res += static_cast<char>(0x80 | ((code >> 18) & 0x3f));
-    res += static_cast<char>(0x80 | ((code >> 12) & 0x3f));
-    res += static_cast<char>(0x80 | ((code >> 6) & 0x3f));
-    res += static_cast<char>(0x80 | (code & 0x3f));
-  } else if (code < 0x80000000) {
-    res += static_cast<char>(0xfc | (code >> 30));
-    res += static_cast<char>(0x80 | ((code >> 24) & 0x3f));
-    res += static_cast<char>(0x80 | ((code >> 18) & 0x3f));
-    res += static_cast<char>(0x80 | ((code >> 12) & 0x3f));
-    res += static_cast<char>(0x80 | ((code >> 6) & 0x3f));
-  }
-}
-
-// handles numeric entities - e.g. &#1234; or &#x1234;
-bool process_numeric_entity(const std::string& entity, std::string& res) {
-  int code = 0;
-  // is it hex?
-  if (entity[2] == 'x' || entity[2] == 'X') {
-    // parse the hex code
-    code = strtol(entity.substr(3, entity.size() - 3).c_str(), nullptr, 16);
-  } else {
-    code = strtol(entity.substr(2, entity.size() - 3).c_str(), nullptr, 10);
-  }
-  if (code != 0) {
-    // special handling for nbsp
-    if (code == 0xA0) {
-      res += " ";
-    } else {
-      convert_to_utf8(code, res);
-    }
-    return true;
-  }
-  return false;
-}
-
-// handles named entities - e.g. &amp;
-bool process_string_entity(const std::string& entity, std::string& res) {
-  // it's a named entity - find it in the lookup table
-  // find it in the map
-  const auto it = entity_lookup.find(entity);
-  if (it != entity_lookup.end()) {
-    res += it->second;
-    return true;
-  }
-  return false;
-}
-
-// replace all the entities in the string
-std::string replaceHtmlEntities(const char* text) {
-  std::string res;
-  res.reserve(strlen(text));
-  for (int i = 0; i < strlen(text); ++i) {
-    bool flag = false;
-    // do we have a potential entity?
-    if (text[i] == '&') {
-      // find the end of the entity
-      int j = i + 1;
-      while (j < strlen(text) && text[j] != ';' && j - i < MAX_ENTITY_LENGTH) {
-        j++;
-      }
-      if (j - i > 2) {
-        char entity[j - i + 1];
-        strncpy(entity, text + i, j - i);
-        // is it a numeric code?
-        if (entity[1] == '#') {
-          flag = process_numeric_entity(entity, res);
-        } else {
-          flag = process_string_entity(entity, res);
-        }
-        // skip past the entity if we successfully decoded it
-        if (flag) {
-          i = j;
-        }
-      }
-    }
-    if (!flag) {
-      res += text[i];
-    }
-  }
-  return res;
-}
--- a/lib/Epub/Epub/htmlEntities.h
+++ b/lib/Epub/Epub/htmlEntities.h
@@ -1,7 +0,0 @@
-// from
-// https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp
-
-#pragma once
-#include <string>
-
-std::string replaceHtmlEntities(const char* text);
--- a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp
+++ b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp
@@ -6,7 +6,6 @@
 #include <expat.h>

 #include "../Page.h"
-#include "../htmlEntities.h"

 const char* HEADER_TAGS[] = {"h1", "h2", "h3", "h4", "h5", "h6"};
 constexpr int NUM_HEADER_TAGS = sizeof(HEADER_TAGS) / sizeof(HEADER_TAGS[0]);
@@ -130,7 +129,7 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char
      // Currently looking at whitespace, if there's anything in the partWordBuffer, flush it
      if (self->partWordBufferIndex > 0) {
        self->partWordBuffer[self->partWordBufferIndex] = '\0';
-        self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), fontStyle);
+        self->currentTextBlock->addWord(self->partWordBuffer, fontStyle);
        self->partWordBufferIndex = 0;
      }
      // Skip the whitespace char
@@ -155,7 +154,7 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char
    // If we're about to run out of space, then cut the word off and start a new one
    if (self->partWordBufferIndex >= MAX_WORD_SIZE) {
      self->partWordBuffer[self->partWordBufferIndex] = '\0';
-      self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), fontStyle);
+      self->currentTextBlock->addWord(self->partWordBuffer, fontStyle);
      self->partWordBufferIndex = 0;
    }

@@ -197,7 +196,7 @@ void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* n
      }

      self->partWordBuffer[self->partWordBufferIndex] = '\0';
-      self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), fontStyle);
+      self->currentTextBlock->addWord(self->partWordBuffer, fontStyle);
      self->partWordBufferIndex = 0;
    }
  }