Remove HTML entity parsing (#274)

## Summary * Remove HTML entity parsing * This has been completely useless since the introduction of expat * expat tries to parse all entities in the document, but only knows of HTML ones * Parsing will never end with HTML entities in the text, so the additional step to parse them that we had went completely unused * We should figure out the best way to parse that content in the future, but for now remove that module as it generates a lot of heap allocations with its map and strings
2026-01-07 22:08:43 +10:00
parent 46fa186b82
commit 2b12a65011
3 changed files with 3 additions and 174 deletions
--- a/lib/Epub/Epub/htmlEntities.cpp
+++ b/lib/Epub/Epub/htmlEntities.cpp
@@ -1,163 +0,0 @@
 // from
 // https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp
 #include "htmlEntities.h"
 #include <cstring>
 #include <unordered_map>
 const int MAX_ENTITY_LENGTH = 10;
 // Use book: entities_ww2.epub to test this (Page 7: Entities parser test)
 // Note the supported keys are only in lowercase
 // Store the mappings in a unordered hash map
 static std::unordered_map<std::string, std::string> entity_lookup(
    {{"&quot;", "\""},  {"&frasl;", "⁄"},   {"&amp;", "&"},      {"&lt;", "<"},     {"&gt;", ">"},
     {"&Agrave;", "À"}, {"&Aacute;", "Á"},  {"&Acirc;", "Â"},    {"&Atilde;", "Ã"}, {"&Auml;", "Ä"},
     {"&Aring;", "Å"},  {"&AElig;", "Æ"},   {"&Ccedil;", "Ç"},   {"&Egrave;", "È"}, {"&Eacute;", "É"},
     {"&Ecirc;", "Ê"},  {"&Euml;", "Ë"},    {"&Igrave;", "Ì"},   {"&Iacute;", "Í"}, {"&Icirc;", "Î"},
     {"&Iuml;", "Ï"},   {"&ETH;", "Ð"},     {"&Ntilde;", "Ñ"},   {"&Ograve;", "Ò"}, {"&Oacute;", "Ó"},
     {"&Ocirc;", "Ô"},  {"&Otilde;", "Õ"},  {"&Ouml;", "Ö"},     {"&Oslash;", "Ø"}, {"&Ugrave;", "Ù"},
     {"&Uacute;", "Ú"}, {"&Ucirc;", "Û"},   {"&Uuml;", "Ü"},     {"&Yacute;", "Ý"}, {"&THORN;", "Þ"},
     {"&szlig;", "ß"},  {"&agrave;", "à"},  {"&aacute;", "á"},   {"&acirc;", "â"},  {"&atilde;", "ã"},
     {"&auml;", "ä"},   {"&aring;", "å"},   {"&aelig;", "æ"},    {"&ccedil;", "ç"}, {"&egrave;", "è"},
     {"&eacute;", "é"}, {"&ecirc;", "ê"},   {"&euml;", "ë"},     {"&igrave;", "ì"}, {"&iacute;", "í"},
     {"&icirc;", "î"},  {"&iuml;", "ï"},    {"&eth;", "ð"},      {"&ntilde;", "ñ"}, {"&ograve;", "ò"},
     {"&oacute;", "ó"}, {"&ocirc;", "ô"},   {"&otilde;", "õ"},   {"&ouml;", "ö"},   {"&oslash;", "ø"},
     {"&ugrave;", "ù"}, {"&uacute;", "ú"},  {"&ucirc;", "û"},    {"&uuml;", "ü"},   {"&yacute;", "ý"},
     {"&thorn;", "þ"},  {"&yuml;", "ÿ"},    {"&nbsp;", " "},     {"&iexcl;", "¡"},  {"&cent;", "¢"},
     {"&pound;", "£"},  {"&curren;", "¤"},  {"&yen;", "¥"},      {"&brvbar;", "¦"}, {"&sect;", "§"},
     {"&uml;", "¨"},    {"&copy;", "©"},    {"&ordf;", "ª"},     {"&laquo;", "«"},  {"&not;", "¬"},
     {"&shy;", ""},    {"&reg;", "®"},     {"&macr;", "¯"},     {"&deg;", "°"},    {"&plusmn;", "±"},
     {"&sup2;", "²"},   {"&sup3;", "³"},    {"&acute;", "´"},    {"&micro;", "µ"},  {"&para;", "¶"},
     {"&cedil;", "¸"},  {"&sup1;", "¹"},    {"&ordm;", "º"},     {"&raquo;", "»"},  {"&frac14;", "¼"},
     {"&frac12;", "½"}, {"&frac34;", "¾"},  {"&iquest;", "¿"},   {"&times;", "×"},  {"&divide;", "÷"},
     {"&forall;", "∀"}, {"&part;", "∂"},    {"&exist;", "∃"},    {"&empty;", "∅"},  {"&nabla;", "∇"},
     {"&isin;", "∈"},   {"&notin;", "∉"},   {"&ni;", "∋"},       {"&prod;", "∏"},   {"&sum;", "∑"},
     {"&minus;", "−"},  {"&lowast;", "∗"},  {"&radic;", "√"},    {"&prop;", "∝"},   {"&infin;", "∞"},
     {"&ang;", "∠"},    {"&and;", "∧"},     {"&or;", "∨"},       {"&cap;", "∩"},    {"&cup;", "∪"},
     {"&int;", "∫"},    {"&there4;", "∴"},  {"&sim;", "∼"},      {"&cong;", "≅"},   {"&asymp;", "≈"},
     {"&ne;", "≠"},     {"&equiv;", "≡"},   {"&le;", "≤"},       {"&ge;", "≥"},     {"&sub;", "⊂"},
     {"&sup;", "⊃"},    {"&nsub;", "⊄"},    {"&sube;", "⊆"},     {"&supe;", "⊇"},   {"&oplus;", "⊕"},
     {"&otimes;", "⊗"}, {"&perp;", "⊥"},    {"&sdot;", "⋅"},     {"&Alpha;", "Α"},  {"&Beta;", "Β"},
     {"&Gamma;", "Γ"},  {"&Delta;", "Δ"},   {"&Epsilon;", "Ε"},  {"&Zeta;", "Ζ"},   {"&Eta;", "Η"},
     {"&Theta;", "Θ"},  {"&Iota;", "Ι"},    {"&Kappa;", "Κ"},    {"&Lambda;", "Λ"}, {"&Mu;", "Μ"},
     {"&Nu;", "Ν"},     {"&Xi;", "Ξ"},      {"&Omicron;", "Ο"},  {"&Pi;", "Π"},     {"&Rho;", "Ρ"},
     {"&Sigma;", "Σ"},  {"&Tau;", "Τ"},     {"&Upsilon;", "Υ"},  {"&Phi;", "Φ"},    {"&Chi;", "Χ"},
     {"&Psi;", "Ψ"},    {"&Omega;", "Ω"},   {"&alpha;", "α"},    {"&beta;", "β"},   {"&gamma;", "γ"},
     {"&delta;", "δ"},  {"&epsilon;", "ε"}, {"&zeta;", "ζ"},     {"&eta;", "η"},    {"&theta;", "θ"},
     {"&iota;", "ι"},   {"&kappa;", "κ"},   {"&lambda;", "λ"},   {"&mu;", "μ"},     {"&nu;", "ν"},
     {"&xi;", "ξ"},     {"&omicron;", "ο"}, {"&pi;", "π"},       {"&rho;", "ρ"},    {"&sigmaf;", "ς"},
     {"&sigma;", "σ"},  {"&tau;", "τ"},     {"&upsilon;", "υ"},  {"&phi;", "φ"},    {"&chi;", "χ"},
     {"&psi;", "ψ"},    {"&omega;", "ω"},   {"&thetasym;", "ϑ"}, {"&upsih;", "ϒ"},  {"&piv;", "ϖ"},
     {"&OElig;", "Œ"},  {"&oelig;", "œ"},   {"&Scaron;", "Š"},   {"&scaron;", "š"}, {"&Yuml;", "Ÿ"},
     {"&fnof;", "ƒ"},   {"&circ;", "ˆ"},    {"&tilde;", "˜"},    {"&ensp;", ""},    {"&emsp;", ""},
     {"&thinsp;", ""},  {"&zwnj;", "‌"},  {"&zwj;", "‍"},    {"&lrm;", "‎"},  {"&rlm;", "‏"},
     {"&ndash;", "–"},  {"&mdash;", "—"},   {"&lsquo;", "‘"},    {"&rsquo;", "’"},  {"&sbquo;", "‚"},
     {"&ldquo;", "“"},  {"&rdquo;", "”"},   {"&bdquo;", "„"},    {"&dagger;", "†"}, {"&Dagger;", "‡"},
     {"&bull;", "•"},   {"&hellip;", "…"},  {"&permil;", "‰"},   {"&prime;", "′"},  {"&Prime;", "″"},
     {"&lsaquo;", "‹"}, {"&rsaquo;", "›"},  {"&oline;", "‾"},    {"&euro;", "€"},   {"&trade;", "™"},
     {"&larr;", "←"},   {"&uarr;", "↑"},    {"&rarr;", "→"},     {"&darr;", "↓"},   {"&harr;", "↔"},
     {"&crarr;", "↵"},  {"&lceil;", "⌈"},   {"&rceil;", "⌉"},    {"&lfloor;", "⌊"}, {"&rfloor;", "⌋"},
     {"&loz;", "◊"},    {"&spades;", "♠"},  {"&clubs;", "♣"},    {"&hearts;", "♥"}, {"&diams;", "♦"}});
 // converts from a unicode code point to the utf8 equivalent
 void convert_to_utf8(const int code, std::string& res) {
  // convert to a utf8 sequence
  if (code < 0x80) {
    res += static_cast<char>(code);
  } else if (code < 0x800) {
    res += static_cast<char>(0xc0 | (code >> 6));
    res += static_cast<char>(0x80 | (code & 0x3f));
  } else if (code < 0x10000) {
    res += static_cast<char>(0xe0 | (code >> 12));
    res += static_cast<char>(0x80 | ((code >> 6) & 0x3f));
    res += static_cast<char>(0x80 | (code & 0x3f));
  } else if (code < 0x200000) {
    res += static_cast<char>(0xf0 | (code >> 18));
    res += static_cast<char>(0x80 | ((code >> 12) & 0x3f));
    res += static_cast<char>(0x80 | ((code >> 6) & 0x3f));
    res += static_cast<char>(0x80 | (code & 0x3f));
  } else if (code < 0x4000000) {
    res += static_cast<char>(0xf8 | (code >> 24));
    res += static_cast<char>(0x80 | ((code >> 18) & 0x3f));
    res += static_cast<char>(0x80 | ((code >> 12) & 0x3f));
    res += static_cast<char>(0x80 | ((code >> 6) & 0x3f));
    res += static_cast<char>(0x80 | (code & 0x3f));
  } else if (code < 0x80000000) {
    res += static_cast<char>(0xfc | (code >> 30));
    res += static_cast<char>(0x80 | ((code >> 24) & 0x3f));
    res += static_cast<char>(0x80 | ((code >> 18) & 0x3f));
    res += static_cast<char>(0x80 | ((code >> 12) & 0x3f));
    res += static_cast<char>(0x80 | ((code >> 6) & 0x3f));
  }
 }
 // handles numeric entities - e.g. &#1234; or &#x1234;
 bool process_numeric_entity(const std::string& entity, std::string& res) {
  int code = 0;
  // is it hex?
  if (entity[2] == 'x' || entity[2] == 'X') {
    // parse the hex code
    code = strtol(entity.substr(3, entity.size() - 3).c_str(), nullptr, 16);
  } else {
    code = strtol(entity.substr(2, entity.size() - 3).c_str(), nullptr, 10);
  }
  if (code != 0) {
    // special handling for nbsp
    if (code == 0xA0) {
      res += " ";
    } else {
      convert_to_utf8(code, res);
    }
    return true;
  }
  return false;
 }
 // handles named entities - e.g. &amp;
 bool process_string_entity(const std::string& entity, std::string& res) {
  // it's a named entity - find it in the lookup table
  // find it in the map
  const auto it = entity_lookup.find(entity);
  if (it != entity_lookup.end()) {
    res += it->second;
    return true;
  }
  return false;
 }
 // replace all the entities in the string
 std::string replaceHtmlEntities(const char* text) {
  std::string res;
  res.reserve(strlen(text));
  for (int i = 0; i < strlen(text); ++i) {
    bool flag = false;
    // do we have a potential entity?
    if (text[i] == '&') {
      // find the end of the entity
      int j = i + 1;
      while (j < strlen(text) && text[j] != ';' && j - i < MAX_ENTITY_LENGTH) {
        j++;
      }
      if (j - i > 2) {
        char entity[j - i + 1];
        strncpy(entity, text + i, j - i);
        // is it a numeric code?
        if (entity[1] == '#') {
          flag = process_numeric_entity(entity, res);
        } else {
          flag = process_string_entity(entity, res);
        }
        // skip past the entity if we successfully decoded it
        if (flag) {
          i = j;
        }
      }
    }
    if (!flag) {
      res += text[i];
    }
  }
  return res;
 }
--- a/lib/Epub/Epub/htmlEntities.h
+++ b/lib/Epub/Epub/htmlEntities.h
@@ -1,7 +0,0 @@
 // from
 // https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp
 #pragma once
 #include <string>
 std::string replaceHtmlEntities(const char* text);
--- a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp
+++ b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp
@@ -6,7 +6,6 @@
 #include <expat.h>
 #include "../Page.h"
 #include "../htmlEntities.h"
 const char* HEADER_TAGS[] = {"h1", "h2", "h3", "h4", "h5", "h6"};
 constexpr int NUM_HEADER_TAGS = sizeof(HEADER_TAGS) / sizeof(HEADER_TAGS[0]);
@@ -130,7 +129,7 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char
      // Currently looking at whitespace, if there's anything in the partWordBuffer, flush it
      if (self->partWordBufferIndex > 0) {
        self->partWordBuffer[self->partWordBufferIndex] = '\0';
-        self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), fontStyle);
+        self->currentTextBlock->addWord(self->partWordBuffer, fontStyle);
        self->partWordBufferIndex = 0;
      }
      // Skip the whitespace char
@@ -155,7 +154,7 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char
    // If we're about to run out of space, then cut the word off and start a new one
    if (self->partWordBufferIndex >= MAX_WORD_SIZE) {
      self->partWordBuffer[self->partWordBufferIndex] = '\0';
-      self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), fontStyle);
+      self->currentTextBlock->addWord(self->partWordBuffer, fontStyle);
      self->partWordBufferIndex = 0;
    }
@@ -197,7 +196,7 @@ void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* n
      }
      self->partWordBuffer[self->partWordBufferIndex] = '\0';
-      self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), fontStyle);
+      self->currentTextBlock->addWord(self->partWordBuffer, fontStyle);
      self->partWordBufferIndex = 0;
    }
  }