From 2b12a65011293f4cf8b75094027664f2fe0a314b Mon Sep 17 00:00:00 2001 From: Dave Allie Date: Wed, 7 Jan 2026 22:08:43 +1000 Subject: [PATCH] Remove HTML entity parsing (#274) ## Summary * Remove HTML entity parsing * This has been completely useless since the introduction of expat * expat tries to parse all entities in the document, but only knows of HTML ones * Parsing will never end with HTML entities in the text, so the additional step to parse them that we had went completely unused * We should figure out the best way to parse that content in the future, but for now remove that module as it generates a lot of heap allocations with its map and strings --- lib/Epub/Epub/htmlEntities.cpp | 163 ------------------ lib/Epub/Epub/htmlEntities.h | 7 - .../Epub/parsers/ChapterHtmlSlimParser.cpp | 7 +- 3 files changed, 3 insertions(+), 174 deletions(-) delete mode 100644 lib/Epub/Epub/htmlEntities.cpp delete mode 100644 lib/Epub/Epub/htmlEntities.h diff --git a/lib/Epub/Epub/htmlEntities.cpp b/lib/Epub/Epub/htmlEntities.cpp deleted file mode 100644 index f44a158..0000000 --- a/lib/Epub/Epub/htmlEntities.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// from -// https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp - -#include "htmlEntities.h" - -#include -#include - -const int MAX_ENTITY_LENGTH = 10; - -// Use book: entities_ww2.epub to test this (Page 7: Entities parser test) -// Note the supported keys are only in lowercase -// Store the mappings in a unordered hash map -static std::unordered_map entity_lookup( - {{""", "\""}, {"⁄", "⁄"}, {"&", "&"}, {"<", "<"}, {">", ">"}, - {"À", "À"}, {"Á", "Á"}, {"Â", "Â"}, {"Ã", "Ã"}, {"Ä", "Ä"}, - {"Å", "Å"}, {"Æ", "Æ"}, {"Ç", "Ç"}, {"È", "È"}, {"É", "É"}, - {"Ê", "Ê"}, {"Ë", "Ë"}, {"Ì", "Ì"}, {"Í", "Í"}, {"Î", "Î"}, - {"Ï", "Ï"}, {"Ð", "Ð"}, {"Ñ", "Ñ"}, {"Ò", "Ò"}, {"Ó", "Ó"}, - {"Ô", "Ô"}, {"Õ", "Õ"}, {"Ö", "Ö"}, {"Ø", "Ø"}, {"Ù", "Ù"}, - {"Ú", "Ú"}, {"Û", "Û"}, {"Ü", "Ü"}, {"Ý", "Ý"}, {"Þ", "Þ"}, - {"ß", "ß"}, {"à", "à"}, {"á", "á"}, {"â", "â"}, {"ã", "ã"}, - {"ä", "ä"}, {"å", "å"}, {"æ", "æ"}, {"ç", "ç"}, {"è", "è"}, - {"é", "é"}, {"ê", "ê"}, {"ë", "ë"}, {"ì", "ì"}, {"í", "í"}, - {"î", "î"}, {"ï", "ï"}, {"ð", "ð"}, {"ñ", "ñ"}, {"ò", "ò"}, - {"ó", "ó"}, {"ô", "ô"}, {"õ", "õ"}, {"ö", "ö"}, {"ø", "ø"}, - {"ù", "ù"}, {"ú", "ú"}, {"û", "û"}, {"ü", "ü"}, {"ý", "ý"}, - {"þ", "þ"}, {"ÿ", "ÿ"}, {" ", " "}, {"¡", "¡"}, {"¢", "¢"}, - {"£", "£"}, {"¤", "¤"}, {"¥", "¥"}, {"¦", "¦"}, {"§", "§"}, - {"¨", "¨"}, {"©", "©"}, {"ª", "ª"}, {"«", "«"}, {"¬", "¬"}, - {"­", "­"}, {"®", "®"}, {"¯", "¯"}, {"°", "°"}, {"±", "±"}, - {"²", "²"}, {"³", "³"}, {"´", "´"}, {"µ", "µ"}, {"¶", "¶"}, - {"¸", "¸"}, {"¹", "¹"}, {"º", "º"}, {"»", "»"}, {"¼", "¼"}, - {"½", "½"}, {"¾", "¾"}, {"¿", "¿"}, {"×", "×"}, {"÷", "÷"}, - {"∀", "∀"}, {"∂", "∂"}, {"∃", "∃"}, {"∅", "∅"}, {"∇", "∇"}, - {"∈", "∈"}, {"∉", "∉"}, {"∋", "∋"}, {"∏", "∏"}, {"∑", "∑"}, - {"−", "−"}, {"∗", "∗"}, {"√", "√"}, {"∝", "∝"}, {"∞", "∞"}, - {"∠", "∠"}, {"∧", "∧"}, {"∨", "∨"}, {"∩", "∩"}, {"∪", "∪"}, - {"∫", "∫"}, {"∴", "∴"}, {"∼", "∼"}, {"≅", "≅"}, {"≈", "≈"}, - {"≠", "≠"}, {"≡", "≡"}, {"≤", "≤"}, {"≥", "≥"}, {"⊂", "⊂"}, - {"⊃", "⊃"}, {"⊄", "⊄"}, {"⊆", "⊆"}, {"⊇", "⊇"}, {"⊕", "⊕"}, - {"⊗", "⊗"}, {"⊥", "⊥"}, {"⋅", "⋅"}, {"Α", "Α"}, {"Β", "Β"}, - {"Γ", "Γ"}, {"Δ", "Δ"}, {"Ε", "Ε"}, {"Ζ", "Ζ"}, {"Η", "Η"}, - {"Θ", "Θ"}, {"Ι", "Ι"}, {"Κ", "Κ"}, {"Λ", "Λ"}, {"Μ", "Μ"}, - {"Ν", "Ν"}, {"Ξ", "Ξ"}, {"Ο", "Ο"}, {"Π", "Π"}, {"Ρ", "Ρ"}, - {"Σ", "Σ"}, {"Τ", "Τ"}, {"Υ", "Υ"}, {"Φ", "Φ"}, {"Χ", "Χ"}, - {"Ψ", "Ψ"}, {"Ω", "Ω"}, {"α", "α"}, {"β", "β"}, {"γ", "γ"}, - {"δ", "δ"}, {"ε", "ε"}, {"ζ", "ζ"}, {"η", "η"}, {"θ", "θ"}, - {"ι", "ι"}, {"κ", "κ"}, {"λ", "λ"}, {"μ", "μ"}, {"ν", "ν"}, - {"ξ", "ξ"}, {"ο", "ο"}, {"π", "π"}, {"ρ", "ρ"}, {"ς", "ς"}, - {"σ", "σ"}, {"τ", "τ"}, {"υ", "υ"}, {"φ", "φ"}, {"χ", "χ"}, - {"ψ", "ψ"}, {"ω", "ω"}, {"ϑ", "ϑ"}, {"ϒ", "ϒ"}, {"ϖ", "ϖ"}, - {"Œ", "Œ"}, {"œ", "œ"}, {"Š", "Š"}, {"š", "š"}, {"Ÿ", "Ÿ"}, - {"ƒ", "ƒ"}, {"ˆ", "ˆ"}, {"˜", "˜"}, {" ", ""}, {" ", ""}, - {" ", ""}, {"‌", "‌"}, {"‍", "‍"}, {"‎", "‎"}, {"‏", "‏"}, - {"–", "–"}, {"—", "—"}, {"‘", "‘"}, {"’", "’"}, {"‚", "‚"}, - {"“", "“"}, {"”", "”"}, {"„", "„"}, {"†", "†"}, {"‡", "‡"}, - {"•", "•"}, {"…", "…"}, {"‰", "‰"}, {"′", "′"}, {"″", "″"}, - {"‹", "‹"}, {"›", "›"}, {"‾", "‾"}, {"€", "€"}, {"™", "™"}, - {"←", "←"}, {"↑", "↑"}, {"→", "→"}, {"↓", "↓"}, {"↔", "↔"}, - {"↵", "↵"}, {"⌈", "⌈"}, {"⌉", "⌉"}, {"⌊", "⌊"}, {"⌋", "⌋"}, - {"◊", "◊"}, {"♠", "♠"}, {"♣", "♣"}, {"♥", "♥"}, {"♦", "♦"}}); - -// converts from a unicode code point to the utf8 equivalent -void convert_to_utf8(const int code, std::string& res) { - // convert to a utf8 sequence - if (code < 0x80) { - res += static_cast(code); - } else if (code < 0x800) { - res += static_cast(0xc0 | (code >> 6)); - res += static_cast(0x80 | (code & 0x3f)); - } else if (code < 0x10000) { - res += static_cast(0xe0 | (code >> 12)); - res += static_cast(0x80 | ((code >> 6) & 0x3f)); - res += static_cast(0x80 | (code & 0x3f)); - } else if (code < 0x200000) { - res += static_cast(0xf0 | (code >> 18)); - res += static_cast(0x80 | ((code >> 12) & 0x3f)); - res += static_cast(0x80 | ((code >> 6) & 0x3f)); - res += static_cast(0x80 | (code & 0x3f)); - } else if (code < 0x4000000) { - res += static_cast(0xf8 | (code >> 24)); - res += static_cast(0x80 | ((code >> 18) & 0x3f)); - res += static_cast(0x80 | ((code >> 12) & 0x3f)); - res += static_cast(0x80 | ((code >> 6) & 0x3f)); - res += static_cast(0x80 | (code & 0x3f)); - } else if (code < 0x80000000) { - res += static_cast(0xfc | (code >> 30)); - res += static_cast(0x80 | ((code >> 24) & 0x3f)); - res += static_cast(0x80 | ((code >> 18) & 0x3f)); - res += static_cast(0x80 | ((code >> 12) & 0x3f)); - res += static_cast(0x80 | ((code >> 6) & 0x3f)); - } -} - -// handles numeric entities - e.g. Ӓ or ሴ -bool process_numeric_entity(const std::string& entity, std::string& res) { - int code = 0; - // is it hex? - if (entity[2] == 'x' || entity[2] == 'X') { - // parse the hex code - code = strtol(entity.substr(3, entity.size() - 3).c_str(), nullptr, 16); - } else { - code = strtol(entity.substr(2, entity.size() - 3).c_str(), nullptr, 10); - } - if (code != 0) { - // special handling for nbsp - if (code == 0xA0) { - res += " "; - } else { - convert_to_utf8(code, res); - } - return true; - } - return false; -} - -// handles named entities - e.g. & -bool process_string_entity(const std::string& entity, std::string& res) { - // it's a named entity - find it in the lookup table - // find it in the map - const auto it = entity_lookup.find(entity); - if (it != entity_lookup.end()) { - res += it->second; - return true; - } - return false; -} - -// replace all the entities in the string -std::string replaceHtmlEntities(const char* text) { - std::string res; - res.reserve(strlen(text)); - for (int i = 0; i < strlen(text); ++i) { - bool flag = false; - // do we have a potential entity? - if (text[i] == '&') { - // find the end of the entity - int j = i + 1; - while (j < strlen(text) && text[j] != ';' && j - i < MAX_ENTITY_LENGTH) { - j++; - } - if (j - i > 2) { - char entity[j - i + 1]; - strncpy(entity, text + i, j - i); - // is it a numeric code? - if (entity[1] == '#') { - flag = process_numeric_entity(entity, res); - } else { - flag = process_string_entity(entity, res); - } - // skip past the entity if we successfully decoded it - if (flag) { - i = j; - } - } - } - if (!flag) { - res += text[i]; - } - } - return res; -} diff --git a/lib/Epub/Epub/htmlEntities.h b/lib/Epub/Epub/htmlEntities.h deleted file mode 100644 index 109f717..0000000 --- a/lib/Epub/Epub/htmlEntities.h +++ /dev/null @@ -1,7 +0,0 @@ -// from -// https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp - -#pragma once -#include - -std::string replaceHtmlEntities(const char* text); diff --git a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp index e5eb4d1..b96d28f 100644 --- a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp +++ b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp @@ -6,7 +6,6 @@ #include #include "../Page.h" -#include "../htmlEntities.h" const char* HEADER_TAGS[] = {"h1", "h2", "h3", "h4", "h5", "h6"}; constexpr int NUM_HEADER_TAGS = sizeof(HEADER_TAGS) / sizeof(HEADER_TAGS[0]); @@ -130,7 +129,7 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char // Currently looking at whitespace, if there's anything in the partWordBuffer, flush it if (self->partWordBufferIndex > 0) { self->partWordBuffer[self->partWordBufferIndex] = '\0'; - self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), fontStyle); + self->currentTextBlock->addWord(self->partWordBuffer, fontStyle); self->partWordBufferIndex = 0; } // Skip the whitespace char @@ -155,7 +154,7 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char // If we're about to run out of space, then cut the word off and start a new one if (self->partWordBufferIndex >= MAX_WORD_SIZE) { self->partWordBuffer[self->partWordBufferIndex] = '\0'; - self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), fontStyle); + self->currentTextBlock->addWord(self->partWordBuffer, fontStyle); self->partWordBufferIndex = 0; } @@ -197,7 +196,7 @@ void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* n } self->partWordBuffer[self->partWordBufferIndex] = '\0'; - self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), fontStyle); + self->currentTextBlock->addWord(self->partWordBuffer, fontStyle); self->partWordBufferIndex = 0; } }