Public release

2025-12-03 22:00:29 +11:00
commit 2ccdbeecc8
54 changed files with 33356 additions and 0 deletions
--- a/lib/Epub/Epub.cpp
+++ b/lib/Epub/Epub.cpp
@@ -0,0 +1,383 @@
+#include "Epub.h"
+
+#include <HardwareSerial.h>
+#include <SD.h>
+#include <ZipFile.h>
+#include <tinyxml2.h>
+
+#include <map>
+
+bool Epub::findContentOpfFile(const ZipFile& zip, std::string& contentOpfFile) {
+  // open up the meta data to find where the content.opf file lives
+  size_t s;
+  const auto metaInfo = zip.readTextFileToMemory("META-INF/container.xml", &s);
+  if (!metaInfo) {
+    Serial.println("Could not find META-INF/container.xml");
+    return false;
+  }
+
+  // parse the meta data
+  tinyxml2::XMLDocument metaDataDoc;
+  const auto result = metaDataDoc.Parse(metaInfo);
+  free(metaInfo);
+
+  if (result != tinyxml2::XML_SUCCESS) {
+    Serial.printf("Could not parse META-INF/container.xml. Error: %d\n", result);
+    return false;
+  }
+
+  const auto container = metaDataDoc.FirstChildElement("container");
+  if (!container) {
+    Serial.println("Could not find container element in META-INF/container.xml");
+    return false;
+  }
+
+  const auto rootfiles = container->FirstChildElement("rootfiles");
+  if (!rootfiles) {
+    Serial.println("Could not find rootfiles element in META-INF/container.xml");
+    return false;
+  }
+
+  // find the root file that has the media-type="application/oebps-package+xml"
+  auto rootfile = rootfiles->FirstChildElement("rootfile");
+  while (rootfile) {
+    const char* mediaType = rootfile->Attribute("media-type");
+    if (mediaType && strcmp(mediaType, "application/oebps-package+xml") == 0) {
+      const char* full_path = rootfile->Attribute("full-path");
+      if (full_path) {
+        contentOpfFile = full_path;
+        return true;
+      }
+    }
+    rootfile = rootfile->NextSiblingElement("rootfile");
+  }
+
+  Serial.println("Could not get path to content.opf file");
+  return false;
+}
+
+bool Epub::parseContentOpf(ZipFile& zip, std::string& content_opf_file) {
+  // read in the content.opf file and parse it
+  auto contents = zip.readTextFileToMemory(content_opf_file.c_str());
+
+  // parse the contents
+  tinyxml2::XMLDocument doc;
+  auto result = doc.Parse(contents);
+  free(contents);
+
+  if (result != tinyxml2::XML_SUCCESS) {
+    Serial.printf("Error parsing content.opf - %s\n", tinyxml2::XMLDocument::ErrorIDToName(result));
+    return false;
+  }
+
+  auto package = doc.FirstChildElement("package");
+  if (!package) package = doc.FirstChildElement("opf:package");
+
+  if (!package) {
+    Serial.println("Could not find package element in content.opf");
+    return false;
+  }
+
+  // get the metadata - title and cover image
+  auto metadata = package->FirstChildElement("metadata");
+  if (!metadata) metadata = package->FirstChildElement("opf:metadata");
+  if (!metadata) {
+    Serial.println("Missing metadata");
+    return false;
+  }
+
+  auto titleEl = metadata->FirstChildElement("dc:title");
+  if (!titleEl) {
+    Serial.println("Missing title");
+    return false;
+  }
+  this->title = titleEl->GetText();
+
+  auto cover = metadata->FirstChildElement("meta");
+  if (!cover) cover = metadata->FirstChildElement("opf:meta");
+  while (cover && cover->Attribute("name") && strcmp(cover->Attribute("name"), "cover") != 0) {
+    cover = cover->NextSiblingElement("meta");
+  }
+  if (!cover) {
+    Serial.println("Missing cover");
+  }
+  auto coverItem = cover ? cover->Attribute("content") : nullptr;
+
+  // read the manifest and spine
+  // the manifest gives us the names of the files
+  // the spine gives us the order of the files
+  // we can then read the files in the order they are in the spine
+  auto manifest = package->FirstChildElement("manifest");
+  if (!manifest) manifest = package->FirstChildElement("opf:manifest");
+  if (!manifest) {
+    Serial.println("Missing manifest");
+    return false;
+  }
+
+  // create a mapping from id to file name
+  auto item = manifest->FirstChildElement("item");
+  if (!item) item = manifest->FirstChildElement("opf:item");
+  std::map<std::string, std::string> items;
+
+  while (item) {
+    std::string itemId = item->Attribute("id");
+    std::string href = contentBasePath + item->Attribute("href");
+
+    // grab the cover image
+    if (coverItem && itemId == coverItem) {
+      coverImageItem = href;
+    }
+
+    // grab the ncx file
+    if (itemId == "ncx" || itemId == "ncxtoc") {
+      tocNcxItem = href;
+    }
+
+    items[itemId] = href;
+    auto nextItem = item->NextSiblingElement("item");
+    if (!nextItem) nextItem = item->NextSiblingElement("opf:item");
+    item = nextItem;
+  }
+
+  // find the spine
+  auto spineEl = package->FirstChildElement("spine");
+  if (!spineEl) spineEl = package->FirstChildElement("opf:spine");
+  if (!spineEl) {
+    Serial.println("Missing spine");
+    return false;
+  }
+
+  // read the spine
+  auto itemref = spineEl->FirstChildElement("itemref");
+  if (!itemref) itemref = spineEl->FirstChildElement("opf:itemref");
+  while (itemref) {
+    auto id = itemref->Attribute("idref");
+    if (items.find(id) != items.end()) {
+      spine.emplace_back(id, items[id]);
+    }
+    auto nextItemRef = itemref->NextSiblingElement("itemref");
+    if (!nextItemRef) nextItemRef = itemref->NextSiblingElement("opf:itemref");
+    itemref = nextItemRef;
+  }
+  return true;
+}
+
+bool Epub::parseTocNcxFile(ZipFile& zip) {
+  // the ncx file should have been specified in the content.opf file
+  if (tocNcxItem.empty()) {
+    Serial.println("No ncx file specified");
+    return false;
+  }
+
+  auto ncxData = zip.readTextFileToMemory(tocNcxItem.c_str());
+  if (!ncxData) {
+    Serial.printf("Could not find %s\n", tocNcxItem.c_str());
+    return false;
+  }
+
+  // Parse the Toc contents
+  tinyxml2::XMLDocument doc;
+  auto result = doc.Parse(ncxData);
+  free(ncxData);
+
+  if (result != tinyxml2::XML_SUCCESS) {
+    Serial.printf("Error parsing toc %s\n", tinyxml2::XMLDocument::ErrorIDToName(result));
+    return false;
+  }
+
+  auto ncx = doc.FirstChildElement("ncx");
+  if (!ncx) {
+    Serial.println("Could not find first child ncx in toc");
+    return false;
+  }
+
+  auto navMap = ncx->FirstChildElement("navMap");
+  if (!navMap) {
+    Serial.println("Could not find navMap child in ncx");
+    return false;
+  }
+
+  auto navPoint = navMap->FirstChildElement("navPoint");
+
+  // Fills toc map
+  while (navPoint) {
+    std::string navTitle = navPoint->FirstChildElement("navLabel")->FirstChildElement("text")->FirstChild()->Value();
+    auto content = navPoint->FirstChildElement("content");
+    std::string href = contentBasePath + content->Attribute("src");
+    // split the href on the # to get the href and the anchor
+    size_t pos = href.find('#');
+    std::string anchor;
+
+    if (pos != std::string::npos) {
+      anchor = href.substr(pos + 1);
+      href = href.substr(0, pos);
+    }
+
+    toc.emplace_back(navTitle, href, anchor, 0);
+    navPoint = navPoint->NextSiblingElement("navPoint");
+  }
+
+  return true;
+}
+
+// load in the meta data for the epub file
+bool Epub::load() {
+  ZipFile zip("/sd" + filepath);
+
+  std::string contentOpfFile;
+  if (!findContentOpfFile(zip, contentOpfFile)) {
+    Serial.println("Could not open ePub");
+    return false;
+  }
+
+  contentBasePath = contentOpfFile.substr(0, contentOpfFile.find_last_of('/') + 1);
+
+  if (!parseContentOpf(zip, contentOpfFile)) {
+    return false;
+  }
+
+  if (!parseTocNcxFile(zip)) {
+    return false;
+  }
+
+  return true;
+}
+
+void Epub::clearCache() const { SD.rmdir(cachePath.c_str()); }
+
+void Epub::setupCacheDir() const {
+  if (SD.exists(cachePath.c_str())) {
+    return;
+  }
+
+  // Loop over each segment of the cache path and create directories as needed
+  for (size_t i = 1; i < cachePath.length(); i++) {
+    if (cachePath[i] == '/') {
+      SD.mkdir(cachePath.substr(0, i).c_str());
+    }
+  }
+  SD.mkdir(cachePath.c_str());
+}
+
+const std::string& Epub::getCachePath() const { return cachePath; }
+
+const std::string& Epub::getPath() const { return filepath; }
+
+const std::string& Epub::getTitle() const { return title; }
+
+const std::string& Epub::getCoverImageItem() const { return coverImageItem; }
+
+std::string normalisePath(const std::string& path) {
+  std::vector<std::string> components;
+  std::string component;
+
+  for (const auto c : path) {
+    if (c == '/') {
+      if (!component.empty()) {
+        if (component == "..") {
+          if (!components.empty()) {
+            components.pop_back();
+          }
+        } else {
+          components.push_back(component);
+        }
+        component.clear();
+      }
+    } else {
+      component += c;
+    }
+  }
+
+  if (!component.empty()) {
+    components.push_back(component);
+  }
+
+  std::string result;
+  for (const auto& c : components) {
+    if (!result.empty()) {
+      result += "/";
+    }
+    result += c;
+  }
+
+  return result;
+}
+
+uint8_t* Epub::getItemContents(const std::string& itemHref, size_t* size) const {
+  const ZipFile zip("/sd" + filepath);
+  const std::string path = normalisePath(itemHref);
+
+  const auto content = zip.readFileToMemory(path.c_str(), size);
+  if (!content) {
+    Serial.printf("Failed to read item %s\n", path.c_str());
+    return nullptr;
+  }
+
+  return content;
+}
+
+char* Epub::getTextItemContents(const std::string& itemHref, size_t* size) const {
+  const ZipFile zip("/sd" + filepath);
+  const std::string path = normalisePath(itemHref);
+
+  const auto content = zip.readTextFileToMemory(path.c_str(), size);
+  if (!content) {
+    Serial.printf("Failed to read item %s\n", path.c_str());
+    return nullptr;
+  }
+
+  return content;
+}
+
+int Epub::getSpineItemsCount() const { return spine.size(); }
+
+std::string& Epub::getSpineItem(const int spineIndex) {
+  if (spineIndex < 0 || spineIndex >= spine.size()) {
+    Serial.printf("getSpineItem index:%d is out of range\n", spineIndex);
+    return spine.at(0).second;
+  }
+
+  return spine.at(spineIndex).second;
+}
+
+EpubTocEntry& Epub::getTocItem(const int tocTndex) {
+  if (tocTndex < 0 || tocTndex >= toc.size()) {
+    Serial.printf("getTocItem index:%d is out of range\n", tocTndex);
+    return toc.at(0);
+  }
+
+  return toc.at(tocTndex);
+}
+
+int Epub::getTocItemsCount() const { return toc.size(); }
+
+// work out the section index for a toc index
+int Epub::getSpineIndexForTocIndex(const int tocIndex) const {
+  // the toc entry should have an href that matches the spine item
+  // so we can find the spine index by looking for the href
+  for (int i = 0; i < spine.size(); i++) {
+    if (spine[i].second == toc[tocIndex].href) {
+      return i;
+    }
+  }
+
+  Serial.println("Section not found");
+  // not found - default to the start of the book
+  return 0;
+}
+
+int Epub::getTocIndexForSpineIndex(const int spineIndex) const {
+  // the toc entry should have an href that matches the spine item
+  // so we can find the toc index by looking for the href
+  Serial.printf("Looking for %s\n", spine[spineIndex].second.c_str());
+  for (int i = 0; i < toc.size(); i++) {
+    Serial.printf("Looking at %s\n", toc[i].href.c_str());
+    if (toc[i].href == spine[spineIndex].second) {
+      return i;
+    }
+  }
+
+  Serial.println("TOC item not found");
+  // not found - default to first item
+  return 0;
+}
--- a/lib/Epub/Epub.h
+++ b/lib/Epub/Epub.h
@@ -0,0 +1,73 @@
+#pragma once
+#include <HardwareSerial.h>
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+class ZipFile;
+
+class EpubTocEntry {
+ public:
+  std::string title;
+  std::string href;
+  std::string anchor;
+  int level;
+  EpubTocEntry(std::string title, std::string href, std::string anchor, const int level)
+      : title(std::move(title)), href(std::move(href)), anchor(std::move(anchor)), level(level) {}
+};
+
+class Epub {
+  // the title read from the EPUB meta data
+  std::string title;
+  // the cover image
+  std::string coverImageItem;
+  // the ncx file
+  std::string tocNcxItem;
+  // where is the EPUBfile?
+  std::string filepath;
+  // the spine of the EPUB file
+  std::vector<std::pair<std::string, std::string>> spine;
+  // the toc of the EPUB file
+  std::vector<EpubTocEntry> toc;
+  // the base path for items in the EPUB file
+  std::string contentBasePath;
+  // Uniq cache key based on filepath
+  std::string cachePath;
+
+  // find the path for the content.opf file
+  static bool findContentOpfFile(const ZipFile& zip, std::string& contentOpfFile);
+  bool parseContentOpf(ZipFile& zip, std::string& content_opf_file);
+  bool parseTocNcxFile(ZipFile& zip);
+
+ public:
+  explicit Epub(std::string filepath, const std::string& cacheDir) : filepath(std::move(filepath)) {
+    // create a cache key based on the filepath
+
+    cachePath = cacheDir + "/epub_" + std::to_string(std::hash<std::string>{}(this->filepath));
+  }
+  ~Epub() = default;
+  std::string& getBasePath() { return contentBasePath; }
+  bool load();
+
+  void clearCache() const;
+
+  void setupCacheDir() const;
+
+  const std::string& getCachePath() const;
+  const std::string& getPath() const;
+  const std::string& getTitle() const;
+  const std::string& getCoverImageItem() const;
+  uint8_t* getItemContents(const std::string& itemHref, size_t* size = nullptr) const;
+  char* getTextItemContents(const std::string& itemHref, size_t* size = nullptr) const;
+
+  std::string& getSpineItem(int spineIndex);
+  int getSpineItemsCount() const;
+
+  EpubTocEntry& getTocItem(int tocTndex);
+  int getTocItemsCount() const;
+  // work out the section index for a toc index
+  int getSpineIndexForTocIndex(int tocIndex) const;
+
+  int getTocIndexForSpineIndex(int spineIndex) const;
+};
--- a/lib/Epub/Epub/EpubHtmlParser.cpp
+++ b/lib/Epub/Epub/EpubHtmlParser.cpp
@@ -0,0 +1,181 @@
+#include "EpubHtmlParser.h"
+
+#include <EpdRenderer.h>
+#include <HardwareSerial.h>
+
+#include "Page.h"
+#include "htmlEntities.h"
+
+const char* HEADER_TAGS[] = {"h1", "h2", "h3", "h4", "h5", "h6"};
+constexpr int NUM_HEADER_TAGS = sizeof(HEADER_TAGS) / sizeof(HEADER_TAGS[0]);
+
+const char* BLOCK_TAGS[] = {"p", "li", "div", "br"};
+constexpr int NUM_BLOCK_TAGS = sizeof(BLOCK_TAGS) / sizeof(BLOCK_TAGS[0]);
+
+const char* BOLD_TAGS[] = {"b"};
+constexpr int NUM_BOLD_TAGS = sizeof(BOLD_TAGS) / sizeof(BOLD_TAGS[0]);
+
+const char* ITALIC_TAGS[] = {"i"};
+constexpr int NUM_ITALIC_TAGS = sizeof(ITALIC_TAGS) / sizeof(ITALIC_TAGS[0]);
+
+const char* IMAGE_TAGS[] = {"img"};
+constexpr int NUM_IMAGE_TAGS = sizeof(IMAGE_TAGS) / sizeof(IMAGE_TAGS[0]);
+
+const char* SKIP_TAGS[] = {"head", "table"};
+constexpr int NUM_SKIP_TAGS = sizeof(SKIP_TAGS) / sizeof(SKIP_TAGS[0]);
+
+// given the start and end of a tag, check to see if it matches a known tag
+bool matches(const char* tag_name, const char* possible_tags[], const int possible_tag_count) {
+  for (int i = 0; i < possible_tag_count; i++) {
+    if (strcmp(tag_name, possible_tags[i]) == 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// start a new text block if needed
+void EpubHtmlParser::startNewTextBlock(const BLOCK_STYLE style) {
+  if (currentTextBlock) {
+    // already have a text block running and it is empty - just reuse it
+    if (currentTextBlock->isEmpty()) {
+      currentTextBlock->set_style(style);
+      return;
+    }
+
+    currentTextBlock->finish();
+    makePages();
+    delete currentTextBlock;
+  }
+  currentTextBlock = new TextBlock(style);
+}
+
+bool EpubHtmlParser::VisitEnter(const tinyxml2::XMLElement& element, const tinyxml2::XMLAttribute* firstAttribute) {
+  const char* tag_name = element.Name();
+  if (matches(tag_name, IMAGE_TAGS, NUM_IMAGE_TAGS)) {
+    const char* src = element.Attribute("src");
+    if (src) {
+      // don't leave an empty text block in the list
+      // const BLOCK_STYLE style = currentTextBlock->get_style();
+      if (currentTextBlock->isEmpty()) {
+        delete currentTextBlock;
+        currentTextBlock = nullptr;
+      }
+      // TODO: Fix this
+      // blocks.push_back(new ImageBlock(m_base_path + src));
+      // start a new text block - with the same style as before
+      // startNewTextBlock(style);
+    } else {
+      // ESP_LOGE(TAG, "Could not find src attribute");
+    }
+    return false;
+  }
+
+  if (matches(tag_name, SKIP_TAGS, NUM_SKIP_TAGS)) {
+    return false;
+  }
+
+  // Serial.printf("Text: %s\n", element.GetText());
+
+  if (matches(tag_name, HEADER_TAGS, NUM_HEADER_TAGS)) {
+    insideBoldTag = true;
+    startNewTextBlock(CENTER_ALIGN);
+  } else if (matches(tag_name, BLOCK_TAGS, NUM_BLOCK_TAGS)) {
+    if (strcmp(tag_name, "br") == 0) {
+      startNewTextBlock(currentTextBlock->get_style());
+    } else {
+      startNewTextBlock(JUSTIFIED);
+    }
+  } else if (matches(tag_name, BOLD_TAGS, NUM_BOLD_TAGS)) {
+    insideBoldTag = true;
+  } else if (matches(tag_name, ITALIC_TAGS, NUM_ITALIC_TAGS)) {
+    insideItalicTag = true;
+  }
+  return true;
+}
+/// Visit a text node.
+bool EpubHtmlParser::Visit(const tinyxml2::XMLText& text) {
+  const char* content = text.Value();
+  currentTextBlock->addSpan(replaceHtmlEntities(content), insideBoldTag, insideItalicTag);
+  return true;
+}
+
+bool EpubHtmlParser::VisitExit(const tinyxml2::XMLElement& element) {
+  const char* tag_name = element.Name();
+  if (matches(tag_name, HEADER_TAGS, NUM_HEADER_TAGS)) {
+    insideBoldTag = false;
+  } else if (matches(tag_name, BLOCK_TAGS, NUM_BLOCK_TAGS)) {
+    // nothing to do
+  } else if (matches(tag_name, BOLD_TAGS, NUM_BOLD_TAGS)) {
+    insideBoldTag = false;
+  } else if (matches(tag_name, ITALIC_TAGS, NUM_ITALIC_TAGS)) {
+    insideItalicTag = false;
+  }
+  return true;
+}
+
+bool EpubHtmlParser::parseAndBuildPages() {
+  startNewTextBlock(JUSTIFIED);
+  tinyxml2::XMLDocument doc(false, tinyxml2::COLLAPSE_WHITESPACE);
+
+  const tinyxml2::XMLError result = doc.LoadFile(filepath);
+  if (result != tinyxml2::XML_SUCCESS) {
+    Serial.printf("Failed to load file, Error: %s\n", tinyxml2::XMLDocument::ErrorIDToName(result));
+    return false;
+  }
+
+  doc.Accept(this);
+  if (currentTextBlock) {
+    makePages();
+    completePageFn(currentPage);
+    currentPage = nullptr;
+    delete currentTextBlock;
+    currentTextBlock = nullptr;
+  }
+
+  return true;
+}
+
+void EpubHtmlParser::makePages() {
+  if (!currentTextBlock) {
+    Serial.println("!! No text block to make pages for !!");
+    return;
+  }
+
+  if (!currentPage) {
+    currentPage = new Page();
+  }
+
+  const int lineHeight = renderer->getLineHeight();
+  const int pageHeight = renderer->getPageHeight();
+
+  // Long running task, make sure to let other things happen
+  vTaskDelay(1);
+
+  if (currentTextBlock->getType() == TEXT_BLOCK) {
+    const auto lines = currentTextBlock->splitIntoLines(renderer);
+
+    for (const auto line : lines) {
+      if (currentPage->nextY + lineHeight > pageHeight) {
+        completePageFn(currentPage);
+        currentPage = new Page();
+      }
+
+      currentPage->elements.push_back(new PageLine(line, currentPage->nextY));
+      currentPage->nextY += lineHeight;
+    }
+    // TODO: Fix spacing between paras
+    // add some extra line between blocks
+    currentPage->nextY += lineHeight / 2;
+  }
+  // TODO: Image block support
+  // if (block->getType() == BlockType::IMAGE_BLOCK) {
+  //   ImageBlock *imageBlock = (ImageBlock *)block;
+  //   if (y + imageBlock->height > page_height) {
+  //     pages.push_back(new Page());
+  //     y = 0;
+  //   }
+  //   pages.back()->elements.push_back(new PageImage(imageBlock, y));
+  //   y += imageBlock->height;
+  // }
+}
--- a/lib/Epub/Epub/EpubHtmlParser.h
+++ b/lib/Epub/Epub/EpubHtmlParser.h
@@ -0,0 +1,34 @@
+#pragma once
+#include <tinyxml2.h>
+
+#include <functional>
+
+#include "blocks/TextBlock.h"
+
+class Page;
+class EpdRenderer;
+
+class EpubHtmlParser final : public tinyxml2::XMLVisitor {
+  const char* filepath;
+  EpdRenderer* renderer;
+  std::function<void(Page*)> completePageFn;
+
+  bool insideBoldTag = false;
+  bool insideItalicTag = false;
+  TextBlock* currentTextBlock = nullptr;
+  Page* currentPage = nullptr;
+
+  void startNewTextBlock(BLOCK_STYLE style);
+  void makePages();
+
+  // xml parser callbacks
+  bool VisitEnter(const tinyxml2::XMLElement& element, const tinyxml2::XMLAttribute* firstAttribute) override;
+  bool Visit(const tinyxml2::XMLText& text) override;
+  bool VisitExit(const tinyxml2::XMLElement& element) override;
+  // xml parser callbacks
+ public:
+  explicit EpubHtmlParser(const char* filepath, EpdRenderer* renderer, const std::function<void(Page*)>& completePageFn)
+      : filepath(filepath), renderer(renderer), completePageFn(completePageFn) {}
+  ~EpubHtmlParser() override = default;
+  bool parseAndBuildPages();
+};
--- a/lib/Epub/Epub/Page.cpp
+++ b/lib/Epub/Epub/Page.cpp
@@ -0,0 +1,65 @@
+#include "Page.h"
+
+#include <HardwareSerial.h>
+#include <Serialization.h>
+
+void PageLine::render(EpdRenderer* renderer) { block->render(renderer, 0, yPos); }
+
+void PageLine::serialize(std::ostream& os) {
+  serialization::writePod(os, yPos);
+
+  // serialize TextBlock pointed to by PageLine
+  block->serialize(os);
+}
+
+PageLine* PageLine::deserialize(std::istream& is) {
+  int32_t yPos;
+  serialization::readPod(is, yPos);
+
+  const auto tb = TextBlock::deserialize(is);
+  return new PageLine(tb, yPos);
+}
+
+void Page::render(EpdRenderer* renderer) const {
+  const auto start = millis();
+  for (const auto element : elements) {
+    element->render(renderer);
+  }
+  Serial.printf("Rendered page elements (%u) in %dms\n", elements.size(), millis() - start);
+}
+
+void Page::serialize(std::ostream& os) const {
+  serialization::writePod(os, nextY);
+
+  const uint32_t count = elements.size();
+  serialization::writePod(os, count);
+
+  for (auto* el : elements) {
+    // Only PageLine exists currently
+    serialization::writePod(os, static_cast<uint8_t>(TAG_PageLine));
+    static_cast<PageLine*>(el)->serialize(os);
+  }
+}
+
+Page* Page::deserialize(std::istream& is) {
+  auto* page = new Page();
+
+  serialization::readPod(is, page->nextY);
+
+  uint32_t count;
+  serialization::readPod(is, count);
+
+  for (uint32_t i = 0; i < count; i++) {
+    uint8_t tag;
+    serialization::readPod(is, tag);
+
+    if (tag == TAG_PageLine) {
+      auto* pl = PageLine::deserialize(is);
+      page->elements.push_back(pl);
+    } else {
+      throw std::runtime_error("Unknown PageElement tag");
+    }
+  }
+
+  return page;
+}
--- a/lib/Epub/Epub/Page.h
+++ b/lib/Epub/Epub/Page.h
@@ -0,0 +1,43 @@
+#pragma once
+#include "blocks/TextBlock.h"
+
+enum PageElementTag : uint8_t {
+  TAG_PageLine = 1,
+};
+
+// represents something that has been added to a page
+class PageElement {
+ public:
+  int yPos;
+  explicit PageElement(const int yPos) : yPos(yPos) {}
+  virtual ~PageElement() = default;
+  virtual void render(EpdRenderer* renderer) = 0;
+  virtual void serialize(std::ostream& os) = 0;
+};
+
+// a line from a block element
+class PageLine final : public PageElement {
+  const TextBlock* block;
+
+ public:
+  PageLine(const TextBlock* block, const int yPos) : PageElement(yPos), block(block) {}
+  ~PageLine() override { delete block; }
+  void render(EpdRenderer* renderer) override;
+  void serialize(std::ostream& os) override;
+  static PageLine* deserialize(std::istream& is);
+};
+
+class Page {
+ public:
+  int nextY = 0;
+  // the list of block index and line numbers on this page
+  std::vector<PageElement*> elements;
+  void render(EpdRenderer* renderer) const;
+  ~Page() {
+    for (const auto element : elements) {
+      delete element;
+    }
+  }
+  void serialize(std::ostream& os) const;
+  static Page* deserialize(std::istream& is);
+};
--- a/lib/Epub/Epub/Section.cpp
+++ b/lib/Epub/Epub/Section.cpp
@@ -0,0 +1,117 @@
+#include "Section.h"
+
+#include <EpdRenderer.h>
+#include <SD.h>
+
+#include <fstream>
+
+#include "EpubHtmlParser.h"
+#include "Page.h"
+
+void Section::onPageComplete(const Page* page) {
+  Serial.printf("Page %d complete\n", pageCount);
+
+  const auto filePath = cachePath + "/page_" + std::to_string(pageCount) + ".bin";
+  // TODO can this be removed?
+  SD.open(filePath.c_str(), FILE_WRITE).close();
+
+  std::ofstream outputFile("/sd" + filePath);
+  page->serialize(outputFile);
+  outputFile.close();
+
+  pageCount++;
+  delete page;
+}
+
+bool Section::hasCache() {
+  if (!SD.exists(cachePath.c_str())) {
+    return false;
+  }
+
+  const auto sectionFilePath = cachePath + "/section.bin";
+  if (!SD.exists(sectionFilePath.c_str())) {
+    return false;
+  }
+
+  File sectionFile = SD.open(sectionFilePath.c_str(), FILE_READ);
+  uint8_t pageCountBytes[2] = {0, 0};
+  sectionFile.read(pageCountBytes, 2);
+  sectionFile.close();
+
+  pageCount = pageCountBytes[0] + (pageCountBytes[1] << 8);
+  Serial.printf("Loaded cache: %d pages\n", pageCount);
+
+  return true;
+}
+
+void Section::setupCacheDir() const {
+  epub->setupCacheDir();
+  SD.mkdir(cachePath.c_str());
+}
+
+void Section::clearCache() const { SD.rmdir(cachePath.c_str()); }
+
+bool Section::persistPageDataToSD() {
+  size_t size = 0;
+  auto localPath = epub->getSpineItem(spineIndex);
+
+  const auto html = epub->getItemContents(epub->getSpineItem(spineIndex), &size);
+  if (!html) {
+    Serial.println("Failed to read item contents");
+    return false;
+  }
+
+  // TODO: Would love to stream this through an XML visitor
+  const auto tmpHtmlPath = epub->getCachePath() + "/.tmp_" + std::to_string(spineIndex) + ".html";
+  File f = SD.open(tmpHtmlPath.c_str(), FILE_WRITE);
+  const auto written = f.write(html, size);
+  f.close();
+  free(html);
+
+  Serial.printf("Wrote %d bytes to %s\n", written, tmpHtmlPath.c_str());
+
+  if (size != written) {
+    Serial.println("Failed to inflate section contents to SD");
+    SD.remove(tmpHtmlPath.c_str());
+    return false;
+  }
+
+  const auto sdTmpHtmlPath = "/sd" + tmpHtmlPath;
+  auto visitor =
+      EpubHtmlParser(sdTmpHtmlPath.c_str(), renderer, [this](const Page* page) { this->onPageComplete(page); });
+
+  // TODO: Come back and see if mem used here can be lowered?
+  const bool success = visitor.parseAndBuildPages();
+  SD.remove(tmpHtmlPath.c_str());
+  if (!success) {
+    Serial.println("Failed to parse and build pages");
+    return false;
+  }
+
+  File sectionFile = SD.open((cachePath + "/section.bin").c_str(), FILE_WRITE, true);
+  const uint8_t pageCountBytes[2] = {static_cast<uint8_t>(pageCount & 0xFF),
+                                     static_cast<uint8_t>((pageCount >> 8) & 0xFF)};
+  sectionFile.write(pageCountBytes, 2);
+  sectionFile.close();
+
+  return true;
+}
+
+void Section::renderPage() {
+  if (0 <= currentPage && currentPage < pageCount) {
+    const auto filePath = "/sd" + cachePath + "/page_" + std::to_string(currentPage) + ".bin";
+    std::ifstream inputFile(filePath);
+    const Page* p = Page::deserialize(inputFile);
+    inputFile.close();
+    p->render(renderer);
+    delete p;
+  } else if (pageCount == 0) {
+    Serial.println("No pages to render");
+    const int width = renderer->getTextWidth("Empty chapter", true);
+    renderer->drawText((renderer->getPageWidth() - width) / 2, 300, "Empty chapter", true);
+  } else {
+    Serial.printf("Page out of bounds: %d (max %d)\n", currentPage, pageCount);
+    const int width = renderer->getTextWidth("Out of bounds", true);
+    renderer->drawText((renderer->getPageWidth() - width) / 2, 300, "Out of bounds", true);
+  }
+}
--- a/lib/Epub/Epub/Section.h
+++ b/lib/Epub/Epub/Section.h
@@ -0,0 +1,29 @@
+#pragma once
+#include "Epub.h"
+
+class Page;
+class EpdRenderer;
+
+class Section {
+  Epub* epub;
+  const int spineIndex;
+  EpdRenderer* renderer;
+  std::string cachePath;
+
+  void onPageComplete(const Page* page);
+
+ public:
+  int pageCount = 0;
+  int currentPage = 0;
+
+  explicit Section(Epub* epub, const int spineIndex, EpdRenderer* renderer)
+      : epub(epub), spineIndex(spineIndex), renderer(renderer) {
+    cachePath = epub->getCachePath() + "/" + std::to_string(spineIndex);
+  }
+  ~Section() = default;
+  bool hasCache();
+  void setupCacheDir() const;
+  void clearCache() const;
+  bool persistPageDataToSD();
+  void renderPage();
+};
--- a/lib/Epub/Epub/blocks/Block.h
+++ b/lib/Epub/Epub/blocks/Block.h
@@ -0,0 +1,15 @@
+#pragma once
+
+class EpdRenderer;
+
+typedef enum { TEXT_BLOCK, IMAGE_BLOCK } BlockType;
+
+// a block of content in the html - either a paragraph or an image
+class Block {
+ public:
+  virtual ~Block() = default;
+  virtual void layout(EpdRenderer* renderer) = 0;
+  virtual BlockType getType() = 0;
+  virtual bool isEmpty() = 0;
+  virtual void finish() {}
+};
--- a/lib/Epub/Epub/blocks/TextBlock.cpp
+++ b/lib/Epub/Epub/blocks/TextBlock.cpp
@@ -0,0 +1,235 @@
+#include "TextBlock.h"
+
+#include <EpdRenderer.h>
+#include <Serialization.h>
+
+static bool isWhitespace(const char c) { return c == ' ' || c == '\r' || c == '\n'; }
+
+// move past anything that should be considered part of a work
+static int skipWord(const std::string& text, int index, const int length) {
+  while (index < length && !isWhitespace(text[index])) {
+    index++;
+  }
+  return index;
+}
+
+// skip past any white space characters
+static int skipWhitespace(const std::string& html, int index, const int length) {
+  while (index < length && isWhitespace(html[index])) {
+    index++;
+  }
+  return index;
+}
+
+void TextBlock::addSpan(const std::string& span, const bool is_bold, const bool is_italic) {
+  // adding a span to text block
+  // make a copy of the text as we'll modify it
+  const int length = span.length();
+  // const auto text = new char[length + 1];
+  // strcpy(text, span);
+  // work out where each word is in the span
+  int index = 0;
+  while (index < length) {
+    // skip past any whitespace to the start of a word
+    index = skipWhitespace(span, index, length);
+    const int wordStart = index;
+    // find the end of the word
+    index = skipWord(span, index, length);
+    const int wordLength = index - wordStart;
+    if (wordLength > 0) {
+      words.push_back(span.substr(wordStart, wordLength));
+      wordStyles.push_back((is_bold ? BOLD_SPAN : 0) | (is_italic ? ITALIC_SPAN : 0));
+    }
+  }
+}
+
+std::list<TextBlock*> TextBlock::splitIntoLines(const EpdRenderer* renderer) {
+  const int totalWordCount = words.size();
+  const int pageWidth = renderer->getPageWidth();
+  const int spaceWidth = renderer->getSpaceWidth();
+
+  words.shrink_to_fit();
+  wordStyles.shrink_to_fit();
+  wordXpos.reserve(totalWordCount);
+
+  // measure each word
+  uint16_t wordWidths[totalWordCount];
+  for (int i = 0; i < words.size(); i++) {
+    // measure the word
+    const int width = renderer->getTextWidth(words[i].c_str(), wordStyles[i] & BOLD_SPAN, wordStyles[i] & ITALIC_SPAN);
+    wordWidths[i] = width;
+  }
+
+  // now apply the dynamic programming algorithm to find the best line breaks
+  // DP table in which dp[i] represents cost of line starting with word words[i]
+  int dp[totalWordCount];
+
+  // Array in which ans[i] store index of last word in line starting with word
+  // word[i]
+  size_t ans[totalWordCount];
+
+  // If only one word is present then only one line is required. Cost of last
+  // line is zero. Hence cost of this line is zero. Ending point is also n-1 as
+  // single word is present
+  dp[totalWordCount - 1] = 0;
+  ans[totalWordCount - 1] = totalWordCount - 1;
+
+  // Make each word first word of line by iterating over each index in arr.
+  for (int i = totalWordCount - 2; i >= 0; i--) {
+    int currlen = -1;
+    dp[i] = INT_MAX;
+
+    // Variable to store possible minimum cost of line.
+    int cost;
+
+    // Keep on adding words in current line by iterating from starting word upto
+    // last word in arr.
+    for (int j = i; j < totalWordCount; j++) {
+      // Update the width of the words in current line + the space between two
+      // words.
+      currlen += wordWidths[j] + spaceWidth;
+
+      // If we're bigger than the current pagewidth then we can't add more words
+      if (currlen > pageWidth) break;
+
+      // if we've run out of words then this is last line and the cost should be
+      // 0 Otherwise the cost is the sqaure of the left over space + the costs
+      // of all the previous lines
+      if (j == totalWordCount - 1)
+        cost = 0;
+      else
+        cost = (pageWidth - currlen) * (pageWidth - currlen) + dp[j + 1];
+
+      // Check if this arrangement gives minimum cost for line starting with
+      // word words[i].
+      if (cost < dp[i]) {
+        dp[i] = cost;
+        ans[i] = j;
+      }
+    }
+  }
+
+  // We can now iterate through the answer to find the line break positions
+  std::list<uint16_t> lineBreaks;
+  for (size_t i = 0; i < totalWordCount;) {
+    i = ans[i] + 1;
+    if (i > totalWordCount) {
+      break;
+    }
+    lineBreaks.push_back(i);
+    // Text too big, just exit
+    if (lineBreaks.size() > 1000) {
+      break;
+    }
+  }
+
+  std::list<TextBlock*> lines;
+
+  // With the line breaks calculated we can now position the words along the
+  // line
+  int startWord = 0;
+  for (const auto lineBreak : lineBreaks) {
+    const int lineWordCount = lineBreak - startWord;
+
+    int lineWordWidthSum = 0;
+    for (int i = startWord; i < lineBreak; i++) {
+      lineWordWidthSum += wordWidths[i];
+    }
+
+    // Calculate spacing between words
+    const uint16_t spareSpace = pageWidth - lineWordWidthSum;
+    uint16_t spacing = spaceWidth;
+    // evenly space words if using justified style, not the last line, and at
+    // least 2 words
+    if (style == JUSTIFIED && lineBreak != lineBreaks.back() && lineWordCount >= 2) {
+      spacing = spareSpace / (lineWordCount - 1);
+    }
+
+    uint16_t xpos = 0;
+    if (style == RIGHT_ALIGN) {
+      xpos = spareSpace - (lineWordCount - 1) * spaceWidth;
+    } else if (style == CENTER_ALIGN) {
+      xpos = (spareSpace - (lineWordCount - 1) * spaceWidth) / 2;
+    }
+
+    for (int i = startWord; i < lineBreak; i++) {
+      wordXpos[i] = xpos;
+      xpos += wordWidths[i] + spacing;
+    }
+
+    std::vector<std::string> lineWords;
+    std::vector<uint16_t> lineXPos;
+    std::vector<uint8_t> lineWordStyles;
+    lineWords.reserve(lineWordCount);
+    lineXPos.reserve(lineWordCount);
+    lineWordStyles.reserve(lineWordCount);
+
+    for (int i = startWord; i < lineBreak; i++) {
+      lineWords.push_back(words[i]);
+      lineXPos.push_back(wordXpos[i]);
+      lineWordStyles.push_back(wordStyles[i]);
+    }
+    const auto textLine = new TextBlock(lineWords, lineXPos, lineWordStyles, style);
+    lines.push_back(textLine);
+    startWord = lineBreak;
+  }
+
+  return lines;
+}
+
+void TextBlock::render(const EpdRenderer* renderer, const int x, const int y) const {
+  for (int i = 0; i < words.size(); i++) {
+    // get the style
+    const uint8_t wordStyle = wordStyles[i];
+    // render the word
+    renderer->drawText(x + wordXpos[i], y, words[i].c_str(), wordStyle & BOLD_SPAN, wordStyle & ITALIC_SPAN);
+  }
+}
+
+void TextBlock::serialize(std::ostream& os) const {
+  // words
+  const uint32_t wc = words.size();
+  serialization::writePod(os, wc);
+  for (const auto& w : words) serialization::writeString(os, w);
+
+  // wordXpos
+  const uint32_t xc = wordXpos.size();
+  serialization::writePod(os, xc);
+  for (auto x : wordXpos) serialization::writePod(os, x);
+
+  // wordStyles
+  const uint32_t sc = wordStyles.size();
+  serialization::writePod(os, sc);
+  for (auto s : wordStyles) serialization::writePod(os, s);
+
+  // style
+  serialization::writePod(os, style);
+}
+
+TextBlock* TextBlock::deserialize(std::istream& is) {
+  uint32_t wc, xc, sc;
+  std::vector<std::string> words;
+  std::vector<uint16_t> wordXpos;
+  std::vector<uint8_t> wordStyles;
+  BLOCK_STYLE style;
+
+  // words
+  serialization::readPod(is, wc);
+  words.resize(wc);
+  for (auto& w : words) serialization::readString(is, w);
+
+  // wordXpos
+  serialization::readPod(is, xc);
+  wordXpos.resize(xc);
+  for (auto& x : wordXpos) serialization::readPod(is, x);
+
+  // wordStyles
+  serialization::readPod(is, sc);
+  wordStyles.resize(sc);
+  for (auto& s : wordStyles) serialization::readPod(is, s);
+
+  // style
+  serialization::readPod(is, style);
+
+  return new TextBlock(words, wordXpos, wordStyles, style);
+}
--- a/lib/Epub/Epub/blocks/TextBlock.h
+++ b/lib/Epub/Epub/blocks/TextBlock.h
@@ -0,0 +1,50 @@
+#pragma once
+#include <list>
+#include <string>
+#include <vector>
+
+#include "Block.h"
+
+enum SPAN_STYLE : uint8_t {
+  BOLD_SPAN = 1,
+  ITALIC_SPAN = 2,
+};
+
+enum BLOCK_STYLE : uint8_t {
+  JUSTIFIED = 0,
+  LEFT_ALIGN = 1,
+  CENTER_ALIGN = 2,
+  RIGHT_ALIGN = 3,
+};
+
+// represents a block of words in the html document
+class TextBlock final : public Block {
+  // pointer to each word
+  std::vector<std::string> words;
+  // x position of each word
+  std::vector<uint16_t> wordXpos;
+  // the styles of each word
+  std::vector<uint8_t> wordStyles;
+
+  // the style of the block - left, center, right aligned
+  BLOCK_STYLE style;
+
+ public:
+  void addSpan(const std::string& span, bool is_bold, bool is_italic);
+  explicit TextBlock(const BLOCK_STYLE style) : style(style) {}
+  explicit TextBlock(const std::vector<std::string>& words, const std::vector<uint16_t>& word_xpos,
+                     // the styles of each word
+                     const std::vector<uint8_t>& word_styles, const BLOCK_STYLE style)
+      : words(words), wordXpos(word_xpos), wordStyles(word_styles), style(style) {}
+  ~TextBlock() override = default;
+  void set_style(const BLOCK_STYLE style) { this->style = style; }
+  BLOCK_STYLE get_style() const { return style; }
+  bool isEmpty() override { return words.empty(); }
+  void layout(EpdRenderer* renderer) override {};
+  // given a renderer works out where to break the words into lines
+  std::list<TextBlock*> splitIntoLines(const EpdRenderer* renderer);
+  void render(const EpdRenderer* renderer, int x, int y) const;
+  BlockType getType() override { return TEXT_BLOCK; }
+  void serialize(std::ostream& os) const;
+  static TextBlock* deserialize(std::istream& is);
+};
--- a/lib/Epub/Epub/htmlEntities.cpp
+++ b/lib/Epub/Epub/htmlEntities.cpp
@@ -0,0 +1,163 @@
+// from
+// https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp
+
+#include "htmlEntities.h"
+
+#include <cstring>
+#include <unordered_map>
+
+const int MAX_ENTITY_LENGTH = 10;
+
+// Use book: entities_ww2.epub to test this (Page 7: Entities parser test)
+// Note the supported keys are only in lowercase
+// Store the mappings in a unordered hash map
+static std::unordered_map<std::string, std::string> entity_lookup(
+    {{"&quot;", "\""},  {"&frasl;", "⁄"},   {"&amp;", "&"},      {"&lt;", "<"},     {"&gt;", ">"},
+     {"&Agrave;", "À"}, {"&Aacute;", "Á"},  {"&Acirc;", "Â"},    {"&Atilde;", "Ã"}, {"&Auml;", "Ä"},
+     {"&Aring;", "Å"},  {"&AElig;", "Æ"},   {"&Ccedil;", "Ç"},   {"&Egrave;", "È"}, {"&Eacute;", "É"},
+     {"&Ecirc;", "Ê"},  {"&Euml;", "Ë"},    {"&Igrave;", "Ì"},   {"&Iacute;", "Í"}, {"&Icirc;", "Î"},
+     {"&Iuml;", "Ï"},   {"&ETH;", "Ð"},     {"&Ntilde;", "Ñ"},   {"&Ograve;", "Ò"}, {"&Oacute;", "Ó"},
+     {"&Ocirc;", "Ô"},  {"&Otilde;", "Õ"},  {"&Ouml;", "Ö"},     {"&Oslash;", "Ø"}, {"&Ugrave;", "Ù"},
+     {"&Uacute;", "Ú"}, {"&Ucirc;", "Û"},   {"&Uuml;", "Ü"},     {"&Yacute;", "Ý"}, {"&THORN;", "Þ"},
+     {"&szlig;", "ß"},  {"&agrave;", "à"},  {"&aacute;", "á"},   {"&acirc;", "â"},  {"&atilde;", "ã"},
+     {"&auml;", "ä"},   {"&aring;", "å"},   {"&aelig;", "æ"},    {"&ccedil;", "ç"}, {"&egrave;", "è"},
+     {"&eacute;", "é"}, {"&ecirc;", "ê"},   {"&euml;", "ë"},     {"&igrave;", "ì"}, {"&iacute;", "í"},
+     {"&icirc;", "î"},  {"&iuml;", "ï"},    {"&eth;", "ð"},      {"&ntilde;", "ñ"}, {"&ograve;", "ò"},
+     {"&oacute;", "ó"}, {"&ocirc;", "ô"},   {"&otilde;", "õ"},   {"&ouml;", "ö"},   {"&oslash;", "ø"},
+     {"&ugrave;", "ù"}, {"&uacute;", "ú"},  {"&ucirc;", "û"},    {"&uuml;", "ü"},   {"&yacute;", "ý"},
+     {"&thorn;", "þ"},  {"&yuml;", "ÿ"},    {"&nbsp;", " "},     {"&iexcl;", "¡"},  {"&cent;", "¢"},
+     {"&pound;", "£"},  {"&curren;", "¤"},  {"&yen;", "¥"},      {"&brvbar;", "¦"}, {"&sect;", "§"},
+     {"&uml;", "¨"},    {"&copy;", "©"},    {"&ordf;", "ª"},     {"&laquo;", "«"},  {"&not;", "¬"},
+     {"&shy;", ""},    {"&reg;", "®"},     {"&macr;", "¯"},     {"&deg;", "°"},    {"&plusmn;", "±"},
+     {"&sup2;", "²"},   {"&sup3;", "³"},    {"&acute;", "´"},    {"&micro;", "µ"},  {"&para;", "¶"},
+     {"&cedil;", "¸"},  {"&sup1;", "¹"},    {"&ordm;", "º"},     {"&raquo;", "»"},  {"&frac14;", "¼"},
+     {"&frac12;", "½"}, {"&frac34;", "¾"},  {"&iquest;", "¿"},   {"&times;", "×"},  {"&divide;", "÷"},
+     {"&forall;", "∀"}, {"&part;", "∂"},    {"&exist;", "∃"},    {"&empty;", "∅"},  {"&nabla;", "∇"},
+     {"&isin;", "∈"},   {"&notin;", "∉"},   {"&ni;", "∋"},       {"&prod;", "∏"},   {"&sum;", "∑"},
+     {"&minus;", "−"},  {"&lowast;", "∗"},  {"&radic;", "√"},    {"&prop;", "∝"},   {"&infin;", "∞"},
+     {"&ang;", "∠"},    {"&and;", "∧"},     {"&or;", "∨"},       {"&cap;", "∩"},    {"&cup;", "∪"},
+     {"&int;", "∫"},    {"&there4;", "∴"},  {"&sim;", "∼"},      {"&cong;", "≅"},   {"&asymp;", "≈"},
+     {"&ne;", "≠"},     {"&equiv;", "≡"},   {"&le;", "≤"},       {"&ge;", "≥"},     {"&sub;", "⊂"},
+     {"&sup;", "⊃"},    {"&nsub;", "⊄"},    {"&sube;", "⊆"},     {"&supe;", "⊇"},   {"&oplus;", "⊕"},
+     {"&otimes;", "⊗"}, {"&perp;", "⊥"},    {"&sdot;", "⋅"},     {"&Alpha;", "Α"},  {"&Beta;", "Β"},
+     {"&Gamma;", "Γ"},  {"&Delta;", "Δ"},   {"&Epsilon;", "Ε"},  {"&Zeta;", "Ζ"},   {"&Eta;", "Η"},
+     {"&Theta;", "Θ"},  {"&Iota;", "Ι"},    {"&Kappa;", "Κ"},    {"&Lambda;", "Λ"}, {"&Mu;", "Μ"},
+     {"&Nu;", "Ν"},     {"&Xi;", "Ξ"},      {"&Omicron;", "Ο"},  {"&Pi;", "Π"},     {"&Rho;", "Ρ"},
+     {"&Sigma;", "Σ"},  {"&Tau;", "Τ"},     {"&Upsilon;", "Υ"},  {"&Phi;", "Φ"},    {"&Chi;", "Χ"},
+     {"&Psi;", "Ψ"},    {"&Omega;", "Ω"},   {"&alpha;", "α"},    {"&beta;", "β"},   {"&gamma;", "γ"},
+     {"&delta;", "δ"},  {"&epsilon;", "ε"}, {"&zeta;", "ζ"},     {"&eta;", "η"},    {"&theta;", "θ"},
+     {"&iota;", "ι"},   {"&kappa;", "κ"},   {"&lambda;", "λ"},   {"&mu;", "μ"},     {"&nu;", "ν"},
+     {"&xi;", "ξ"},     {"&omicron;", "ο"}, {"&pi;", "π"},       {"&rho;", "ρ"},    {"&sigmaf;", "ς"},
+     {"&sigma;", "σ"},  {"&tau;", "τ"},     {"&upsilon;", "υ"},  {"&phi;", "φ"},    {"&chi;", "χ"},
+     {"&psi;", "ψ"},    {"&omega;", "ω"},   {"&thetasym;", "ϑ"}, {"&upsih;", "ϒ"},  {"&piv;", "ϖ"},
+     {"&OElig;", "Œ"},  {"&oelig;", "œ"},   {"&Scaron;", "Š"},   {"&scaron;", "š"}, {"&Yuml;", "Ÿ"},
+     {"&fnof;", "ƒ"},   {"&circ;", "ˆ"},    {"&tilde;", "˜"},    {"&ensp;", ""},    {"&emsp;", ""},
+     {"&thinsp;", ""},  {"&zwnj;", "‌"},  {"&zwj;", "‍"},    {"&lrm;", "‎"},  {"&rlm;", "‏"},
+     {"&ndash;", "–"},  {"&mdash;", "—"},   {"&lsquo;", "‘"},    {"&rsquo;", "’"},  {"&sbquo;", "‚"},
+     {"&ldquo;", "“"},  {"&rdquo;", "”"},   {"&bdquo;", "„"},    {"&dagger;", "†"}, {"&Dagger;", "‡"},
+     {"&bull;", "•"},   {"&hellip;", "…"},  {"&permil;", "‰"},   {"&prime;", "′"},  {"&Prime;", "″"},
+     {"&lsaquo;", "‹"}, {"&rsaquo;", "›"},  {"&oline;", "‾"},    {"&euro;", "€"},   {"&trade;", "™"},
+     {"&larr;", "←"},   {"&uarr;", "↑"},    {"&rarr;", "→"},     {"&darr;", "↓"},   {"&harr;", "↔"},
+     {"&crarr;", "↵"},  {"&lceil;", "⌈"},   {"&rceil;", "⌉"},    {"&lfloor;", "⌊"}, {"&rfloor;", "⌋"},
+     {"&loz;", "◊"},    {"&spades;", "♠"},  {"&clubs;", "♣"},    {"&hearts;", "♥"}, {"&diams;", "♦"}});
+
+// converts from a unicode code point to the utf8 equivalent
+void convert_to_utf8(const int code, std::string& res) {
+  // convert to a utf8 sequence
+  if (code < 0x80) {
+    res += static_cast<char>(code);
+  } else if (code < 0x800) {
+    res += static_cast<char>(0xc0 | (code >> 6));
+    res += static_cast<char>(0x80 | (code & 0x3f));
+  } else if (code < 0x10000) {
+    res += static_cast<char>(0xe0 | (code >> 12));
+    res += static_cast<char>(0x80 | ((code >> 6) & 0x3f));
+    res += static_cast<char>(0x80 | (code & 0x3f));
+  } else if (code < 0x200000) {
+    res += static_cast<char>(0xf0 | (code >> 18));
+    res += static_cast<char>(0x80 | ((code >> 12) & 0x3f));
+    res += static_cast<char>(0x80 | ((code >> 6) & 0x3f));
+    res += static_cast<char>(0x80 | (code & 0x3f));
+  } else if (code < 0x4000000) {
+    res += static_cast<char>(0xf8 | (code >> 24));
+    res += static_cast<char>(0x80 | ((code >> 18) & 0x3f));
+    res += static_cast<char>(0x80 | ((code >> 12) & 0x3f));
+    res += static_cast<char>(0x80 | ((code >> 6) & 0x3f));
+    res += static_cast<char>(0x80 | (code & 0x3f));
+  } else if (code < 0x80000000) {
+    res += static_cast<char>(0xfc | (code >> 30));
+    res += static_cast<char>(0x80 | ((code >> 24) & 0x3f));
+    res += static_cast<char>(0x80 | ((code >> 18) & 0x3f));
+    res += static_cast<char>(0x80 | ((code >> 12) & 0x3f));
+    res += static_cast<char>(0x80 | ((code >> 6) & 0x3f));
+  }
+}
+
+// handles numeric entities - e.g. &#1234; or &#x1234;
+bool process_numeric_entity(const std::string& entity, std::string& res) {
+  int code = 0;
+  // is it hex?
+  if (entity[2] == 'x' || entity[2] == 'X') {
+    // parse the hex code
+    code = strtol(entity.substr(3, entity.size() - 3).c_str(), nullptr, 16);
+  } else {
+    code = strtol(entity.substr(2, entity.size() - 3).c_str(), nullptr, 10);
+  }
+  if (code != 0) {
+    // special handling for nbsp
+    if (code == 0xA0) {
+      res += " ";
+    } else {
+      convert_to_utf8(code, res);
+    }
+    return true;
+  }
+  return false;
+}
+
+// handles named entities - e.g. &amp;
+bool process_string_entity(const std::string& entity, std::string& res) {
+  // it's a named entity - find it in the lookup table
+  // find it in the map
+  const auto it = entity_lookup.find(entity);
+  if (it != entity_lookup.end()) {
+    res += it->second;
+    return true;
+  }
+  return false;
+}
+
+// replace all the entities in the string
+std::string replaceHtmlEntities(const char* text) {
+  std::string res;
+  res.reserve(strlen(text));
+  for (int i = 0; i < strlen(text); ++i) {
+    bool flag = false;
+    // do we have a potential entity?
+    if (text[i] == '&') {
+      // find the end of the entity
+      int j = i + 1;
+      while (j < strlen(text) && text[j] != ';' && j - i < MAX_ENTITY_LENGTH) {
+        j++;
+      }
+      if (j - i > 2) {
+        char entity[j - i + 1];
+        strncpy(entity, text + i, j - i);
+        // is it a numeric code?
+        if (entity[1] == '#') {
+          flag = process_numeric_entity(entity, res);
+        } else {
+          flag = process_string_entity(entity, res);
+        }
+        // skip past the entity if we successfully decoded it
+        if (flag) {
+          i = j;
+        }
+      }
+    }
+    if (!flag) {
+      res += text[i];
+    }
+  }
+  return res;
+}
--- a/lib/Epub/Epub/htmlEntities.h
+++ b/lib/Epub/Epub/htmlEntities.h
@@ -0,0 +1,7 @@
+// from
+// https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp
+
+#pragma once
+#include <string>
+
+std::string replaceHtmlEntities(const char* text);