Stream inflated EPUB HTMLs down to disk instead of inflating in memory (#4)

* Downgrade miniz for stability * Stream HTML from ZIP down to disk instead of loading all in mem
2025-12-08 00:39:17 +11:00
parent c715c18bf7
commit de453fed1d
9 changed files with 857 additions and 1108 deletions
--- a/lib/Epub/Epub.cpp
+++ b/lib/Epub/Epub.cpp
@@ -9,7 +9,7 @@
 bool Epub::findContentOpfFile(const ZipFile& zip, std::string& contentOpfFile) {
  // open up the meta data to find where the content.opf file lives
  size_t s;
-  const auto metaInfo = zip.readTextFileToMemory("META-INF/container.xml", &s);
+  const auto metaInfo = reinterpret_cast<char*>(zip.readFileToMemory("META-INF/container.xml", &s, true));
  if (!metaInfo) {
    Serial.println("Could not find META-INF/container.xml");
    return false;
@@ -57,7 +57,7 @@ bool Epub::findContentOpfFile(const ZipFile& zip, std::string& contentOpfFile) {

 bool Epub::parseContentOpf(ZipFile& zip, std::string& content_opf_file) {
  // read in the content.opf file and parse it
-  auto contents = zip.readTextFileToMemory(content_opf_file.c_str());
+  auto contents = reinterpret_cast<char*>(zip.readFileToMemory(content_opf_file.c_str(), nullptr, true));

  // parse the contents
  tinyxml2::XMLDocument doc;
@@ -168,7 +168,7 @@ bool Epub::parseTocNcxFile(const ZipFile& zip) {
    return false;
  }

-  const auto ncxData = zip.readTextFileToMemory(tocNcxItem.c_str());
+  const auto ncxData = reinterpret_cast<char*>(zip.readFileToMemory(tocNcxItem.c_str(), nullptr, true));
  if (!ncxData) {
    Serial.printf("Could not find %s\n", tocNcxItem.c_str());
    return false;
@@ -308,11 +308,11 @@ std::string normalisePath(const std::string& path) {
  return result;
 }

-uint8_t* Epub::getItemContents(const std::string& itemHref, size_t* size) const {
+uint8_t* Epub::readItemContentsToBytes(const std::string& itemHref, size_t* size, bool trailingNullByte) const {
  const ZipFile zip("/sd" + filepath);
  const std::string path = normalisePath(itemHref);

-  const auto content = zip.readFileToMemory(path.c_str(), size);
+  const auto content = zip.readFileToMemory(path.c_str(), size, trailingNullByte);
  if (!content) {
    Serial.printf("Failed to read item %s\n", path.c_str());
    return nullptr;
@@ -321,17 +321,11 @@ uint8_t* Epub::getItemContents(const std::string& itemHref, size_t* size) const
  return content;
 }

-char* Epub::getTextItemContents(const std::string& itemHref, size_t* size) const {
+bool Epub::readItemContentsToStream(const std::string& itemHref, Print& out, const size_t chunkSize) const {
  const ZipFile zip("/sd" + filepath);
  const std::string path = normalisePath(itemHref);

-  const auto content = zip.readTextFileToMemory(path.c_str(), size);
-  if (!content) {
-    Serial.printf("Failed to read item %s\n", path.c_str());
-    return nullptr;
-  }
-
-  return content;
+  return zip.readFileToStream(path.c_str(), out, chunkSize);
 }

 int Epub::getSpineItemsCount() const { return spine.size(); }
--- a/lib/Epub/Epub.h
+++ b/lib/Epub/Epub.h
@@ -56,8 +56,9 @@ class Epub {
  const std::string& getPath() const;
  const std::string& getTitle() const;
  const std::string& getCoverImageItem() const;
-  uint8_t* getItemContents(const std::string& itemHref, size_t* size = nullptr) const;
-  char* getTextItemContents(const std::string& itemHref, size_t* size = nullptr) const;
+  uint8_t* readItemContentsToBytes(const std::string& itemHref, size_t* size = nullptr,
+                                   bool trailingNullByte = false) const;
+  bool readItemContentsToStream(const std::string& itemHref, Print& out, size_t chunkSize) const;
  std::string& getSpineItem(int spineIndex);
  int getSpineItemsCount() const;
  EpubTocEntry& getTocItem(int tocTndex);
--- a/lib/Epub/Epub/EpubHtmlParserSlim.cpp
+++ b/lib/Epub/Epub/EpubHtmlParserSlim.cpp
@@ -199,6 +199,11 @@ bool EpubHtmlParserSlim::parseAndBuildPages() {
  XML_SetCharacterDataHandler(parser, characterData);

  FILE* file = fopen(filepath, "r");
+  if (!file) {
+    Serial.printf("Couldn't open file %s\n", filepath);
+    XML_ParserFree(parser);
+    return false;
+  }

  do {
    void* const buf = XML_GetBuffer(parser, 1024);
--- a/lib/Epub/Epub/Section.cpp
+++ b/lib/Epub/Epub/Section.cpp
@@ -64,35 +64,28 @@ void Section::setupCacheDir() const {
 void Section::clearCache() const { SD.rmdir(cachePath.c_str()); }

 bool Section::persistPageDataToSD() {
-  size_t size = 0;
-  auto localPath = epub->getSpineItem(spineIndex);
+  const auto localPath = epub->getSpineItem(spineIndex);

-  const auto html = epub->getItemContents(epub->getSpineItem(spineIndex), &size);
-  if (!html) {
-    Serial.println("Failed to read item contents");
-    return false;
-  }
-
-  // TODO: Would love to stream this through an XML visitor
+  // TODO: Should we get rid of this file all together?
+  //       It currently saves us a bit of memory by allowing for all the inflation bits to be released
+  //       before loading the XML parser
  const auto tmpHtmlPath = epub->getCachePath() + "/.tmp_" + std::to_string(spineIndex) + ".html";
-  File f = SD.open(tmpHtmlPath.c_str(), FILE_WRITE);
-  const auto written = f.write(html, size);
+  File f = SD.open(tmpHtmlPath.c_str(), FILE_WRITE, true);
+  bool success = epub->readItemContentsToStream(localPath, f, 1024);
  f.close();
-  free(html);

-  Serial.printf("Wrote %d bytes to %s\n", written, tmpHtmlPath.c_str());
-
-  if (size != written) {
-    Serial.println("Failed to inflate section contents to SD");
-    SD.remove(tmpHtmlPath.c_str());
+  if (!success) {
+    Serial.println("Failed to stream item contents");
    return false;
  }

+  Serial.printf("Streamed HTML to %s\n", tmpHtmlPath.c_str());
+
  const auto sdTmpHtmlPath = "/sd" + tmpHtmlPath;

  auto visitor =
      EpubHtmlParserSlim(sdTmpHtmlPath.c_str(), renderer, [this](const Page* page) { this->onPageComplete(page); });
-  const bool success = visitor.parseAndBuildPages();
+  success = visitor.parseAndBuildPages();

  SD.remove(tmpHtmlPath.c_str());
  if (!success) {