Custom zip parsing (#140)

## Summary * Use custom zip central directory parsing to lower memory usage when loading zipped epub content
2025-12-29 20:17:29 +10:00
parent d7f4bd54f5
commit 071ccb9d1b
7 changed files with 358 additions and 107 deletions
--- a/lib/Epub/Epub.cpp
+++ b/lib/Epub/Epub.cpp
@@ -60,9 +60,6 @@ bool Epub::parseContentOpf(BookMetadataCache::BookMetadata& bookMetadata) {
  }

  ContentOpfParser opfParser(getCachePath(), getBasePath(), contentOpfSize, bookMetadataCache.get());
-  Serial.printf("[%lu] [MEM] Free: %d bytes, Total: %d bytes, Min Free: %d bytes\n", millis(), ESP.getFreeHeap(),
-                ESP.getHeapSize(), ESP.getMinFreeHeap());
-
  if (!opfParser.setup()) {
    Serial.printf("[%lu] [EBP] Could not setup content.opf parser\n", millis());
    return false;
@@ -321,10 +318,9 @@ bool Epub::generateCoverBmp() const {
 }

 uint8_t* Epub::readItemContentsToBytes(const std::string& itemHref, size_t* size, const bool trailingNullByte) const {
-  const ZipFile zip("/sd" + filepath);
  const std::string path = FsHelpers::normalisePath(itemHref);

-  const auto content = zip.readFileToMemory(path.c_str(), size, trailingNullByte);
+  const auto content = ZipFile(filepath).readFileToMemory(path.c_str(), size, trailingNullByte);
  if (!content) {
    Serial.printf("[%lu] [EBP] Failed to read item %s\n", millis(), path.c_str());
    return nullptr;
@@ -334,20 +330,13 @@ uint8_t* Epub::readItemContentsToBytes(const std::string& itemHref, size_t* size
 }

 bool Epub::readItemContentsToStream(const std::string& itemHref, Print& out, const size_t chunkSize) const {
-  const ZipFile zip("/sd" + filepath);
  const std::string path = FsHelpers::normalisePath(itemHref);
-
-  return zip.readFileToStream(path.c_str(), out, chunkSize);
+  return ZipFile(filepath).readFileToStream(path.c_str(), out, chunkSize);
 }

 bool Epub::getItemSize(const std::string& itemHref, size_t* size) const {
-  const ZipFile zip("/sd" + filepath);
-  return getItemSize(zip, itemHref, size);
-}
-
-bool Epub::getItemSize(const ZipFile& zip, const std::string& itemHref, size_t* size) {
  const std::string path = FsHelpers::normalisePath(itemHref);
-  return zip.getInflatedFileSize(path.c_str(), size);
+  return ZipFile(filepath).getInflatedFileSize(path.c_str(), size);
 }

 int Epub::getSpineItemsCount() const {
--- a/lib/Epub/Epub.h
+++ b/lib/Epub/Epub.h
@@ -24,7 +24,6 @@ class Epub {
  bool findContentOpfFile(std::string* contentOpfFile) const;
  bool parseContentOpf(BookMetadataCache::BookMetadata& bookMetadata);
  bool parseTocNcxFile() const;
-  static bool getItemSize(const ZipFile& zip, const std::string& itemHref, size_t* size);

 public:
  explicit Epub(std::string filepath, const std::string& cacheDir) : filepath(std::move(filepath)) {
@@ -54,5 +53,5 @@ class Epub {
  size_t getCumulativeSpineItemSize(int spineIndex) const;

  size_t getBookSize() const;
-  uint8_t calculateProgress(const int currentSpineIndex, const float currentSpineRead) const;
+  uint8_t calculateProgress(int currentSpineIndex, float currentSpineRead) const;
 };
--- a/lib/Epub/Epub/BookMetadataCache.cpp
+++ b/lib/Epub/Epub/BookMetadataCache.cpp
@@ -122,7 +122,26 @@ bool BookMetadataCache::buildBookBin(const std::string& epubPath, const BookMeta
  // LUTs complete
  // Loop through spines from spine file matching up TOC indexes, calculating cumulative size and writing to book.bin

-  const ZipFile zip("/sd" + epubPath);
+  ZipFile zip(epubPath);
+  // Pre-open zip file to speed up size calculations
+  if (!zip.open()) {
+    Serial.printf("[%lu] [BMC] Could not open EPUB zip for size calculations\n", millis());
+    bookFile.close();
+    spineFile.close();
+    tocFile.close();
+    return false;
+  }
+  // TODO: For large ZIPs loading the all localHeaderOffsets will crash.
+  //       However not having them loaded is extremely slow. Need a better solution here.
+  //       Perhaps only a cache of spine items or a better way to speedup lookups?
+  if (!zip.loadAllFileStatSlims()) {
+    Serial.printf("[%lu] [BMC] Could not load zip local header offsets for size calculations\n", millis());
+    bookFile.close();
+    spineFile.close();
+    tocFile.close();
+    zip.close();
+    return false;
+  }
  size_t cumSize = 0;
  spineFile.seek(0);
  for (int i = 0; i < spineCount; i++) {
@@ -157,6 +176,8 @@ bool BookMetadataCache::buildBookBin(const std::string& epubPath, const BookMeta
    // Write out spine data to book.bin
    writeSpineEntry(bookFile, spineEntry);
  }
+  // Close opened zip file
+  zip.close();

  // Loop through toc entries from toc file writing to book.bin
  tocFile.seek(0);
@@ -223,6 +244,8 @@ void BookMetadataCache::createTocEntry(const std::string& title, const std::stri

  int spineIndex = -1;
  // find spine index
+  // TODO: This lookup is slow as need to scan through all items each time. We can't hold it all in memory due to size.
+  //       But perhaps we can load just the hrefs in a vector/list to do an index lookup?
  spineFile.seek(0);
  for (int i = 0; i < spineCount; i++) {
    auto spineEntry = readSpineEntry(spineFile);
--- a/lib/Epub/Epub/parsers/ContentOpfParser.cpp
+++ b/lib/Epub/Epub/parsers/ContentOpfParser.cpp
@@ -3,7 +3,6 @@
 #include <FsHelpers.h>
 #include <HardwareSerial.h>
 #include <Serialization.h>
-#include <ZipFile.h>

 #include "../BookMetadataCache.h"

@@ -183,6 +182,8 @@ void XMLCALL ContentOpfParser::startElement(void* userData, const XML_Char* name
        if (strcmp(atts[i], "idref") == 0) {
          const std::string idref = atts[i + 1];
          // Resolve the idref to href using items map
+          // TODO: This lookup is slow as need to scan through all items each time.
+          //       It can take up to 200ms per item when getting to 1500 items.
          self->tempItemStore.seek(0);
          std::string itemId;
          std::string href;