Custom zip parsing (#140)

## Summary

* Use custom zip central directory parsing to lower memory usage when
loading zipped epub content
This commit is contained in:
Dave Allie
2025-12-29 20:17:29 +10:00
committed by GitHub
parent d7f4bd54f5
commit 071ccb9d1b
7 changed files with 358 additions and 107 deletions

View File

@@ -60,9 +60,6 @@ bool Epub::parseContentOpf(BookMetadataCache::BookMetadata& bookMetadata) {
}
ContentOpfParser opfParser(getCachePath(), getBasePath(), contentOpfSize, bookMetadataCache.get());
Serial.printf("[%lu] [MEM] Free: %d bytes, Total: %d bytes, Min Free: %d bytes\n", millis(), ESP.getFreeHeap(),
ESP.getHeapSize(), ESP.getMinFreeHeap());
if (!opfParser.setup()) {
Serial.printf("[%lu] [EBP] Could not setup content.opf parser\n", millis());
return false;
@@ -321,10 +318,9 @@ bool Epub::generateCoverBmp() const {
}
uint8_t* Epub::readItemContentsToBytes(const std::string& itemHref, size_t* size, const bool trailingNullByte) const {
const ZipFile zip("/sd" + filepath);
const std::string path = FsHelpers::normalisePath(itemHref);
const auto content = zip.readFileToMemory(path.c_str(), size, trailingNullByte);
const auto content = ZipFile(filepath).readFileToMemory(path.c_str(), size, trailingNullByte);
if (!content) {
Serial.printf("[%lu] [EBP] Failed to read item %s\n", millis(), path.c_str());
return nullptr;
@@ -334,20 +330,13 @@ uint8_t* Epub::readItemContentsToBytes(const std::string& itemHref, size_t* size
}
bool Epub::readItemContentsToStream(const std::string& itemHref, Print& out, const size_t chunkSize) const {
const ZipFile zip("/sd" + filepath);
const std::string path = FsHelpers::normalisePath(itemHref);
return zip.readFileToStream(path.c_str(), out, chunkSize);
return ZipFile(filepath).readFileToStream(path.c_str(), out, chunkSize);
}
bool Epub::getItemSize(const std::string& itemHref, size_t* size) const {
const ZipFile zip("/sd" + filepath);
return getItemSize(zip, itemHref, size);
}
bool Epub::getItemSize(const ZipFile& zip, const std::string& itemHref, size_t* size) {
const std::string path = FsHelpers::normalisePath(itemHref);
return zip.getInflatedFileSize(path.c_str(), size);
return ZipFile(filepath).getInflatedFileSize(path.c_str(), size);
}
int Epub::getSpineItemsCount() const {

View File

@@ -24,7 +24,6 @@ class Epub {
bool findContentOpfFile(std::string* contentOpfFile) const;
bool parseContentOpf(BookMetadataCache::BookMetadata& bookMetadata);
bool parseTocNcxFile() const;
static bool getItemSize(const ZipFile& zip, const std::string& itemHref, size_t* size);
public:
explicit Epub(std::string filepath, const std::string& cacheDir) : filepath(std::move(filepath)) {
@@ -54,5 +53,5 @@ class Epub {
size_t getCumulativeSpineItemSize(int spineIndex) const;
size_t getBookSize() const;
uint8_t calculateProgress(const int currentSpineIndex, const float currentSpineRead) const;
uint8_t calculateProgress(int currentSpineIndex, float currentSpineRead) const;
};

View File

@@ -122,7 +122,26 @@ bool BookMetadataCache::buildBookBin(const std::string& epubPath, const BookMeta
// LUTs complete
// Loop through spines from spine file matching up TOC indexes, calculating cumulative size and writing to book.bin
const ZipFile zip("/sd" + epubPath);
ZipFile zip(epubPath);
// Pre-open zip file to speed up size calculations
if (!zip.open()) {
Serial.printf("[%lu] [BMC] Could not open EPUB zip for size calculations\n", millis());
bookFile.close();
spineFile.close();
tocFile.close();
return false;
}
// TODO: For large ZIPs loading the all localHeaderOffsets will crash.
// However not having them loaded is extremely slow. Need a better solution here.
// Perhaps only a cache of spine items or a better way to speedup lookups?
if (!zip.loadAllFileStatSlims()) {
Serial.printf("[%lu] [BMC] Could not load zip local header offsets for size calculations\n", millis());
bookFile.close();
spineFile.close();
tocFile.close();
zip.close();
return false;
}
size_t cumSize = 0;
spineFile.seek(0);
for (int i = 0; i < spineCount; i++) {
@@ -157,6 +176,8 @@ bool BookMetadataCache::buildBookBin(const std::string& epubPath, const BookMeta
// Write out spine data to book.bin
writeSpineEntry(bookFile, spineEntry);
}
// Close opened zip file
zip.close();
// Loop through toc entries from toc file writing to book.bin
tocFile.seek(0);
@@ -223,6 +244,8 @@ void BookMetadataCache::createTocEntry(const std::string& title, const std::stri
int spineIndex = -1;
// find spine index
// TODO: This lookup is slow as need to scan through all items each time. We can't hold it all in memory due to size.
// But perhaps we can load just the hrefs in a vector/list to do an index lookup?
spineFile.seek(0);
for (int i = 0; i < spineCount; i++) {
auto spineEntry = readSpineEntry(spineFile);

View File

@@ -3,7 +3,6 @@
#include <FsHelpers.h>
#include <HardwareSerial.h>
#include <Serialization.h>
#include <ZipFile.h>
#include "../BookMetadataCache.h"
@@ -183,6 +182,8 @@ void XMLCALL ContentOpfParser::startElement(void* userData, const XML_Char* name
if (strcmp(atts[i], "idref") == 0) {
const std::string idref = atts[i + 1];
// Resolve the idref to href using items map
// TODO: This lookup is slow as need to scan through all items each time.
// It can take up to 200ms per item when getting to 1500 items.
self->tempItemStore.seek(0);
std::string itemId;
std::string href;