Stream inflated EPUB HTMLs down to disk instead of inflating in memory (#4)

* Downgrade miniz for stability

* Stream HTML from ZIP down to disk instead of loading all in mem
This commit is contained in:
Dave Allie
2025-12-08 00:39:17 +11:00
committed by GitHub
parent c715c18bf7
commit de453fed1d
9 changed files with 857 additions and 1108 deletions

View File

@@ -9,7 +9,7 @@
bool Epub::findContentOpfFile(const ZipFile& zip, std::string& contentOpfFile) {
// open up the meta data to find where the content.opf file lives
size_t s;
const auto metaInfo = zip.readTextFileToMemory("META-INF/container.xml", &s);
const auto metaInfo = reinterpret_cast<char*>(zip.readFileToMemory("META-INF/container.xml", &s, true));
if (!metaInfo) {
Serial.println("Could not find META-INF/container.xml");
return false;
@@ -57,7 +57,7 @@ bool Epub::findContentOpfFile(const ZipFile& zip, std::string& contentOpfFile) {
bool Epub::parseContentOpf(ZipFile& zip, std::string& content_opf_file) {
// read in the content.opf file and parse it
auto contents = zip.readTextFileToMemory(content_opf_file.c_str());
auto contents = reinterpret_cast<char*>(zip.readFileToMemory(content_opf_file.c_str(), nullptr, true));
// parse the contents
tinyxml2::XMLDocument doc;
@@ -168,7 +168,7 @@ bool Epub::parseTocNcxFile(const ZipFile& zip) {
return false;
}
const auto ncxData = zip.readTextFileToMemory(tocNcxItem.c_str());
const auto ncxData = reinterpret_cast<char*>(zip.readFileToMemory(tocNcxItem.c_str(), nullptr, true));
if (!ncxData) {
Serial.printf("Could not find %s\n", tocNcxItem.c_str());
return false;
@@ -308,11 +308,11 @@ std::string normalisePath(const std::string& path) {
return result;
}
uint8_t* Epub::getItemContents(const std::string& itemHref, size_t* size) const {
uint8_t* Epub::readItemContentsToBytes(const std::string& itemHref, size_t* size, bool trailingNullByte) const {
const ZipFile zip("/sd" + filepath);
const std::string path = normalisePath(itemHref);
const auto content = zip.readFileToMemory(path.c_str(), size);
const auto content = zip.readFileToMemory(path.c_str(), size, trailingNullByte);
if (!content) {
Serial.printf("Failed to read item %s\n", path.c_str());
return nullptr;
@@ -321,17 +321,11 @@ uint8_t* Epub::getItemContents(const std::string& itemHref, size_t* size) const
return content;
}
char* Epub::getTextItemContents(const std::string& itemHref, size_t* size) const {
bool Epub::readItemContentsToStream(const std::string& itemHref, Print& out, const size_t chunkSize) const {
const ZipFile zip("/sd" + filepath);
const std::string path = normalisePath(itemHref);
const auto content = zip.readTextFileToMemory(path.c_str(), size);
if (!content) {
Serial.printf("Failed to read item %s\n", path.c_str());
return nullptr;
}
return content;
return zip.readFileToStream(path.c_str(), out, chunkSize);
}
int Epub::getSpineItemsCount() const { return spine.size(); }

View File

@@ -56,8 +56,9 @@ class Epub {
const std::string& getPath() const;
const std::string& getTitle() const;
const std::string& getCoverImageItem() const;
uint8_t* getItemContents(const std::string& itemHref, size_t* size = nullptr) const;
char* getTextItemContents(const std::string& itemHref, size_t* size = nullptr) const;
uint8_t* readItemContentsToBytes(const std::string& itemHref, size_t* size = nullptr,
bool trailingNullByte = false) const;
bool readItemContentsToStream(const std::string& itemHref, Print& out, size_t chunkSize) const;
std::string& getSpineItem(int spineIndex);
int getSpineItemsCount() const;
EpubTocEntry& getTocItem(int tocTndex);

View File

@@ -199,6 +199,11 @@ bool EpubHtmlParserSlim::parseAndBuildPages() {
XML_SetCharacterDataHandler(parser, characterData);
FILE* file = fopen(filepath, "r");
if (!file) {
Serial.printf("Couldn't open file %s\n", filepath);
XML_ParserFree(parser);
return false;
}
do {
void* const buf = XML_GetBuffer(parser, 1024);

View File

@@ -64,35 +64,28 @@ void Section::setupCacheDir() const {
void Section::clearCache() const { SD.rmdir(cachePath.c_str()); }
bool Section::persistPageDataToSD() {
size_t size = 0;
auto localPath = epub->getSpineItem(spineIndex);
const auto localPath = epub->getSpineItem(spineIndex);
const auto html = epub->getItemContents(epub->getSpineItem(spineIndex), &size);
if (!html) {
Serial.println("Failed to read item contents");
return false;
}
// TODO: Would love to stream this through an XML visitor
// TODO: Should we get rid of this file all together?
// It currently saves us a bit of memory by allowing for all the inflation bits to be released
// before loading the XML parser
const auto tmpHtmlPath = epub->getCachePath() + "/.tmp_" + std::to_string(spineIndex) + ".html";
File f = SD.open(tmpHtmlPath.c_str(), FILE_WRITE);
const auto written = f.write(html, size);
File f = SD.open(tmpHtmlPath.c_str(), FILE_WRITE, true);
bool success = epub->readItemContentsToStream(localPath, f, 1024);
f.close();
free(html);
Serial.printf("Wrote %d bytes to %s\n", written, tmpHtmlPath.c_str());
if (size != written) {
Serial.println("Failed to inflate section contents to SD");
SD.remove(tmpHtmlPath.c_str());
if (!success) {
Serial.println("Failed to stream item contents");
return false;
}
Serial.printf("Streamed HTML to %s\n", tmpHtmlPath.c_str());
const auto sdTmpHtmlPath = "/sd" + tmpHtmlPath;
auto visitor =
EpubHtmlParserSlim(sdTmpHtmlPath.c_str(), renderer, [this](const Page* page) { this->onPageComplete(page); });
const bool success = visitor.parseAndBuildPages();
success = visitor.parseAndBuildPages();
SD.remove(tmpHtmlPath.c_str());
if (!success) {