Remove tinyxml2 dependency replace with expat parsers (#9)

This commit is contained in:
Dave Allie
2025-12-13 19:36:01 +11:00
committed by GitHub
parent d450f362d1
commit c7a32fe41f
17 changed files with 696 additions and 253 deletions

View File

@@ -7,247 +7,148 @@
#include <map>
#include "Epub/FsHelpers.h"
#include "Epub/parsers/ContainerParser.h"
#include "Epub/parsers/ContentOpfParser.h"
#include "Epub/parsers/TocNcxParser.h"
bool Epub::findContentOpfFile(const ZipFile& zip, std::string& contentOpfFile) {
// open up the meta data to find where the content.opf file lives
size_t s;
const auto metaInfo = reinterpret_cast<char*>(zip.readFileToMemory("META-INF/container.xml", &s, true));
if (!metaInfo) {
Serial.printf("[%lu] [EBP] Could not find META-INF/container.xml\n", millis());
bool Epub::findContentOpfFile(std::string* contentOpfFile) const {
const auto containerPath = "META-INF/container.xml";
size_t containerSize;
// Get file size without loading it all into heap
if (!getItemSize(containerPath, &containerSize)) {
Serial.printf("[%lu] [EBP] Could not find or size META-INF/container.xml\n", millis());
return false;
}
// parse the meta data
tinyxml2::XMLDocument metaDataDoc;
const auto result = metaDataDoc.Parse(metaInfo);
free(metaInfo);
ContainerParser containerParser(containerSize);
if (result != tinyxml2::XML_SUCCESS) {
Serial.printf("[%lu] [EBP] Could not parse META-INF/container.xml. Error: %d\n", millis(), result);
if (!containerParser.setup()) {
return false;
}
const auto container = metaDataDoc.FirstChildElement("container");
if (!container) {
Serial.printf("[%lu] [EBP] Could not find container element in META-INF/container.xml\n", millis());
// Stream read (reusing your existing stream logic)
if (!readItemContentsToStream(containerPath, containerParser, 512)) {
Serial.printf("[%lu] [EBP] Could not read META-INF/container.xml\n", millis());
containerParser.teardown();
return false;
}
const auto rootfiles = container->FirstChildElement("rootfiles");
if (!rootfiles) {
Serial.printf("[%lu] [EBP] Could not find rootfiles element in META-INF/container.xml\n", millis());
// Extract the result
if (containerParser.fullPath.empty()) {
Serial.printf("[%lu] [EBP] Could not find valid rootfile in container.xml\n", millis());
containerParser.teardown();
return false;
}
// find the root file that has the media-type="application/oebps-package+xml"
auto rootfile = rootfiles->FirstChildElement("rootfile");
while (rootfile) {
const char* mediaType = rootfile->Attribute("media-type");
if (mediaType && strcmp(mediaType, "application/oebps-package+xml") == 0) {
const char* full_path = rootfile->Attribute("full-path");
if (full_path) {
contentOpfFile = full_path;
return true;
}
}
rootfile = rootfile->NextSiblingElement("rootfile");
}
*contentOpfFile = std::move(containerParser.fullPath);
Serial.printf("[%lu] [EBP] Could not get path to content.opf file\n", millis());
return false;
}
bool Epub::parseContentOpf(ZipFile& zip, std::string& content_opf_file) {
// read in the content.opf file and parse it
auto contents = reinterpret_cast<char*>(zip.readFileToMemory(content_opf_file.c_str(), nullptr, true));
// parse the contents
tinyxml2::XMLDocument doc;
auto result = doc.Parse(contents);
free(contents);
if (result != tinyxml2::XML_SUCCESS) {
Serial.printf("[%lu] [EBP] Error parsing content.opf - %s\n", millis(),
tinyxml2::XMLDocument::ErrorIDToName(result));
return false;
}
auto package = doc.FirstChildElement("package");
if (!package) package = doc.FirstChildElement("opf:package");
if (!package) {
Serial.printf("[%lu] [EBP] Could not find package element in content.opf\n", millis());
return false;
}
// get the metadata - title and cover image
auto metadata = package->FirstChildElement("metadata");
if (!metadata) metadata = package->FirstChildElement("opf:metadata");
if (!metadata) {
Serial.printf("[%lu] [EBP] Missing metadata\n", millis());
return false;
}
auto titleEl = metadata->FirstChildElement("dc:title");
if (!titleEl) {
Serial.printf("[%lu] [EBP] Missing title\n", millis());
return false;
}
this->title = titleEl->GetText();
auto cover = metadata->FirstChildElement("meta");
if (!cover) cover = metadata->FirstChildElement("opf:meta");
while (cover && cover->Attribute("name") && strcmp(cover->Attribute("name"), "cover") != 0) {
cover = cover->NextSiblingElement("meta");
}
if (!cover) {
Serial.printf("[%lu] [EBP] Missing cover\n", millis());
}
auto coverItem = cover ? cover->Attribute("content") : nullptr;
// read the manifest and spine
// the manifest gives us the names of the files
// the spine gives us the order of the files
// we can then read the files in the order they are in the spine
auto manifest = package->FirstChildElement("manifest");
if (!manifest) manifest = package->FirstChildElement("opf:manifest");
if (!manifest) {
Serial.printf("[%lu] [EBP] Missing manifest\n", millis());
return false;
}
// create a mapping from id to file name
auto item = manifest->FirstChildElement("item");
if (!item) item = manifest->FirstChildElement("opf:item");
std::map<std::string, std::string> items;
while (item) {
std::string itemId = item->Attribute("id");
std::string href = contentBasePath + item->Attribute("href");
// grab the cover image
if (coverItem && itemId == coverItem) {
coverImageItem = href;
}
// grab the ncx file
if (itemId == "ncx" || itemId == "ncxtoc") {
tocNcxItem = href;
}
items[itemId] = href;
auto nextItem = item->NextSiblingElement("item");
if (!nextItem) nextItem = item->NextSiblingElement("opf:item");
item = nextItem;
}
// find the spine
auto spineEl = package->FirstChildElement("spine");
if (!spineEl) spineEl = package->FirstChildElement("opf:spine");
if (!spineEl) {
Serial.printf("[%lu] [EBP] Missing spine\n", millis());
return false;
}
// read the spine
auto itemref = spineEl->FirstChildElement("itemref");
if (!itemref) itemref = spineEl->FirstChildElement("opf:itemref");
while (itemref) {
auto id = itemref->Attribute("idref");
if (items.find(id) != items.end()) {
spine.emplace_back(id, items[id]);
}
auto nextItemRef = itemref->NextSiblingElement("itemref");
if (!nextItemRef) nextItemRef = itemref->NextSiblingElement("opf:itemref");
itemref = nextItemRef;
}
containerParser.teardown();
return true;
}
bool Epub::parseTocNcxFile(const ZipFile& zip) {
bool Epub::parseContentOpf(const std::string& contentOpfFilePath) {
size_t contentOpfSize;
if (!getItemSize(contentOpfFilePath, &contentOpfSize)) {
Serial.printf("[%lu] [EBP] Could not get size of content.opf\n", millis());
return false;
}
ContentOpfParser opfParser(getBasePath(), contentOpfSize);
if (!opfParser.setup()) {
Serial.printf("[%lu] [EBP] Could not setup content.opf parser\n", millis());
return false;
}
if (!readItemContentsToStream(contentOpfFilePath, opfParser, 1024)) {
Serial.printf("[%lu] [EBP] Could not read content.opf\n", millis());
opfParser.teardown();
return false;
}
// Grab data from opfParser into epub
title = opfParser.title;
if (opfParser.items.count("ncx")) {
tocNcxItem = opfParser.items.at("ncx");
} else if (opfParser.items.count("ncxtoc")) {
tocNcxItem = opfParser.items.at("ncxtoc");
}
for (auto& spineRef : opfParser.spineRefs) {
if (opfParser.items.count(spineRef)) {
spine.emplace_back(spineRef, opfParser.items.at(spineRef));
}
}
Serial.printf("[%lu] [EBP] Successfully parsed content.opf\n", millis());
opfParser.teardown();
return true;
}
bool Epub::parseTocNcxFile() {
// the ncx file should have been specified in the content.opf file
if (tocNcxItem.empty()) {
Serial.printf("[%lu] [EBP] No ncx file specified\n", millis());
return false;
}
const auto ncxData = reinterpret_cast<char*>(zip.readFileToMemory(tocNcxItem.c_str(), nullptr, true));
if (!ncxData) {
Serial.printf("[%lu] [EBP] Could not find %s\n", millis(), tocNcxItem.c_str());
size_t tocSize;
if (!getItemSize(tocNcxItem, &tocSize)) {
Serial.printf("[%lu] [EBP] Could not get size of toc ncx\n", millis());
return false;
}
// Parse the Toc contents
tinyxml2::XMLDocument doc;
const auto result = doc.Parse(ncxData);
free(ncxData);
TocNcxParser ncxParser(contentBasePath, tocSize);
if (result != tinyxml2::XML_SUCCESS) {
Serial.printf("[%lu] [EBP] Error parsing toc %s\n", millis(), tinyxml2::XMLDocument::ErrorIDToName(result));
if (!ncxParser.setup()) {
Serial.printf("[%lu] [EBP] Could not setup toc ncx parser\n", millis());
return false;
}
const auto ncx = doc.FirstChildElement("ncx");
if (!ncx) {
Serial.printf("[%lu] [EBP] Could not find first child ncx in toc\n", millis());
if (!readItemContentsToStream(tocNcxItem, ncxParser, 1024)) {
Serial.printf("[%lu] [EBP] Could not read toc ncx stream\n", millis());
ncxParser.teardown();
return false;
}
const auto navMap = ncx->FirstChildElement("navMap");
if (!navMap) {
Serial.printf("[%lu] [EBP] Could not find navMap child in ncx\n", millis());
return false;
}
this->toc = std::move(ncxParser.toc);
recursivelyParseNavMap(navMap->FirstChildElement("navPoint"));
Serial.printf("[%lu] [EBP] Parsed %d TOC items\n", millis(), this->toc.size());
ncxParser.teardown();
return true;
}
void Epub::recursivelyParseNavMap(tinyxml2::XMLElement* element) {
// Fills toc map
while (element) {
std::string navTitle = element->FirstChildElement("navLabel")->FirstChildElement("text")->FirstChild()->Value();
const auto content = element->FirstChildElement("content");
std::string href = contentBasePath + content->Attribute("src");
// split the href on the # to get the href and the anchor
const size_t pos = href.find('#');
std::string anchor;
if (pos != std::string::npos) {
anchor = href.substr(pos + 1);
href = href.substr(0, pos);
}
toc.emplace_back(navTitle, href, anchor, 0);
tinyxml2::XMLElement* nestedNavPoint = element->FirstChildElement("navPoint");
if (nestedNavPoint) {
recursivelyParseNavMap(nestedNavPoint);
}
element = element->NextSiblingElement("navPoint");
}
}
// load in the meta data for the epub file
bool Epub::load() {
Serial.printf("[%lu] [EBP] Loading ePub: %s\n", millis(), filepath.c_str());
ZipFile zip("/sd" + filepath);
std::string contentOpfFile;
if (!findContentOpfFile(zip, contentOpfFile)) {
Serial.printf("[%lu] [EBP] Could not open ePub\n", millis());
std::string contentOpfFilePath;
if (!findContentOpfFile(&contentOpfFilePath)) {
Serial.printf("[%lu] [EBP] Could not find content.opf in zip\n", millis());
return false;
}
contentBasePath = contentOpfFile.substr(0, contentOpfFile.find_last_of('/') + 1);
Serial.printf("[%lu] [EBP] Found content.opf at: %s\n", millis(), contentOpfFilePath.c_str());
if (!parseContentOpf(zip, contentOpfFile)) {
contentBasePath = contentOpfFilePath.substr(0, contentOpfFilePath.find_last_of('/') + 1);
if (!parseContentOpf(contentOpfFilePath)) {
Serial.printf("[%lu] [EBP] Could not parse content.opf\n", millis());
return false;
}
if (!parseTocNcxFile(zip)) {
if (!parseTocNcxFile()) {
Serial.printf("[%lu] [EBP] Could not parse toc\n", millis());
return false;
}
Serial.printf("[%lu] [EBP] Loaded ePub: %s\n", millis(), filepath.c_str());
return true;
}
@@ -344,6 +245,13 @@ bool Epub::readItemContentsToStream(const std::string& itemHref, Print& out, con
return zip.readFileToStream(path.c_str(), out, chunkSize);
}
bool Epub::getItemSize(const std::string& itemHref, size_t* size) const {
const ZipFile zip("/sd" + filepath);
const std::string path = normalisePath(itemHref);
return zip.getInflatedFileSize(path.c_str(), size);
}
int Epub::getSpineItemsCount() const { return spine.size(); }
std::string& Epub::getSpineItem(const int spineIndex) {