Public release

This commit is contained in:
Dave Allie
2025-12-03 22:00:29 +11:00
commit 2ccdbeecc8
54 changed files with 33356 additions and 0 deletions

383
lib/Epub/Epub.cpp Normal file
View File

@@ -0,0 +1,383 @@
#include "Epub.h"
#include <HardwareSerial.h>
#include <SD.h>
#include <ZipFile.h>
#include <tinyxml2.h>
#include <map>
bool Epub::findContentOpfFile(const ZipFile& zip, std::string& contentOpfFile) {
// open up the meta data to find where the content.opf file lives
size_t s;
const auto metaInfo = zip.readTextFileToMemory("META-INF/container.xml", &s);
if (!metaInfo) {
Serial.println("Could not find META-INF/container.xml");
return false;
}
// parse the meta data
tinyxml2::XMLDocument metaDataDoc;
const auto result = metaDataDoc.Parse(metaInfo);
free(metaInfo);
if (result != tinyxml2::XML_SUCCESS) {
Serial.printf("Could not parse META-INF/container.xml. Error: %d\n", result);
return false;
}
const auto container = metaDataDoc.FirstChildElement("container");
if (!container) {
Serial.println("Could not find container element in META-INF/container.xml");
return false;
}
const auto rootfiles = container->FirstChildElement("rootfiles");
if (!rootfiles) {
Serial.println("Could not find rootfiles element in META-INF/container.xml");
return false;
}
// find the root file that has the media-type="application/oebps-package+xml"
auto rootfile = rootfiles->FirstChildElement("rootfile");
while (rootfile) {
const char* mediaType = rootfile->Attribute("media-type");
if (mediaType && strcmp(mediaType, "application/oebps-package+xml") == 0) {
const char* full_path = rootfile->Attribute("full-path");
if (full_path) {
contentOpfFile = full_path;
return true;
}
}
rootfile = rootfile->NextSiblingElement("rootfile");
}
Serial.println("Could not get path to content.opf file");
return false;
}
bool Epub::parseContentOpf(ZipFile& zip, std::string& content_opf_file) {
// read in the content.opf file and parse it
auto contents = zip.readTextFileToMemory(content_opf_file.c_str());
// parse the contents
tinyxml2::XMLDocument doc;
auto result = doc.Parse(contents);
free(contents);
if (result != tinyxml2::XML_SUCCESS) {
Serial.printf("Error parsing content.opf - %s\n", tinyxml2::XMLDocument::ErrorIDToName(result));
return false;
}
auto package = doc.FirstChildElement("package");
if (!package) package = doc.FirstChildElement("opf:package");
if (!package) {
Serial.println("Could not find package element in content.opf");
return false;
}
// get the metadata - title and cover image
auto metadata = package->FirstChildElement("metadata");
if (!metadata) metadata = package->FirstChildElement("opf:metadata");
if (!metadata) {
Serial.println("Missing metadata");
return false;
}
auto titleEl = metadata->FirstChildElement("dc:title");
if (!titleEl) {
Serial.println("Missing title");
return false;
}
this->title = titleEl->GetText();
auto cover = metadata->FirstChildElement("meta");
if (!cover) cover = metadata->FirstChildElement("opf:meta");
while (cover && cover->Attribute("name") && strcmp(cover->Attribute("name"), "cover") != 0) {
cover = cover->NextSiblingElement("meta");
}
if (!cover) {
Serial.println("Missing cover");
}
auto coverItem = cover ? cover->Attribute("content") : nullptr;
// read the manifest and spine
// the manifest gives us the names of the files
// the spine gives us the order of the files
// we can then read the files in the order they are in the spine
auto manifest = package->FirstChildElement("manifest");
if (!manifest) manifest = package->FirstChildElement("opf:manifest");
if (!manifest) {
Serial.println("Missing manifest");
return false;
}
// create a mapping from id to file name
auto item = manifest->FirstChildElement("item");
if (!item) item = manifest->FirstChildElement("opf:item");
std::map<std::string, std::string> items;
while (item) {
std::string itemId = item->Attribute("id");
std::string href = contentBasePath + item->Attribute("href");
// grab the cover image
if (coverItem && itemId == coverItem) {
coverImageItem = href;
}
// grab the ncx file
if (itemId == "ncx" || itemId == "ncxtoc") {
tocNcxItem = href;
}
items[itemId] = href;
auto nextItem = item->NextSiblingElement("item");
if (!nextItem) nextItem = item->NextSiblingElement("opf:item");
item = nextItem;
}
// find the spine
auto spineEl = package->FirstChildElement("spine");
if (!spineEl) spineEl = package->FirstChildElement("opf:spine");
if (!spineEl) {
Serial.println("Missing spine");
return false;
}
// read the spine
auto itemref = spineEl->FirstChildElement("itemref");
if (!itemref) itemref = spineEl->FirstChildElement("opf:itemref");
while (itemref) {
auto id = itemref->Attribute("idref");
if (items.find(id) != items.end()) {
spine.emplace_back(id, items[id]);
}
auto nextItemRef = itemref->NextSiblingElement("itemref");
if (!nextItemRef) nextItemRef = itemref->NextSiblingElement("opf:itemref");
itemref = nextItemRef;
}
return true;
}
bool Epub::parseTocNcxFile(ZipFile& zip) {
// the ncx file should have been specified in the content.opf file
if (tocNcxItem.empty()) {
Serial.println("No ncx file specified");
return false;
}
auto ncxData = zip.readTextFileToMemory(tocNcxItem.c_str());
if (!ncxData) {
Serial.printf("Could not find %s\n", tocNcxItem.c_str());
return false;
}
// Parse the Toc contents
tinyxml2::XMLDocument doc;
auto result = doc.Parse(ncxData);
free(ncxData);
if (result != tinyxml2::XML_SUCCESS) {
Serial.printf("Error parsing toc %s\n", tinyxml2::XMLDocument::ErrorIDToName(result));
return false;
}
auto ncx = doc.FirstChildElement("ncx");
if (!ncx) {
Serial.println("Could not find first child ncx in toc");
return false;
}
auto navMap = ncx->FirstChildElement("navMap");
if (!navMap) {
Serial.println("Could not find navMap child in ncx");
return false;
}
auto navPoint = navMap->FirstChildElement("navPoint");
// Fills toc map
while (navPoint) {
std::string navTitle = navPoint->FirstChildElement("navLabel")->FirstChildElement("text")->FirstChild()->Value();
auto content = navPoint->FirstChildElement("content");
std::string href = contentBasePath + content->Attribute("src");
// split the href on the # to get the href and the anchor
size_t pos = href.find('#');
std::string anchor;
if (pos != std::string::npos) {
anchor = href.substr(pos + 1);
href = href.substr(0, pos);
}
toc.emplace_back(navTitle, href, anchor, 0);
navPoint = navPoint->NextSiblingElement("navPoint");
}
return true;
}
// load in the meta data for the epub file
bool Epub::load() {
ZipFile zip("/sd" + filepath);
std::string contentOpfFile;
if (!findContentOpfFile(zip, contentOpfFile)) {
Serial.println("Could not open ePub");
return false;
}
contentBasePath = contentOpfFile.substr(0, contentOpfFile.find_last_of('/') + 1);
if (!parseContentOpf(zip, contentOpfFile)) {
return false;
}
if (!parseTocNcxFile(zip)) {
return false;
}
return true;
}
void Epub::clearCache() const { SD.rmdir(cachePath.c_str()); }
void Epub::setupCacheDir() const {
if (SD.exists(cachePath.c_str())) {
return;
}
// Loop over each segment of the cache path and create directories as needed
for (size_t i = 1; i < cachePath.length(); i++) {
if (cachePath[i] == '/') {
SD.mkdir(cachePath.substr(0, i).c_str());
}
}
SD.mkdir(cachePath.c_str());
}
const std::string& Epub::getCachePath() const { return cachePath; }
const std::string& Epub::getPath() const { return filepath; }
const std::string& Epub::getTitle() const { return title; }
const std::string& Epub::getCoverImageItem() const { return coverImageItem; }
std::string normalisePath(const std::string& path) {
std::vector<std::string> components;
std::string component;
for (const auto c : path) {
if (c == '/') {
if (!component.empty()) {
if (component == "..") {
if (!components.empty()) {
components.pop_back();
}
} else {
components.push_back(component);
}
component.clear();
}
} else {
component += c;
}
}
if (!component.empty()) {
components.push_back(component);
}
std::string result;
for (const auto& c : components) {
if (!result.empty()) {
result += "/";
}
result += c;
}
return result;
}
uint8_t* Epub::getItemContents(const std::string& itemHref, size_t* size) const {
const ZipFile zip("/sd" + filepath);
const std::string path = normalisePath(itemHref);
const auto content = zip.readFileToMemory(path.c_str(), size);
if (!content) {
Serial.printf("Failed to read item %s\n", path.c_str());
return nullptr;
}
return content;
}
char* Epub::getTextItemContents(const std::string& itemHref, size_t* size) const {
const ZipFile zip("/sd" + filepath);
const std::string path = normalisePath(itemHref);
const auto content = zip.readTextFileToMemory(path.c_str(), size);
if (!content) {
Serial.printf("Failed to read item %s\n", path.c_str());
return nullptr;
}
return content;
}
int Epub::getSpineItemsCount() const { return spine.size(); }
std::string& Epub::getSpineItem(const int spineIndex) {
if (spineIndex < 0 || spineIndex >= spine.size()) {
Serial.printf("getSpineItem index:%d is out of range\n", spineIndex);
return spine.at(0).second;
}
return spine.at(spineIndex).second;
}
EpubTocEntry& Epub::getTocItem(const int tocTndex) {
if (tocTndex < 0 || tocTndex >= toc.size()) {
Serial.printf("getTocItem index:%d is out of range\n", tocTndex);
return toc.at(0);
}
return toc.at(tocTndex);
}
int Epub::getTocItemsCount() const { return toc.size(); }
// work out the section index for a toc index
int Epub::getSpineIndexForTocIndex(const int tocIndex) const {
// the toc entry should have an href that matches the spine item
// so we can find the spine index by looking for the href
for (int i = 0; i < spine.size(); i++) {
if (spine[i].second == toc[tocIndex].href) {
return i;
}
}
Serial.println("Section not found");
// not found - default to the start of the book
return 0;
}
int Epub::getTocIndexForSpineIndex(const int spineIndex) const {
// the toc entry should have an href that matches the spine item
// so we can find the toc index by looking for the href
Serial.printf("Looking for %s\n", spine[spineIndex].second.c_str());
for (int i = 0; i < toc.size(); i++) {
Serial.printf("Looking at %s\n", toc[i].href.c_str());
if (toc[i].href == spine[spineIndex].second) {
return i;
}
}
Serial.println("TOC item not found");
// not found - default to first item
return 0;
}

73
lib/Epub/Epub.h Normal file
View File

@@ -0,0 +1,73 @@
#pragma once
#include <HardwareSerial.h>
#include <string>
#include <unordered_map>
#include <vector>
class ZipFile;
class EpubTocEntry {
public:
std::string title;
std::string href;
std::string anchor;
int level;
EpubTocEntry(std::string title, std::string href, std::string anchor, const int level)
: title(std::move(title)), href(std::move(href)), anchor(std::move(anchor)), level(level) {}
};
class Epub {
// the title read from the EPUB meta data
std::string title;
// the cover image
std::string coverImageItem;
// the ncx file
std::string tocNcxItem;
// where is the EPUBfile?
std::string filepath;
// the spine of the EPUB file
std::vector<std::pair<std::string, std::string>> spine;
// the toc of the EPUB file
std::vector<EpubTocEntry> toc;
// the base path for items in the EPUB file
std::string contentBasePath;
// Uniq cache key based on filepath
std::string cachePath;
// find the path for the content.opf file
static bool findContentOpfFile(const ZipFile& zip, std::string& contentOpfFile);
bool parseContentOpf(ZipFile& zip, std::string& content_opf_file);
bool parseTocNcxFile(ZipFile& zip);
public:
explicit Epub(std::string filepath, const std::string& cacheDir) : filepath(std::move(filepath)) {
// create a cache key based on the filepath
cachePath = cacheDir + "/epub_" + std::to_string(std::hash<std::string>{}(this->filepath));
}
~Epub() = default;
std::string& getBasePath() { return contentBasePath; }
bool load();
void clearCache() const;
void setupCacheDir() const;
const std::string& getCachePath() const;
const std::string& getPath() const;
const std::string& getTitle() const;
const std::string& getCoverImageItem() const;
uint8_t* getItemContents(const std::string& itemHref, size_t* size = nullptr) const;
char* getTextItemContents(const std::string& itemHref, size_t* size = nullptr) const;
std::string& getSpineItem(int spineIndex);
int getSpineItemsCount() const;
EpubTocEntry& getTocItem(int tocTndex);
int getTocItemsCount() const;
// work out the section index for a toc index
int getSpineIndexForTocIndex(int tocIndex) const;
int getTocIndexForSpineIndex(int spineIndex) const;
};

View File

@@ -0,0 +1,181 @@
#include "EpubHtmlParser.h"
#include <EpdRenderer.h>
#include <HardwareSerial.h>
#include "Page.h"
#include "htmlEntities.h"
const char* HEADER_TAGS[] = {"h1", "h2", "h3", "h4", "h5", "h6"};
constexpr int NUM_HEADER_TAGS = sizeof(HEADER_TAGS) / sizeof(HEADER_TAGS[0]);
const char* BLOCK_TAGS[] = {"p", "li", "div", "br"};
constexpr int NUM_BLOCK_TAGS = sizeof(BLOCK_TAGS) / sizeof(BLOCK_TAGS[0]);
const char* BOLD_TAGS[] = {"b"};
constexpr int NUM_BOLD_TAGS = sizeof(BOLD_TAGS) / sizeof(BOLD_TAGS[0]);
const char* ITALIC_TAGS[] = {"i"};
constexpr int NUM_ITALIC_TAGS = sizeof(ITALIC_TAGS) / sizeof(ITALIC_TAGS[0]);
const char* IMAGE_TAGS[] = {"img"};
constexpr int NUM_IMAGE_TAGS = sizeof(IMAGE_TAGS) / sizeof(IMAGE_TAGS[0]);
const char* SKIP_TAGS[] = {"head", "table"};
constexpr int NUM_SKIP_TAGS = sizeof(SKIP_TAGS) / sizeof(SKIP_TAGS[0]);
// given the start and end of a tag, check to see if it matches a known tag
bool matches(const char* tag_name, const char* possible_tags[], const int possible_tag_count) {
for (int i = 0; i < possible_tag_count; i++) {
if (strcmp(tag_name, possible_tags[i]) == 0) {
return true;
}
}
return false;
}
// start a new text block if needed
void EpubHtmlParser::startNewTextBlock(const BLOCK_STYLE style) {
if (currentTextBlock) {
// already have a text block running and it is empty - just reuse it
if (currentTextBlock->isEmpty()) {
currentTextBlock->set_style(style);
return;
}
currentTextBlock->finish();
makePages();
delete currentTextBlock;
}
currentTextBlock = new TextBlock(style);
}
bool EpubHtmlParser::VisitEnter(const tinyxml2::XMLElement& element, const tinyxml2::XMLAttribute* firstAttribute) {
const char* tag_name = element.Name();
if (matches(tag_name, IMAGE_TAGS, NUM_IMAGE_TAGS)) {
const char* src = element.Attribute("src");
if (src) {
// don't leave an empty text block in the list
// const BLOCK_STYLE style = currentTextBlock->get_style();
if (currentTextBlock->isEmpty()) {
delete currentTextBlock;
currentTextBlock = nullptr;
}
// TODO: Fix this
// blocks.push_back(new ImageBlock(m_base_path + src));
// start a new text block - with the same style as before
// startNewTextBlock(style);
} else {
// ESP_LOGE(TAG, "Could not find src attribute");
}
return false;
}
if (matches(tag_name, SKIP_TAGS, NUM_SKIP_TAGS)) {
return false;
}
// Serial.printf("Text: %s\n", element.GetText());
if (matches(tag_name, HEADER_TAGS, NUM_HEADER_TAGS)) {
insideBoldTag = true;
startNewTextBlock(CENTER_ALIGN);
} else if (matches(tag_name, BLOCK_TAGS, NUM_BLOCK_TAGS)) {
if (strcmp(tag_name, "br") == 0) {
startNewTextBlock(currentTextBlock->get_style());
} else {
startNewTextBlock(JUSTIFIED);
}
} else if (matches(tag_name, BOLD_TAGS, NUM_BOLD_TAGS)) {
insideBoldTag = true;
} else if (matches(tag_name, ITALIC_TAGS, NUM_ITALIC_TAGS)) {
insideItalicTag = true;
}
return true;
}
/// Visit a text node.
bool EpubHtmlParser::Visit(const tinyxml2::XMLText& text) {
const char* content = text.Value();
currentTextBlock->addSpan(replaceHtmlEntities(content), insideBoldTag, insideItalicTag);
return true;
}
bool EpubHtmlParser::VisitExit(const tinyxml2::XMLElement& element) {
const char* tag_name = element.Name();
if (matches(tag_name, HEADER_TAGS, NUM_HEADER_TAGS)) {
insideBoldTag = false;
} else if (matches(tag_name, BLOCK_TAGS, NUM_BLOCK_TAGS)) {
// nothing to do
} else if (matches(tag_name, BOLD_TAGS, NUM_BOLD_TAGS)) {
insideBoldTag = false;
} else if (matches(tag_name, ITALIC_TAGS, NUM_ITALIC_TAGS)) {
insideItalicTag = false;
}
return true;
}
bool EpubHtmlParser::parseAndBuildPages() {
startNewTextBlock(JUSTIFIED);
tinyxml2::XMLDocument doc(false, tinyxml2::COLLAPSE_WHITESPACE);
const tinyxml2::XMLError result = doc.LoadFile(filepath);
if (result != tinyxml2::XML_SUCCESS) {
Serial.printf("Failed to load file, Error: %s\n", tinyxml2::XMLDocument::ErrorIDToName(result));
return false;
}
doc.Accept(this);
if (currentTextBlock) {
makePages();
completePageFn(currentPage);
currentPage = nullptr;
delete currentTextBlock;
currentTextBlock = nullptr;
}
return true;
}
void EpubHtmlParser::makePages() {
if (!currentTextBlock) {
Serial.println("!! No text block to make pages for !!");
return;
}
if (!currentPage) {
currentPage = new Page();
}
const int lineHeight = renderer->getLineHeight();
const int pageHeight = renderer->getPageHeight();
// Long running task, make sure to let other things happen
vTaskDelay(1);
if (currentTextBlock->getType() == TEXT_BLOCK) {
const auto lines = currentTextBlock->splitIntoLines(renderer);
for (const auto line : lines) {
if (currentPage->nextY + lineHeight > pageHeight) {
completePageFn(currentPage);
currentPage = new Page();
}
currentPage->elements.push_back(new PageLine(line, currentPage->nextY));
currentPage->nextY += lineHeight;
}
// TODO: Fix spacing between paras
// add some extra line between blocks
currentPage->nextY += lineHeight / 2;
}
// TODO: Image block support
// if (block->getType() == BlockType::IMAGE_BLOCK) {
// ImageBlock *imageBlock = (ImageBlock *)block;
// if (y + imageBlock->height > page_height) {
// pages.push_back(new Page());
// y = 0;
// }
// pages.back()->elements.push_back(new PageImage(imageBlock, y));
// y += imageBlock->height;
// }
}

View File

@@ -0,0 +1,34 @@
#pragma once
#include <tinyxml2.h>
#include <functional>
#include "blocks/TextBlock.h"
class Page;
class EpdRenderer;
class EpubHtmlParser final : public tinyxml2::XMLVisitor {
const char* filepath;
EpdRenderer* renderer;
std::function<void(Page*)> completePageFn;
bool insideBoldTag = false;
bool insideItalicTag = false;
TextBlock* currentTextBlock = nullptr;
Page* currentPage = nullptr;
void startNewTextBlock(BLOCK_STYLE style);
void makePages();
// xml parser callbacks
bool VisitEnter(const tinyxml2::XMLElement& element, const tinyxml2::XMLAttribute* firstAttribute) override;
bool Visit(const tinyxml2::XMLText& text) override;
bool VisitExit(const tinyxml2::XMLElement& element) override;
// xml parser callbacks
public:
explicit EpubHtmlParser(const char* filepath, EpdRenderer* renderer, const std::function<void(Page*)>& completePageFn)
: filepath(filepath), renderer(renderer), completePageFn(completePageFn) {}
~EpubHtmlParser() override = default;
bool parseAndBuildPages();
};

65
lib/Epub/Epub/Page.cpp Normal file
View File

@@ -0,0 +1,65 @@
#include "Page.h"
#include <HardwareSerial.h>
#include <Serialization.h>
void PageLine::render(EpdRenderer* renderer) { block->render(renderer, 0, yPos); }
void PageLine::serialize(std::ostream& os) {
serialization::writePod(os, yPos);
// serialize TextBlock pointed to by PageLine
block->serialize(os);
}
PageLine* PageLine::deserialize(std::istream& is) {
int32_t yPos;
serialization::readPod(is, yPos);
const auto tb = TextBlock::deserialize(is);
return new PageLine(tb, yPos);
}
void Page::render(EpdRenderer* renderer) const {
const auto start = millis();
for (const auto element : elements) {
element->render(renderer);
}
Serial.printf("Rendered page elements (%u) in %dms\n", elements.size(), millis() - start);
}
void Page::serialize(std::ostream& os) const {
serialization::writePod(os, nextY);
const uint32_t count = elements.size();
serialization::writePod(os, count);
for (auto* el : elements) {
// Only PageLine exists currently
serialization::writePod(os, static_cast<uint8_t>(TAG_PageLine));
static_cast<PageLine*>(el)->serialize(os);
}
}
Page* Page::deserialize(std::istream& is) {
auto* page = new Page();
serialization::readPod(is, page->nextY);
uint32_t count;
serialization::readPod(is, count);
for (uint32_t i = 0; i < count; i++) {
uint8_t tag;
serialization::readPod(is, tag);
if (tag == TAG_PageLine) {
auto* pl = PageLine::deserialize(is);
page->elements.push_back(pl);
} else {
throw std::runtime_error("Unknown PageElement tag");
}
}
return page;
}

43
lib/Epub/Epub/Page.h Normal file
View File

@@ -0,0 +1,43 @@
#pragma once
#include "blocks/TextBlock.h"
enum PageElementTag : uint8_t {
TAG_PageLine = 1,
};
// represents something that has been added to a page
class PageElement {
public:
int yPos;
explicit PageElement(const int yPos) : yPos(yPos) {}
virtual ~PageElement() = default;
virtual void render(EpdRenderer* renderer) = 0;
virtual void serialize(std::ostream& os) = 0;
};
// a line from a block element
class PageLine final : public PageElement {
const TextBlock* block;
public:
PageLine(const TextBlock* block, const int yPos) : PageElement(yPos), block(block) {}
~PageLine() override { delete block; }
void render(EpdRenderer* renderer) override;
void serialize(std::ostream& os) override;
static PageLine* deserialize(std::istream& is);
};
class Page {
public:
int nextY = 0;
// the list of block index and line numbers on this page
std::vector<PageElement*> elements;
void render(EpdRenderer* renderer) const;
~Page() {
for (const auto element : elements) {
delete element;
}
}
void serialize(std::ostream& os) const;
static Page* deserialize(std::istream& is);
};

117
lib/Epub/Epub/Section.cpp Normal file
View File

@@ -0,0 +1,117 @@
#include "Section.h"
#include <EpdRenderer.h>
#include <SD.h>
#include <fstream>
#include "EpubHtmlParser.h"
#include "Page.h"
void Section::onPageComplete(const Page* page) {
Serial.printf("Page %d complete\n", pageCount);
const auto filePath = cachePath + "/page_" + std::to_string(pageCount) + ".bin";
// TODO can this be removed?
SD.open(filePath.c_str(), FILE_WRITE).close();
std::ofstream outputFile("/sd" + filePath);
page->serialize(outputFile);
outputFile.close();
pageCount++;
delete page;
}
bool Section::hasCache() {
if (!SD.exists(cachePath.c_str())) {
return false;
}
const auto sectionFilePath = cachePath + "/section.bin";
if (!SD.exists(sectionFilePath.c_str())) {
return false;
}
File sectionFile = SD.open(sectionFilePath.c_str(), FILE_READ);
uint8_t pageCountBytes[2] = {0, 0};
sectionFile.read(pageCountBytes, 2);
sectionFile.close();
pageCount = pageCountBytes[0] + (pageCountBytes[1] << 8);
Serial.printf("Loaded cache: %d pages\n", pageCount);
return true;
}
void Section::setupCacheDir() const {
epub->setupCacheDir();
SD.mkdir(cachePath.c_str());
}
void Section::clearCache() const { SD.rmdir(cachePath.c_str()); }
bool Section::persistPageDataToSD() {
size_t size = 0;
auto localPath = epub->getSpineItem(spineIndex);
const auto html = epub->getItemContents(epub->getSpineItem(spineIndex), &size);
if (!html) {
Serial.println("Failed to read item contents");
return false;
}
// TODO: Would love to stream this through an XML visitor
const auto tmpHtmlPath = epub->getCachePath() + "/.tmp_" + std::to_string(spineIndex) + ".html";
File f = SD.open(tmpHtmlPath.c_str(), FILE_WRITE);
const auto written = f.write(html, size);
f.close();
free(html);
Serial.printf("Wrote %d bytes to %s\n", written, tmpHtmlPath.c_str());
if (size != written) {
Serial.println("Failed to inflate section contents to SD");
SD.remove(tmpHtmlPath.c_str());
return false;
}
const auto sdTmpHtmlPath = "/sd" + tmpHtmlPath;
auto visitor =
EpubHtmlParser(sdTmpHtmlPath.c_str(), renderer, [this](const Page* page) { this->onPageComplete(page); });
// TODO: Come back and see if mem used here can be lowered?
const bool success = visitor.parseAndBuildPages();
SD.remove(tmpHtmlPath.c_str());
if (!success) {
Serial.println("Failed to parse and build pages");
return false;
}
File sectionFile = SD.open((cachePath + "/section.bin").c_str(), FILE_WRITE, true);
const uint8_t pageCountBytes[2] = {static_cast<uint8_t>(pageCount & 0xFF),
static_cast<uint8_t>((pageCount >> 8) & 0xFF)};
sectionFile.write(pageCountBytes, 2);
sectionFile.close();
return true;
}
void Section::renderPage() {
if (0 <= currentPage && currentPage < pageCount) {
const auto filePath = "/sd" + cachePath + "/page_" + std::to_string(currentPage) + ".bin";
std::ifstream inputFile(filePath);
const Page* p = Page::deserialize(inputFile);
inputFile.close();
p->render(renderer);
delete p;
} else if (pageCount == 0) {
Serial.println("No pages to render");
const int width = renderer->getTextWidth("Empty chapter", true);
renderer->drawText((renderer->getPageWidth() - width) / 2, 300, "Empty chapter", true);
} else {
Serial.printf("Page out of bounds: %d (max %d)\n", currentPage, pageCount);
const int width = renderer->getTextWidth("Out of bounds", true);
renderer->drawText((renderer->getPageWidth() - width) / 2, 300, "Out of bounds", true);
}
}

29
lib/Epub/Epub/Section.h Normal file
View File

@@ -0,0 +1,29 @@
#pragma once
#include "Epub.h"
class Page;
class EpdRenderer;
class Section {
Epub* epub;
const int spineIndex;
EpdRenderer* renderer;
std::string cachePath;
void onPageComplete(const Page* page);
public:
int pageCount = 0;
int currentPage = 0;
explicit Section(Epub* epub, const int spineIndex, EpdRenderer* renderer)
: epub(epub), spineIndex(spineIndex), renderer(renderer) {
cachePath = epub->getCachePath() + "/" + std::to_string(spineIndex);
}
~Section() = default;
bool hasCache();
void setupCacheDir() const;
void clearCache() const;
bool persistPageDataToSD();
void renderPage();
};

View File

@@ -0,0 +1,15 @@
#pragma once
class EpdRenderer;
typedef enum { TEXT_BLOCK, IMAGE_BLOCK } BlockType;
// a block of content in the html - either a paragraph or an image
class Block {
public:
virtual ~Block() = default;
virtual void layout(EpdRenderer* renderer) = 0;
virtual BlockType getType() = 0;
virtual bool isEmpty() = 0;
virtual void finish() {}
};

View File

@@ -0,0 +1,235 @@
#include "TextBlock.h"
#include <EpdRenderer.h>
#include <Serialization.h>
static bool isWhitespace(const char c) { return c == ' ' || c == '\r' || c == '\n'; }
// move past anything that should be considered part of a work
static int skipWord(const std::string& text, int index, const int length) {
while (index < length && !isWhitespace(text[index])) {
index++;
}
return index;
}
// skip past any white space characters
static int skipWhitespace(const std::string& html, int index, const int length) {
while (index < length && isWhitespace(html[index])) {
index++;
}
return index;
}
void TextBlock::addSpan(const std::string& span, const bool is_bold, const bool is_italic) {
// adding a span to text block
// make a copy of the text as we'll modify it
const int length = span.length();
// const auto text = new char[length + 1];
// strcpy(text, span);
// work out where each word is in the span
int index = 0;
while (index < length) {
// skip past any whitespace to the start of a word
index = skipWhitespace(span, index, length);
const int wordStart = index;
// find the end of the word
index = skipWord(span, index, length);
const int wordLength = index - wordStart;
if (wordLength > 0) {
words.push_back(span.substr(wordStart, wordLength));
wordStyles.push_back((is_bold ? BOLD_SPAN : 0) | (is_italic ? ITALIC_SPAN : 0));
}
}
}
std::list<TextBlock*> TextBlock::splitIntoLines(const EpdRenderer* renderer) {
const int totalWordCount = words.size();
const int pageWidth = renderer->getPageWidth();
const int spaceWidth = renderer->getSpaceWidth();
words.shrink_to_fit();
wordStyles.shrink_to_fit();
wordXpos.reserve(totalWordCount);
// measure each word
uint16_t wordWidths[totalWordCount];
for (int i = 0; i < words.size(); i++) {
// measure the word
const int width = renderer->getTextWidth(words[i].c_str(), wordStyles[i] & BOLD_SPAN, wordStyles[i] & ITALIC_SPAN);
wordWidths[i] = width;
}
// now apply the dynamic programming algorithm to find the best line breaks
// DP table in which dp[i] represents cost of line starting with word words[i]
int dp[totalWordCount];
// Array in which ans[i] store index of last word in line starting with word
// word[i]
size_t ans[totalWordCount];
// If only one word is present then only one line is required. Cost of last
// line is zero. Hence cost of this line is zero. Ending point is also n-1 as
// single word is present
dp[totalWordCount - 1] = 0;
ans[totalWordCount - 1] = totalWordCount - 1;
// Make each word first word of line by iterating over each index in arr.
for (int i = totalWordCount - 2; i >= 0; i--) {
int currlen = -1;
dp[i] = INT_MAX;
// Variable to store possible minimum cost of line.
int cost;
// Keep on adding words in current line by iterating from starting word upto
// last word in arr.
for (int j = i; j < totalWordCount; j++) {
// Update the width of the words in current line + the space between two
// words.
currlen += wordWidths[j] + spaceWidth;
// If we're bigger than the current pagewidth then we can't add more words
if (currlen > pageWidth) break;
// if we've run out of words then this is last line and the cost should be
// 0 Otherwise the cost is the sqaure of the left over space + the costs
// of all the previous lines
if (j == totalWordCount - 1)
cost = 0;
else
cost = (pageWidth - currlen) * (pageWidth - currlen) + dp[j + 1];
// Check if this arrangement gives minimum cost for line starting with
// word words[i].
if (cost < dp[i]) {
dp[i] = cost;
ans[i] = j;
}
}
}
// We can now iterate through the answer to find the line break positions
std::list<uint16_t> lineBreaks;
for (size_t i = 0; i < totalWordCount;) {
i = ans[i] + 1;
if (i > totalWordCount) {
break;
}
lineBreaks.push_back(i);
// Text too big, just exit
if (lineBreaks.size() > 1000) {
break;
}
}
std::list<TextBlock*> lines;
// With the line breaks calculated we can now position the words along the
// line
int startWord = 0;
for (const auto lineBreak : lineBreaks) {
const int lineWordCount = lineBreak - startWord;
int lineWordWidthSum = 0;
for (int i = startWord; i < lineBreak; i++) {
lineWordWidthSum += wordWidths[i];
}
// Calculate spacing between words
const uint16_t spareSpace = pageWidth - lineWordWidthSum;
uint16_t spacing = spaceWidth;
// evenly space words if using justified style, not the last line, and at
// least 2 words
if (style == JUSTIFIED && lineBreak != lineBreaks.back() && lineWordCount >= 2) {
spacing = spareSpace / (lineWordCount - 1);
}
uint16_t xpos = 0;
if (style == RIGHT_ALIGN) {
xpos = spareSpace - (lineWordCount - 1) * spaceWidth;
} else if (style == CENTER_ALIGN) {
xpos = (spareSpace - (lineWordCount - 1) * spaceWidth) / 2;
}
for (int i = startWord; i < lineBreak; i++) {
wordXpos[i] = xpos;
xpos += wordWidths[i] + spacing;
}
std::vector<std::string> lineWords;
std::vector<uint16_t> lineXPos;
std::vector<uint8_t> lineWordStyles;
lineWords.reserve(lineWordCount);
lineXPos.reserve(lineWordCount);
lineWordStyles.reserve(lineWordCount);
for (int i = startWord; i < lineBreak; i++) {
lineWords.push_back(words[i]);
lineXPos.push_back(wordXpos[i]);
lineWordStyles.push_back(wordStyles[i]);
}
const auto textLine = new TextBlock(lineWords, lineXPos, lineWordStyles, style);
lines.push_back(textLine);
startWord = lineBreak;
}
return lines;
}
void TextBlock::render(const EpdRenderer* renderer, const int x, const int y) const {
for (int i = 0; i < words.size(); i++) {
// get the style
const uint8_t wordStyle = wordStyles[i];
// render the word
renderer->drawText(x + wordXpos[i], y, words[i].c_str(), wordStyle & BOLD_SPAN, wordStyle & ITALIC_SPAN);
}
}
void TextBlock::serialize(std::ostream& os) const {
// words
const uint32_t wc = words.size();
serialization::writePod(os, wc);
for (const auto& w : words) serialization::writeString(os, w);
// wordXpos
const uint32_t xc = wordXpos.size();
serialization::writePod(os, xc);
for (auto x : wordXpos) serialization::writePod(os, x);
// wordStyles
const uint32_t sc = wordStyles.size();
serialization::writePod(os, sc);
for (auto s : wordStyles) serialization::writePod(os, s);
// style
serialization::writePod(os, style);
}
TextBlock* TextBlock::deserialize(std::istream& is) {
uint32_t wc, xc, sc;
std::vector<std::string> words;
std::vector<uint16_t> wordXpos;
std::vector<uint8_t> wordStyles;
BLOCK_STYLE style;
// words
serialization::readPod(is, wc);
words.resize(wc);
for (auto& w : words) serialization::readString(is, w);
// wordXpos
serialization::readPod(is, xc);
wordXpos.resize(xc);
for (auto& x : wordXpos) serialization::readPod(is, x);
// wordStyles
serialization::readPod(is, sc);
wordStyles.resize(sc);
for (auto& s : wordStyles) serialization::readPod(is, s);
// style
serialization::readPod(is, style);
return new TextBlock(words, wordXpos, wordStyles, style);
}

View File

@@ -0,0 +1,50 @@
#pragma once
#include <list>
#include <string>
#include <vector>
#include "Block.h"
enum SPAN_STYLE : uint8_t {
BOLD_SPAN = 1,
ITALIC_SPAN = 2,
};
enum BLOCK_STYLE : uint8_t {
JUSTIFIED = 0,
LEFT_ALIGN = 1,
CENTER_ALIGN = 2,
RIGHT_ALIGN = 3,
};
// represents a block of words in the html document
class TextBlock final : public Block {
// pointer to each word
std::vector<std::string> words;
// x position of each word
std::vector<uint16_t> wordXpos;
// the styles of each word
std::vector<uint8_t> wordStyles;
// the style of the block - left, center, right aligned
BLOCK_STYLE style;
public:
void addSpan(const std::string& span, bool is_bold, bool is_italic);
explicit TextBlock(const BLOCK_STYLE style) : style(style) {}
explicit TextBlock(const std::vector<std::string>& words, const std::vector<uint16_t>& word_xpos,
// the styles of each word
const std::vector<uint8_t>& word_styles, const BLOCK_STYLE style)
: words(words), wordXpos(word_xpos), wordStyles(word_styles), style(style) {}
~TextBlock() override = default;
void set_style(const BLOCK_STYLE style) { this->style = style; }
BLOCK_STYLE get_style() const { return style; }
bool isEmpty() override { return words.empty(); }
void layout(EpdRenderer* renderer) override {};
// given a renderer works out where to break the words into lines
std::list<TextBlock*> splitIntoLines(const EpdRenderer* renderer);
void render(const EpdRenderer* renderer, int x, int y) const;
BlockType getType() override { return TEXT_BLOCK; }
void serialize(std::ostream& os) const;
static TextBlock* deserialize(std::istream& is);
};

View File

@@ -0,0 +1,163 @@
// from
// https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp
#include "htmlEntities.h"
#include <cstring>
#include <unordered_map>
const int MAX_ENTITY_LENGTH = 10;
// Use book: entities_ww2.epub to test this (Page 7: Entities parser test)
// Note the supported keys are only in lowercase
// Store the mappings in a unordered hash map
static std::unordered_map<std::string, std::string> entity_lookup(
{{"&quot;", "\""}, {"&frasl;", ""}, {"&amp;", "&"}, {"&lt;", "<"}, {"&gt;", ">"},
{"&Agrave;", "À"}, {"&Aacute;", "Á"}, {"&Acirc;", "Â"}, {"&Atilde;", "Ã"}, {"&Auml;", "Ä"},
{"&Aring;", "Å"}, {"&AElig;", "Æ"}, {"&Ccedil;", "Ç"}, {"&Egrave;", "È"}, {"&Eacute;", "É"},
{"&Ecirc;", "Ê"}, {"&Euml;", "Ë"}, {"&Igrave;", "Ì"}, {"&Iacute;", "Í"}, {"&Icirc;", "Î"},
{"&Iuml;", "Ï"}, {"&ETH;", "Ð"}, {"&Ntilde;", "Ñ"}, {"&Ograve;", "Ò"}, {"&Oacute;", "Ó"},
{"&Ocirc;", "Ô"}, {"&Otilde;", "Õ"}, {"&Ouml;", "Ö"}, {"&Oslash;", "Ø"}, {"&Ugrave;", "Ù"},
{"&Uacute;", "Ú"}, {"&Ucirc;", "Û"}, {"&Uuml;", "Ü"}, {"&Yacute;", "Ý"}, {"&THORN;", "Þ"},
{"&szlig;", "ß"}, {"&agrave;", "à"}, {"&aacute;", "á"}, {"&acirc;", "â"}, {"&atilde;", "ã"},
{"&auml;", "ä"}, {"&aring;", "å"}, {"&aelig;", "æ"}, {"&ccedil;", "ç"}, {"&egrave;", "è"},
{"&eacute;", "é"}, {"&ecirc;", "ê"}, {"&euml;", "ë"}, {"&igrave;", "ì"}, {"&iacute;", "í"},
{"&icirc;", "î"}, {"&iuml;", "ï"}, {"&eth;", "ð"}, {"&ntilde;", "ñ"}, {"&ograve;", "ò"},
{"&oacute;", "ó"}, {"&ocirc;", "ô"}, {"&otilde;", "õ"}, {"&ouml;", "ö"}, {"&oslash;", "ø"},
{"&ugrave;", "ù"}, {"&uacute;", "ú"}, {"&ucirc;", "û"}, {"&uuml;", "ü"}, {"&yacute;", "ý"},
{"&thorn;", "þ"}, {"&yuml;", "ÿ"}, {"&nbsp;", " "}, {"&iexcl;", "¡"}, {"&cent;", "¢"},
{"&pound;", "£"}, {"&curren;", "¤"}, {"&yen;", "¥"}, {"&brvbar;", "¦"}, {"&sect;", "§"},
{"&uml;", "¨"}, {"&copy;", "©"}, {"&ordf;", "ª"}, {"&laquo;", "«"}, {"&not;", "¬"},
{"&shy;", "­"}, {"&reg;", "®"}, {"&macr;", "¯"}, {"&deg;", "°"}, {"&plusmn;", "±"},
{"&sup2;", "²"}, {"&sup3;", "³"}, {"&acute;", "´"}, {"&micro;", "µ"}, {"&para;", ""},
{"&cedil;", "¸"}, {"&sup1;", "¹"}, {"&ordm;", "º"}, {"&raquo;", "»"}, {"&frac14;", "¼"},
{"&frac12;", "½"}, {"&frac34;", "¾"}, {"&iquest;", "¿"}, {"&times;", "×"}, {"&divide;", "÷"},
{"&forall;", ""}, {"&part;", ""}, {"&exist;", ""}, {"&empty;", ""}, {"&nabla;", ""},
{"&isin;", ""}, {"&notin;", ""}, {"&ni;", ""}, {"&prod;", ""}, {"&sum;", ""},
{"&minus;", ""}, {"&lowast;", ""}, {"&radic;", ""}, {"&prop;", ""}, {"&infin;", ""},
{"&ang;", ""}, {"&and;", ""}, {"&or;", ""}, {"&cap;", ""}, {"&cup;", ""},
{"&int;", ""}, {"&there4;", ""}, {"&sim;", ""}, {"&cong;", ""}, {"&asymp;", ""},
{"&ne;", ""}, {"&equiv;", ""}, {"&le;", ""}, {"&ge;", ""}, {"&sub;", ""},
{"&sup;", ""}, {"&nsub;", ""}, {"&sube;", ""}, {"&supe;", ""}, {"&oplus;", ""},
{"&otimes;", ""}, {"&perp;", ""}, {"&sdot;", ""}, {"&Alpha;", "Α"}, {"&Beta;", "Β"},
{"&Gamma;", "Γ"}, {"&Delta;", "Δ"}, {"&Epsilon;", "Ε"}, {"&Zeta;", "Ζ"}, {"&Eta;", "Η"},
{"&Theta;", "Θ"}, {"&Iota;", "Ι"}, {"&Kappa;", "Κ"}, {"&Lambda;", "Λ"}, {"&Mu;", "Μ"},
{"&Nu;", "Ν"}, {"&Xi;", "Ξ"}, {"&Omicron;", "Ο"}, {"&Pi;", "Π"}, {"&Rho;", "Ρ"},
{"&Sigma;", "Σ"}, {"&Tau;", "Τ"}, {"&Upsilon;", "Υ"}, {"&Phi;", "Φ"}, {"&Chi;", "Χ"},
{"&Psi;", "Ψ"}, {"&Omega;", "Ω"}, {"&alpha;", "α"}, {"&beta;", "β"}, {"&gamma;", "γ"},
{"&delta;", "δ"}, {"&epsilon;", "ε"}, {"&zeta;", "ζ"}, {"&eta;", "η"}, {"&theta;", "θ"},
{"&iota;", "ι"}, {"&kappa;", "κ"}, {"&lambda;", "λ"}, {"&mu;", "μ"}, {"&nu;", "ν"},
{"&xi;", "ξ"}, {"&omicron;", "ο"}, {"&pi;", "π"}, {"&rho;", "ρ"}, {"&sigmaf;", "ς"},
{"&sigma;", "σ"}, {"&tau;", "τ"}, {"&upsilon;", "υ"}, {"&phi;", "φ"}, {"&chi;", "χ"},
{"&psi;", "ψ"}, {"&omega;", "ω"}, {"&thetasym;", "ϑ"}, {"&upsih;", "ϒ"}, {"&piv;", "ϖ"},
{"&OElig;", "Œ"}, {"&oelig;", "œ"}, {"&Scaron;", "Š"}, {"&scaron;", "š"}, {"&Yuml;", "Ÿ"},
{"&fnof;", "ƒ"}, {"&circ;", "ˆ"}, {"&tilde;", "˜"}, {"&ensp;", ""}, {"&emsp;", ""},
{"&thinsp;", ""}, {"&zwnj;", ""}, {"&zwj;", ""}, {"&lrm;", ""}, {"&rlm;", ""},
{"&ndash;", ""}, {"&mdash;", ""}, {"&lsquo;", ""}, {"&rsquo;", ""}, {"&sbquo;", ""},
{"&ldquo;", ""}, {"&rdquo;", ""}, {"&bdquo;", ""}, {"&dagger;", ""}, {"&Dagger;", ""},
{"&bull;", ""}, {"&hellip;", ""}, {"&permil;", ""}, {"&prime;", ""}, {"&Prime;", ""},
{"&lsaquo;", ""}, {"&rsaquo;", ""}, {"&oline;", ""}, {"&euro;", ""}, {"&trade;", ""},
{"&larr;", ""}, {"&uarr;", ""}, {"&rarr;", ""}, {"&darr;", ""}, {"&harr;", ""},
{"&crarr;", ""}, {"&lceil;", ""}, {"&rceil;", ""}, {"&lfloor;", ""}, {"&rfloor;", ""},
{"&loz;", ""}, {"&spades;", ""}, {"&clubs;", ""}, {"&hearts;", ""}, {"&diams;", ""}});
// converts from a unicode code point to the utf8 equivalent
void convert_to_utf8(const int code, std::string& res) {
// convert to a utf8 sequence
if (code < 0x80) {
res += static_cast<char>(code);
} else if (code < 0x800) {
res += static_cast<char>(0xc0 | (code >> 6));
res += static_cast<char>(0x80 | (code & 0x3f));
} else if (code < 0x10000) {
res += static_cast<char>(0xe0 | (code >> 12));
res += static_cast<char>(0x80 | ((code >> 6) & 0x3f));
res += static_cast<char>(0x80 | (code & 0x3f));
} else if (code < 0x200000) {
res += static_cast<char>(0xf0 | (code >> 18));
res += static_cast<char>(0x80 | ((code >> 12) & 0x3f));
res += static_cast<char>(0x80 | ((code >> 6) & 0x3f));
res += static_cast<char>(0x80 | (code & 0x3f));
} else if (code < 0x4000000) {
res += static_cast<char>(0xf8 | (code >> 24));
res += static_cast<char>(0x80 | ((code >> 18) & 0x3f));
res += static_cast<char>(0x80 | ((code >> 12) & 0x3f));
res += static_cast<char>(0x80 | ((code >> 6) & 0x3f));
res += static_cast<char>(0x80 | (code & 0x3f));
} else if (code < 0x80000000) {
res += static_cast<char>(0xfc | (code >> 30));
res += static_cast<char>(0x80 | ((code >> 24) & 0x3f));
res += static_cast<char>(0x80 | ((code >> 18) & 0x3f));
res += static_cast<char>(0x80 | ((code >> 12) & 0x3f));
res += static_cast<char>(0x80 | ((code >> 6) & 0x3f));
}
}
// handles numeric entities - e.g. &#1234; or &#x1234;
bool process_numeric_entity(const std::string& entity, std::string& res) {
int code = 0;
// is it hex?
if (entity[2] == 'x' || entity[2] == 'X') {
// parse the hex code
code = strtol(entity.substr(3, entity.size() - 3).c_str(), nullptr, 16);
} else {
code = strtol(entity.substr(2, entity.size() - 3).c_str(), nullptr, 10);
}
if (code != 0) {
// special handling for nbsp
if (code == 0xA0) {
res += " ";
} else {
convert_to_utf8(code, res);
}
return true;
}
return false;
}
// handles named entities - e.g. &amp;
bool process_string_entity(const std::string& entity, std::string& res) {
// it's a named entity - find it in the lookup table
// find it in the map
const auto it = entity_lookup.find(entity);
if (it != entity_lookup.end()) {
res += it->second;
return true;
}
return false;
}
// replace all the entities in the string
std::string replaceHtmlEntities(const char* text) {
std::string res;
res.reserve(strlen(text));
for (int i = 0; i < strlen(text); ++i) {
bool flag = false;
// do we have a potential entity?
if (text[i] == '&') {
// find the end of the entity
int j = i + 1;
while (j < strlen(text) && text[j] != ';' && j - i < MAX_ENTITY_LENGTH) {
j++;
}
if (j - i > 2) {
char entity[j - i + 1];
strncpy(entity, text + i, j - i);
// is it a numeric code?
if (entity[1] == '#') {
flag = process_numeric_entity(entity, res);
} else {
flag = process_string_entity(entity, res);
}
// skip past the entity if we successfully decoded it
if (flag) {
i = j;
}
}
}
if (!flag) {
res += text[i];
}
}
return res;
}

View File

@@ -0,0 +1,7 @@
// from
// https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp
#pragma once
#include <string>
std::string replaceHtmlEntities(const char* text);