Public release
This commit is contained in:
383
lib/Epub/Epub.cpp
Normal file
383
lib/Epub/Epub.cpp
Normal file
@@ -0,0 +1,383 @@
|
||||
#include "Epub.h"
|
||||
|
||||
#include <HardwareSerial.h>
|
||||
#include <SD.h>
|
||||
#include <ZipFile.h>
|
||||
#include <tinyxml2.h>
|
||||
|
||||
#include <map>
|
||||
|
||||
bool Epub::findContentOpfFile(const ZipFile& zip, std::string& contentOpfFile) {
|
||||
// open up the meta data to find where the content.opf file lives
|
||||
size_t s;
|
||||
const auto metaInfo = zip.readTextFileToMemory("META-INF/container.xml", &s);
|
||||
if (!metaInfo) {
|
||||
Serial.println("Could not find META-INF/container.xml");
|
||||
return false;
|
||||
}
|
||||
|
||||
// parse the meta data
|
||||
tinyxml2::XMLDocument metaDataDoc;
|
||||
const auto result = metaDataDoc.Parse(metaInfo);
|
||||
free(metaInfo);
|
||||
|
||||
if (result != tinyxml2::XML_SUCCESS) {
|
||||
Serial.printf("Could not parse META-INF/container.xml. Error: %d\n", result);
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto container = metaDataDoc.FirstChildElement("container");
|
||||
if (!container) {
|
||||
Serial.println("Could not find container element in META-INF/container.xml");
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto rootfiles = container->FirstChildElement("rootfiles");
|
||||
if (!rootfiles) {
|
||||
Serial.println("Could not find rootfiles element in META-INF/container.xml");
|
||||
return false;
|
||||
}
|
||||
|
||||
// find the root file that has the media-type="application/oebps-package+xml"
|
||||
auto rootfile = rootfiles->FirstChildElement("rootfile");
|
||||
while (rootfile) {
|
||||
const char* mediaType = rootfile->Attribute("media-type");
|
||||
if (mediaType && strcmp(mediaType, "application/oebps-package+xml") == 0) {
|
||||
const char* full_path = rootfile->Attribute("full-path");
|
||||
if (full_path) {
|
||||
contentOpfFile = full_path;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
rootfile = rootfile->NextSiblingElement("rootfile");
|
||||
}
|
||||
|
||||
Serial.println("Could not get path to content.opf file");
|
||||
return false;
|
||||
}
|
||||
|
||||
bool Epub::parseContentOpf(ZipFile& zip, std::string& content_opf_file) {
|
||||
// read in the content.opf file and parse it
|
||||
auto contents = zip.readTextFileToMemory(content_opf_file.c_str());
|
||||
|
||||
// parse the contents
|
||||
tinyxml2::XMLDocument doc;
|
||||
auto result = doc.Parse(contents);
|
||||
free(contents);
|
||||
|
||||
if (result != tinyxml2::XML_SUCCESS) {
|
||||
Serial.printf("Error parsing content.opf - %s\n", tinyxml2::XMLDocument::ErrorIDToName(result));
|
||||
return false;
|
||||
}
|
||||
|
||||
auto package = doc.FirstChildElement("package");
|
||||
if (!package) package = doc.FirstChildElement("opf:package");
|
||||
|
||||
if (!package) {
|
||||
Serial.println("Could not find package element in content.opf");
|
||||
return false;
|
||||
}
|
||||
|
||||
// get the metadata - title and cover image
|
||||
auto metadata = package->FirstChildElement("metadata");
|
||||
if (!metadata) metadata = package->FirstChildElement("opf:metadata");
|
||||
if (!metadata) {
|
||||
Serial.println("Missing metadata");
|
||||
return false;
|
||||
}
|
||||
|
||||
auto titleEl = metadata->FirstChildElement("dc:title");
|
||||
if (!titleEl) {
|
||||
Serial.println("Missing title");
|
||||
return false;
|
||||
}
|
||||
this->title = titleEl->GetText();
|
||||
|
||||
auto cover = metadata->FirstChildElement("meta");
|
||||
if (!cover) cover = metadata->FirstChildElement("opf:meta");
|
||||
while (cover && cover->Attribute("name") && strcmp(cover->Attribute("name"), "cover") != 0) {
|
||||
cover = cover->NextSiblingElement("meta");
|
||||
}
|
||||
if (!cover) {
|
||||
Serial.println("Missing cover");
|
||||
}
|
||||
auto coverItem = cover ? cover->Attribute("content") : nullptr;
|
||||
|
||||
// read the manifest and spine
|
||||
// the manifest gives us the names of the files
|
||||
// the spine gives us the order of the files
|
||||
// we can then read the files in the order they are in the spine
|
||||
auto manifest = package->FirstChildElement("manifest");
|
||||
if (!manifest) manifest = package->FirstChildElement("opf:manifest");
|
||||
if (!manifest) {
|
||||
Serial.println("Missing manifest");
|
||||
return false;
|
||||
}
|
||||
|
||||
// create a mapping from id to file name
|
||||
auto item = manifest->FirstChildElement("item");
|
||||
if (!item) item = manifest->FirstChildElement("opf:item");
|
||||
std::map<std::string, std::string> items;
|
||||
|
||||
while (item) {
|
||||
std::string itemId = item->Attribute("id");
|
||||
std::string href = contentBasePath + item->Attribute("href");
|
||||
|
||||
// grab the cover image
|
||||
if (coverItem && itemId == coverItem) {
|
||||
coverImageItem = href;
|
||||
}
|
||||
|
||||
// grab the ncx file
|
||||
if (itemId == "ncx" || itemId == "ncxtoc") {
|
||||
tocNcxItem = href;
|
||||
}
|
||||
|
||||
items[itemId] = href;
|
||||
auto nextItem = item->NextSiblingElement("item");
|
||||
if (!nextItem) nextItem = item->NextSiblingElement("opf:item");
|
||||
item = nextItem;
|
||||
}
|
||||
|
||||
// find the spine
|
||||
auto spineEl = package->FirstChildElement("spine");
|
||||
if (!spineEl) spineEl = package->FirstChildElement("opf:spine");
|
||||
if (!spineEl) {
|
||||
Serial.println("Missing spine");
|
||||
return false;
|
||||
}
|
||||
|
||||
// read the spine
|
||||
auto itemref = spineEl->FirstChildElement("itemref");
|
||||
if (!itemref) itemref = spineEl->FirstChildElement("opf:itemref");
|
||||
while (itemref) {
|
||||
auto id = itemref->Attribute("idref");
|
||||
if (items.find(id) != items.end()) {
|
||||
spine.emplace_back(id, items[id]);
|
||||
}
|
||||
auto nextItemRef = itemref->NextSiblingElement("itemref");
|
||||
if (!nextItemRef) nextItemRef = itemref->NextSiblingElement("opf:itemref");
|
||||
itemref = nextItemRef;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Epub::parseTocNcxFile(ZipFile& zip) {
|
||||
// the ncx file should have been specified in the content.opf file
|
||||
if (tocNcxItem.empty()) {
|
||||
Serial.println("No ncx file specified");
|
||||
return false;
|
||||
}
|
||||
|
||||
auto ncxData = zip.readTextFileToMemory(tocNcxItem.c_str());
|
||||
if (!ncxData) {
|
||||
Serial.printf("Could not find %s\n", tocNcxItem.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
// Parse the Toc contents
|
||||
tinyxml2::XMLDocument doc;
|
||||
auto result = doc.Parse(ncxData);
|
||||
free(ncxData);
|
||||
|
||||
if (result != tinyxml2::XML_SUCCESS) {
|
||||
Serial.printf("Error parsing toc %s\n", tinyxml2::XMLDocument::ErrorIDToName(result));
|
||||
return false;
|
||||
}
|
||||
|
||||
auto ncx = doc.FirstChildElement("ncx");
|
||||
if (!ncx) {
|
||||
Serial.println("Could not find first child ncx in toc");
|
||||
return false;
|
||||
}
|
||||
|
||||
auto navMap = ncx->FirstChildElement("navMap");
|
||||
if (!navMap) {
|
||||
Serial.println("Could not find navMap child in ncx");
|
||||
return false;
|
||||
}
|
||||
|
||||
auto navPoint = navMap->FirstChildElement("navPoint");
|
||||
|
||||
// Fills toc map
|
||||
while (navPoint) {
|
||||
std::string navTitle = navPoint->FirstChildElement("navLabel")->FirstChildElement("text")->FirstChild()->Value();
|
||||
auto content = navPoint->FirstChildElement("content");
|
||||
std::string href = contentBasePath + content->Attribute("src");
|
||||
// split the href on the # to get the href and the anchor
|
||||
size_t pos = href.find('#');
|
||||
std::string anchor;
|
||||
|
||||
if (pos != std::string::npos) {
|
||||
anchor = href.substr(pos + 1);
|
||||
href = href.substr(0, pos);
|
||||
}
|
||||
|
||||
toc.emplace_back(navTitle, href, anchor, 0);
|
||||
navPoint = navPoint->NextSiblingElement("navPoint");
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// load in the meta data for the epub file
|
||||
bool Epub::load() {
|
||||
ZipFile zip("/sd" + filepath);
|
||||
|
||||
std::string contentOpfFile;
|
||||
if (!findContentOpfFile(zip, contentOpfFile)) {
|
||||
Serial.println("Could not open ePub");
|
||||
return false;
|
||||
}
|
||||
|
||||
contentBasePath = contentOpfFile.substr(0, contentOpfFile.find_last_of('/') + 1);
|
||||
|
||||
if (!parseContentOpf(zip, contentOpfFile)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!parseTocNcxFile(zip)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void Epub::clearCache() const { SD.rmdir(cachePath.c_str()); }
|
||||
|
||||
void Epub::setupCacheDir() const {
|
||||
if (SD.exists(cachePath.c_str())) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Loop over each segment of the cache path and create directories as needed
|
||||
for (size_t i = 1; i < cachePath.length(); i++) {
|
||||
if (cachePath[i] == '/') {
|
||||
SD.mkdir(cachePath.substr(0, i).c_str());
|
||||
}
|
||||
}
|
||||
SD.mkdir(cachePath.c_str());
|
||||
}
|
||||
|
||||
const std::string& Epub::getCachePath() const { return cachePath; }
|
||||
|
||||
const std::string& Epub::getPath() const { return filepath; }
|
||||
|
||||
const std::string& Epub::getTitle() const { return title; }
|
||||
|
||||
const std::string& Epub::getCoverImageItem() const { return coverImageItem; }
|
||||
|
||||
std::string normalisePath(const std::string& path) {
|
||||
std::vector<std::string> components;
|
||||
std::string component;
|
||||
|
||||
for (const auto c : path) {
|
||||
if (c == '/') {
|
||||
if (!component.empty()) {
|
||||
if (component == "..") {
|
||||
if (!components.empty()) {
|
||||
components.pop_back();
|
||||
}
|
||||
} else {
|
||||
components.push_back(component);
|
||||
}
|
||||
component.clear();
|
||||
}
|
||||
} else {
|
||||
component += c;
|
||||
}
|
||||
}
|
||||
|
||||
if (!component.empty()) {
|
||||
components.push_back(component);
|
||||
}
|
||||
|
||||
std::string result;
|
||||
for (const auto& c : components) {
|
||||
if (!result.empty()) {
|
||||
result += "/";
|
||||
}
|
||||
result += c;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
uint8_t* Epub::getItemContents(const std::string& itemHref, size_t* size) const {
|
||||
const ZipFile zip("/sd" + filepath);
|
||||
const std::string path = normalisePath(itemHref);
|
||||
|
||||
const auto content = zip.readFileToMemory(path.c_str(), size);
|
||||
if (!content) {
|
||||
Serial.printf("Failed to read item %s\n", path.c_str());
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return content;
|
||||
}
|
||||
|
||||
char* Epub::getTextItemContents(const std::string& itemHref, size_t* size) const {
|
||||
const ZipFile zip("/sd" + filepath);
|
||||
const std::string path = normalisePath(itemHref);
|
||||
|
||||
const auto content = zip.readTextFileToMemory(path.c_str(), size);
|
||||
if (!content) {
|
||||
Serial.printf("Failed to read item %s\n", path.c_str());
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return content;
|
||||
}
|
||||
|
||||
int Epub::getSpineItemsCount() const { return spine.size(); }
|
||||
|
||||
std::string& Epub::getSpineItem(const int spineIndex) {
|
||||
if (spineIndex < 0 || spineIndex >= spine.size()) {
|
||||
Serial.printf("getSpineItem index:%d is out of range\n", spineIndex);
|
||||
return spine.at(0).second;
|
||||
}
|
||||
|
||||
return spine.at(spineIndex).second;
|
||||
}
|
||||
|
||||
EpubTocEntry& Epub::getTocItem(const int tocTndex) {
|
||||
if (tocTndex < 0 || tocTndex >= toc.size()) {
|
||||
Serial.printf("getTocItem index:%d is out of range\n", tocTndex);
|
||||
return toc.at(0);
|
||||
}
|
||||
|
||||
return toc.at(tocTndex);
|
||||
}
|
||||
|
||||
int Epub::getTocItemsCount() const { return toc.size(); }
|
||||
|
||||
// work out the section index for a toc index
|
||||
int Epub::getSpineIndexForTocIndex(const int tocIndex) const {
|
||||
// the toc entry should have an href that matches the spine item
|
||||
// so we can find the spine index by looking for the href
|
||||
for (int i = 0; i < spine.size(); i++) {
|
||||
if (spine[i].second == toc[tocIndex].href) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
Serial.println("Section not found");
|
||||
// not found - default to the start of the book
|
||||
return 0;
|
||||
}
|
||||
|
||||
int Epub::getTocIndexForSpineIndex(const int spineIndex) const {
|
||||
// the toc entry should have an href that matches the spine item
|
||||
// so we can find the toc index by looking for the href
|
||||
Serial.printf("Looking for %s\n", spine[spineIndex].second.c_str());
|
||||
for (int i = 0; i < toc.size(); i++) {
|
||||
Serial.printf("Looking at %s\n", toc[i].href.c_str());
|
||||
if (toc[i].href == spine[spineIndex].second) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
Serial.println("TOC item not found");
|
||||
// not found - default to first item
|
||||
return 0;
|
||||
}
|
||||
73
lib/Epub/Epub.h
Normal file
73
lib/Epub/Epub.h
Normal file
@@ -0,0 +1,73 @@
|
||||
#pragma once
|
||||
#include <HardwareSerial.h>
|
||||
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
class ZipFile;
|
||||
|
||||
class EpubTocEntry {
|
||||
public:
|
||||
std::string title;
|
||||
std::string href;
|
||||
std::string anchor;
|
||||
int level;
|
||||
EpubTocEntry(std::string title, std::string href, std::string anchor, const int level)
|
||||
: title(std::move(title)), href(std::move(href)), anchor(std::move(anchor)), level(level) {}
|
||||
};
|
||||
|
||||
class Epub {
|
||||
// the title read from the EPUB meta data
|
||||
std::string title;
|
||||
// the cover image
|
||||
std::string coverImageItem;
|
||||
// the ncx file
|
||||
std::string tocNcxItem;
|
||||
// where is the EPUBfile?
|
||||
std::string filepath;
|
||||
// the spine of the EPUB file
|
||||
std::vector<std::pair<std::string, std::string>> spine;
|
||||
// the toc of the EPUB file
|
||||
std::vector<EpubTocEntry> toc;
|
||||
// the base path for items in the EPUB file
|
||||
std::string contentBasePath;
|
||||
// Uniq cache key based on filepath
|
||||
std::string cachePath;
|
||||
|
||||
// find the path for the content.opf file
|
||||
static bool findContentOpfFile(const ZipFile& zip, std::string& contentOpfFile);
|
||||
bool parseContentOpf(ZipFile& zip, std::string& content_opf_file);
|
||||
bool parseTocNcxFile(ZipFile& zip);
|
||||
|
||||
public:
|
||||
explicit Epub(std::string filepath, const std::string& cacheDir) : filepath(std::move(filepath)) {
|
||||
// create a cache key based on the filepath
|
||||
|
||||
cachePath = cacheDir + "/epub_" + std::to_string(std::hash<std::string>{}(this->filepath));
|
||||
}
|
||||
~Epub() = default;
|
||||
std::string& getBasePath() { return contentBasePath; }
|
||||
bool load();
|
||||
|
||||
void clearCache() const;
|
||||
|
||||
void setupCacheDir() const;
|
||||
|
||||
const std::string& getCachePath() const;
|
||||
const std::string& getPath() const;
|
||||
const std::string& getTitle() const;
|
||||
const std::string& getCoverImageItem() const;
|
||||
uint8_t* getItemContents(const std::string& itemHref, size_t* size = nullptr) const;
|
||||
char* getTextItemContents(const std::string& itemHref, size_t* size = nullptr) const;
|
||||
|
||||
std::string& getSpineItem(int spineIndex);
|
||||
int getSpineItemsCount() const;
|
||||
|
||||
EpubTocEntry& getTocItem(int tocTndex);
|
||||
int getTocItemsCount() const;
|
||||
// work out the section index for a toc index
|
||||
int getSpineIndexForTocIndex(int tocIndex) const;
|
||||
|
||||
int getTocIndexForSpineIndex(int spineIndex) const;
|
||||
};
|
||||
181
lib/Epub/Epub/EpubHtmlParser.cpp
Normal file
181
lib/Epub/Epub/EpubHtmlParser.cpp
Normal file
@@ -0,0 +1,181 @@
|
||||
#include "EpubHtmlParser.h"
|
||||
|
||||
#include <EpdRenderer.h>
|
||||
#include <HardwareSerial.h>
|
||||
|
||||
#include "Page.h"
|
||||
#include "htmlEntities.h"
|
||||
|
||||
const char* HEADER_TAGS[] = {"h1", "h2", "h3", "h4", "h5", "h6"};
|
||||
constexpr int NUM_HEADER_TAGS = sizeof(HEADER_TAGS) / sizeof(HEADER_TAGS[0]);
|
||||
|
||||
const char* BLOCK_TAGS[] = {"p", "li", "div", "br"};
|
||||
constexpr int NUM_BLOCK_TAGS = sizeof(BLOCK_TAGS) / sizeof(BLOCK_TAGS[0]);
|
||||
|
||||
const char* BOLD_TAGS[] = {"b"};
|
||||
constexpr int NUM_BOLD_TAGS = sizeof(BOLD_TAGS) / sizeof(BOLD_TAGS[0]);
|
||||
|
||||
const char* ITALIC_TAGS[] = {"i"};
|
||||
constexpr int NUM_ITALIC_TAGS = sizeof(ITALIC_TAGS) / sizeof(ITALIC_TAGS[0]);
|
||||
|
||||
const char* IMAGE_TAGS[] = {"img"};
|
||||
constexpr int NUM_IMAGE_TAGS = sizeof(IMAGE_TAGS) / sizeof(IMAGE_TAGS[0]);
|
||||
|
||||
const char* SKIP_TAGS[] = {"head", "table"};
|
||||
constexpr int NUM_SKIP_TAGS = sizeof(SKIP_TAGS) / sizeof(SKIP_TAGS[0]);
|
||||
|
||||
// given the start and end of a tag, check to see if it matches a known tag
|
||||
bool matches(const char* tag_name, const char* possible_tags[], const int possible_tag_count) {
|
||||
for (int i = 0; i < possible_tag_count; i++) {
|
||||
if (strcmp(tag_name, possible_tags[i]) == 0) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// start a new text block if needed
|
||||
void EpubHtmlParser::startNewTextBlock(const BLOCK_STYLE style) {
|
||||
if (currentTextBlock) {
|
||||
// already have a text block running and it is empty - just reuse it
|
||||
if (currentTextBlock->isEmpty()) {
|
||||
currentTextBlock->set_style(style);
|
||||
return;
|
||||
}
|
||||
|
||||
currentTextBlock->finish();
|
||||
makePages();
|
||||
delete currentTextBlock;
|
||||
}
|
||||
currentTextBlock = new TextBlock(style);
|
||||
}
|
||||
|
||||
bool EpubHtmlParser::VisitEnter(const tinyxml2::XMLElement& element, const tinyxml2::XMLAttribute* firstAttribute) {
|
||||
const char* tag_name = element.Name();
|
||||
if (matches(tag_name, IMAGE_TAGS, NUM_IMAGE_TAGS)) {
|
||||
const char* src = element.Attribute("src");
|
||||
if (src) {
|
||||
// don't leave an empty text block in the list
|
||||
// const BLOCK_STYLE style = currentTextBlock->get_style();
|
||||
if (currentTextBlock->isEmpty()) {
|
||||
delete currentTextBlock;
|
||||
currentTextBlock = nullptr;
|
||||
}
|
||||
// TODO: Fix this
|
||||
// blocks.push_back(new ImageBlock(m_base_path + src));
|
||||
// start a new text block - with the same style as before
|
||||
// startNewTextBlock(style);
|
||||
} else {
|
||||
// ESP_LOGE(TAG, "Could not find src attribute");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
if (matches(tag_name, SKIP_TAGS, NUM_SKIP_TAGS)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Serial.printf("Text: %s\n", element.GetText());
|
||||
|
||||
if (matches(tag_name, HEADER_TAGS, NUM_HEADER_TAGS)) {
|
||||
insideBoldTag = true;
|
||||
startNewTextBlock(CENTER_ALIGN);
|
||||
} else if (matches(tag_name, BLOCK_TAGS, NUM_BLOCK_TAGS)) {
|
||||
if (strcmp(tag_name, "br") == 0) {
|
||||
startNewTextBlock(currentTextBlock->get_style());
|
||||
} else {
|
||||
startNewTextBlock(JUSTIFIED);
|
||||
}
|
||||
} else if (matches(tag_name, BOLD_TAGS, NUM_BOLD_TAGS)) {
|
||||
insideBoldTag = true;
|
||||
} else if (matches(tag_name, ITALIC_TAGS, NUM_ITALIC_TAGS)) {
|
||||
insideItalicTag = true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
/// Visit a text node.
|
||||
bool EpubHtmlParser::Visit(const tinyxml2::XMLText& text) {
|
||||
const char* content = text.Value();
|
||||
currentTextBlock->addSpan(replaceHtmlEntities(content), insideBoldTag, insideItalicTag);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool EpubHtmlParser::VisitExit(const tinyxml2::XMLElement& element) {
|
||||
const char* tag_name = element.Name();
|
||||
if (matches(tag_name, HEADER_TAGS, NUM_HEADER_TAGS)) {
|
||||
insideBoldTag = false;
|
||||
} else if (matches(tag_name, BLOCK_TAGS, NUM_BLOCK_TAGS)) {
|
||||
// nothing to do
|
||||
} else if (matches(tag_name, BOLD_TAGS, NUM_BOLD_TAGS)) {
|
||||
insideBoldTag = false;
|
||||
} else if (matches(tag_name, ITALIC_TAGS, NUM_ITALIC_TAGS)) {
|
||||
insideItalicTag = false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool EpubHtmlParser::parseAndBuildPages() {
|
||||
startNewTextBlock(JUSTIFIED);
|
||||
tinyxml2::XMLDocument doc(false, tinyxml2::COLLAPSE_WHITESPACE);
|
||||
|
||||
const tinyxml2::XMLError result = doc.LoadFile(filepath);
|
||||
if (result != tinyxml2::XML_SUCCESS) {
|
||||
Serial.printf("Failed to load file, Error: %s\n", tinyxml2::XMLDocument::ErrorIDToName(result));
|
||||
return false;
|
||||
}
|
||||
|
||||
doc.Accept(this);
|
||||
if (currentTextBlock) {
|
||||
makePages();
|
||||
completePageFn(currentPage);
|
||||
currentPage = nullptr;
|
||||
delete currentTextBlock;
|
||||
currentTextBlock = nullptr;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void EpubHtmlParser::makePages() {
|
||||
if (!currentTextBlock) {
|
||||
Serial.println("!! No text block to make pages for !!");
|
||||
return;
|
||||
}
|
||||
|
||||
if (!currentPage) {
|
||||
currentPage = new Page();
|
||||
}
|
||||
|
||||
const int lineHeight = renderer->getLineHeight();
|
||||
const int pageHeight = renderer->getPageHeight();
|
||||
|
||||
// Long running task, make sure to let other things happen
|
||||
vTaskDelay(1);
|
||||
|
||||
if (currentTextBlock->getType() == TEXT_BLOCK) {
|
||||
const auto lines = currentTextBlock->splitIntoLines(renderer);
|
||||
|
||||
for (const auto line : lines) {
|
||||
if (currentPage->nextY + lineHeight > pageHeight) {
|
||||
completePageFn(currentPage);
|
||||
currentPage = new Page();
|
||||
}
|
||||
|
||||
currentPage->elements.push_back(new PageLine(line, currentPage->nextY));
|
||||
currentPage->nextY += lineHeight;
|
||||
}
|
||||
// TODO: Fix spacing between paras
|
||||
// add some extra line between blocks
|
||||
currentPage->nextY += lineHeight / 2;
|
||||
}
|
||||
// TODO: Image block support
|
||||
// if (block->getType() == BlockType::IMAGE_BLOCK) {
|
||||
// ImageBlock *imageBlock = (ImageBlock *)block;
|
||||
// if (y + imageBlock->height > page_height) {
|
||||
// pages.push_back(new Page());
|
||||
// y = 0;
|
||||
// }
|
||||
// pages.back()->elements.push_back(new PageImage(imageBlock, y));
|
||||
// y += imageBlock->height;
|
||||
// }
|
||||
}
|
||||
34
lib/Epub/Epub/EpubHtmlParser.h
Normal file
34
lib/Epub/Epub/EpubHtmlParser.h
Normal file
@@ -0,0 +1,34 @@
|
||||
#pragma once
|
||||
#include <tinyxml2.h>
|
||||
|
||||
#include <functional>
|
||||
|
||||
#include "blocks/TextBlock.h"
|
||||
|
||||
class Page;
|
||||
class EpdRenderer;
|
||||
|
||||
class EpubHtmlParser final : public tinyxml2::XMLVisitor {
|
||||
const char* filepath;
|
||||
EpdRenderer* renderer;
|
||||
std::function<void(Page*)> completePageFn;
|
||||
|
||||
bool insideBoldTag = false;
|
||||
bool insideItalicTag = false;
|
||||
TextBlock* currentTextBlock = nullptr;
|
||||
Page* currentPage = nullptr;
|
||||
|
||||
void startNewTextBlock(BLOCK_STYLE style);
|
||||
void makePages();
|
||||
|
||||
// xml parser callbacks
|
||||
bool VisitEnter(const tinyxml2::XMLElement& element, const tinyxml2::XMLAttribute* firstAttribute) override;
|
||||
bool Visit(const tinyxml2::XMLText& text) override;
|
||||
bool VisitExit(const tinyxml2::XMLElement& element) override;
|
||||
// xml parser callbacks
|
||||
public:
|
||||
explicit EpubHtmlParser(const char* filepath, EpdRenderer* renderer, const std::function<void(Page*)>& completePageFn)
|
||||
: filepath(filepath), renderer(renderer), completePageFn(completePageFn) {}
|
||||
~EpubHtmlParser() override = default;
|
||||
bool parseAndBuildPages();
|
||||
};
|
||||
65
lib/Epub/Epub/Page.cpp
Normal file
65
lib/Epub/Epub/Page.cpp
Normal file
@@ -0,0 +1,65 @@
|
||||
#include "Page.h"
|
||||
|
||||
#include <HardwareSerial.h>
|
||||
#include <Serialization.h>
|
||||
|
||||
void PageLine::render(EpdRenderer* renderer) { block->render(renderer, 0, yPos); }
|
||||
|
||||
void PageLine::serialize(std::ostream& os) {
|
||||
serialization::writePod(os, yPos);
|
||||
|
||||
// serialize TextBlock pointed to by PageLine
|
||||
block->serialize(os);
|
||||
}
|
||||
|
||||
PageLine* PageLine::deserialize(std::istream& is) {
|
||||
int32_t yPos;
|
||||
serialization::readPod(is, yPos);
|
||||
|
||||
const auto tb = TextBlock::deserialize(is);
|
||||
return new PageLine(tb, yPos);
|
||||
}
|
||||
|
||||
void Page::render(EpdRenderer* renderer) const {
|
||||
const auto start = millis();
|
||||
for (const auto element : elements) {
|
||||
element->render(renderer);
|
||||
}
|
||||
Serial.printf("Rendered page elements (%u) in %dms\n", elements.size(), millis() - start);
|
||||
}
|
||||
|
||||
void Page::serialize(std::ostream& os) const {
|
||||
serialization::writePod(os, nextY);
|
||||
|
||||
const uint32_t count = elements.size();
|
||||
serialization::writePod(os, count);
|
||||
|
||||
for (auto* el : elements) {
|
||||
// Only PageLine exists currently
|
||||
serialization::writePod(os, static_cast<uint8_t>(TAG_PageLine));
|
||||
static_cast<PageLine*>(el)->serialize(os);
|
||||
}
|
||||
}
|
||||
|
||||
Page* Page::deserialize(std::istream& is) {
|
||||
auto* page = new Page();
|
||||
|
||||
serialization::readPod(is, page->nextY);
|
||||
|
||||
uint32_t count;
|
||||
serialization::readPod(is, count);
|
||||
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
uint8_t tag;
|
||||
serialization::readPod(is, tag);
|
||||
|
||||
if (tag == TAG_PageLine) {
|
||||
auto* pl = PageLine::deserialize(is);
|
||||
page->elements.push_back(pl);
|
||||
} else {
|
||||
throw std::runtime_error("Unknown PageElement tag");
|
||||
}
|
||||
}
|
||||
|
||||
return page;
|
||||
}
|
||||
43
lib/Epub/Epub/Page.h
Normal file
43
lib/Epub/Epub/Page.h
Normal file
@@ -0,0 +1,43 @@
|
||||
#pragma once
|
||||
#include "blocks/TextBlock.h"
|
||||
|
||||
enum PageElementTag : uint8_t {
|
||||
TAG_PageLine = 1,
|
||||
};
|
||||
|
||||
// represents something that has been added to a page
|
||||
class PageElement {
|
||||
public:
|
||||
int yPos;
|
||||
explicit PageElement(const int yPos) : yPos(yPos) {}
|
||||
virtual ~PageElement() = default;
|
||||
virtual void render(EpdRenderer* renderer) = 0;
|
||||
virtual void serialize(std::ostream& os) = 0;
|
||||
};
|
||||
|
||||
// a line from a block element
|
||||
class PageLine final : public PageElement {
|
||||
const TextBlock* block;
|
||||
|
||||
public:
|
||||
PageLine(const TextBlock* block, const int yPos) : PageElement(yPos), block(block) {}
|
||||
~PageLine() override { delete block; }
|
||||
void render(EpdRenderer* renderer) override;
|
||||
void serialize(std::ostream& os) override;
|
||||
static PageLine* deserialize(std::istream& is);
|
||||
};
|
||||
|
||||
class Page {
|
||||
public:
|
||||
int nextY = 0;
|
||||
// the list of block index and line numbers on this page
|
||||
std::vector<PageElement*> elements;
|
||||
void render(EpdRenderer* renderer) const;
|
||||
~Page() {
|
||||
for (const auto element : elements) {
|
||||
delete element;
|
||||
}
|
||||
}
|
||||
void serialize(std::ostream& os) const;
|
||||
static Page* deserialize(std::istream& is);
|
||||
};
|
||||
117
lib/Epub/Epub/Section.cpp
Normal file
117
lib/Epub/Epub/Section.cpp
Normal file
@@ -0,0 +1,117 @@
|
||||
#include "Section.h"
|
||||
|
||||
#include <EpdRenderer.h>
|
||||
#include <SD.h>
|
||||
|
||||
#include <fstream>
|
||||
|
||||
#include "EpubHtmlParser.h"
|
||||
#include "Page.h"
|
||||
|
||||
void Section::onPageComplete(const Page* page) {
|
||||
Serial.printf("Page %d complete\n", pageCount);
|
||||
|
||||
const auto filePath = cachePath + "/page_" + std::to_string(pageCount) + ".bin";
|
||||
// TODO can this be removed?
|
||||
SD.open(filePath.c_str(), FILE_WRITE).close();
|
||||
|
||||
std::ofstream outputFile("/sd" + filePath);
|
||||
page->serialize(outputFile);
|
||||
outputFile.close();
|
||||
|
||||
pageCount++;
|
||||
delete page;
|
||||
}
|
||||
|
||||
bool Section::hasCache() {
|
||||
if (!SD.exists(cachePath.c_str())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto sectionFilePath = cachePath + "/section.bin";
|
||||
if (!SD.exists(sectionFilePath.c_str())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
File sectionFile = SD.open(sectionFilePath.c_str(), FILE_READ);
|
||||
uint8_t pageCountBytes[2] = {0, 0};
|
||||
sectionFile.read(pageCountBytes, 2);
|
||||
sectionFile.close();
|
||||
|
||||
pageCount = pageCountBytes[0] + (pageCountBytes[1] << 8);
|
||||
Serial.printf("Loaded cache: %d pages\n", pageCount);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void Section::setupCacheDir() const {
|
||||
epub->setupCacheDir();
|
||||
SD.mkdir(cachePath.c_str());
|
||||
}
|
||||
|
||||
void Section::clearCache() const { SD.rmdir(cachePath.c_str()); }
|
||||
|
||||
bool Section::persistPageDataToSD() {
|
||||
size_t size = 0;
|
||||
auto localPath = epub->getSpineItem(spineIndex);
|
||||
|
||||
const auto html = epub->getItemContents(epub->getSpineItem(spineIndex), &size);
|
||||
if (!html) {
|
||||
Serial.println("Failed to read item contents");
|
||||
return false;
|
||||
}
|
||||
|
||||
// TODO: Would love to stream this through an XML visitor
|
||||
const auto tmpHtmlPath = epub->getCachePath() + "/.tmp_" + std::to_string(spineIndex) + ".html";
|
||||
File f = SD.open(tmpHtmlPath.c_str(), FILE_WRITE);
|
||||
const auto written = f.write(html, size);
|
||||
f.close();
|
||||
free(html);
|
||||
|
||||
Serial.printf("Wrote %d bytes to %s\n", written, tmpHtmlPath.c_str());
|
||||
|
||||
if (size != written) {
|
||||
Serial.println("Failed to inflate section contents to SD");
|
||||
SD.remove(tmpHtmlPath.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto sdTmpHtmlPath = "/sd" + tmpHtmlPath;
|
||||
auto visitor =
|
||||
EpubHtmlParser(sdTmpHtmlPath.c_str(), renderer, [this](const Page* page) { this->onPageComplete(page); });
|
||||
|
||||
// TODO: Come back and see if mem used here can be lowered?
|
||||
const bool success = visitor.parseAndBuildPages();
|
||||
SD.remove(tmpHtmlPath.c_str());
|
||||
if (!success) {
|
||||
Serial.println("Failed to parse and build pages");
|
||||
return false;
|
||||
}
|
||||
|
||||
File sectionFile = SD.open((cachePath + "/section.bin").c_str(), FILE_WRITE, true);
|
||||
const uint8_t pageCountBytes[2] = {static_cast<uint8_t>(pageCount & 0xFF),
|
||||
static_cast<uint8_t>((pageCount >> 8) & 0xFF)};
|
||||
sectionFile.write(pageCountBytes, 2);
|
||||
sectionFile.close();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void Section::renderPage() {
|
||||
if (0 <= currentPage && currentPage < pageCount) {
|
||||
const auto filePath = "/sd" + cachePath + "/page_" + std::to_string(currentPage) + ".bin";
|
||||
std::ifstream inputFile(filePath);
|
||||
const Page* p = Page::deserialize(inputFile);
|
||||
inputFile.close();
|
||||
p->render(renderer);
|
||||
delete p;
|
||||
} else if (pageCount == 0) {
|
||||
Serial.println("No pages to render");
|
||||
const int width = renderer->getTextWidth("Empty chapter", true);
|
||||
renderer->drawText((renderer->getPageWidth() - width) / 2, 300, "Empty chapter", true);
|
||||
} else {
|
||||
Serial.printf("Page out of bounds: %d (max %d)\n", currentPage, pageCount);
|
||||
const int width = renderer->getTextWidth("Out of bounds", true);
|
||||
renderer->drawText((renderer->getPageWidth() - width) / 2, 300, "Out of bounds", true);
|
||||
}
|
||||
}
|
||||
29
lib/Epub/Epub/Section.h
Normal file
29
lib/Epub/Epub/Section.h
Normal file
@@ -0,0 +1,29 @@
|
||||
#pragma once
|
||||
#include "Epub.h"
|
||||
|
||||
class Page;
|
||||
class EpdRenderer;
|
||||
|
||||
class Section {
|
||||
Epub* epub;
|
||||
const int spineIndex;
|
||||
EpdRenderer* renderer;
|
||||
std::string cachePath;
|
||||
|
||||
void onPageComplete(const Page* page);
|
||||
|
||||
public:
|
||||
int pageCount = 0;
|
||||
int currentPage = 0;
|
||||
|
||||
explicit Section(Epub* epub, const int spineIndex, EpdRenderer* renderer)
|
||||
: epub(epub), spineIndex(spineIndex), renderer(renderer) {
|
||||
cachePath = epub->getCachePath() + "/" + std::to_string(spineIndex);
|
||||
}
|
||||
~Section() = default;
|
||||
bool hasCache();
|
||||
void setupCacheDir() const;
|
||||
void clearCache() const;
|
||||
bool persistPageDataToSD();
|
||||
void renderPage();
|
||||
};
|
||||
15
lib/Epub/Epub/blocks/Block.h
Normal file
15
lib/Epub/Epub/blocks/Block.h
Normal file
@@ -0,0 +1,15 @@
|
||||
#pragma once
|
||||
|
||||
class EpdRenderer;
|
||||
|
||||
typedef enum { TEXT_BLOCK, IMAGE_BLOCK } BlockType;
|
||||
|
||||
// a block of content in the html - either a paragraph or an image
|
||||
class Block {
|
||||
public:
|
||||
virtual ~Block() = default;
|
||||
virtual void layout(EpdRenderer* renderer) = 0;
|
||||
virtual BlockType getType() = 0;
|
||||
virtual bool isEmpty() = 0;
|
||||
virtual void finish() {}
|
||||
};
|
||||
235
lib/Epub/Epub/blocks/TextBlock.cpp
Normal file
235
lib/Epub/Epub/blocks/TextBlock.cpp
Normal file
@@ -0,0 +1,235 @@
|
||||
#include "TextBlock.h"
|
||||
|
||||
#include <EpdRenderer.h>
|
||||
#include <Serialization.h>
|
||||
|
||||
static bool isWhitespace(const char c) { return c == ' ' || c == '\r' || c == '\n'; }
|
||||
|
||||
// move past anything that should be considered part of a work
|
||||
static int skipWord(const std::string& text, int index, const int length) {
|
||||
while (index < length && !isWhitespace(text[index])) {
|
||||
index++;
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
// skip past any white space characters
|
||||
static int skipWhitespace(const std::string& html, int index, const int length) {
|
||||
while (index < length && isWhitespace(html[index])) {
|
||||
index++;
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
void TextBlock::addSpan(const std::string& span, const bool is_bold, const bool is_italic) {
|
||||
// adding a span to text block
|
||||
// make a copy of the text as we'll modify it
|
||||
const int length = span.length();
|
||||
// const auto text = new char[length + 1];
|
||||
// strcpy(text, span);
|
||||
// work out where each word is in the span
|
||||
int index = 0;
|
||||
while (index < length) {
|
||||
// skip past any whitespace to the start of a word
|
||||
index = skipWhitespace(span, index, length);
|
||||
const int wordStart = index;
|
||||
// find the end of the word
|
||||
index = skipWord(span, index, length);
|
||||
const int wordLength = index - wordStart;
|
||||
if (wordLength > 0) {
|
||||
words.push_back(span.substr(wordStart, wordLength));
|
||||
wordStyles.push_back((is_bold ? BOLD_SPAN : 0) | (is_italic ? ITALIC_SPAN : 0));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::list<TextBlock*> TextBlock::splitIntoLines(const EpdRenderer* renderer) {
|
||||
const int totalWordCount = words.size();
|
||||
const int pageWidth = renderer->getPageWidth();
|
||||
const int spaceWidth = renderer->getSpaceWidth();
|
||||
|
||||
words.shrink_to_fit();
|
||||
wordStyles.shrink_to_fit();
|
||||
wordXpos.reserve(totalWordCount);
|
||||
|
||||
// measure each word
|
||||
uint16_t wordWidths[totalWordCount];
|
||||
for (int i = 0; i < words.size(); i++) {
|
||||
// measure the word
|
||||
const int width = renderer->getTextWidth(words[i].c_str(), wordStyles[i] & BOLD_SPAN, wordStyles[i] & ITALIC_SPAN);
|
||||
wordWidths[i] = width;
|
||||
}
|
||||
|
||||
// now apply the dynamic programming algorithm to find the best line breaks
|
||||
// DP table in which dp[i] represents cost of line starting with word words[i]
|
||||
int dp[totalWordCount];
|
||||
|
||||
// Array in which ans[i] store index of last word in line starting with word
|
||||
// word[i]
|
||||
size_t ans[totalWordCount];
|
||||
|
||||
// If only one word is present then only one line is required. Cost of last
|
||||
// line is zero. Hence cost of this line is zero. Ending point is also n-1 as
|
||||
// single word is present
|
||||
dp[totalWordCount - 1] = 0;
|
||||
ans[totalWordCount - 1] = totalWordCount - 1;
|
||||
|
||||
// Make each word first word of line by iterating over each index in arr.
|
||||
for (int i = totalWordCount - 2; i >= 0; i--) {
|
||||
int currlen = -1;
|
||||
dp[i] = INT_MAX;
|
||||
|
||||
// Variable to store possible minimum cost of line.
|
||||
int cost;
|
||||
|
||||
// Keep on adding words in current line by iterating from starting word upto
|
||||
// last word in arr.
|
||||
for (int j = i; j < totalWordCount; j++) {
|
||||
// Update the width of the words in current line + the space between two
|
||||
// words.
|
||||
currlen += wordWidths[j] + spaceWidth;
|
||||
|
||||
// If we're bigger than the current pagewidth then we can't add more words
|
||||
if (currlen > pageWidth) break;
|
||||
|
||||
// if we've run out of words then this is last line and the cost should be
|
||||
// 0 Otherwise the cost is the sqaure of the left over space + the costs
|
||||
// of all the previous lines
|
||||
if (j == totalWordCount - 1)
|
||||
cost = 0;
|
||||
else
|
||||
cost = (pageWidth - currlen) * (pageWidth - currlen) + dp[j + 1];
|
||||
|
||||
// Check if this arrangement gives minimum cost for line starting with
|
||||
// word words[i].
|
||||
if (cost < dp[i]) {
|
||||
dp[i] = cost;
|
||||
ans[i] = j;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We can now iterate through the answer to find the line break positions
|
||||
std::list<uint16_t> lineBreaks;
|
||||
for (size_t i = 0; i < totalWordCount;) {
|
||||
i = ans[i] + 1;
|
||||
if (i > totalWordCount) {
|
||||
break;
|
||||
}
|
||||
lineBreaks.push_back(i);
|
||||
// Text too big, just exit
|
||||
if (lineBreaks.size() > 1000) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
std::list<TextBlock*> lines;
|
||||
|
||||
// With the line breaks calculated we can now position the words along the
|
||||
// line
|
||||
int startWord = 0;
|
||||
for (const auto lineBreak : lineBreaks) {
|
||||
const int lineWordCount = lineBreak - startWord;
|
||||
|
||||
int lineWordWidthSum = 0;
|
||||
for (int i = startWord; i < lineBreak; i++) {
|
||||
lineWordWidthSum += wordWidths[i];
|
||||
}
|
||||
|
||||
// Calculate spacing between words
|
||||
const uint16_t spareSpace = pageWidth - lineWordWidthSum;
|
||||
uint16_t spacing = spaceWidth;
|
||||
// evenly space words if using justified style, not the last line, and at
|
||||
// least 2 words
|
||||
if (style == JUSTIFIED && lineBreak != lineBreaks.back() && lineWordCount >= 2) {
|
||||
spacing = spareSpace / (lineWordCount - 1);
|
||||
}
|
||||
|
||||
uint16_t xpos = 0;
|
||||
if (style == RIGHT_ALIGN) {
|
||||
xpos = spareSpace - (lineWordCount - 1) * spaceWidth;
|
||||
} else if (style == CENTER_ALIGN) {
|
||||
xpos = (spareSpace - (lineWordCount - 1) * spaceWidth) / 2;
|
||||
}
|
||||
|
||||
for (int i = startWord; i < lineBreak; i++) {
|
||||
wordXpos[i] = xpos;
|
||||
xpos += wordWidths[i] + spacing;
|
||||
}
|
||||
|
||||
std::vector<std::string> lineWords;
|
||||
std::vector<uint16_t> lineXPos;
|
||||
std::vector<uint8_t> lineWordStyles;
|
||||
lineWords.reserve(lineWordCount);
|
||||
lineXPos.reserve(lineWordCount);
|
||||
lineWordStyles.reserve(lineWordCount);
|
||||
|
||||
for (int i = startWord; i < lineBreak; i++) {
|
||||
lineWords.push_back(words[i]);
|
||||
lineXPos.push_back(wordXpos[i]);
|
||||
lineWordStyles.push_back(wordStyles[i]);
|
||||
}
|
||||
const auto textLine = new TextBlock(lineWords, lineXPos, lineWordStyles, style);
|
||||
lines.push_back(textLine);
|
||||
startWord = lineBreak;
|
||||
}
|
||||
|
||||
return lines;
|
||||
}
|
||||
|
||||
void TextBlock::render(const EpdRenderer* renderer, const int x, const int y) const {
|
||||
for (int i = 0; i < words.size(); i++) {
|
||||
// get the style
|
||||
const uint8_t wordStyle = wordStyles[i];
|
||||
// render the word
|
||||
renderer->drawText(x + wordXpos[i], y, words[i].c_str(), wordStyle & BOLD_SPAN, wordStyle & ITALIC_SPAN);
|
||||
}
|
||||
}
|
||||
|
||||
void TextBlock::serialize(std::ostream& os) const {
|
||||
// words
|
||||
const uint32_t wc = words.size();
|
||||
serialization::writePod(os, wc);
|
||||
for (const auto& w : words) serialization::writeString(os, w);
|
||||
|
||||
// wordXpos
|
||||
const uint32_t xc = wordXpos.size();
|
||||
serialization::writePod(os, xc);
|
||||
for (auto x : wordXpos) serialization::writePod(os, x);
|
||||
|
||||
// wordStyles
|
||||
const uint32_t sc = wordStyles.size();
|
||||
serialization::writePod(os, sc);
|
||||
for (auto s : wordStyles) serialization::writePod(os, s);
|
||||
|
||||
// style
|
||||
serialization::writePod(os, style);
|
||||
}
|
||||
|
||||
TextBlock* TextBlock::deserialize(std::istream& is) {
|
||||
uint32_t wc, xc, sc;
|
||||
std::vector<std::string> words;
|
||||
std::vector<uint16_t> wordXpos;
|
||||
std::vector<uint8_t> wordStyles;
|
||||
BLOCK_STYLE style;
|
||||
|
||||
// words
|
||||
serialization::readPod(is, wc);
|
||||
words.resize(wc);
|
||||
for (auto& w : words) serialization::readString(is, w);
|
||||
|
||||
// wordXpos
|
||||
serialization::readPod(is, xc);
|
||||
wordXpos.resize(xc);
|
||||
for (auto& x : wordXpos) serialization::readPod(is, x);
|
||||
|
||||
// wordStyles
|
||||
serialization::readPod(is, sc);
|
||||
wordStyles.resize(sc);
|
||||
for (auto& s : wordStyles) serialization::readPod(is, s);
|
||||
|
||||
// style
|
||||
serialization::readPod(is, style);
|
||||
|
||||
return new TextBlock(words, wordXpos, wordStyles, style);
|
||||
}
|
||||
50
lib/Epub/Epub/blocks/TextBlock.h
Normal file
50
lib/Epub/Epub/blocks/TextBlock.h
Normal file
@@ -0,0 +1,50 @@
|
||||
#pragma once
|
||||
#include <list>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "Block.h"
|
||||
|
||||
enum SPAN_STYLE : uint8_t {
|
||||
BOLD_SPAN = 1,
|
||||
ITALIC_SPAN = 2,
|
||||
};
|
||||
|
||||
enum BLOCK_STYLE : uint8_t {
|
||||
JUSTIFIED = 0,
|
||||
LEFT_ALIGN = 1,
|
||||
CENTER_ALIGN = 2,
|
||||
RIGHT_ALIGN = 3,
|
||||
};
|
||||
|
||||
// represents a block of words in the html document
|
||||
class TextBlock final : public Block {
|
||||
// pointer to each word
|
||||
std::vector<std::string> words;
|
||||
// x position of each word
|
||||
std::vector<uint16_t> wordXpos;
|
||||
// the styles of each word
|
||||
std::vector<uint8_t> wordStyles;
|
||||
|
||||
// the style of the block - left, center, right aligned
|
||||
BLOCK_STYLE style;
|
||||
|
||||
public:
|
||||
void addSpan(const std::string& span, bool is_bold, bool is_italic);
|
||||
explicit TextBlock(const BLOCK_STYLE style) : style(style) {}
|
||||
explicit TextBlock(const std::vector<std::string>& words, const std::vector<uint16_t>& word_xpos,
|
||||
// the styles of each word
|
||||
const std::vector<uint8_t>& word_styles, const BLOCK_STYLE style)
|
||||
: words(words), wordXpos(word_xpos), wordStyles(word_styles), style(style) {}
|
||||
~TextBlock() override = default;
|
||||
void set_style(const BLOCK_STYLE style) { this->style = style; }
|
||||
BLOCK_STYLE get_style() const { return style; }
|
||||
bool isEmpty() override { return words.empty(); }
|
||||
void layout(EpdRenderer* renderer) override {};
|
||||
// given a renderer works out where to break the words into lines
|
||||
std::list<TextBlock*> splitIntoLines(const EpdRenderer* renderer);
|
||||
void render(const EpdRenderer* renderer, int x, int y) const;
|
||||
BlockType getType() override { return TEXT_BLOCK; }
|
||||
void serialize(std::ostream& os) const;
|
||||
static TextBlock* deserialize(std::istream& is);
|
||||
};
|
||||
163
lib/Epub/Epub/htmlEntities.cpp
Normal file
163
lib/Epub/Epub/htmlEntities.cpp
Normal file
@@ -0,0 +1,163 @@
|
||||
// from
|
||||
// https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp
|
||||
|
||||
#include "htmlEntities.h"
|
||||
|
||||
#include <cstring>
|
||||
#include <unordered_map>
|
||||
|
||||
const int MAX_ENTITY_LENGTH = 10;
|
||||
|
||||
// Use book: entities_ww2.epub to test this (Page 7: Entities parser test)
|
||||
// Note the supported keys are only in lowercase
|
||||
// Store the mappings in a unordered hash map
|
||||
static std::unordered_map<std::string, std::string> entity_lookup(
|
||||
{{""", "\""}, {"⁄", "⁄"}, {"&", "&"}, {"<", "<"}, {">", ">"},
|
||||
{"À", "À"}, {"Á", "Á"}, {"Â", "Â"}, {"Ã", "Ã"}, {"Ä", "Ä"},
|
||||
{"Å", "Å"}, {"Æ", "Æ"}, {"Ç", "Ç"}, {"È", "È"}, {"É", "É"},
|
||||
{"Ê", "Ê"}, {"Ë", "Ë"}, {"Ì", "Ì"}, {"Í", "Í"}, {"Î", "Î"},
|
||||
{"Ï", "Ï"}, {"Ð", "Ð"}, {"Ñ", "Ñ"}, {"Ò", "Ò"}, {"Ó", "Ó"},
|
||||
{"Ô", "Ô"}, {"Õ", "Õ"}, {"Ö", "Ö"}, {"Ø", "Ø"}, {"Ù", "Ù"},
|
||||
{"Ú", "Ú"}, {"Û", "Û"}, {"Ü", "Ü"}, {"Ý", "Ý"}, {"Þ", "Þ"},
|
||||
{"ß", "ß"}, {"à", "à"}, {"á", "á"}, {"â", "â"}, {"ã", "ã"},
|
||||
{"ä", "ä"}, {"å", "å"}, {"æ", "æ"}, {"ç", "ç"}, {"è", "è"},
|
||||
{"é", "é"}, {"ê", "ê"}, {"ë", "ë"}, {"ì", "ì"}, {"í", "í"},
|
||||
{"î", "î"}, {"ï", "ï"}, {"ð", "ð"}, {"ñ", "ñ"}, {"ò", "ò"},
|
||||
{"ó", "ó"}, {"ô", "ô"}, {"õ", "õ"}, {"ö", "ö"}, {"ø", "ø"},
|
||||
{"ù", "ù"}, {"ú", "ú"}, {"û", "û"}, {"ü", "ü"}, {"ý", "ý"},
|
||||
{"þ", "þ"}, {"ÿ", "ÿ"}, {" ", " "}, {"¡", "¡"}, {"¢", "¢"},
|
||||
{"£", "£"}, {"¤", "¤"}, {"¥", "¥"}, {"¦", "¦"}, {"§", "§"},
|
||||
{"¨", "¨"}, {"©", "©"}, {"ª", "ª"}, {"«", "«"}, {"¬", "¬"},
|
||||
{"­", ""}, {"®", "®"}, {"¯", "¯"}, {"°", "°"}, {"±", "±"},
|
||||
{"²", "²"}, {"³", "³"}, {"´", "´"}, {"µ", "µ"}, {"¶", "¶"},
|
||||
{"¸", "¸"}, {"¹", "¹"}, {"º", "º"}, {"»", "»"}, {"¼", "¼"},
|
||||
{"½", "½"}, {"¾", "¾"}, {"¿", "¿"}, {"×", "×"}, {"÷", "÷"},
|
||||
{"∀", "∀"}, {"∂", "∂"}, {"∃", "∃"}, {"∅", "∅"}, {"∇", "∇"},
|
||||
{"∈", "∈"}, {"∉", "∉"}, {"∋", "∋"}, {"∏", "∏"}, {"∑", "∑"},
|
||||
{"−", "−"}, {"∗", "∗"}, {"√", "√"}, {"∝", "∝"}, {"∞", "∞"},
|
||||
{"∠", "∠"}, {"∧", "∧"}, {"∨", "∨"}, {"∩", "∩"}, {"∪", "∪"},
|
||||
{"∫", "∫"}, {"∴", "∴"}, {"∼", "∼"}, {"≅", "≅"}, {"≈", "≈"},
|
||||
{"≠", "≠"}, {"≡", "≡"}, {"≤", "≤"}, {"≥", "≥"}, {"⊂", "⊂"},
|
||||
{"⊃", "⊃"}, {"⊄", "⊄"}, {"⊆", "⊆"}, {"⊇", "⊇"}, {"⊕", "⊕"},
|
||||
{"⊗", "⊗"}, {"⊥", "⊥"}, {"⋅", "⋅"}, {"Α", "Α"}, {"Β", "Β"},
|
||||
{"Γ", "Γ"}, {"Δ", "Δ"}, {"Ε", "Ε"}, {"Ζ", "Ζ"}, {"Η", "Η"},
|
||||
{"Θ", "Θ"}, {"Ι", "Ι"}, {"Κ", "Κ"}, {"Λ", "Λ"}, {"Μ", "Μ"},
|
||||
{"Ν", "Ν"}, {"Ξ", "Ξ"}, {"Ο", "Ο"}, {"Π", "Π"}, {"Ρ", "Ρ"},
|
||||
{"Σ", "Σ"}, {"Τ", "Τ"}, {"Υ", "Υ"}, {"Φ", "Φ"}, {"Χ", "Χ"},
|
||||
{"Ψ", "Ψ"}, {"Ω", "Ω"}, {"α", "α"}, {"β", "β"}, {"γ", "γ"},
|
||||
{"δ", "δ"}, {"ε", "ε"}, {"ζ", "ζ"}, {"η", "η"}, {"θ", "θ"},
|
||||
{"ι", "ι"}, {"κ", "κ"}, {"λ", "λ"}, {"μ", "μ"}, {"ν", "ν"},
|
||||
{"ξ", "ξ"}, {"ο", "ο"}, {"π", "π"}, {"ρ", "ρ"}, {"ς", "ς"},
|
||||
{"σ", "σ"}, {"τ", "τ"}, {"υ", "υ"}, {"φ", "φ"}, {"χ", "χ"},
|
||||
{"ψ", "ψ"}, {"ω", "ω"}, {"ϑ", "ϑ"}, {"ϒ", "ϒ"}, {"ϖ", "ϖ"},
|
||||
{"Œ", "Œ"}, {"œ", "œ"}, {"Š", "Š"}, {"š", "š"}, {"Ÿ", "Ÿ"},
|
||||
{"ƒ", "ƒ"}, {"ˆ", "ˆ"}, {"˜", "˜"}, {" ", ""}, {" ", ""},
|
||||
{" ", ""}, {"‌", ""}, {"‍", ""}, {"‎", ""}, {"‏", ""},
|
||||
{"–", "–"}, {"—", "—"}, {"‘", "‘"}, {"’", "’"}, {"‚", "‚"},
|
||||
{"“", "“"}, {"”", "”"}, {"„", "„"}, {"†", "†"}, {"‡", "‡"},
|
||||
{"•", "•"}, {"…", "…"}, {"‰", "‰"}, {"′", "′"}, {"″", "″"},
|
||||
{"‹", "‹"}, {"›", "›"}, {"‾", "‾"}, {"€", "€"}, {"™", "™"},
|
||||
{"←", "←"}, {"↑", "↑"}, {"→", "→"}, {"↓", "↓"}, {"↔", "↔"},
|
||||
{"↵", "↵"}, {"⌈", "⌈"}, {"⌉", "⌉"}, {"⌊", "⌊"}, {"⌋", "⌋"},
|
||||
{"◊", "◊"}, {"♠", "♠"}, {"♣", "♣"}, {"♥", "♥"}, {"♦", "♦"}});
|
||||
|
||||
// converts from a unicode code point to the utf8 equivalent
|
||||
void convert_to_utf8(const int code, std::string& res) {
|
||||
// convert to a utf8 sequence
|
||||
if (code < 0x80) {
|
||||
res += static_cast<char>(code);
|
||||
} else if (code < 0x800) {
|
||||
res += static_cast<char>(0xc0 | (code >> 6));
|
||||
res += static_cast<char>(0x80 | (code & 0x3f));
|
||||
} else if (code < 0x10000) {
|
||||
res += static_cast<char>(0xe0 | (code >> 12));
|
||||
res += static_cast<char>(0x80 | ((code >> 6) & 0x3f));
|
||||
res += static_cast<char>(0x80 | (code & 0x3f));
|
||||
} else if (code < 0x200000) {
|
||||
res += static_cast<char>(0xf0 | (code >> 18));
|
||||
res += static_cast<char>(0x80 | ((code >> 12) & 0x3f));
|
||||
res += static_cast<char>(0x80 | ((code >> 6) & 0x3f));
|
||||
res += static_cast<char>(0x80 | (code & 0x3f));
|
||||
} else if (code < 0x4000000) {
|
||||
res += static_cast<char>(0xf8 | (code >> 24));
|
||||
res += static_cast<char>(0x80 | ((code >> 18) & 0x3f));
|
||||
res += static_cast<char>(0x80 | ((code >> 12) & 0x3f));
|
||||
res += static_cast<char>(0x80 | ((code >> 6) & 0x3f));
|
||||
res += static_cast<char>(0x80 | (code & 0x3f));
|
||||
} else if (code < 0x80000000) {
|
||||
res += static_cast<char>(0xfc | (code >> 30));
|
||||
res += static_cast<char>(0x80 | ((code >> 24) & 0x3f));
|
||||
res += static_cast<char>(0x80 | ((code >> 18) & 0x3f));
|
||||
res += static_cast<char>(0x80 | ((code >> 12) & 0x3f));
|
||||
res += static_cast<char>(0x80 | ((code >> 6) & 0x3f));
|
||||
}
|
||||
}
|
||||
|
||||
// handles numeric entities - e.g. Ӓ or ሴ
|
||||
bool process_numeric_entity(const std::string& entity, std::string& res) {
|
||||
int code = 0;
|
||||
// is it hex?
|
||||
if (entity[2] == 'x' || entity[2] == 'X') {
|
||||
// parse the hex code
|
||||
code = strtol(entity.substr(3, entity.size() - 3).c_str(), nullptr, 16);
|
||||
} else {
|
||||
code = strtol(entity.substr(2, entity.size() - 3).c_str(), nullptr, 10);
|
||||
}
|
||||
if (code != 0) {
|
||||
// special handling for nbsp
|
||||
if (code == 0xA0) {
|
||||
res += " ";
|
||||
} else {
|
||||
convert_to_utf8(code, res);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// handles named entities - e.g. &
|
||||
bool process_string_entity(const std::string& entity, std::string& res) {
|
||||
// it's a named entity - find it in the lookup table
|
||||
// find it in the map
|
||||
const auto it = entity_lookup.find(entity);
|
||||
if (it != entity_lookup.end()) {
|
||||
res += it->second;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// replace all the entities in the string
|
||||
std::string replaceHtmlEntities(const char* text) {
|
||||
std::string res;
|
||||
res.reserve(strlen(text));
|
||||
for (int i = 0; i < strlen(text); ++i) {
|
||||
bool flag = false;
|
||||
// do we have a potential entity?
|
||||
if (text[i] == '&') {
|
||||
// find the end of the entity
|
||||
int j = i + 1;
|
||||
while (j < strlen(text) && text[j] != ';' && j - i < MAX_ENTITY_LENGTH) {
|
||||
j++;
|
||||
}
|
||||
if (j - i > 2) {
|
||||
char entity[j - i + 1];
|
||||
strncpy(entity, text + i, j - i);
|
||||
// is it a numeric code?
|
||||
if (entity[1] == '#') {
|
||||
flag = process_numeric_entity(entity, res);
|
||||
} else {
|
||||
flag = process_string_entity(entity, res);
|
||||
}
|
||||
// skip past the entity if we successfully decoded it
|
||||
if (flag) {
|
||||
i = j;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!flag) {
|
||||
res += text[i];
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
7
lib/Epub/Epub/htmlEntities.h
Normal file
7
lib/Epub/Epub/htmlEntities.h
Normal file
@@ -0,0 +1,7 @@
|
||||
// from
|
||||
// https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp
|
||||
|
||||
#pragma once
|
||||
#include <string>
|
||||
|
||||
std::string replaceHtmlEntities(const char* text);
|
||||
Reference in New Issue
Block a user