Add expat and swap out EPUB HTML parser (#2)

* Add expat and swap out ERB HTML parser

* Increase EpubHtmlParserSlim file buffer to 1024 bytes

* Cleanup TextBlock functions

* Do not break words when leaving spans
This commit is contained in:
Dave Allie
2025-12-06 20:57:24 +11:00
committed by GitHub
parent ad8cee12ab
commit dd6e649d74
32 changed files with 15969 additions and 269 deletions

View File

@@ -1,181 +0,0 @@
#include "EpubHtmlParser.h"
#include <EpdRenderer.h>
#include <HardwareSerial.h>
#include "Page.h"
#include "htmlEntities.h"
const char* HEADER_TAGS[] = {"h1", "h2", "h3", "h4", "h5", "h6"};
constexpr int NUM_HEADER_TAGS = sizeof(HEADER_TAGS) / sizeof(HEADER_TAGS[0]);
const char* BLOCK_TAGS[] = {"p", "li", "div", "br"};
constexpr int NUM_BLOCK_TAGS = sizeof(BLOCK_TAGS) / sizeof(BLOCK_TAGS[0]);
const char* BOLD_TAGS[] = {"b"};
constexpr int NUM_BOLD_TAGS = sizeof(BOLD_TAGS) / sizeof(BOLD_TAGS[0]);
const char* ITALIC_TAGS[] = {"i"};
constexpr int NUM_ITALIC_TAGS = sizeof(ITALIC_TAGS) / sizeof(ITALIC_TAGS[0]);
const char* IMAGE_TAGS[] = {"img"};
constexpr int NUM_IMAGE_TAGS = sizeof(IMAGE_TAGS) / sizeof(IMAGE_TAGS[0]);
const char* SKIP_TAGS[] = {"head", "table"};
constexpr int NUM_SKIP_TAGS = sizeof(SKIP_TAGS) / sizeof(SKIP_TAGS[0]);
// given the start and end of a tag, check to see if it matches a known tag
bool matches(const char* tag_name, const char* possible_tags[], const int possible_tag_count) {
for (int i = 0; i < possible_tag_count; i++) {
if (strcmp(tag_name, possible_tags[i]) == 0) {
return true;
}
}
return false;
}
// start a new text block if needed
void EpubHtmlParser::startNewTextBlock(const BLOCK_STYLE style) {
if (currentTextBlock) {
// already have a text block running and it is empty - just reuse it
if (currentTextBlock->isEmpty()) {
currentTextBlock->set_style(style);
return;
}
currentTextBlock->finish();
makePages();
delete currentTextBlock;
}
currentTextBlock = new TextBlock(style);
}
bool EpubHtmlParser::VisitEnter(const tinyxml2::XMLElement& element, const tinyxml2::XMLAttribute* firstAttribute) {
const char* tag_name = element.Name();
if (matches(tag_name, IMAGE_TAGS, NUM_IMAGE_TAGS)) {
const char* src = element.Attribute("src");
if (src) {
// don't leave an empty text block in the list
// const BLOCK_STYLE style = currentTextBlock->get_style();
if (currentTextBlock->isEmpty()) {
delete currentTextBlock;
currentTextBlock = nullptr;
}
// TODO: Fix this
// blocks.push_back(new ImageBlock(m_base_path + src));
// start a new text block - with the same style as before
// startNewTextBlock(style);
} else {
// ESP_LOGE(TAG, "Could not find src attribute");
}
return false;
}
if (matches(tag_name, SKIP_TAGS, NUM_SKIP_TAGS)) {
return false;
}
// Serial.printf("Text: %s\n", element.GetText());
if (matches(tag_name, HEADER_TAGS, NUM_HEADER_TAGS)) {
insideBoldTag = true;
startNewTextBlock(CENTER_ALIGN);
} else if (matches(tag_name, BLOCK_TAGS, NUM_BLOCK_TAGS)) {
if (strcmp(tag_name, "br") == 0) {
startNewTextBlock(currentTextBlock->get_style());
} else {
startNewTextBlock(JUSTIFIED);
}
} else if (matches(tag_name, BOLD_TAGS, NUM_BOLD_TAGS)) {
insideBoldTag = true;
} else if (matches(tag_name, ITALIC_TAGS, NUM_ITALIC_TAGS)) {
insideItalicTag = true;
}
return true;
}
/// Visit a text node.
bool EpubHtmlParser::Visit(const tinyxml2::XMLText& text) {
const char* content = text.Value();
currentTextBlock->addSpan(replaceHtmlEntities(content), insideBoldTag, insideItalicTag);
return true;
}
bool EpubHtmlParser::VisitExit(const tinyxml2::XMLElement& element) {
const char* tag_name = element.Name();
if (matches(tag_name, HEADER_TAGS, NUM_HEADER_TAGS)) {
insideBoldTag = false;
} else if (matches(tag_name, BLOCK_TAGS, NUM_BLOCK_TAGS)) {
// nothing to do
} else if (matches(tag_name, BOLD_TAGS, NUM_BOLD_TAGS)) {
insideBoldTag = false;
} else if (matches(tag_name, ITALIC_TAGS, NUM_ITALIC_TAGS)) {
insideItalicTag = false;
}
return true;
}
bool EpubHtmlParser::parseAndBuildPages() {
startNewTextBlock(JUSTIFIED);
tinyxml2::XMLDocument doc(false, tinyxml2::COLLAPSE_WHITESPACE);
const tinyxml2::XMLError result = doc.LoadFile(filepath);
if (result != tinyxml2::XML_SUCCESS) {
Serial.printf("Failed to load file, Error: %s\n", tinyxml2::XMLDocument::ErrorIDToName(result));
return false;
}
doc.Accept(this);
if (currentTextBlock) {
makePages();
completePageFn(currentPage);
currentPage = nullptr;
delete currentTextBlock;
currentTextBlock = nullptr;
}
return true;
}
void EpubHtmlParser::makePages() {
if (!currentTextBlock) {
Serial.println("!! No text block to make pages for !!");
return;
}
if (!currentPage) {
currentPage = new Page();
}
const int lineHeight = renderer.getLineHeight();
const int pageHeight = renderer.getPageHeight();
// Long running task, make sure to let other things happen
vTaskDelay(1);
if (currentTextBlock->getType() == TEXT_BLOCK) {
const auto lines = currentTextBlock->splitIntoLines(renderer);
for (const auto line : lines) {
if (currentPage->nextY + lineHeight > pageHeight) {
completePageFn(currentPage);
currentPage = new Page();
}
currentPage->elements.push_back(new PageLine(line, currentPage->nextY));
currentPage->nextY += lineHeight;
}
// TODO: Fix spacing between paras
// add some extra line between blocks
currentPage->nextY += lineHeight / 2;
}
// TODO: Image block support
// if (block->getType() == BlockType::IMAGE_BLOCK) {
// ImageBlock *imageBlock = (ImageBlock *)block;
// if (y + imageBlock->height > page_height) {
// pages.push_back(new Page());
// y = 0;
// }
// pages.back()->elements.push_back(new PageImage(imageBlock, y));
// y += imageBlock->height;
// }
}

View File

@@ -1,34 +0,0 @@
#pragma once
#include <tinyxml2.h>
#include <functional>
#include "blocks/TextBlock.h"
class Page;
class EpdRenderer;
class EpubHtmlParser final : public tinyxml2::XMLVisitor {
const char* filepath;
EpdRenderer& renderer;
std::function<void(Page*)> completePageFn;
bool insideBoldTag = false;
bool insideItalicTag = false;
TextBlock* currentTextBlock = nullptr;
Page* currentPage = nullptr;
void startNewTextBlock(BLOCK_STYLE style);
void makePages();
// xml parser callbacks
bool VisitEnter(const tinyxml2::XMLElement& element, const tinyxml2::XMLAttribute* firstAttribute) override;
bool Visit(const tinyxml2::XMLText& text) override;
bool VisitExit(const tinyxml2::XMLElement& element) override;
// xml parser callbacks
public:
explicit EpubHtmlParser(const char* filepath, EpdRenderer& renderer, const std::function<void(Page*)>& completePageFn)
: filepath(filepath), renderer(renderer), completePageFn(completePageFn) {}
~EpubHtmlParser() override = default;
bool parseAndBuildPages();
};

View File

@@ -0,0 +1,291 @@
#include "EpubHtmlParserSlim.h"
#include <EpdRenderer.h>
#include <HardwareSerial.h>
#include "Page.h"
#include "htmlEntities.h"
const char* HEADER_TAGS[] = {"h1", "h2", "h3", "h4", "h5", "h6"};
constexpr int NUM_HEADER_TAGS = sizeof(HEADER_TAGS) / sizeof(HEADER_TAGS[0]);
const char* BLOCK_TAGS[] = {"p", "li", "div", "br"};
constexpr int NUM_BLOCK_TAGS = sizeof(BLOCK_TAGS) / sizeof(BLOCK_TAGS[0]);
const char* BOLD_TAGS[] = {"b"};
constexpr int NUM_BOLD_TAGS = sizeof(BOLD_TAGS) / sizeof(BOLD_TAGS[0]);
const char* ITALIC_TAGS[] = {"i"};
constexpr int NUM_ITALIC_TAGS = sizeof(ITALIC_TAGS) / sizeof(ITALIC_TAGS[0]);
const char* IMAGE_TAGS[] = {"img"};
constexpr int NUM_IMAGE_TAGS = sizeof(IMAGE_TAGS) / sizeof(IMAGE_TAGS[0]);
const char* SKIP_TAGS[] = {"head", "table"};
constexpr int NUM_SKIP_TAGS = sizeof(SKIP_TAGS) / sizeof(SKIP_TAGS[0]);
bool isWhitespace(const char c) { return c == ' ' || c == '\r' || c == '\n'; }
// given the start and end of a tag, check to see if it matches a known tag
bool matches(const char* tag_name, const char* possible_tags[], const int possible_tag_count) {
for (int i = 0; i < possible_tag_count; i++) {
if (strcmp(tag_name, possible_tags[i]) == 0) {
return true;
}
}
return false;
}
// start a new text block if needed
void EpubHtmlParserSlim::startNewTextBlock(const BLOCK_STYLE style) {
if (currentTextBlock) {
// already have a text block running and it is empty - just reuse it
if (currentTextBlock->isEmpty()) {
currentTextBlock->setStyle(style);
return;
}
currentTextBlock->finish();
makePages();
delete currentTextBlock;
}
currentTextBlock = new TextBlock(style);
}
#include <expat.h>
void XMLCALL EpubHtmlParserSlim::startElement(void* userData, const XML_Char* name, const XML_Char** atts) {
auto* self = static_cast<EpubHtmlParserSlim*>(userData);
(void)atts;
// Middle of skip
if (self->skipUntilDepth < self->depth) {
self->depth += 1;
return;
}
if (matches(name, IMAGE_TAGS, NUM_IMAGE_TAGS)) {
// const char* src = element.Attribute("src");
// if (src) {
// // don't leave an empty text block in the list
// // const BLOCK_STYLE style = currentTextBlock->get_style();
// if (currentTextBlock->isEmpty()) {
// delete currentTextBlock;
// currentTextBlock = nullptr;
// }
// // TODO: Fix this
// // blocks.push_back(new ImageBlock(m_base_path + src));
// // start a new text block - with the same style as before
// // startNewTextBlock(style);
// } else {
// // ESP_LOGE(TAG, "Could not find src attribute");
// }
// start skip
self->skipUntilDepth = self->depth;
self->depth += 1;
return;
}
if (matches(name, SKIP_TAGS, NUM_SKIP_TAGS)) {
// start skip
self->skipUntilDepth = self->depth;
self->depth += 1;
return;
}
if (matches(name, HEADER_TAGS, NUM_HEADER_TAGS)) {
self->startNewTextBlock(CENTER_ALIGN);
self->boldUntilDepth = min(self->boldUntilDepth, self->depth);
} else if (matches(name, BLOCK_TAGS, NUM_BLOCK_TAGS)) {
if (strcmp(name, "br") == 0) {
self->startNewTextBlock(self->currentTextBlock->getStyle());
} else {
self->startNewTextBlock(JUSTIFIED);
}
} else if (matches(name, BOLD_TAGS, NUM_BOLD_TAGS)) {
self->boldUntilDepth = min(self->boldUntilDepth, self->depth);
} else if (matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS)) {
self->italicUntilDepth = min(self->italicUntilDepth, self->depth);
}
self->depth += 1;
}
void XMLCALL EpubHtmlParserSlim::characterData(void* userData, const XML_Char* s, const int len) {
auto* self = static_cast<EpubHtmlParserSlim*>(userData);
// Middle of skip
if (self->skipUntilDepth < self->depth) {
return;
}
for (int i = 0; i < len; i++) {
if (isWhitespace(s[i])) {
// Currently looking at whitespace, if there's anything in the partWordBuffer, flush it
if (self->partWordBufferIndex > 0) {
self->partWordBuffer[self->partWordBufferIndex] = '\0';
self->currentTextBlock->addWord(replaceHtmlEntities(self->partWordBuffer), self->boldUntilDepth < self->depth,
self->italicUntilDepth < self->depth);
self->partWordBufferIndex = 0;
}
// Skip the whitespace char
continue;
}
// If we're about to run out of space, then cut the word off and start a new one
if (self->partWordBufferIndex >= PART_WORD_BUFFER_SIZE - 2) {
self->partWordBuffer[self->partWordBufferIndex] = '\0';
self->currentTextBlock->addWord(replaceHtmlEntities(self->partWordBuffer), self->boldUntilDepth < self->depth,
self->italicUntilDepth < self->depth);
self->partWordBufferIndex = 0;
}
self->partWordBuffer[self->partWordBufferIndex++] = s[i];
}
}
void XMLCALL EpubHtmlParserSlim::endElement(void* userData, const XML_Char* name) {
auto* self = static_cast<EpubHtmlParserSlim*>(userData);
(void)name;
if (self->partWordBufferIndex > 0) {
// Only flush out part word buffer if we're closing a block tag or are at the top of the HTML file.
// We don't want to flush out content when closing inline tags like <span>.
// Currently this also flushes out on closing <b> and <i> tags, but they are line tags so that shouldn't happen,
// text styling needs to be overhauled to fix it.
const bool shouldBreakText = matches(name, BLOCK_TAGS, NUM_BLOCK_TAGS) ||
matches(name, HEADER_TAGS, NUM_HEADER_TAGS) ||
matches(name, BOLD_TAGS, NUM_BOLD_TAGS) ||
matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS) ||
self->depth == 1;
if (shouldBreakText) {
self->partWordBuffer[self->partWordBufferIndex] = '\0';
self->currentTextBlock->addWord(replaceHtmlEntities(self->partWordBuffer), self->boldUntilDepth < self->depth,
self->italicUntilDepth < self->depth);
self->partWordBufferIndex = 0;
}
}
self->depth -= 1;
// Leaving skip
if (self->skipUntilDepth == self->depth) {
self->skipUntilDepth = INT_MAX;
}
// Leaving bold
if (self->boldUntilDepth == self->depth) {
self->boldUntilDepth = INT_MAX;
}
// Leaving italic
if (self->italicUntilDepth == self->depth) {
self->italicUntilDepth = INT_MAX;
}
}
bool EpubHtmlParserSlim::parseAndBuildPages() {
startNewTextBlock(JUSTIFIED);
const XML_Parser parser = XML_ParserCreate(nullptr);
int done;
if (!parser) {
Serial.println("Couldn't allocate memory for parser");
return false;
}
XML_SetUserData(parser, this);
XML_SetElementHandler(parser, startElement, endElement);
XML_SetCharacterDataHandler(parser, characterData);
FILE* file = fopen(filepath, "r");
do {
void* const buf = XML_GetBuffer(parser, 1024);
if (!buf) {
Serial.println("Couldn't allocate memory for buffer");
XML_ParserFree(parser);
fclose(file);
return false;
}
const size_t len = fread(buf, 1, 1024, file);
if (ferror(file)) {
Serial.println("Read error");
XML_ParserFree(parser);
fclose(file);
return false;
}
done = feof(file);
if (XML_ParseBuffer(parser, static_cast<int>(len), done) == XML_STATUS_ERROR) {
Serial.printf("Parse error at line %lu:\n%s\n", XML_GetCurrentLineNumber(parser),
XML_ErrorString(XML_GetErrorCode(parser)));
XML_ParserFree(parser);
fclose(file);
return false;
}
} while (!done);
XML_ParserFree(parser);
fclose(file);
// Process last page if there is still text
if (currentTextBlock) {
makePages();
completePageFn(currentPage);
currentPage = nullptr;
delete currentTextBlock;
currentTextBlock = nullptr;
}
return true;
}
void EpubHtmlParserSlim::makePages() {
if (!currentTextBlock) {
Serial.println("!! No text block to make pages for !!");
return;
}
if (!currentPage) {
currentPage = new Page();
}
const int lineHeight = renderer.getLineHeight();
const int pageHeight = renderer.getPageHeight();
// Long running task, make sure to let other things happen
vTaskDelay(1);
if (currentTextBlock->getType() == TEXT_BLOCK) {
const auto lines = currentTextBlock->splitIntoLines(renderer);
for (const auto line : lines) {
if (currentPage->nextY + lineHeight > pageHeight) {
completePageFn(currentPage);
currentPage = new Page();
}
currentPage->elements.push_back(new PageLine(line, currentPage->nextY));
currentPage->nextY += lineHeight;
}
// add some extra line between blocks
currentPage->nextY += lineHeight / 2;
}
// TODO: Image block support
// if (block->getType() == BlockType::IMAGE_BLOCK) {
// ImageBlock *imageBlock = (ImageBlock *)block;
// if (y + imageBlock->height > page_height) {
// pages.push_back(new Page());
// y = 0;
// }
// pages.back()->elements.push_back(new PageImage(imageBlock, y));
// y += imageBlock->height;
// }
}

View File

@@ -0,0 +1,42 @@
#pragma once
#include <expat.h>
#include <limits.h>
#include <functional>
#include "blocks/TextBlock.h"
class Page;
class EpdRenderer;
#define PART_WORD_BUFFER_SIZE 200
class EpubHtmlParserSlim {
const char* filepath;
EpdRenderer& renderer;
std::function<void(Page*)> completePageFn;
int depth = 0;
int skipUntilDepth = INT_MAX;
int boldUntilDepth = INT_MAX;
int italicUntilDepth = INT_MAX;
// If we encounter words longer than this, but this is pretty large
char partWordBuffer[PART_WORD_BUFFER_SIZE] = {};
int partWordBufferIndex = 0;
TextBlock* currentTextBlock = nullptr;
Page* currentPage = nullptr;
void startNewTextBlock(BLOCK_STYLE style);
void makePages();
// XML callbacks
static void XMLCALL startElement(void* userData, const XML_Char* name, const XML_Char** atts);
static void XMLCALL characterData(void* userData, const XML_Char* s, int len);
static void XMLCALL endElement(void* userData, const XML_Char* name);
public:
explicit EpubHtmlParserSlim(const char* filepath, EpdRenderer& renderer,
const std::function<void(Page*)>& completePageFn)
: filepath(filepath), renderer(renderer), completePageFn(completePageFn) {}
~EpubHtmlParserSlim() = default;
bool parseAndBuildPages();
};

View File

@@ -5,11 +5,11 @@
#include <fstream>
#include "EpubHtmlParser.h"
#include "EpubHtmlParserSlim.h"
#include "Page.h"
void Section::onPageComplete(const Page* page) {
Serial.printf("Page %d complete\n", pageCount);
Serial.printf("Page %d complete - free mem: %lu\n", pageCount, ESP.getFreeHeap());
const auto filePath = cachePath + "/page_" + std::to_string(pageCount) + ".bin";
@@ -75,11 +75,11 @@ bool Section::persistPageDataToSD() {
}
const auto sdTmpHtmlPath = "/sd" + tmpHtmlPath;
auto visitor =
EpubHtmlParser(sdTmpHtmlPath.c_str(), renderer, [this](const Page* page) { this->onPageComplete(page); });
// TODO: Come back and see if mem used here can be lowered?
auto visitor =
EpubHtmlParserSlim(sdTmpHtmlPath.c_str(), renderer, [this](const Page* page) { this->onPageComplete(page); });
const bool success = visitor.parseAndBuildPages();
SD.remove(tmpHtmlPath.c_str());
if (!success) {
Serial.println("Failed to parse and build pages");

View File

@@ -3,44 +3,11 @@
#include <EpdRenderer.h>
#include <Serialization.h>
static bool isWhitespace(const char c) { return c == ' ' || c == '\r' || c == '\n'; }
void TextBlock::addWord(const std::string& word, const bool is_bold, const bool is_italic) {
if (word.length() == 0) return;
// move past anything that should be considered part of a work
static int skipWord(const std::string& text, int index, const int length) {
while (index < length && !isWhitespace(text[index])) {
index++;
}
return index;
}
// skip past any white space characters
static int skipWhitespace(const std::string& html, int index, const int length) {
while (index < length && isWhitespace(html[index])) {
index++;
}
return index;
}
void TextBlock::addSpan(const std::string& span, const bool is_bold, const bool is_italic) {
// adding a span to text block
// make a copy of the text as we'll modify it
const int length = span.length();
// const auto text = new char[length + 1];
// strcpy(text, span);
// work out where each word is in the span
int index = 0;
while (index < length) {
// skip past any whitespace to the start of a word
index = skipWhitespace(span, index, length);
const int wordStart = index;
// find the end of the word
index = skipWord(span, index, length);
const int wordLength = index - wordStart;
if (wordLength > 0) {
words.push_back(span.substr(wordStart, wordLength));
wordStyles.push_back((is_bold ? BOLD_SPAN : 0) | (is_italic ? ITALIC_SPAN : 0));
}
}
words.push_back(word);
wordStyles.push_back((is_bold ? BOLD_SPAN : 0) | (is_italic ? ITALIC_SPAN : 0));
}
std::list<TextBlock*> TextBlock::splitIntoLines(const EpdRenderer& renderer) {
@@ -189,17 +156,12 @@ std::list<TextBlock*> TextBlock::splitIntoLines(const EpdRenderer& renderer) {
void TextBlock::render(const EpdRenderer& renderer, const int x, const int y) const {
for (int i = 0; i < words.size(); i++) {
// get the style
const uint8_t wordStyle = wordStyles[i];
// render the word
EpdFontStyle fontStyle = REGULAR;
if (wordStyles[i] & BOLD_SPAN) {
if (wordStyles[i] & ITALIC_SPAN) {
fontStyle = BOLD_ITALIC;
} else {
fontStyle = BOLD;
}
if (wordStyles[i] & BOLD_SPAN && wordStyles[i] & ITALIC_SPAN) {
fontStyle = BOLD_ITALIC;
} else if (wordStyles[i] & BOLD_SPAN) {
fontStyle = BOLD;
} else if (wordStyles[i] & ITALIC_SPAN) {
fontStyle = ITALIC;
}

View File

@@ -30,15 +30,15 @@ class TextBlock final : public Block {
BLOCK_STYLE style;
public:
void addSpan(const std::string& span, bool is_bold, bool is_italic);
explicit TextBlock(const BLOCK_STYLE style) : style(style) {}
explicit TextBlock(const std::vector<std::string>& words, const std::vector<uint16_t>& word_xpos,
// the styles of each word
const std::vector<uint8_t>& word_styles, const BLOCK_STYLE style)
: words(words), wordXpos(word_xpos), wordStyles(word_styles), style(style) {}
~TextBlock() override = default;
void set_style(const BLOCK_STYLE style) { this->style = style; }
BLOCK_STYLE get_style() const { return style; }
void addWord(const std::string& word, bool is_bold, bool is_italic);
void setStyle(const BLOCK_STYLE style) { this->style = style; }
BLOCK_STYLE getStyle() const { return style; }
bool isEmpty() override { return words.empty(); }
void layout(EpdRenderer& renderer) override {};
// given a renderer works out where to break the words into lines