Add expat and swap out EPUB HTML parser (#2)
* Add expat and swap out ERB HTML parser * Increase EpubHtmlParserSlim file buffer to 1024 bytes * Cleanup TextBlock functions * Do not break words when leaving spans
This commit is contained in:
@@ -1,181 +0,0 @@
|
||||
#include "EpubHtmlParser.h"
|
||||
|
||||
#include <EpdRenderer.h>
|
||||
#include <HardwareSerial.h>
|
||||
|
||||
#include "Page.h"
|
||||
#include "htmlEntities.h"
|
||||
|
||||
const char* HEADER_TAGS[] = {"h1", "h2", "h3", "h4", "h5", "h6"};
|
||||
constexpr int NUM_HEADER_TAGS = sizeof(HEADER_TAGS) / sizeof(HEADER_TAGS[0]);
|
||||
|
||||
const char* BLOCK_TAGS[] = {"p", "li", "div", "br"};
|
||||
constexpr int NUM_BLOCK_TAGS = sizeof(BLOCK_TAGS) / sizeof(BLOCK_TAGS[0]);
|
||||
|
||||
const char* BOLD_TAGS[] = {"b"};
|
||||
constexpr int NUM_BOLD_TAGS = sizeof(BOLD_TAGS) / sizeof(BOLD_TAGS[0]);
|
||||
|
||||
const char* ITALIC_TAGS[] = {"i"};
|
||||
constexpr int NUM_ITALIC_TAGS = sizeof(ITALIC_TAGS) / sizeof(ITALIC_TAGS[0]);
|
||||
|
||||
const char* IMAGE_TAGS[] = {"img"};
|
||||
constexpr int NUM_IMAGE_TAGS = sizeof(IMAGE_TAGS) / sizeof(IMAGE_TAGS[0]);
|
||||
|
||||
const char* SKIP_TAGS[] = {"head", "table"};
|
||||
constexpr int NUM_SKIP_TAGS = sizeof(SKIP_TAGS) / sizeof(SKIP_TAGS[0]);
|
||||
|
||||
// given the start and end of a tag, check to see if it matches a known tag
|
||||
bool matches(const char* tag_name, const char* possible_tags[], const int possible_tag_count) {
|
||||
for (int i = 0; i < possible_tag_count; i++) {
|
||||
if (strcmp(tag_name, possible_tags[i]) == 0) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// start a new text block if needed
|
||||
void EpubHtmlParser::startNewTextBlock(const BLOCK_STYLE style) {
|
||||
if (currentTextBlock) {
|
||||
// already have a text block running and it is empty - just reuse it
|
||||
if (currentTextBlock->isEmpty()) {
|
||||
currentTextBlock->set_style(style);
|
||||
return;
|
||||
}
|
||||
|
||||
currentTextBlock->finish();
|
||||
makePages();
|
||||
delete currentTextBlock;
|
||||
}
|
||||
currentTextBlock = new TextBlock(style);
|
||||
}
|
||||
|
||||
bool EpubHtmlParser::VisitEnter(const tinyxml2::XMLElement& element, const tinyxml2::XMLAttribute* firstAttribute) {
|
||||
const char* tag_name = element.Name();
|
||||
if (matches(tag_name, IMAGE_TAGS, NUM_IMAGE_TAGS)) {
|
||||
const char* src = element.Attribute("src");
|
||||
if (src) {
|
||||
// don't leave an empty text block in the list
|
||||
// const BLOCK_STYLE style = currentTextBlock->get_style();
|
||||
if (currentTextBlock->isEmpty()) {
|
||||
delete currentTextBlock;
|
||||
currentTextBlock = nullptr;
|
||||
}
|
||||
// TODO: Fix this
|
||||
// blocks.push_back(new ImageBlock(m_base_path + src));
|
||||
// start a new text block - with the same style as before
|
||||
// startNewTextBlock(style);
|
||||
} else {
|
||||
// ESP_LOGE(TAG, "Could not find src attribute");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
if (matches(tag_name, SKIP_TAGS, NUM_SKIP_TAGS)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Serial.printf("Text: %s\n", element.GetText());
|
||||
|
||||
if (matches(tag_name, HEADER_TAGS, NUM_HEADER_TAGS)) {
|
||||
insideBoldTag = true;
|
||||
startNewTextBlock(CENTER_ALIGN);
|
||||
} else if (matches(tag_name, BLOCK_TAGS, NUM_BLOCK_TAGS)) {
|
||||
if (strcmp(tag_name, "br") == 0) {
|
||||
startNewTextBlock(currentTextBlock->get_style());
|
||||
} else {
|
||||
startNewTextBlock(JUSTIFIED);
|
||||
}
|
||||
} else if (matches(tag_name, BOLD_TAGS, NUM_BOLD_TAGS)) {
|
||||
insideBoldTag = true;
|
||||
} else if (matches(tag_name, ITALIC_TAGS, NUM_ITALIC_TAGS)) {
|
||||
insideItalicTag = true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
/// Visit a text node.
|
||||
bool EpubHtmlParser::Visit(const tinyxml2::XMLText& text) {
|
||||
const char* content = text.Value();
|
||||
currentTextBlock->addSpan(replaceHtmlEntities(content), insideBoldTag, insideItalicTag);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool EpubHtmlParser::VisitExit(const tinyxml2::XMLElement& element) {
|
||||
const char* tag_name = element.Name();
|
||||
if (matches(tag_name, HEADER_TAGS, NUM_HEADER_TAGS)) {
|
||||
insideBoldTag = false;
|
||||
} else if (matches(tag_name, BLOCK_TAGS, NUM_BLOCK_TAGS)) {
|
||||
// nothing to do
|
||||
} else if (matches(tag_name, BOLD_TAGS, NUM_BOLD_TAGS)) {
|
||||
insideBoldTag = false;
|
||||
} else if (matches(tag_name, ITALIC_TAGS, NUM_ITALIC_TAGS)) {
|
||||
insideItalicTag = false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool EpubHtmlParser::parseAndBuildPages() {
|
||||
startNewTextBlock(JUSTIFIED);
|
||||
tinyxml2::XMLDocument doc(false, tinyxml2::COLLAPSE_WHITESPACE);
|
||||
|
||||
const tinyxml2::XMLError result = doc.LoadFile(filepath);
|
||||
if (result != tinyxml2::XML_SUCCESS) {
|
||||
Serial.printf("Failed to load file, Error: %s\n", tinyxml2::XMLDocument::ErrorIDToName(result));
|
||||
return false;
|
||||
}
|
||||
|
||||
doc.Accept(this);
|
||||
if (currentTextBlock) {
|
||||
makePages();
|
||||
completePageFn(currentPage);
|
||||
currentPage = nullptr;
|
||||
delete currentTextBlock;
|
||||
currentTextBlock = nullptr;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void EpubHtmlParser::makePages() {
|
||||
if (!currentTextBlock) {
|
||||
Serial.println("!! No text block to make pages for !!");
|
||||
return;
|
||||
}
|
||||
|
||||
if (!currentPage) {
|
||||
currentPage = new Page();
|
||||
}
|
||||
|
||||
const int lineHeight = renderer.getLineHeight();
|
||||
const int pageHeight = renderer.getPageHeight();
|
||||
|
||||
// Long running task, make sure to let other things happen
|
||||
vTaskDelay(1);
|
||||
|
||||
if (currentTextBlock->getType() == TEXT_BLOCK) {
|
||||
const auto lines = currentTextBlock->splitIntoLines(renderer);
|
||||
|
||||
for (const auto line : lines) {
|
||||
if (currentPage->nextY + lineHeight > pageHeight) {
|
||||
completePageFn(currentPage);
|
||||
currentPage = new Page();
|
||||
}
|
||||
|
||||
currentPage->elements.push_back(new PageLine(line, currentPage->nextY));
|
||||
currentPage->nextY += lineHeight;
|
||||
}
|
||||
// TODO: Fix spacing between paras
|
||||
// add some extra line between blocks
|
||||
currentPage->nextY += lineHeight / 2;
|
||||
}
|
||||
// TODO: Image block support
|
||||
// if (block->getType() == BlockType::IMAGE_BLOCK) {
|
||||
// ImageBlock *imageBlock = (ImageBlock *)block;
|
||||
// if (y + imageBlock->height > page_height) {
|
||||
// pages.push_back(new Page());
|
||||
// y = 0;
|
||||
// }
|
||||
// pages.back()->elements.push_back(new PageImage(imageBlock, y));
|
||||
// y += imageBlock->height;
|
||||
// }
|
||||
}
|
||||
@@ -1,34 +0,0 @@
|
||||
#pragma once
|
||||
#include <tinyxml2.h>
|
||||
|
||||
#include <functional>
|
||||
|
||||
#include "blocks/TextBlock.h"
|
||||
|
||||
class Page;
|
||||
class EpdRenderer;
|
||||
|
||||
class EpubHtmlParser final : public tinyxml2::XMLVisitor {
|
||||
const char* filepath;
|
||||
EpdRenderer& renderer;
|
||||
std::function<void(Page*)> completePageFn;
|
||||
|
||||
bool insideBoldTag = false;
|
||||
bool insideItalicTag = false;
|
||||
TextBlock* currentTextBlock = nullptr;
|
||||
Page* currentPage = nullptr;
|
||||
|
||||
void startNewTextBlock(BLOCK_STYLE style);
|
||||
void makePages();
|
||||
|
||||
// xml parser callbacks
|
||||
bool VisitEnter(const tinyxml2::XMLElement& element, const tinyxml2::XMLAttribute* firstAttribute) override;
|
||||
bool Visit(const tinyxml2::XMLText& text) override;
|
||||
bool VisitExit(const tinyxml2::XMLElement& element) override;
|
||||
// xml parser callbacks
|
||||
public:
|
||||
explicit EpubHtmlParser(const char* filepath, EpdRenderer& renderer, const std::function<void(Page*)>& completePageFn)
|
||||
: filepath(filepath), renderer(renderer), completePageFn(completePageFn) {}
|
||||
~EpubHtmlParser() override = default;
|
||||
bool parseAndBuildPages();
|
||||
};
|
||||
291
lib/Epub/Epub/EpubHtmlParserSlim.cpp
Normal file
291
lib/Epub/Epub/EpubHtmlParserSlim.cpp
Normal file
@@ -0,0 +1,291 @@
|
||||
#include "EpubHtmlParserSlim.h"
|
||||
|
||||
#include <EpdRenderer.h>
|
||||
#include <HardwareSerial.h>
|
||||
|
||||
#include "Page.h"
|
||||
#include "htmlEntities.h"
|
||||
|
||||
const char* HEADER_TAGS[] = {"h1", "h2", "h3", "h4", "h5", "h6"};
|
||||
constexpr int NUM_HEADER_TAGS = sizeof(HEADER_TAGS) / sizeof(HEADER_TAGS[0]);
|
||||
|
||||
const char* BLOCK_TAGS[] = {"p", "li", "div", "br"};
|
||||
constexpr int NUM_BLOCK_TAGS = sizeof(BLOCK_TAGS) / sizeof(BLOCK_TAGS[0]);
|
||||
|
||||
const char* BOLD_TAGS[] = {"b"};
|
||||
constexpr int NUM_BOLD_TAGS = sizeof(BOLD_TAGS) / sizeof(BOLD_TAGS[0]);
|
||||
|
||||
const char* ITALIC_TAGS[] = {"i"};
|
||||
constexpr int NUM_ITALIC_TAGS = sizeof(ITALIC_TAGS) / sizeof(ITALIC_TAGS[0]);
|
||||
|
||||
const char* IMAGE_TAGS[] = {"img"};
|
||||
constexpr int NUM_IMAGE_TAGS = sizeof(IMAGE_TAGS) / sizeof(IMAGE_TAGS[0]);
|
||||
|
||||
const char* SKIP_TAGS[] = {"head", "table"};
|
||||
constexpr int NUM_SKIP_TAGS = sizeof(SKIP_TAGS) / sizeof(SKIP_TAGS[0]);
|
||||
|
||||
bool isWhitespace(const char c) { return c == ' ' || c == '\r' || c == '\n'; }
|
||||
|
||||
// given the start and end of a tag, check to see if it matches a known tag
|
||||
bool matches(const char* tag_name, const char* possible_tags[], const int possible_tag_count) {
|
||||
for (int i = 0; i < possible_tag_count; i++) {
|
||||
if (strcmp(tag_name, possible_tags[i]) == 0) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// start a new text block if needed
|
||||
void EpubHtmlParserSlim::startNewTextBlock(const BLOCK_STYLE style) {
|
||||
if (currentTextBlock) {
|
||||
// already have a text block running and it is empty - just reuse it
|
||||
if (currentTextBlock->isEmpty()) {
|
||||
currentTextBlock->setStyle(style);
|
||||
return;
|
||||
}
|
||||
|
||||
currentTextBlock->finish();
|
||||
makePages();
|
||||
delete currentTextBlock;
|
||||
}
|
||||
currentTextBlock = new TextBlock(style);
|
||||
}
|
||||
|
||||
#include <expat.h>
|
||||
|
||||
void XMLCALL EpubHtmlParserSlim::startElement(void* userData, const XML_Char* name, const XML_Char** atts) {
|
||||
auto* self = static_cast<EpubHtmlParserSlim*>(userData);
|
||||
(void)atts;
|
||||
|
||||
// Middle of skip
|
||||
if (self->skipUntilDepth < self->depth) {
|
||||
self->depth += 1;
|
||||
return;
|
||||
}
|
||||
|
||||
if (matches(name, IMAGE_TAGS, NUM_IMAGE_TAGS)) {
|
||||
// const char* src = element.Attribute("src");
|
||||
// if (src) {
|
||||
// // don't leave an empty text block in the list
|
||||
// // const BLOCK_STYLE style = currentTextBlock->get_style();
|
||||
// if (currentTextBlock->isEmpty()) {
|
||||
// delete currentTextBlock;
|
||||
// currentTextBlock = nullptr;
|
||||
// }
|
||||
// // TODO: Fix this
|
||||
// // blocks.push_back(new ImageBlock(m_base_path + src));
|
||||
// // start a new text block - with the same style as before
|
||||
// // startNewTextBlock(style);
|
||||
// } else {
|
||||
// // ESP_LOGE(TAG, "Could not find src attribute");
|
||||
// }
|
||||
|
||||
// start skip
|
||||
self->skipUntilDepth = self->depth;
|
||||
self->depth += 1;
|
||||
return;
|
||||
}
|
||||
|
||||
if (matches(name, SKIP_TAGS, NUM_SKIP_TAGS)) {
|
||||
// start skip
|
||||
self->skipUntilDepth = self->depth;
|
||||
self->depth += 1;
|
||||
return;
|
||||
}
|
||||
|
||||
if (matches(name, HEADER_TAGS, NUM_HEADER_TAGS)) {
|
||||
self->startNewTextBlock(CENTER_ALIGN);
|
||||
self->boldUntilDepth = min(self->boldUntilDepth, self->depth);
|
||||
} else if (matches(name, BLOCK_TAGS, NUM_BLOCK_TAGS)) {
|
||||
if (strcmp(name, "br") == 0) {
|
||||
self->startNewTextBlock(self->currentTextBlock->getStyle());
|
||||
} else {
|
||||
self->startNewTextBlock(JUSTIFIED);
|
||||
}
|
||||
} else if (matches(name, BOLD_TAGS, NUM_BOLD_TAGS)) {
|
||||
self->boldUntilDepth = min(self->boldUntilDepth, self->depth);
|
||||
} else if (matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS)) {
|
||||
self->italicUntilDepth = min(self->italicUntilDepth, self->depth);
|
||||
}
|
||||
|
||||
self->depth += 1;
|
||||
}
|
||||
|
||||
void XMLCALL EpubHtmlParserSlim::characterData(void* userData, const XML_Char* s, const int len) {
|
||||
auto* self = static_cast<EpubHtmlParserSlim*>(userData);
|
||||
|
||||
// Middle of skip
|
||||
if (self->skipUntilDepth < self->depth) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (int i = 0; i < len; i++) {
|
||||
if (isWhitespace(s[i])) {
|
||||
// Currently looking at whitespace, if there's anything in the partWordBuffer, flush it
|
||||
if (self->partWordBufferIndex > 0) {
|
||||
self->partWordBuffer[self->partWordBufferIndex] = '\0';
|
||||
self->currentTextBlock->addWord(replaceHtmlEntities(self->partWordBuffer), self->boldUntilDepth < self->depth,
|
||||
self->italicUntilDepth < self->depth);
|
||||
self->partWordBufferIndex = 0;
|
||||
}
|
||||
// Skip the whitespace char
|
||||
continue;
|
||||
}
|
||||
|
||||
// If we're about to run out of space, then cut the word off and start a new one
|
||||
if (self->partWordBufferIndex >= PART_WORD_BUFFER_SIZE - 2) {
|
||||
self->partWordBuffer[self->partWordBufferIndex] = '\0';
|
||||
self->currentTextBlock->addWord(replaceHtmlEntities(self->partWordBuffer), self->boldUntilDepth < self->depth,
|
||||
self->italicUntilDepth < self->depth);
|
||||
self->partWordBufferIndex = 0;
|
||||
}
|
||||
|
||||
self->partWordBuffer[self->partWordBufferIndex++] = s[i];
|
||||
}
|
||||
}
|
||||
|
||||
void XMLCALL EpubHtmlParserSlim::endElement(void* userData, const XML_Char* name) {
|
||||
auto* self = static_cast<EpubHtmlParserSlim*>(userData);
|
||||
(void)name;
|
||||
|
||||
if (self->partWordBufferIndex > 0) {
|
||||
// Only flush out part word buffer if we're closing a block tag or are at the top of the HTML file.
|
||||
// We don't want to flush out content when closing inline tags like <span>.
|
||||
// Currently this also flushes out on closing <b> and <i> tags, but they are line tags so that shouldn't happen,
|
||||
// text styling needs to be overhauled to fix it.
|
||||
const bool shouldBreakText = matches(name, BLOCK_TAGS, NUM_BLOCK_TAGS) ||
|
||||
matches(name, HEADER_TAGS, NUM_HEADER_TAGS) ||
|
||||
matches(name, BOLD_TAGS, NUM_BOLD_TAGS) ||
|
||||
matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS) ||
|
||||
self->depth == 1;
|
||||
|
||||
if (shouldBreakText) {
|
||||
self->partWordBuffer[self->partWordBufferIndex] = '\0';
|
||||
self->currentTextBlock->addWord(replaceHtmlEntities(self->partWordBuffer), self->boldUntilDepth < self->depth,
|
||||
self->italicUntilDepth < self->depth);
|
||||
self->partWordBufferIndex = 0;
|
||||
}
|
||||
}
|
||||
|
||||
self->depth -= 1;
|
||||
|
||||
// Leaving skip
|
||||
if (self->skipUntilDepth == self->depth) {
|
||||
self->skipUntilDepth = INT_MAX;
|
||||
}
|
||||
|
||||
// Leaving bold
|
||||
if (self->boldUntilDepth == self->depth) {
|
||||
self->boldUntilDepth = INT_MAX;
|
||||
}
|
||||
|
||||
// Leaving italic
|
||||
if (self->italicUntilDepth == self->depth) {
|
||||
self->italicUntilDepth = INT_MAX;
|
||||
}
|
||||
}
|
||||
|
||||
bool EpubHtmlParserSlim::parseAndBuildPages() {
|
||||
startNewTextBlock(JUSTIFIED);
|
||||
|
||||
const XML_Parser parser = XML_ParserCreate(nullptr);
|
||||
int done;
|
||||
|
||||
if (!parser) {
|
||||
Serial.println("Couldn't allocate memory for parser");
|
||||
return false;
|
||||
}
|
||||
|
||||
XML_SetUserData(parser, this);
|
||||
XML_SetElementHandler(parser, startElement, endElement);
|
||||
XML_SetCharacterDataHandler(parser, characterData);
|
||||
|
||||
FILE* file = fopen(filepath, "r");
|
||||
|
||||
do {
|
||||
void* const buf = XML_GetBuffer(parser, 1024);
|
||||
if (!buf) {
|
||||
Serial.println("Couldn't allocate memory for buffer");
|
||||
XML_ParserFree(parser);
|
||||
fclose(file);
|
||||
return false;
|
||||
}
|
||||
|
||||
const size_t len = fread(buf, 1, 1024, file);
|
||||
|
||||
if (ferror(file)) {
|
||||
Serial.println("Read error");
|
||||
XML_ParserFree(parser);
|
||||
fclose(file);
|
||||
return false;
|
||||
}
|
||||
|
||||
done = feof(file);
|
||||
|
||||
if (XML_ParseBuffer(parser, static_cast<int>(len), done) == XML_STATUS_ERROR) {
|
||||
Serial.printf("Parse error at line %lu:\n%s\n", XML_GetCurrentLineNumber(parser),
|
||||
XML_ErrorString(XML_GetErrorCode(parser)));
|
||||
XML_ParserFree(parser);
|
||||
fclose(file);
|
||||
return false;
|
||||
}
|
||||
} while (!done);
|
||||
|
||||
XML_ParserFree(parser);
|
||||
fclose(file);
|
||||
|
||||
// Process last page if there is still text
|
||||
if (currentTextBlock) {
|
||||
makePages();
|
||||
completePageFn(currentPage);
|
||||
currentPage = nullptr;
|
||||
delete currentTextBlock;
|
||||
currentTextBlock = nullptr;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void EpubHtmlParserSlim::makePages() {
|
||||
if (!currentTextBlock) {
|
||||
Serial.println("!! No text block to make pages for !!");
|
||||
return;
|
||||
}
|
||||
|
||||
if (!currentPage) {
|
||||
currentPage = new Page();
|
||||
}
|
||||
|
||||
const int lineHeight = renderer.getLineHeight();
|
||||
const int pageHeight = renderer.getPageHeight();
|
||||
|
||||
// Long running task, make sure to let other things happen
|
||||
vTaskDelay(1);
|
||||
|
||||
if (currentTextBlock->getType() == TEXT_BLOCK) {
|
||||
const auto lines = currentTextBlock->splitIntoLines(renderer);
|
||||
|
||||
for (const auto line : lines) {
|
||||
if (currentPage->nextY + lineHeight > pageHeight) {
|
||||
completePageFn(currentPage);
|
||||
currentPage = new Page();
|
||||
}
|
||||
|
||||
currentPage->elements.push_back(new PageLine(line, currentPage->nextY));
|
||||
currentPage->nextY += lineHeight;
|
||||
}
|
||||
// add some extra line between blocks
|
||||
currentPage->nextY += lineHeight / 2;
|
||||
}
|
||||
// TODO: Image block support
|
||||
// if (block->getType() == BlockType::IMAGE_BLOCK) {
|
||||
// ImageBlock *imageBlock = (ImageBlock *)block;
|
||||
// if (y + imageBlock->height > page_height) {
|
||||
// pages.push_back(new Page());
|
||||
// y = 0;
|
||||
// }
|
||||
// pages.back()->elements.push_back(new PageImage(imageBlock, y));
|
||||
// y += imageBlock->height;
|
||||
// }
|
||||
}
|
||||
42
lib/Epub/Epub/EpubHtmlParserSlim.h
Normal file
42
lib/Epub/Epub/EpubHtmlParserSlim.h
Normal file
@@ -0,0 +1,42 @@
|
||||
#pragma once
|
||||
|
||||
#include <expat.h>
|
||||
#include <limits.h>
|
||||
|
||||
#include <functional>
|
||||
|
||||
#include "blocks/TextBlock.h"
|
||||
|
||||
class Page;
|
||||
class EpdRenderer;
|
||||
|
||||
#define PART_WORD_BUFFER_SIZE 200
|
||||
|
||||
class EpubHtmlParserSlim {
|
||||
const char* filepath;
|
||||
EpdRenderer& renderer;
|
||||
std::function<void(Page*)> completePageFn;
|
||||
int depth = 0;
|
||||
int skipUntilDepth = INT_MAX;
|
||||
int boldUntilDepth = INT_MAX;
|
||||
int italicUntilDepth = INT_MAX;
|
||||
// If we encounter words longer than this, but this is pretty large
|
||||
char partWordBuffer[PART_WORD_BUFFER_SIZE] = {};
|
||||
int partWordBufferIndex = 0;
|
||||
TextBlock* currentTextBlock = nullptr;
|
||||
Page* currentPage = nullptr;
|
||||
|
||||
void startNewTextBlock(BLOCK_STYLE style);
|
||||
void makePages();
|
||||
// XML callbacks
|
||||
static void XMLCALL startElement(void* userData, const XML_Char* name, const XML_Char** atts);
|
||||
static void XMLCALL characterData(void* userData, const XML_Char* s, int len);
|
||||
static void XMLCALL endElement(void* userData, const XML_Char* name);
|
||||
|
||||
public:
|
||||
explicit EpubHtmlParserSlim(const char* filepath, EpdRenderer& renderer,
|
||||
const std::function<void(Page*)>& completePageFn)
|
||||
: filepath(filepath), renderer(renderer), completePageFn(completePageFn) {}
|
||||
~EpubHtmlParserSlim() = default;
|
||||
bool parseAndBuildPages();
|
||||
};
|
||||
@@ -5,11 +5,11 @@
|
||||
|
||||
#include <fstream>
|
||||
|
||||
#include "EpubHtmlParser.h"
|
||||
#include "EpubHtmlParserSlim.h"
|
||||
#include "Page.h"
|
||||
|
||||
void Section::onPageComplete(const Page* page) {
|
||||
Serial.printf("Page %d complete\n", pageCount);
|
||||
Serial.printf("Page %d complete - free mem: %lu\n", pageCount, ESP.getFreeHeap());
|
||||
|
||||
const auto filePath = cachePath + "/page_" + std::to_string(pageCount) + ".bin";
|
||||
|
||||
@@ -75,11 +75,11 @@ bool Section::persistPageDataToSD() {
|
||||
}
|
||||
|
||||
const auto sdTmpHtmlPath = "/sd" + tmpHtmlPath;
|
||||
auto visitor =
|
||||
EpubHtmlParser(sdTmpHtmlPath.c_str(), renderer, [this](const Page* page) { this->onPageComplete(page); });
|
||||
|
||||
// TODO: Come back and see if mem used here can be lowered?
|
||||
auto visitor =
|
||||
EpubHtmlParserSlim(sdTmpHtmlPath.c_str(), renderer, [this](const Page* page) { this->onPageComplete(page); });
|
||||
const bool success = visitor.parseAndBuildPages();
|
||||
|
||||
SD.remove(tmpHtmlPath.c_str());
|
||||
if (!success) {
|
||||
Serial.println("Failed to parse and build pages");
|
||||
|
||||
@@ -3,44 +3,11 @@
|
||||
#include <EpdRenderer.h>
|
||||
#include <Serialization.h>
|
||||
|
||||
static bool isWhitespace(const char c) { return c == ' ' || c == '\r' || c == '\n'; }
|
||||
void TextBlock::addWord(const std::string& word, const bool is_bold, const bool is_italic) {
|
||||
if (word.length() == 0) return;
|
||||
|
||||
// move past anything that should be considered part of a work
|
||||
static int skipWord(const std::string& text, int index, const int length) {
|
||||
while (index < length && !isWhitespace(text[index])) {
|
||||
index++;
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
// skip past any white space characters
|
||||
static int skipWhitespace(const std::string& html, int index, const int length) {
|
||||
while (index < length && isWhitespace(html[index])) {
|
||||
index++;
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
void TextBlock::addSpan(const std::string& span, const bool is_bold, const bool is_italic) {
|
||||
// adding a span to text block
|
||||
// make a copy of the text as we'll modify it
|
||||
const int length = span.length();
|
||||
// const auto text = new char[length + 1];
|
||||
// strcpy(text, span);
|
||||
// work out where each word is in the span
|
||||
int index = 0;
|
||||
while (index < length) {
|
||||
// skip past any whitespace to the start of a word
|
||||
index = skipWhitespace(span, index, length);
|
||||
const int wordStart = index;
|
||||
// find the end of the word
|
||||
index = skipWord(span, index, length);
|
||||
const int wordLength = index - wordStart;
|
||||
if (wordLength > 0) {
|
||||
words.push_back(span.substr(wordStart, wordLength));
|
||||
wordStyles.push_back((is_bold ? BOLD_SPAN : 0) | (is_italic ? ITALIC_SPAN : 0));
|
||||
}
|
||||
}
|
||||
words.push_back(word);
|
||||
wordStyles.push_back((is_bold ? BOLD_SPAN : 0) | (is_italic ? ITALIC_SPAN : 0));
|
||||
}
|
||||
|
||||
std::list<TextBlock*> TextBlock::splitIntoLines(const EpdRenderer& renderer) {
|
||||
@@ -189,17 +156,12 @@ std::list<TextBlock*> TextBlock::splitIntoLines(const EpdRenderer& renderer) {
|
||||
|
||||
void TextBlock::render(const EpdRenderer& renderer, const int x, const int y) const {
|
||||
for (int i = 0; i < words.size(); i++) {
|
||||
// get the style
|
||||
const uint8_t wordStyle = wordStyles[i];
|
||||
// render the word
|
||||
EpdFontStyle fontStyle = REGULAR;
|
||||
if (wordStyles[i] & BOLD_SPAN) {
|
||||
if (wordStyles[i] & ITALIC_SPAN) {
|
||||
fontStyle = BOLD_ITALIC;
|
||||
} else {
|
||||
fontStyle = BOLD;
|
||||
}
|
||||
|
||||
if (wordStyles[i] & BOLD_SPAN && wordStyles[i] & ITALIC_SPAN) {
|
||||
fontStyle = BOLD_ITALIC;
|
||||
} else if (wordStyles[i] & BOLD_SPAN) {
|
||||
fontStyle = BOLD;
|
||||
} else if (wordStyles[i] & ITALIC_SPAN) {
|
||||
fontStyle = ITALIC;
|
||||
}
|
||||
|
||||
@@ -30,15 +30,15 @@ class TextBlock final : public Block {
|
||||
BLOCK_STYLE style;
|
||||
|
||||
public:
|
||||
void addSpan(const std::string& span, bool is_bold, bool is_italic);
|
||||
explicit TextBlock(const BLOCK_STYLE style) : style(style) {}
|
||||
explicit TextBlock(const std::vector<std::string>& words, const std::vector<uint16_t>& word_xpos,
|
||||
// the styles of each word
|
||||
const std::vector<uint8_t>& word_styles, const BLOCK_STYLE style)
|
||||
: words(words), wordXpos(word_xpos), wordStyles(word_styles), style(style) {}
|
||||
~TextBlock() override = default;
|
||||
void set_style(const BLOCK_STYLE style) { this->style = style; }
|
||||
BLOCK_STYLE get_style() const { return style; }
|
||||
void addWord(const std::string& word, bool is_bold, bool is_italic);
|
||||
void setStyle(const BLOCK_STYLE style) { this->style = style; }
|
||||
BLOCK_STYLE getStyle() const { return style; }
|
||||
bool isEmpty() override { return words.empty(); }
|
||||
void layout(EpdRenderer& renderer) override {};
|
||||
// given a renderer works out where to break the words into lines
|
||||
|
||||
Reference in New Issue
Block a user