Add expat and swap out EPUB HTML parser (#2)

* Add expat and swap out ERB HTML parser * Increase EpubHtmlParserSlim file buffer to 1024 bytes * Cleanup TextBlock functions * Do not break words when leaving spans
2025-12-06 20:57:24 +11:00
parent ad8cee12ab
commit dd6e649d74
32 changed files with 15969 additions and 269 deletions
--- a/lib/Epub/Epub/EpubHtmlParser.cpp
+++ b/lib/Epub/Epub/EpubHtmlParser.cpp
@@ -1,181 +0,0 @@
-#include "EpubHtmlParser.h"
-
-#include <EpdRenderer.h>
-#include <HardwareSerial.h>
-
-#include "Page.h"
-#include "htmlEntities.h"
-
-const char* HEADER_TAGS[] = {"h1", "h2", "h3", "h4", "h5", "h6"};
-constexpr int NUM_HEADER_TAGS = sizeof(HEADER_TAGS) / sizeof(HEADER_TAGS[0]);
-
-const char* BLOCK_TAGS[] = {"p", "li", "div", "br"};
-constexpr int NUM_BLOCK_TAGS = sizeof(BLOCK_TAGS) / sizeof(BLOCK_TAGS[0]);
-
-const char* BOLD_TAGS[] = {"b"};
-constexpr int NUM_BOLD_TAGS = sizeof(BOLD_TAGS) / sizeof(BOLD_TAGS[0]);
-
-const char* ITALIC_TAGS[] = {"i"};
-constexpr int NUM_ITALIC_TAGS = sizeof(ITALIC_TAGS) / sizeof(ITALIC_TAGS[0]);
-
-const char* IMAGE_TAGS[] = {"img"};
-constexpr int NUM_IMAGE_TAGS = sizeof(IMAGE_TAGS) / sizeof(IMAGE_TAGS[0]);
-
-const char* SKIP_TAGS[] = {"head", "table"};
-constexpr int NUM_SKIP_TAGS = sizeof(SKIP_TAGS) / sizeof(SKIP_TAGS[0]);
-
-// given the start and end of a tag, check to see if it matches a known tag
-bool matches(const char* tag_name, const char* possible_tags[], const int possible_tag_count) {
-  for (int i = 0; i < possible_tag_count; i++) {
-    if (strcmp(tag_name, possible_tags[i]) == 0) {
-      return true;
-    }
-  }
-  return false;
-}
-
-// start a new text block if needed
-void EpubHtmlParser::startNewTextBlock(const BLOCK_STYLE style) {
-  if (currentTextBlock) {
-    // already have a text block running and it is empty - just reuse it
-    if (currentTextBlock->isEmpty()) {
-      currentTextBlock->set_style(style);
-      return;
-    }
-
-    currentTextBlock->finish();
-    makePages();
-    delete currentTextBlock;
-  }
-  currentTextBlock = new TextBlock(style);
-}
-
-bool EpubHtmlParser::VisitEnter(const tinyxml2::XMLElement& element, const tinyxml2::XMLAttribute* firstAttribute) {
-  const char* tag_name = element.Name();
-  if (matches(tag_name, IMAGE_TAGS, NUM_IMAGE_TAGS)) {
-    const char* src = element.Attribute("src");
-    if (src) {
-      // don't leave an empty text block in the list
-      // const BLOCK_STYLE style = currentTextBlock->get_style();
-      if (currentTextBlock->isEmpty()) {
-        delete currentTextBlock;
-        currentTextBlock = nullptr;
-      }
-      // TODO: Fix this
-      // blocks.push_back(new ImageBlock(m_base_path + src));
-      // start a new text block - with the same style as before
-      // startNewTextBlock(style);
-    } else {
-      // ESP_LOGE(TAG, "Could not find src attribute");
-    }
-    return false;
-  }
-
-  if (matches(tag_name, SKIP_TAGS, NUM_SKIP_TAGS)) {
-    return false;
-  }
-
-  // Serial.printf("Text: %s\n", element.GetText());
-
-  if (matches(tag_name, HEADER_TAGS, NUM_HEADER_TAGS)) {
-    insideBoldTag = true;
-    startNewTextBlock(CENTER_ALIGN);
-  } else if (matches(tag_name, BLOCK_TAGS, NUM_BLOCK_TAGS)) {
-    if (strcmp(tag_name, "br") == 0) {
-      startNewTextBlock(currentTextBlock->get_style());
-    } else {
-      startNewTextBlock(JUSTIFIED);
-    }
-  } else if (matches(tag_name, BOLD_TAGS, NUM_BOLD_TAGS)) {
-    insideBoldTag = true;
-  } else if (matches(tag_name, ITALIC_TAGS, NUM_ITALIC_TAGS)) {
-    insideItalicTag = true;
-  }
-  return true;
-}
-/// Visit a text node.
-bool EpubHtmlParser::Visit(const tinyxml2::XMLText& text) {
-  const char* content = text.Value();
-  currentTextBlock->addSpan(replaceHtmlEntities(content), insideBoldTag, insideItalicTag);
-  return true;
-}
-
-bool EpubHtmlParser::VisitExit(const tinyxml2::XMLElement& element) {
-  const char* tag_name = element.Name();
-  if (matches(tag_name, HEADER_TAGS, NUM_HEADER_TAGS)) {
-    insideBoldTag = false;
-  } else if (matches(tag_name, BLOCK_TAGS, NUM_BLOCK_TAGS)) {
-    // nothing to do
-  } else if (matches(tag_name, BOLD_TAGS, NUM_BOLD_TAGS)) {
-    insideBoldTag = false;
-  } else if (matches(tag_name, ITALIC_TAGS, NUM_ITALIC_TAGS)) {
-    insideItalicTag = false;
-  }
-  return true;
-}
-
-bool EpubHtmlParser::parseAndBuildPages() {
-  startNewTextBlock(JUSTIFIED);
-  tinyxml2::XMLDocument doc(false, tinyxml2::COLLAPSE_WHITESPACE);
-
-  const tinyxml2::XMLError result = doc.LoadFile(filepath);
-  if (result != tinyxml2::XML_SUCCESS) {
-    Serial.printf("Failed to load file, Error: %s\n", tinyxml2::XMLDocument::ErrorIDToName(result));
-    return false;
-  }
-
-  doc.Accept(this);
-  if (currentTextBlock) {
-    makePages();
-    completePageFn(currentPage);
-    currentPage = nullptr;
-    delete currentTextBlock;
-    currentTextBlock = nullptr;
-  }
-
-  return true;
-}
-
-void EpubHtmlParser::makePages() {
-  if (!currentTextBlock) {
-    Serial.println("!! No text block to make pages for !!");
-    return;
-  }
-
-  if (!currentPage) {
-    currentPage = new Page();
-  }
-
-  const int lineHeight = renderer.getLineHeight();
-  const int pageHeight = renderer.getPageHeight();
-
-  // Long running task, make sure to let other things happen
-  vTaskDelay(1);
-
-  if (currentTextBlock->getType() == TEXT_BLOCK) {
-    const auto lines = currentTextBlock->splitIntoLines(renderer);
-
-    for (const auto line : lines) {
-      if (currentPage->nextY + lineHeight > pageHeight) {
-        completePageFn(currentPage);
-        currentPage = new Page();
-      }
-
-      currentPage->elements.push_back(new PageLine(line, currentPage->nextY));
-      currentPage->nextY += lineHeight;
-    }
-    // TODO: Fix spacing between paras
-    // add some extra line between blocks
-    currentPage->nextY += lineHeight / 2;
-  }
-  // TODO: Image block support
-  // if (block->getType() == BlockType::IMAGE_BLOCK) {
-  //   ImageBlock *imageBlock = (ImageBlock *)block;
-  //   if (y + imageBlock->height > page_height) {
-  //     pages.push_back(new Page());
-  //     y = 0;
-  //   }
-  //   pages.back()->elements.push_back(new PageImage(imageBlock, y));
-  //   y += imageBlock->height;
-  // }
-}
--- a/lib/Epub/Epub/EpubHtmlParser.h
+++ b/lib/Epub/Epub/EpubHtmlParser.h
@@ -1,34 +0,0 @@
-#pragma once
-#include <tinyxml2.h>
-
-#include <functional>
-
-#include "blocks/TextBlock.h"
-
-class Page;
-class EpdRenderer;
-
-class EpubHtmlParser final : public tinyxml2::XMLVisitor {
-  const char* filepath;
-  EpdRenderer& renderer;
-  std::function<void(Page*)> completePageFn;
-
-  bool insideBoldTag = false;
-  bool insideItalicTag = false;
-  TextBlock* currentTextBlock = nullptr;
-  Page* currentPage = nullptr;
-
-  void startNewTextBlock(BLOCK_STYLE style);
-  void makePages();
-
-  // xml parser callbacks
-  bool VisitEnter(const tinyxml2::XMLElement& element, const tinyxml2::XMLAttribute* firstAttribute) override;
-  bool Visit(const tinyxml2::XMLText& text) override;
-  bool VisitExit(const tinyxml2::XMLElement& element) override;
-  // xml parser callbacks
- public:
-  explicit EpubHtmlParser(const char* filepath, EpdRenderer& renderer, const std::function<void(Page*)>& completePageFn)
-      : filepath(filepath), renderer(renderer), completePageFn(completePageFn) {}
-  ~EpubHtmlParser() override = default;
-  bool parseAndBuildPages();
-};
--- a/lib/Epub/Epub/EpubHtmlParserSlim.cpp
+++ b/lib/Epub/Epub/EpubHtmlParserSlim.cpp
@@ -0,0 +1,291 @@
+#include "EpubHtmlParserSlim.h"
+
+#include <EpdRenderer.h>
+#include <HardwareSerial.h>
+
+#include "Page.h"
+#include "htmlEntities.h"
+
+const char* HEADER_TAGS[] = {"h1", "h2", "h3", "h4", "h5", "h6"};
+constexpr int NUM_HEADER_TAGS = sizeof(HEADER_TAGS) / sizeof(HEADER_TAGS[0]);
+
+const char* BLOCK_TAGS[] = {"p", "li", "div", "br"};
+constexpr int NUM_BLOCK_TAGS = sizeof(BLOCK_TAGS) / sizeof(BLOCK_TAGS[0]);
+
+const char* BOLD_TAGS[] = {"b"};
+constexpr int NUM_BOLD_TAGS = sizeof(BOLD_TAGS) / sizeof(BOLD_TAGS[0]);
+
+const char* ITALIC_TAGS[] = {"i"};
+constexpr int NUM_ITALIC_TAGS = sizeof(ITALIC_TAGS) / sizeof(ITALIC_TAGS[0]);
+
+const char* IMAGE_TAGS[] = {"img"};
+constexpr int NUM_IMAGE_TAGS = sizeof(IMAGE_TAGS) / sizeof(IMAGE_TAGS[0]);
+
+const char* SKIP_TAGS[] = {"head", "table"};
+constexpr int NUM_SKIP_TAGS = sizeof(SKIP_TAGS) / sizeof(SKIP_TAGS[0]);
+
+bool isWhitespace(const char c) { return c == ' ' || c == '\r' || c == '\n'; }
+
+// given the start and end of a tag, check to see if it matches a known tag
+bool matches(const char* tag_name, const char* possible_tags[], const int possible_tag_count) {
+  for (int i = 0; i < possible_tag_count; i++) {
+    if (strcmp(tag_name, possible_tags[i]) == 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// start a new text block if needed
+void EpubHtmlParserSlim::startNewTextBlock(const BLOCK_STYLE style) {
+  if (currentTextBlock) {
+    // already have a text block running and it is empty - just reuse it
+    if (currentTextBlock->isEmpty()) {
+      currentTextBlock->setStyle(style);
+      return;
+    }
+
+    currentTextBlock->finish();
+    makePages();
+    delete currentTextBlock;
+  }
+  currentTextBlock = new TextBlock(style);
+}
+
+#include <expat.h>
+
+void XMLCALL EpubHtmlParserSlim::startElement(void* userData, const XML_Char* name, const XML_Char** atts) {
+  auto* self = static_cast<EpubHtmlParserSlim*>(userData);
+  (void)atts;
+
+  // Middle of skip
+  if (self->skipUntilDepth < self->depth) {
+    self->depth += 1;
+    return;
+  }
+
+  if (matches(name, IMAGE_TAGS, NUM_IMAGE_TAGS)) {
+    // const char* src = element.Attribute("src");
+    // if (src) {
+    //   // don't leave an empty text block in the list
+    //   // const BLOCK_STYLE style = currentTextBlock->get_style();
+    //   if (currentTextBlock->isEmpty()) {
+    //     delete currentTextBlock;
+    //     currentTextBlock = nullptr;
+    //   }
+    //   // TODO: Fix this
+    //   // blocks.push_back(new ImageBlock(m_base_path + src));
+    //   // start a new text block - with the same style as before
+    //   // startNewTextBlock(style);
+    // } else {
+    //   // ESP_LOGE(TAG, "Could not find src attribute");
+    // }
+
+    // start skip
+    self->skipUntilDepth = self->depth;
+    self->depth += 1;
+    return;
+  }
+
+  if (matches(name, SKIP_TAGS, NUM_SKIP_TAGS)) {
+    // start skip
+    self->skipUntilDepth = self->depth;
+    self->depth += 1;
+    return;
+  }
+
+  if (matches(name, HEADER_TAGS, NUM_HEADER_TAGS)) {
+    self->startNewTextBlock(CENTER_ALIGN);
+    self->boldUntilDepth = min(self->boldUntilDepth, self->depth);
+  } else if (matches(name, BLOCK_TAGS, NUM_BLOCK_TAGS)) {
+    if (strcmp(name, "br") == 0) {
+      self->startNewTextBlock(self->currentTextBlock->getStyle());
+    } else {
+      self->startNewTextBlock(JUSTIFIED);
+    }
+  } else if (matches(name, BOLD_TAGS, NUM_BOLD_TAGS)) {
+    self->boldUntilDepth = min(self->boldUntilDepth, self->depth);
+  } else if (matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS)) {
+    self->italicUntilDepth = min(self->italicUntilDepth, self->depth);
+  }
+
+  self->depth += 1;
+}
+
+void XMLCALL EpubHtmlParserSlim::characterData(void* userData, const XML_Char* s, const int len) {
+  auto* self = static_cast<EpubHtmlParserSlim*>(userData);
+
+  // Middle of skip
+  if (self->skipUntilDepth < self->depth) {
+    return;
+  }
+
+  for (int i = 0; i < len; i++) {
+    if (isWhitespace(s[i])) {
+      // Currently looking at whitespace, if there's anything in the partWordBuffer, flush it
+      if (self->partWordBufferIndex > 0) {
+        self->partWordBuffer[self->partWordBufferIndex] = '\0';
+        self->currentTextBlock->addWord(replaceHtmlEntities(self->partWordBuffer), self->boldUntilDepth < self->depth,
+                                        self->italicUntilDepth < self->depth);
+        self->partWordBufferIndex = 0;
+      }
+      // Skip the whitespace char
+      continue;
+    }
+
+    // If we're about to run out of space, then cut the word off and start a new one
+    if (self->partWordBufferIndex >= PART_WORD_BUFFER_SIZE - 2) {
+      self->partWordBuffer[self->partWordBufferIndex] = '\0';
+      self->currentTextBlock->addWord(replaceHtmlEntities(self->partWordBuffer), self->boldUntilDepth < self->depth,
+                                      self->italicUntilDepth < self->depth);
+      self->partWordBufferIndex = 0;
+    }
+
+    self->partWordBuffer[self->partWordBufferIndex++] = s[i];
+  }
+}
+
+void XMLCALL EpubHtmlParserSlim::endElement(void* userData, const XML_Char* name) {
+  auto* self = static_cast<EpubHtmlParserSlim*>(userData);
+  (void)name;
+
+  if (self->partWordBufferIndex > 0) {
+    // Only flush out part word buffer if we're closing a block tag or are at the top of the HTML file.
+    // We don't want to flush out content when closing inline tags like <span>.
+    // Currently this also flushes out on closing <b> and <i> tags, but they are line tags so that shouldn't happen,
+    // text styling needs to be overhauled to fix it.
+    const bool shouldBreakText = matches(name, BLOCK_TAGS, NUM_BLOCK_TAGS) ||
+      matches(name, HEADER_TAGS, NUM_HEADER_TAGS) ||
+      matches(name, BOLD_TAGS, NUM_BOLD_TAGS) ||
+      matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS) ||
+        self->depth == 1;
+
+    if (shouldBreakText) {
+      self->partWordBuffer[self->partWordBufferIndex] = '\0';
+      self->currentTextBlock->addWord(replaceHtmlEntities(self->partWordBuffer), self->boldUntilDepth < self->depth,
+                                      self->italicUntilDepth < self->depth);
+      self->partWordBufferIndex = 0;
+    }
+  }
+
+  self->depth -= 1;
+
+  // Leaving skip
+  if (self->skipUntilDepth == self->depth) {
+    self->skipUntilDepth = INT_MAX;
+  }
+
+  // Leaving bold
+  if (self->boldUntilDepth == self->depth) {
+    self->boldUntilDepth = INT_MAX;
+  }
+
+  // Leaving italic
+  if (self->italicUntilDepth == self->depth) {
+    self->italicUntilDepth = INT_MAX;
+  }
+}
+
+bool EpubHtmlParserSlim::parseAndBuildPages() {
+  startNewTextBlock(JUSTIFIED);
+
+  const XML_Parser parser = XML_ParserCreate(nullptr);
+  int done;
+
+  if (!parser) {
+    Serial.println("Couldn't allocate memory for parser");
+    return false;
+  }
+
+  XML_SetUserData(parser, this);
+  XML_SetElementHandler(parser, startElement, endElement);
+  XML_SetCharacterDataHandler(parser, characterData);
+
+  FILE* file = fopen(filepath, "r");
+
+  do {
+    void* const buf = XML_GetBuffer(parser, 1024);
+    if (!buf) {
+      Serial.println("Couldn't allocate memory for buffer");
+      XML_ParserFree(parser);
+      fclose(file);
+      return false;
+    }
+
+    const size_t len = fread(buf, 1, 1024, file);
+
+    if (ferror(file)) {
+      Serial.println("Read error");
+      XML_ParserFree(parser);
+      fclose(file);
+      return false;
+    }
+
+    done = feof(file);
+
+    if (XML_ParseBuffer(parser, static_cast<int>(len), done) == XML_STATUS_ERROR) {
+      Serial.printf("Parse error at line %lu:\n%s\n", XML_GetCurrentLineNumber(parser),
+                    XML_ErrorString(XML_GetErrorCode(parser)));
+      XML_ParserFree(parser);
+      fclose(file);
+      return false;
+    }
+  } while (!done);
+
+  XML_ParserFree(parser);
+  fclose(file);
+
+  // Process last page if there is still text
+  if (currentTextBlock) {
+    makePages();
+    completePageFn(currentPage);
+    currentPage = nullptr;
+    delete currentTextBlock;
+    currentTextBlock = nullptr;
+  }
+
+  return true;
+}
+
+void EpubHtmlParserSlim::makePages() {
+  if (!currentTextBlock) {
+    Serial.println("!! No text block to make pages for !!");
+    return;
+  }
+
+  if (!currentPage) {
+    currentPage = new Page();
+  }
+
+  const int lineHeight = renderer.getLineHeight();
+  const int pageHeight = renderer.getPageHeight();
+
+  // Long running task, make sure to let other things happen
+  vTaskDelay(1);
+
+  if (currentTextBlock->getType() == TEXT_BLOCK) {
+    const auto lines = currentTextBlock->splitIntoLines(renderer);
+
+    for (const auto line : lines) {
+      if (currentPage->nextY + lineHeight > pageHeight) {
+        completePageFn(currentPage);
+        currentPage = new Page();
+      }
+
+      currentPage->elements.push_back(new PageLine(line, currentPage->nextY));
+      currentPage->nextY += lineHeight;
+    }
+    // add some extra line between blocks
+    currentPage->nextY += lineHeight / 2;
+  }
+  // TODO: Image block support
+  // if (block->getType() == BlockType::IMAGE_BLOCK) {
+  //   ImageBlock *imageBlock = (ImageBlock *)block;
+  //   if (y + imageBlock->height > page_height) {
+  //     pages.push_back(new Page());
+  //     y = 0;
+  //   }
+  //   pages.back()->elements.push_back(new PageImage(imageBlock, y));
+  //   y += imageBlock->height;
+  // }
+}
--- a/lib/Epub/Epub/EpubHtmlParserSlim.h
+++ b/lib/Epub/Epub/EpubHtmlParserSlim.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include <expat.h>
+#include <limits.h>
+
+#include <functional>
+
+#include "blocks/TextBlock.h"
+
+class Page;
+class EpdRenderer;
+
+#define PART_WORD_BUFFER_SIZE 200
+
+class EpubHtmlParserSlim {
+  const char* filepath;
+  EpdRenderer& renderer;
+  std::function<void(Page*)> completePageFn;
+  int depth = 0;
+  int skipUntilDepth = INT_MAX;
+  int boldUntilDepth = INT_MAX;
+  int italicUntilDepth = INT_MAX;
+  // If we encounter words longer than this, but this is pretty large
+  char partWordBuffer[PART_WORD_BUFFER_SIZE] = {};
+  int partWordBufferIndex = 0;
+  TextBlock* currentTextBlock = nullptr;
+  Page* currentPage = nullptr;
+
+  void startNewTextBlock(BLOCK_STYLE style);
+  void makePages();
+  // XML callbacks
+  static void XMLCALL startElement(void* userData, const XML_Char* name, const XML_Char** atts);
+  static void XMLCALL characterData(void* userData, const XML_Char* s, int len);
+  static void XMLCALL endElement(void* userData, const XML_Char* name);
+
+ public:
+  explicit EpubHtmlParserSlim(const char* filepath, EpdRenderer& renderer,
+                              const std::function<void(Page*)>& completePageFn)
+      : filepath(filepath), renderer(renderer), completePageFn(completePageFn) {}
+  ~EpubHtmlParserSlim() = default;
+  bool parseAndBuildPages();
+};
--- a/lib/Epub/Epub/Section.cpp
+++ b/lib/Epub/Epub/Section.cpp
@@ -5,11 +5,11 @@

 #include <fstream>

-#include "EpubHtmlParser.h"
+#include "EpubHtmlParserSlim.h"
 #include "Page.h"

 void Section::onPageComplete(const Page* page) {
-  Serial.printf("Page %d complete\n", pageCount);
+  Serial.printf("Page %d complete - free mem: %lu\n", pageCount, ESP.getFreeHeap());

  const auto filePath = cachePath + "/page_" + std::to_string(pageCount) + ".bin";

@@ -75,11 +75,11 @@ bool Section::persistPageDataToSD() {
  }

  const auto sdTmpHtmlPath = "/sd" + tmpHtmlPath;
-  auto visitor =
-      EpubHtmlParser(sdTmpHtmlPath.c_str(), renderer, [this](const Page* page) { this->onPageComplete(page); });

-  // TODO: Come back and see if mem used here can be lowered?
+  auto visitor =
+      EpubHtmlParserSlim(sdTmpHtmlPath.c_str(), renderer, [this](const Page* page) { this->onPageComplete(page); });
  const bool success = visitor.parseAndBuildPages();
+
  SD.remove(tmpHtmlPath.c_str());
  if (!success) {
    Serial.println("Failed to parse and build pages");
--- a/lib/Epub/Epub/blocks/TextBlock.cpp
+++ b/lib/Epub/Epub/blocks/TextBlock.cpp
@@ -3,44 +3,11 @@
 #include <EpdRenderer.h>
 #include <Serialization.h>

-static bool isWhitespace(const char c) { return c == ' ' || c == '\r' || c == '\n'; }
+void TextBlock::addWord(const std::string& word, const bool is_bold, const bool is_italic) {
+  if (word.length() == 0) return;

-// move past anything that should be considered part of a work
-static int skipWord(const std::string& text, int index, const int length) {
-  while (index < length && !isWhitespace(text[index])) {
-    index++;
-  }
-  return index;
-}
-
-// skip past any white space characters
-static int skipWhitespace(const std::string& html, int index, const int length) {
-  while (index < length && isWhitespace(html[index])) {
-    index++;
-  }
-  return index;
-}
-
-void TextBlock::addSpan(const std::string& span, const bool is_bold, const bool is_italic) {
-  // adding a span to text block
-  // make a copy of the text as we'll modify it
-  const int length = span.length();
-  // const auto text = new char[length + 1];
-  // strcpy(text, span);
-  // work out where each word is in the span
-  int index = 0;
-  while (index < length) {
-    // skip past any whitespace to the start of a word
-    index = skipWhitespace(span, index, length);
-    const int wordStart = index;
-    // find the end of the word
-    index = skipWord(span, index, length);
-    const int wordLength = index - wordStart;
-    if (wordLength > 0) {
-      words.push_back(span.substr(wordStart, wordLength));
-      wordStyles.push_back((is_bold ? BOLD_SPAN : 0) | (is_italic ? ITALIC_SPAN : 0));
-    }
-  }
+  words.push_back(word);
+  wordStyles.push_back((is_bold ? BOLD_SPAN : 0) | (is_italic ? ITALIC_SPAN : 0));
 }

 std::list<TextBlock*> TextBlock::splitIntoLines(const EpdRenderer& renderer) {
@@ -189,17 +156,12 @@ std::list<TextBlock*> TextBlock::splitIntoLines(const EpdRenderer& renderer) {

 void TextBlock::render(const EpdRenderer& renderer, const int x, const int y) const {
  for (int i = 0; i < words.size(); i++) {
-    // get the style
-    const uint8_t wordStyle = wordStyles[i];
    // render the word
    EpdFontStyle fontStyle = REGULAR;
-    if (wordStyles[i] & BOLD_SPAN) {
-      if (wordStyles[i] & ITALIC_SPAN) {
-        fontStyle = BOLD_ITALIC;
-      } else {
-        fontStyle = BOLD;
-      }
-
+    if (wordStyles[i] & BOLD_SPAN && wordStyles[i] & ITALIC_SPAN) {
+      fontStyle = BOLD_ITALIC;
+    } else if (wordStyles[i] & BOLD_SPAN) {
+      fontStyle = BOLD;
    } else if (wordStyles[i] & ITALIC_SPAN) {
      fontStyle = ITALIC;
    }
--- a/lib/Epub/Epub/blocks/TextBlock.h
+++ b/lib/Epub/Epub/blocks/TextBlock.h
@@ -30,15 +30,15 @@ class TextBlock final : public Block {
  BLOCK_STYLE style;

 public:
-  void addSpan(const std::string& span, bool is_bold, bool is_italic);
  explicit TextBlock(const BLOCK_STYLE style) : style(style) {}
  explicit TextBlock(const std::vector<std::string>& words, const std::vector<uint16_t>& word_xpos,
                     // the styles of each word
                     const std::vector<uint8_t>& word_styles, const BLOCK_STYLE style)
      : words(words), wordXpos(word_xpos), wordStyles(word_styles), style(style) {}
  ~TextBlock() override = default;
-  void set_style(const BLOCK_STYLE style) { this->style = style; }
-  BLOCK_STYLE get_style() const { return style; }
+  void addWord(const std::string& word, bool is_bold, bool is_italic);
+  void setStyle(const BLOCK_STYLE style) { this->style = style; }
+  BLOCK_STYLE getStyle() const { return style; }
  bool isEmpty() override { return words.empty(); }
  void layout(EpdRenderer& renderer) override {};
  // given a renderer works out where to break the words into lines