Skip BOM character (sometimes used in front of em-dashes) (#340)

## Summary

Skip BOM character (sometimes used in front of em-dashes) - they are not
part of the glyph set and would render `?` otherwise.

---

### AI Usage

Did you use AI tools to help write this code? _**YES**_
This commit is contained in:
Jonas Diemer
2026-01-14 12:38:30 +01:00
committed by GitHub
parent 49f97b69ca
commit 1c027ce2cd

View File

@@ -151,6 +151,20 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char
} }
} }
// Skip Zero Width No-Break Space / BOM (U+FEFF) = 0xEF 0xBB 0xBF
const XML_Char FEFF_BYTE_1 = static_cast<XML_Char>(0xEF);
const XML_Char FEFF_BYTE_2 = static_cast<XML_Char>(0xBB);
const XML_Char FEFF_BYTE_3 = static_cast<XML_Char>(0xBF);
if (s[i] == FEFF_BYTE_1) {
// Check if the next two bytes complete the 3-byte sequence
if ((i + 2 < len) && (s[i + 1] == FEFF_BYTE_2) && (s[i + 2] == FEFF_BYTE_3)) {
// Sequence 0xEF 0xBB 0xBF found!
i += 2; // Skip the next two bytes
continue; // Move to the next iteration
}
}
// If we're about to run out of space, then cut the word off and start a new one // If we're about to run out of space, then cut the word off and start a new one
if (self->partWordBufferIndex >= MAX_WORD_SIZE) { if (self->partWordBufferIndex >= MAX_WORD_SIZE) {
self->partWordBuffer[self->partWordBufferIndex] = '\0'; self->partWordBuffer[self->partWordBufferIndex] = '\0';