feat: dict based Hyphenation (#305)
## Summary * Adds (optional) Hyphenation for English, French, German, Russian languages ## Additional Context * Included hyphenation dictionaries add approximately 280kb to the flash usage (German alone takes 200kb) * Trie encoded dictionaries are adopted from hypher project (https://github.com/typst/hypher) * Soft hyphens (and other explicit hyphens) take precedence over dict-based hyphenation. Overall, the hyphenation rules are quite aggressive, as I believe it makes more sense on our smaller screen. --------- Co-authored-by: Dave Allie <dave@daveallie.com>
This commit is contained in:
committed by
GitHub
parent
5fef99c641
commit
8824c87490
82
scripts/generate_hyphenation_trie.py
Executable file
82
scripts/generate_hyphenation_trie.py
Executable file
@@ -0,0 +1,82 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Embed hypher-generated `.bin` tries into constexpr headers."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import pathlib
|
||||
|
||||
|
||||
def _format_bytes(blob: bytes, per_line: int = 16) -> str:
|
||||
# Render the blob as a comma separated list of hex literals with consistent wrapping.
|
||||
lines = []
|
||||
for i in range(0, len(blob), per_line):
|
||||
chunk = ', '.join(f"0x{b:02X}" for b in blob[i : i + per_line])
|
||||
lines.append(f" {chunk},")
|
||||
if not lines:
|
||||
lines.append(" 0x00,")
|
||||
return '\n'.join(lines)
|
||||
|
||||
|
||||
def _symbol_from_output(path: pathlib.Path) -> str:
|
||||
# Derive a stable C identifier from the destination header name (e.g., hyph-en.trie.h -> en).
|
||||
name = path.name
|
||||
if name.endswith('.trie.h'):
|
||||
name = name[:-7]
|
||||
if name.startswith('hyph-'):
|
||||
name = name[5:]
|
||||
name = name.replace('-', '_')
|
||||
if name.endswith('.trie'):
|
||||
name = name[:-5]
|
||||
return name
|
||||
|
||||
|
||||
def write_header(path: pathlib.Path, blob: bytes, symbol: str) -> None:
|
||||
# Emit a constexpr header containing the raw bytes plus a SerializedHyphenationPatterns descriptor.
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
data_symbol = f"{symbol}_trie_data"
|
||||
patterns_symbol = f"{symbol}_patterns"
|
||||
bytes_literal = _format_bytes(blob)
|
||||
content = f"""#pragma once
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
|
||||
#include "../SerializedHyphenationTrie.h"
|
||||
|
||||
// Auto-generated by generate_hyphenation_trie.py. Do not edit manually.
|
||||
alignas(4) constexpr uint8_t {data_symbol}[] = {{
|
||||
{bytes_literal}
|
||||
}};
|
||||
|
||||
constexpr SerializedHyphenationPatterns {patterns_symbol} = {{
|
||||
{data_symbol},
|
||||
sizeof({data_symbol}),
|
||||
}};
|
||||
"""
|
||||
path.write_text(content)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--input', dest='inputs', action='append', required=True,
|
||||
help='Path to a hypher-generated .bin trie')
|
||||
parser.add_argument('--output', dest='outputs', action='append', required=True,
|
||||
help='Destination header path (hyph-*.trie.h)')
|
||||
args = parser.parse_args()
|
||||
|
||||
if len(args.inputs) != len(args.outputs):
|
||||
raise SystemExit('input/output counts must match')
|
||||
|
||||
for src, dst in zip(args.inputs, args.outputs):
|
||||
# Process each input/output pair independently so mixed-language refreshes work in one invocation.
|
||||
src_path = pathlib.Path(src)
|
||||
blob = src_path.read_bytes()
|
||||
out_path = pathlib.Path(dst)
|
||||
symbol = _symbol_from_output(out_path)
|
||||
write_header(out_path, blob, symbol)
|
||||
print(f'wrote {dst} ({len(blob)} bytes payload)')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user