feat: dict based Hyphenation (#305)

## Summary

* Adds (optional) Hyphenation for English, French, German, Russian
languages

## Additional Context

* Included hyphenation dictionaries add approximately 280kb to the flash
usage (German alone takes 200kb)
* Trie encoded dictionaries are adopted from hypher project
(https://github.com/typst/hypher)
* Soft hyphens (and other explicit hyphens) take precedence over
dict-based hyphenation. Overall, the hyphenation rules are quite
aggressive, as I believe it makes more sense on our smaller screen.

---------

Co-authored-by: Dave Allie <dave@daveallie.com>
This commit is contained in:
Arthur Tazhitdinov
2026-01-19 17:56:26 +05:00
committed by GitHub
parent 5fef99c641
commit 8824c87490
40 changed files with 36465 additions and 52 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,232 @@
"""
Generate hyphenation test data from a text file.
This script extracts unique words from a book and generates ground truth
hyphenations using the pyphen library, which can be used to test and validate
the hyphenation implementations (e.g., German, English, Russian).
Usage:
python generate_hyphenation_test_data.py <input_file> <output_file>
[--language de_DE] [--max-words 5000] [--min-prefix 2] [--min-suffix 2]
Requirements:
pip install pyphen
"""
import argparse
import re
from collections import Counter
import pyphen
from pathlib import Path
import zipfile
def extract_text_from_epub(epub_path):
"""Extract textual content from an .epub archive by concatenating HTML/XHTML files."""
texts = []
with zipfile.ZipFile(epub_path, "r") as z:
for name in z.namelist():
lower = name.lower()
if (
lower.endswith(".xhtml")
or lower.endswith(".html")
or lower.endswith(".htm")
):
try:
data = z.read(name).decode("utf-8", errors="ignore")
except Exception:
continue
# Remove tags
text = re.sub(r"<[^>]+>", " ", data)
texts.append(text)
return "\n".join(texts)
def extract_words(text):
"""Extract all words from text, preserving original case."""
# Match runs of Unicode letters (any script) while excluding digits/underscores
return re.findall(r"[^\W\d_]+", text, flags=re.UNICODE)
def clean_word(word):
"""Normalize word for hyphenation testing."""
# Keep original case but strip any non-letter characters
return word.strip()
def generate_hyphenation_data(
input_file,
output_file,
language="de_DE",
min_length=6,
max_words=5000,
min_prefix=2,
min_suffix=2,
):
"""
Generate hyphenation test data from a text file.
Args:
input_file: Path to input text file
output_file: Path to output file with hyphenation data
language: Language code for pyphen (e.g., 'de_DE', 'en_US')
min_length: Minimum word length to include
max_words: Maximum number of words to include (default: 5000)
min_prefix: Minimum characters allowed before the first hyphen (default: 2)
min_suffix: Minimum characters allowed after the last hyphen (default: 2)
"""
print(f"Reading from: {input_file}")
# Read the input file
if str(input_file).lower().endswith(".epub"):
print("Detected .epub input; extracting HTML content")
text = extract_text_from_epub(input_file)
else:
with open(input_file, "r", encoding="utf-8") as f:
text = f.read()
# Extract words
print("Extracting words...")
words = extract_words(text)
print(f"Found {len(words)} total words")
# Count word frequencies
word_counts = Counter(words)
print(f"Found {len(word_counts)} unique words")
# Initialize pyphen hyphenator
print(
f"Initializing hyphenator for language: {language} (min_prefix={min_prefix}, min_suffix={min_suffix})"
)
try:
hyphenator = pyphen.Pyphen(lang=language, left=min_prefix, right=min_suffix)
except KeyError:
print(f"Error: Language '{language}' not found in pyphen.")
print("Available languages include: de_DE, en_US, en_GB, fr_FR, etc.")
return
# Generate hyphenations
print("Generating hyphenations...")
hyphenation_data = []
# Sort by frequency (most common first) then alphabetically
sorted_words = sorted(word_counts.items(), key=lambda x: (-x[1], x[0].lower()))
for word, count in sorted_words:
# Filter by minimum length
if len(word) < min_length:
continue
# Get hyphenation (may produce no '=' characters)
hyphenated = hyphenator.inserted(word, hyphen="=")
# Include all words (so we can take the top N most common words even if
# they don't have hyphenation points). This replaces the previous filter
# which dropped words without '='.
hyphenation_data.append(
{"word": word, "hyphenated": hyphenated, "count": count}
)
# Stop if we've reached max_words
if max_words and len(hyphenation_data) >= max_words:
break
print(f"Generated {len(hyphenation_data)} hyphenated words")
# Write output file
print(f"Writing to: {output_file}")
with open(output_file, "w", encoding="utf-8") as f:
# Write header with metadata
f.write(f"# Hyphenation Test Data\n")
f.write(f"# Source: {Path(input_file).name}\n")
f.write(f"# Language: {language}\n")
f.write(f"# Min prefix: {min_prefix}\n")
f.write(f"# Min suffix: {min_suffix}\n")
f.write(f"# Total words: {len(hyphenation_data)}\n")
f.write(f"# Format: word | hyphenated_form | frequency_in_source\n")
f.write(f"#\n")
f.write(f"# Hyphenation points are marked with '='\n")
f.write(f"# Example: Silbentrennung -> Sil=ben=tren=nung\n")
f.write(f"#\n\n")
# Write data
for item in hyphenation_data:
f.write(f"{item['word']}|{item['hyphenated']}|{item['count']}\n")
print("Done!")
# Print some statistics
print("\n=== Statistics ===")
print(f"Total unique words extracted: {len(word_counts)}")
print(f"Words with hyphenation points: {len(hyphenation_data)}")
print(
f"Average hyphenation points per word: {sum(h['hyphenated'].count('=') for h in hyphenation_data) / len(hyphenation_data):.2f}"
)
# Print some examples
print("\n=== Examples (first 10) ===")
for item in hyphenation_data[:10]:
print(
f" {item['word']:20} -> {item['hyphenated']:30} (appears {item['count']}x)"
)
def main():
parser = argparse.ArgumentParser(
description="Generate hyphenation test data from a text file",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Generate test data from a German book
python generate_hyphenation_test_data.py ../data/books/bobiverse_1.txt hyphenation_test_data.txt
# Limit to 500 most common words
python generate_hyphenation_test_data.py ../data/books/bobiverse_1.txt hyphenation_test_data.txt --max-words 500
# Use English hyphenation (when available)
python generate_hyphenation_test_data.py book.txt test_en.txt --language en_US
""",
)
parser.add_argument("input_file", help="Input text file to extract words from")
parser.add_argument("output_file", help="Output file for hyphenation test data")
parser.add_argument(
"--language", default="de_DE", help="Language code (default: de_DE)"
)
parser.add_argument(
"--min-length", type=int, default=6, help="Minimum word length (default: 6)"
)
parser.add_argument(
"--max-words",
type=int,
default=5000,
help="Maximum number of words to include (default: 5000)",
)
parser.add_argument(
"--min-prefix",
type=int,
default=2,
help="Minimum characters permitted before the first hyphen (default: 2)",
)
parser.add_argument(
"--min-suffix",
type=int,
default=2,
help="Minimum characters permitted after the last hyphen (default: 2)",
)
args = parser.parse_args()
generate_hyphenation_data(
args.input_file,
args.output_file,
language=args.language,
min_length=args.min_length,
max_words=args.max_words,
min_prefix=args.min_prefix,
min_suffix=args.min_suffix,
)
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff