test/hyphenation_eval/resources/generate_hyphenation_test_data.py

"""
Generate hyphenation test data from a text file.

This script extracts unique words from a book and generates ground truth
hyphenations using the pyphen library, which can be used to test and validate
the hyphenation implementations (e.g., German, English, Russian).

Usage:
    python generate_hyphenation_test_data.py <input_file> <output_file>
        [--language de_DE] [--max-words 5000] [--min-prefix 2] [--min-suffix 2]

Requirements:
    pip install pyphen
"""

import argparse
import re
from collections import Counter
import pyphen
from pathlib import Path
import zipfile


def extract_text_from_epub(epub_path):
    """Extract textual content from an .epub archive by concatenating HTML/XHTML files."""
    texts = []
    with zipfile.ZipFile(epub_path, "r") as z:
        for name in z.namelist():
            lower = name.lower()
            if (
                lower.endswith(".xhtml")
                or lower.endswith(".html")
                or lower.endswith(".htm")
            ):
                try:
                    data = z.read(name).decode("utf-8", errors="ignore")
                except Exception:
                    continue
                # Remove tags
                text = re.sub(r"<[^>]+>", " ", data)
                texts.append(text)
    return "\n".join(texts)


def extract_words(text):
    """Extract all words from text, preserving original case."""
    # Match runs of Unicode letters (any script) while excluding digits/underscores
    return re.findall(r"[^\W\d_]+", text, flags=re.UNICODE)


def clean_word(word):
    """Normalize word for hyphenation testing."""
    # Keep original case but strip any non-letter characters
    return word.strip()


def generate_hyphenation_data(
    input_file,
    output_file,
    language="de_DE",
    min_length=6,
    max_words=5000,
    min_prefix=2,
    min_suffix=2,
):
    """
    Generate hyphenation test data from a text file.

    Args:
        input_file: Path to input text file
        output_file: Path to output file with hyphenation data
        language: Language code for pyphen (e.g., 'de_DE', 'en_US')
        min_length: Minimum word length to include
        max_words: Maximum number of words to include (default: 5000)
        min_prefix: Minimum characters allowed before the first hyphen (default: 2)
        min_suffix: Minimum characters allowed after the last hyphen (default: 2)
    """
    print(f"Reading from: {input_file}")

    # Read the input file
    if str(input_file).lower().endswith(".epub"):
        print("Detected .epub input; extracting HTML content")
        text = extract_text_from_epub(input_file)
    else:
        with open(input_file, "r", encoding="utf-8") as f:
            text = f.read()

    # Extract words
    print("Extracting words...")
    words = extract_words(text)
    print(f"Found {len(words)} total words")

    # Count word frequencies
    word_counts = Counter(words)
    print(f"Found {len(word_counts)} unique words")

    # Initialize pyphen hyphenator
    print(
        f"Initializing hyphenator for language: {language} (min_prefix={min_prefix}, min_suffix={min_suffix})"
    )
    try:
        hyphenator = pyphen.Pyphen(lang=language, left=min_prefix, right=min_suffix)
    except KeyError:
        print(f"Error: Language '{language}' not found in pyphen.")
        print("Available languages include: de_DE, en_US, en_GB, fr_FR, etc.")
        return

    # Generate hyphenations
    print("Generating hyphenations...")
    hyphenation_data = []

    # Sort by frequency (most common first) then alphabetically
    sorted_words = sorted(word_counts.items(), key=lambda x: (-x[1], x[0].lower()))

    for word, count in sorted_words:
        # Filter by minimum length
        if len(word) < min_length:
            continue

        # Get hyphenation (may produce no '=' characters)
        hyphenated = hyphenator.inserted(word, hyphen="=")

        # Include all words (so we can take the top N most common words even if
        # they don't have hyphenation points). This replaces the previous filter
        # which dropped words without '='.
        hyphenation_data.append(
            {"word": word, "hyphenated": hyphenated, "count": count}
        )

        # Stop if we've reached max_words
        if max_words and len(hyphenation_data) >= max_words:
            break

    print(f"Generated {len(hyphenation_data)} hyphenated words")

    # Write output file
    print(f"Writing to: {output_file}")
    with open(output_file, "w", encoding="utf-8") as f:
        # Write header with metadata
        f.write(f"# Hyphenation Test Data\n")
        f.write(f"# Source: {Path(input_file).name}\n")
        f.write(f"# Language: {language}\n")
        f.write(f"# Min prefix: {min_prefix}\n")
        f.write(f"# Min suffix: {min_suffix}\n")
        f.write(f"# Total words: {len(hyphenation_data)}\n")
        f.write(f"# Format: word | hyphenated_form | frequency_in_source\n")
        f.write(f"#\n")
        f.write(f"# Hyphenation points are marked with '='\n")
        f.write(f"# Example: Silbentrennung -> Sil=ben=tren=nung\n")
        f.write(f"#\n\n")

        # Write data
        for item in hyphenation_data:
            f.write(f"{item['word']}|{item['hyphenated']}|{item['count']}\n")

    print("Done!")

    # Print some statistics
    print("\n=== Statistics ===")
    print(f"Total unique words extracted: {len(word_counts)}")
    print(f"Words with hyphenation points: {len(hyphenation_data)}")
    print(
        f"Average hyphenation points per word: {sum(h['hyphenated'].count('=') for h in hyphenation_data) / len(hyphenation_data):.2f}"
    )

    # Print some examples
    print("\n=== Examples (first 10) ===")
    for item in hyphenation_data[:10]:
        print(
            f"  {item['word']:20} -> {item['hyphenated']:30} (appears {item['count']}x)"
        )


def main():
    parser = argparse.ArgumentParser(
        description="Generate hyphenation test data from a text file",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Generate test data from a German book
  python generate_hyphenation_test_data.py ../data/books/bobiverse_1.txt hyphenation_test_data.txt
  
  # Limit to 500 most common words
  python generate_hyphenation_test_data.py ../data/books/bobiverse_1.txt hyphenation_test_data.txt --max-words 500
  
  # Use English hyphenation (when available)
  python generate_hyphenation_test_data.py book.txt test_en.txt --language en_US
        """,
    )

    parser.add_argument("input_file", help="Input text file to extract words from")
    parser.add_argument("output_file", help="Output file for hyphenation test data")
    parser.add_argument(
        "--language", default="de_DE", help="Language code (default: de_DE)"
    )
    parser.add_argument(
        "--min-length", type=int, default=6, help="Minimum word length (default: 6)"
    )
    parser.add_argument(
        "--max-words",
        type=int,
        default=5000,
        help="Maximum number of words to include (default: 5000)",
    )
    parser.add_argument(
        "--min-prefix",
        type=int,
        default=2,
        help="Minimum characters permitted before the first hyphen (default: 2)",
    )
    parser.add_argument(
        "--min-suffix",
        type=int,
        default=2,
        help="Minimum characters permitted after the last hyphen (default: 2)",
    )

    args = parser.parse_args()

    generate_hyphenation_data(
        args.input_file,
        args.output_file,
        language=args.language,
        min_length=args.min_length,
        max_words=args.max_words,
        min_prefix=args.min_prefix,
        min_suffix=args.min_suffix,
    )


if __name__ == "__main__":
    main()
feat: dict based Hyphenation (#305) ## Summary * Adds (optional) Hyphenation for English, French, German, Russian languages ## Additional Context * Included hyphenation dictionaries add approximately 280kb to the flash usage (German alone takes 200kb) * Trie encoded dictionaries are adopted from hypher project (https://github.com/typst/hypher) * Soft hyphens (and other explicit hyphens) take precedence over dict-based hyphenation. Overall, the hyphenation rules are quite aggressive, as I believe it makes more sense on our smaller screen. --------- Co-authored-by: Dave Allie <dave@daveallie.com> 2026-01-19 17:56:26 +05:00			`"""`
			`Generate hyphenation test data from a text file.`

			`This script extracts unique words from a book and generates ground truth`
			`hyphenations using the pyphen library, which can be used to test and validate`
			`the hyphenation implementations (e.g., German, English, Russian).`

			`Usage:`
			`python generate_hyphenation_test_data.py <input_file> <output_file>`
			`[--language de_DE] [--max-words 5000] [--min-prefix 2] [--min-suffix 2]`

			`Requirements:`
			`pip install pyphen`
			`"""`

			`import argparse`
			`import re`
			`from collections import Counter`
			`import pyphen`
			`from pathlib import Path`
			`import zipfile`


			`def extract_text_from_epub(epub_path):`
			`"""Extract textual content from an .epub archive by concatenating HTML/XHTML files."""`
			`texts = []`
			`with zipfile.ZipFile(epub_path, "r") as z:`
			`for name in z.namelist():`
			`lower = name.lower()`
			`if (`
			`lower.endswith(".xhtml")`
			`or lower.endswith(".html")`
			`or lower.endswith(".htm")`
			`):`
			`try:`
			`data = z.read(name).decode("utf-8", errors="ignore")`
			`except Exception:`
			`continue`
			`# Remove tags`
			`text = re.sub(r"<[^>]+>", " ", data)`
			`texts.append(text)`
			`return "\n".join(texts)`


			`def extract_words(text):`
			`"""Extract all words from text, preserving original case."""`
			`# Match runs of Unicode letters (any script) while excluding digits/underscores`
			`return re.findall(r"[^\W\d_]+", text, flags=re.UNICODE)`


			`def clean_word(word):`
			`"""Normalize word for hyphenation testing."""`
			`# Keep original case but strip any non-letter characters`
			`return word.strip()`


			`def generate_hyphenation_data(`
			`input_file,`
			`output_file,`
			`language="de_DE",`
			`min_length=6,`
			`max_words=5000,`
			`min_prefix=2,`
			`min_suffix=2,`
			`):`
			`"""`
			`Generate hyphenation test data from a text file.`

			`Args:`
			`input_file: Path to input text file`
			`output_file: Path to output file with hyphenation data`
			`language: Language code for pyphen (e.g., 'de_DE', 'en_US')`
			`min_length: Minimum word length to include`
			`max_words: Maximum number of words to include (default: 5000)`
			`min_prefix: Minimum characters allowed before the first hyphen (default: 2)`
			`min_suffix: Minimum characters allowed after the last hyphen (default: 2)`
			`"""`
			`print(f"Reading from: {input_file}")`

			`# Read the input file`
			`if str(input_file).lower().endswith(".epub"):`
			`print("Detected .epub input; extracting HTML content")`
			`text = extract_text_from_epub(input_file)`
			`else:`
			`with open(input_file, "r", encoding="utf-8") as f:`
			`text = f.read()`

			`# Extract words`
			`print("Extracting words...")`
			`words = extract_words(text)`
			`print(f"Found {len(words)} total words")`

			`# Count word frequencies`
			`word_counts = Counter(words)`
			`print(f"Found {len(word_counts)} unique words")`

			`# Initialize pyphen hyphenator`
			`print(`
			`f"Initializing hyphenator for language: {language} (min_prefix={min_prefix}, min_suffix={min_suffix})"`
			`)`
			`try:`
			`hyphenator = pyphen.Pyphen(lang=language, left=min_prefix, right=min_suffix)`
			`except KeyError:`
			`print(f"Error: Language '{language}' not found in pyphen.")`
			`print("Available languages include: de_DE, en_US, en_GB, fr_FR, etc.")`
			`return`

			`# Generate hyphenations`
			`print("Generating hyphenations...")`
			`hyphenation_data = []`

			`# Sort by frequency (most common first) then alphabetically`
			`sorted_words = sorted(word_counts.items(), key=lambda x: (-x[1], x[0].lower()))`

			`for word, count in sorted_words:`
			`# Filter by minimum length`
			`if len(word) < min_length:`
			`continue`

			`# Get hyphenation (may produce no '=' characters)`
			`hyphenated = hyphenator.inserted(word, hyphen="=")`

			`# Include all words (so we can take the top N most common words even if`
			`# they don't have hyphenation points). This replaces the previous filter`
			`# which dropped words without '='.`
			`hyphenation_data.append(`
			`{"word": word, "hyphenated": hyphenated, "count": count}`
			`)`

			`# Stop if we've reached max_words`
			`if max_words and len(hyphenation_data) >= max_words:`
			`break`

			`print(f"Generated {len(hyphenation_data)} hyphenated words")`

			`# Write output file`
			`print(f"Writing to: {output_file}")`
			`with open(output_file, "w", encoding="utf-8") as f:`
			`# Write header with metadata`
			`f.write(f"# Hyphenation Test Data\n")`
			`f.write(f"# Source: {Path(input_file).name}\n")`
			`f.write(f"# Language: {language}\n")`
			`f.write(f"# Min prefix: {min_prefix}\n")`
			`f.write(f"# Min suffix: {min_suffix}\n")`
			`f.write(f"# Total words: {len(hyphenation_data)}\n")`
			`f.write(f"# Format: word \| hyphenated_form \| frequency_in_source\n")`
			`f.write(f"#\n")`
			`f.write(f"# Hyphenation points are marked with '='\n")`
			`f.write(f"# Example: Silbentrennung -> Sil=ben=tren=nung\n")`
			`f.write(f"#\n\n")`

			`# Write data`
			`for item in hyphenation_data:`
			`f.write(f"{item['word']}\|{item['hyphenated']}\|{item['count']}\n")`

			`print("Done!")`

			`# Print some statistics`
			`print("\n=== Statistics ===")`
			`print(f"Total unique words extracted: {len(word_counts)}")`
			`print(f"Words with hyphenation points: {len(hyphenation_data)}")`
			`print(`
			`f"Average hyphenation points per word: {sum(h['hyphenated'].count('=') for h in hyphenation_data) / len(hyphenation_data):.2f}"`
			`)`

			`# Print some examples`
			`print("\n=== Examples (first 10) ===")`
			`for item in hyphenation_data[:10]:`
			`print(`
			`f" {item['word']:20} -> {item['hyphenated']:30} (appears {item['count']}x)"`
			`)`


			`def main():`
			`parser = argparse.ArgumentParser(`
			`description="Generate hyphenation test data from a text file",`
			`formatter_class=argparse.RawDescriptionHelpFormatter,`
			`epilog="""`
			`Examples:`
			`# Generate test data from a German book`
			`python generate_hyphenation_test_data.py ../data/books/bobiverse_1.txt hyphenation_test_data.txt`

			`# Limit to 500 most common words`
			`python generate_hyphenation_test_data.py ../data/books/bobiverse_1.txt hyphenation_test_data.txt --max-words 500`

			`# Use English hyphenation (when available)`
			`python generate_hyphenation_test_data.py book.txt test_en.txt --language en_US`
			`""",`
			`)`

			`parser.add_argument("input_file", help="Input text file to extract words from")`
			`parser.add_argument("output_file", help="Output file for hyphenation test data")`
			`parser.add_argument(`
			`"--language", default="de_DE", help="Language code (default: de_DE)"`
			`)`
			`parser.add_argument(`
			`"--min-length", type=int, default=6, help="Minimum word length (default: 6)"`
			`)`
			`parser.add_argument(`
			`"--max-words",`
			`type=int,`
			`default=5000,`
			`help="Maximum number of words to include (default: 5000)",`
			`)`
			`parser.add_argument(`
			`"--min-prefix",`
			`type=int,`
			`default=2,`
			`help="Minimum characters permitted before the first hyphen (default: 2)",`
			`)`
			`parser.add_argument(`
			`"--min-suffix",`
			`type=int,`
			`default=2,`
			`help="Minimum characters permitted after the last hyphen (default: 2)",`
			`)`

			`args = parser.parse_args()`

			`generate_hyphenation_data(`
			`args.input_file,`
			`args.output_file,`
			`language=args.language,`
			`min_length=args.min_length,`
			`max_words=args.max_words,`
			`min_prefix=args.min_prefix,`
			`min_suffix=args.min_suffix,`
			`)`


			`if __name__ == "__main__":`
			`main()`