feat: dict based Hyphenation (#305)
## Summary * Adds (optional) Hyphenation for English, French, German, Russian languages ## Additional Context * Included hyphenation dictionaries add approximately 280kb to the flash usage (German alone takes 200kb) * Trie encoded dictionaries are adopted from hypher project (https://github.com/typst/hypher) * Soft hyphens (and other explicit hyphens) take precedence over dict-based hyphenation. Overall, the hyphenation rules are quite aggressive, as I believe it makes more sense on our smaller screen. --------- Co-authored-by: Dave Allie <dave@daveallie.com>
This commit is contained in:
committed by
GitHub
parent
5fef99c641
commit
8824c87490
388
test/hyphenation_eval/HyphenationEvaluationTest.cpp
Normal file
388
test/hyphenation_eval/HyphenationEvaluationTest.cpp
Normal file
@@ -0,0 +1,388 @@
|
||||
#include <Utf8.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
#include <cmath>
|
||||
#include <fstream>
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "lib/Epub/Epub/hyphenation/HyphenationCommon.h"
|
||||
#include "lib/Epub/Epub/hyphenation/LanguageHyphenator.h"
|
||||
#include "lib/Epub/Epub/hyphenation/LanguageRegistry.h"
|
||||
|
||||
struct TestCase {
|
||||
std::string word;
|
||||
std::string hyphenated;
|
||||
std::vector<size_t> expectedPositions;
|
||||
int frequency;
|
||||
};
|
||||
|
||||
struct EvaluationResult {
|
||||
int truePositives = 0;
|
||||
int falsePositives = 0;
|
||||
int falseNegatives = 0;
|
||||
double precision = 0.0;
|
||||
double recall = 0.0;
|
||||
double f1Score = 0.0;
|
||||
double weightedScore = 0.0;
|
||||
};
|
||||
|
||||
struct LanguageConfig {
|
||||
std::string cliName;
|
||||
std::string testDataFile;
|
||||
const char* primaryTag;
|
||||
};
|
||||
|
||||
const std::vector<LanguageConfig> kSupportedLanguages = {
|
||||
{"english", "test/hyphenation_eval/resources/english_hyphenation_tests.txt", "en"},
|
||||
{"french", "test/hyphenation_eval/resources/french_hyphenation_tests.txt", "fr"},
|
||||
{"german", "test/hyphenation_eval/resources/german_hyphenation_tests.txt", "de"},
|
||||
{"russian", "test/hyphenation_eval/resources/russian_hyphenation_tests.txt", "ru"},
|
||||
};
|
||||
|
||||
std::vector<size_t> expectedPositionsFromAnnotatedWord(const std::string& annotated) {
|
||||
std::vector<size_t> positions;
|
||||
const unsigned char* ptr = reinterpret_cast<const unsigned char*>(annotated.c_str());
|
||||
size_t codepointIndex = 0;
|
||||
|
||||
while (*ptr != 0) {
|
||||
if (*ptr == '=') {
|
||||
positions.push_back(codepointIndex);
|
||||
++ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
utf8NextCodepoint(&ptr);
|
||||
++codepointIndex;
|
||||
}
|
||||
|
||||
return positions;
|
||||
}
|
||||
|
||||
std::vector<TestCase> loadTestData(const std::string& filename) {
|
||||
std::vector<TestCase> testCases;
|
||||
std::ifstream file(filename);
|
||||
|
||||
if (!file.is_open()) {
|
||||
std::cerr << "Error: Could not open file " << filename << std::endl;
|
||||
return testCases;
|
||||
}
|
||||
|
||||
std::string line;
|
||||
while (std::getline(file, line)) {
|
||||
if (line.empty() || line[0] == '#') {
|
||||
continue;
|
||||
}
|
||||
|
||||
std::istringstream iss(line);
|
||||
std::string word, hyphenated, freqStr;
|
||||
|
||||
if (std::getline(iss, word, '|') && std::getline(iss, hyphenated, '|') && std::getline(iss, freqStr, '|')) {
|
||||
TestCase testCase;
|
||||
testCase.word = word;
|
||||
testCase.hyphenated = hyphenated;
|
||||
testCase.frequency = std::stoi(freqStr);
|
||||
|
||||
testCase.expectedPositions = expectedPositionsFromAnnotatedWord(hyphenated);
|
||||
|
||||
testCases.push_back(testCase);
|
||||
}
|
||||
}
|
||||
|
||||
file.close();
|
||||
return testCases;
|
||||
}
|
||||
|
||||
std::string positionsToHyphenated(const std::string& word, const std::vector<size_t>& positions) {
|
||||
std::string result;
|
||||
std::vector<size_t> sortedPositions = positions;
|
||||
std::sort(sortedPositions.begin(), sortedPositions.end());
|
||||
|
||||
const unsigned char* ptr = reinterpret_cast<const unsigned char*>(word.c_str());
|
||||
size_t codepointIndex = 0;
|
||||
size_t posIdx = 0;
|
||||
|
||||
while (*ptr != 0) {
|
||||
while (posIdx < sortedPositions.size() && sortedPositions[posIdx] == codepointIndex) {
|
||||
result.push_back('=');
|
||||
++posIdx;
|
||||
}
|
||||
|
||||
const unsigned char* current = ptr;
|
||||
utf8NextCodepoint(&ptr);
|
||||
result.append(reinterpret_cast<const char*>(current), reinterpret_cast<const char*>(ptr));
|
||||
++codepointIndex;
|
||||
}
|
||||
|
||||
while (posIdx < sortedPositions.size() && sortedPositions[posIdx] == codepointIndex) {
|
||||
result.push_back('=');
|
||||
++posIdx;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
std::vector<size_t> hyphenateWordWithHyphenator(const std::string& word, const LanguageHyphenator& hyphenator) {
|
||||
auto cps = collectCodepoints(word);
|
||||
trimSurroundingPunctuationAndFootnote(cps);
|
||||
|
||||
return hyphenator.breakIndexes(cps);
|
||||
}
|
||||
|
||||
std::vector<LanguageConfig> resolveLanguages(const std::string& selection) {
|
||||
if (selection == "all") {
|
||||
return kSupportedLanguages;
|
||||
}
|
||||
|
||||
for (const auto& config : kSupportedLanguages) {
|
||||
if (config.cliName == selection) {
|
||||
return {config};
|
||||
}
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
EvaluationResult evaluateWord(const TestCase& testCase,
|
||||
std::function<std::vector<size_t>(const std::string&)> hyphenateFunc) {
|
||||
EvaluationResult result;
|
||||
|
||||
std::vector<size_t> actualPositions = hyphenateFunc(testCase.word);
|
||||
|
||||
std::vector<size_t> expected = testCase.expectedPositions;
|
||||
std::vector<size_t> actual = actualPositions;
|
||||
|
||||
std::sort(expected.begin(), expected.end());
|
||||
std::sort(actual.begin(), actual.end());
|
||||
|
||||
for (size_t pos : actual) {
|
||||
if (std::find(expected.begin(), expected.end(), pos) != expected.end()) {
|
||||
result.truePositives++;
|
||||
} else {
|
||||
result.falsePositives++;
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t pos : expected) {
|
||||
if (std::find(actual.begin(), actual.end(), pos) == actual.end()) {
|
||||
result.falseNegatives++;
|
||||
}
|
||||
}
|
||||
|
||||
if (result.truePositives + result.falsePositives > 0) {
|
||||
result.precision = static_cast<double>(result.truePositives) / (result.truePositives + result.falsePositives);
|
||||
}
|
||||
|
||||
if (result.truePositives + result.falseNegatives > 0) {
|
||||
result.recall = static_cast<double>(result.truePositives) / (result.truePositives + result.falseNegatives);
|
||||
}
|
||||
|
||||
if (result.precision + result.recall > 0) {
|
||||
result.f1Score = 2 * result.precision * result.recall / (result.precision + result.recall);
|
||||
}
|
||||
|
||||
// Treat words that contain no hyphenation marks in both the expected data and the
|
||||
// algorithmic output as perfect matches so they don't drag down the per-word averages.
|
||||
if (expected.empty() && actual.empty()) {
|
||||
result.precision = 1.0;
|
||||
result.recall = 1.0;
|
||||
result.f1Score = 1.0;
|
||||
}
|
||||
|
||||
double fpPenalty = 2.0;
|
||||
double fnPenalty = 1.0;
|
||||
|
||||
int totalErrors = result.falsePositives * fpPenalty + result.falseNegatives * fnPenalty;
|
||||
int totalPossible = expected.size() * fpPenalty;
|
||||
|
||||
if (totalPossible > 0) {
|
||||
result.weightedScore = 1.0 - (static_cast<double>(totalErrors) / totalPossible);
|
||||
result.weightedScore = std::max(0.0, result.weightedScore);
|
||||
} else if (result.falsePositives == 0) {
|
||||
result.weightedScore = 1.0;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void printResults(const std::string& language, const std::vector<TestCase>& testCases,
|
||||
const std::vector<std::pair<TestCase, EvaluationResult>>& worstCases, int perfectMatches,
|
||||
int partialMatches, int completeMisses, double totalPrecision, double totalRecall, double totalF1,
|
||||
double totalWeighted, int totalTP, int totalFP, int totalFN,
|
||||
std::function<std::vector<size_t>(const std::string&)> hyphenateFunc) {
|
||||
std::string lang_upper = language;
|
||||
if (!lang_upper.empty()) {
|
||||
lang_upper[0] = std::toupper(lang_upper[0]);
|
||||
}
|
||||
|
||||
std::cout << "================================================================================" << std::endl;
|
||||
std::cout << lang_upper << " HYPHENATION EVALUATION RESULTS" << std::endl;
|
||||
std::cout << "================================================================================" << std::endl;
|
||||
std::cout << std::endl;
|
||||
|
||||
std::cout << "Total test cases: " << testCases.size() << std::endl;
|
||||
std::cout << "Perfect matches: " << perfectMatches << " (" << (perfectMatches * 100.0 / testCases.size()) << "%)"
|
||||
<< std::endl;
|
||||
std::cout << "Partial matches: " << partialMatches << std::endl;
|
||||
std::cout << "Complete misses: " << completeMisses << std::endl;
|
||||
std::cout << std::endl;
|
||||
|
||||
std::cout << "--- Overall Metrics (averaged per word) ---" << std::endl;
|
||||
std::cout << "Average Precision: " << (totalPrecision / testCases.size() * 100.0) << "%" << std::endl;
|
||||
std::cout << "Average Recall: " << (totalRecall / testCases.size() * 100.0) << "%" << std::endl;
|
||||
std::cout << "Average F1 Score: " << (totalF1 / testCases.size() * 100.0) << "%" << std::endl;
|
||||
std::cout << "Average Weighted Score: " << (totalWeighted / testCases.size() * 100.0) << "% (FP penalty: 2x)"
|
||||
<< std::endl;
|
||||
std::cout << std::endl;
|
||||
|
||||
std::cout << "--- Overall Metrics (total counts) ---" << std::endl;
|
||||
std::cout << "True Positives: " << totalTP << std::endl;
|
||||
std::cout << "False Positives: " << totalFP << " (incorrect hyphenation points)" << std::endl;
|
||||
std::cout << "False Negatives: " << totalFN << " (missed hyphenation points)" << std::endl;
|
||||
|
||||
double overallPrecision = totalTP + totalFP > 0 ? static_cast<double>(totalTP) / (totalTP + totalFP) : 0.0;
|
||||
double overallRecall = totalTP + totalFN > 0 ? static_cast<double>(totalTP) / (totalTP + totalFN) : 0.0;
|
||||
double overallF1 = overallPrecision + overallRecall > 0
|
||||
? 2 * overallPrecision * overallRecall / (overallPrecision + overallRecall)
|
||||
: 0.0;
|
||||
|
||||
std::cout << "Overall Precision: " << (overallPrecision * 100.0) << "%" << std::endl;
|
||||
std::cout << "Overall Recall: " << (overallRecall * 100.0) << "%" << std::endl;
|
||||
std::cout << "Overall F1 Score: " << (overallF1 * 100.0) << "%" << std::endl;
|
||||
std::cout << std::endl;
|
||||
|
||||
// Filter out perfect matches from the “worst cases” section so that only actionable failures appear.
|
||||
auto hasImperfection = [](const EvaluationResult& r) { return r.weightedScore < 0.999999; };
|
||||
std::vector<std::pair<TestCase, EvaluationResult>> imperfectCases;
|
||||
imperfectCases.reserve(worstCases.size());
|
||||
for (const auto& entry : worstCases) {
|
||||
if (hasImperfection(entry.second)) {
|
||||
imperfectCases.push_back(entry);
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "--- Worst Cases (lowest weighted scores) ---" << std::endl;
|
||||
int showCount = std::min(10, static_cast<int>(imperfectCases.size()));
|
||||
for (int i = 0; i < showCount; i++) {
|
||||
const auto& testCase = imperfectCases[i].first;
|
||||
const auto& result = imperfectCases[i].second;
|
||||
|
||||
std::vector<size_t> actualPositions = hyphenateFunc(testCase.word);
|
||||
std::string actualHyphenated = positionsToHyphenated(testCase.word, actualPositions);
|
||||
|
||||
std::cout << "Word: " << testCase.word << " (freq: " << testCase.frequency << ")" << std::endl;
|
||||
std::cout << " Expected: " << testCase.hyphenated << std::endl;
|
||||
std::cout << " Got: " << actualHyphenated << std::endl;
|
||||
std::cout << " Precision: " << (result.precision * 100.0) << "%"
|
||||
<< " Recall: " << (result.recall * 100.0) << "%"
|
||||
<< " F1: " << (result.f1Score * 100.0) << "%"
|
||||
<< " Weighted: " << (result.weightedScore * 100.0) << "%" << std::endl;
|
||||
std::cout << " TP: " << result.truePositives << " FP: " << result.falsePositives
|
||||
<< " FN: " << result.falseNegatives << std::endl;
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
// Additional compact list of the worst ~100 words to aid iteration
|
||||
int compactCount = std::min(100, static_cast<int>(imperfectCases.size()));
|
||||
if (compactCount > 0) {
|
||||
std::cout << "--- Compact Worst Cases (" << compactCount << ") ---" << std::endl;
|
||||
for (int i = 0; i < compactCount; i++) {
|
||||
const auto& testCase = imperfectCases[i].first;
|
||||
std::vector<size_t> actualPositions = hyphenateFunc(testCase.word);
|
||||
std::string actualHyphenated = positionsToHyphenated(testCase.word, actualPositions);
|
||||
std::cout << testCase.word << " | exp:" << testCase.hyphenated << " | got:" << actualHyphenated << std::endl;
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
const bool summaryMode = argc <= 1;
|
||||
const std::string languageSelection = summaryMode ? "all" : argv[1];
|
||||
|
||||
std::vector<LanguageConfig> languages = resolveLanguages(languageSelection);
|
||||
if (languages.empty()) {
|
||||
std::cerr << "Unknown language: " << languageSelection << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
for (const auto& lang : languages) {
|
||||
const auto* hyphenator = getLanguageHyphenatorForPrimaryTag(lang.primaryTag);
|
||||
if (!hyphenator) {
|
||||
std::cerr << "No hyphenator registered for tag: " << lang.primaryTag << std::endl;
|
||||
continue;
|
||||
}
|
||||
const auto hyphenateFunc = [hyphenator](const std::string& word) {
|
||||
return hyphenateWordWithHyphenator(word, *hyphenator);
|
||||
};
|
||||
|
||||
if (!summaryMode) {
|
||||
std::cout << "Loading test data from: " << lang.testDataFile << std::endl;
|
||||
}
|
||||
std::vector<TestCase> testCases = loadTestData(lang.testDataFile);
|
||||
|
||||
if (testCases.empty()) {
|
||||
std::cerr << "No test cases loaded for " << lang.cliName << ". Skipping." << std::endl;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!summaryMode) {
|
||||
std::cout << "Loaded " << testCases.size() << " test cases for " << lang.cliName << std::endl;
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
int perfectMatches = 0;
|
||||
int partialMatches = 0;
|
||||
int completeMisses = 0;
|
||||
|
||||
double totalPrecision = 0.0;
|
||||
double totalRecall = 0.0;
|
||||
double totalF1 = 0.0;
|
||||
double totalWeighted = 0.0;
|
||||
|
||||
int totalTP = 0, totalFP = 0, totalFN = 0;
|
||||
|
||||
std::vector<std::pair<TestCase, EvaluationResult>> worstCases;
|
||||
|
||||
for (const auto& testCase : testCases) {
|
||||
EvaluationResult result = evaluateWord(testCase, hyphenateFunc);
|
||||
|
||||
totalTP += result.truePositives;
|
||||
totalFP += result.falsePositives;
|
||||
totalFN += result.falseNegatives;
|
||||
|
||||
totalPrecision += result.precision;
|
||||
totalRecall += result.recall;
|
||||
totalF1 += result.f1Score;
|
||||
totalWeighted += result.weightedScore;
|
||||
|
||||
if (result.f1Score == 1.0) {
|
||||
perfectMatches++;
|
||||
} else if (result.f1Score > 0.0) {
|
||||
partialMatches++;
|
||||
} else {
|
||||
completeMisses++;
|
||||
}
|
||||
|
||||
worstCases.push_back({testCase, result});
|
||||
}
|
||||
|
||||
if (summaryMode) {
|
||||
const double averageF1Percent = testCases.empty() ? 0.0 : (totalF1 / testCases.size() * 100.0);
|
||||
std::cout << lang.cliName << ": " << averageF1Percent << "%" << std::endl;
|
||||
continue;
|
||||
}
|
||||
|
||||
std::sort(worstCases.begin(), worstCases.end(),
|
||||
[](const auto& a, const auto& b) { return a.second.weightedScore < b.second.weightedScore; });
|
||||
|
||||
printResults(lang.cliName, testCases, worstCases, perfectMatches, partialMatches, completeMisses, totalPrecision,
|
||||
totalRecall, totalF1, totalWeighted, totalTP, totalFP, totalFN, hyphenateFunc);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
5012
test/hyphenation_eval/resources/english_hyphenation_tests.txt
Normal file
5012
test/hyphenation_eval/resources/english_hyphenation_tests.txt
Normal file
File diff suppressed because it is too large
Load Diff
5012
test/hyphenation_eval/resources/french_hyphenation_tests.txt
Normal file
5012
test/hyphenation_eval/resources/french_hyphenation_tests.txt
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,232 @@
|
||||
"""
|
||||
Generate hyphenation test data from a text file.
|
||||
|
||||
This script extracts unique words from a book and generates ground truth
|
||||
hyphenations using the pyphen library, which can be used to test and validate
|
||||
the hyphenation implementations (e.g., German, English, Russian).
|
||||
|
||||
Usage:
|
||||
python generate_hyphenation_test_data.py <input_file> <output_file>
|
||||
[--language de_DE] [--max-words 5000] [--min-prefix 2] [--min-suffix 2]
|
||||
|
||||
Requirements:
|
||||
pip install pyphen
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import re
|
||||
from collections import Counter
|
||||
import pyphen
|
||||
from pathlib import Path
|
||||
import zipfile
|
||||
|
||||
|
||||
def extract_text_from_epub(epub_path):
|
||||
"""Extract textual content from an .epub archive by concatenating HTML/XHTML files."""
|
||||
texts = []
|
||||
with zipfile.ZipFile(epub_path, "r") as z:
|
||||
for name in z.namelist():
|
||||
lower = name.lower()
|
||||
if (
|
||||
lower.endswith(".xhtml")
|
||||
or lower.endswith(".html")
|
||||
or lower.endswith(".htm")
|
||||
):
|
||||
try:
|
||||
data = z.read(name).decode("utf-8", errors="ignore")
|
||||
except Exception:
|
||||
continue
|
||||
# Remove tags
|
||||
text = re.sub(r"<[^>]+>", " ", data)
|
||||
texts.append(text)
|
||||
return "\n".join(texts)
|
||||
|
||||
|
||||
def extract_words(text):
|
||||
"""Extract all words from text, preserving original case."""
|
||||
# Match runs of Unicode letters (any script) while excluding digits/underscores
|
||||
return re.findall(r"[^\W\d_]+", text, flags=re.UNICODE)
|
||||
|
||||
|
||||
def clean_word(word):
|
||||
"""Normalize word for hyphenation testing."""
|
||||
# Keep original case but strip any non-letter characters
|
||||
return word.strip()
|
||||
|
||||
|
||||
def generate_hyphenation_data(
|
||||
input_file,
|
||||
output_file,
|
||||
language="de_DE",
|
||||
min_length=6,
|
||||
max_words=5000,
|
||||
min_prefix=2,
|
||||
min_suffix=2,
|
||||
):
|
||||
"""
|
||||
Generate hyphenation test data from a text file.
|
||||
|
||||
Args:
|
||||
input_file: Path to input text file
|
||||
output_file: Path to output file with hyphenation data
|
||||
language: Language code for pyphen (e.g., 'de_DE', 'en_US')
|
||||
min_length: Minimum word length to include
|
||||
max_words: Maximum number of words to include (default: 5000)
|
||||
min_prefix: Minimum characters allowed before the first hyphen (default: 2)
|
||||
min_suffix: Minimum characters allowed after the last hyphen (default: 2)
|
||||
"""
|
||||
print(f"Reading from: {input_file}")
|
||||
|
||||
# Read the input file
|
||||
if str(input_file).lower().endswith(".epub"):
|
||||
print("Detected .epub input; extracting HTML content")
|
||||
text = extract_text_from_epub(input_file)
|
||||
else:
|
||||
with open(input_file, "r", encoding="utf-8") as f:
|
||||
text = f.read()
|
||||
|
||||
# Extract words
|
||||
print("Extracting words...")
|
||||
words = extract_words(text)
|
||||
print(f"Found {len(words)} total words")
|
||||
|
||||
# Count word frequencies
|
||||
word_counts = Counter(words)
|
||||
print(f"Found {len(word_counts)} unique words")
|
||||
|
||||
# Initialize pyphen hyphenator
|
||||
print(
|
||||
f"Initializing hyphenator for language: {language} (min_prefix={min_prefix}, min_suffix={min_suffix})"
|
||||
)
|
||||
try:
|
||||
hyphenator = pyphen.Pyphen(lang=language, left=min_prefix, right=min_suffix)
|
||||
except KeyError:
|
||||
print(f"Error: Language '{language}' not found in pyphen.")
|
||||
print("Available languages include: de_DE, en_US, en_GB, fr_FR, etc.")
|
||||
return
|
||||
|
||||
# Generate hyphenations
|
||||
print("Generating hyphenations...")
|
||||
hyphenation_data = []
|
||||
|
||||
# Sort by frequency (most common first) then alphabetically
|
||||
sorted_words = sorted(word_counts.items(), key=lambda x: (-x[1], x[0].lower()))
|
||||
|
||||
for word, count in sorted_words:
|
||||
# Filter by minimum length
|
||||
if len(word) < min_length:
|
||||
continue
|
||||
|
||||
# Get hyphenation (may produce no '=' characters)
|
||||
hyphenated = hyphenator.inserted(word, hyphen="=")
|
||||
|
||||
# Include all words (so we can take the top N most common words even if
|
||||
# they don't have hyphenation points). This replaces the previous filter
|
||||
# which dropped words without '='.
|
||||
hyphenation_data.append(
|
||||
{"word": word, "hyphenated": hyphenated, "count": count}
|
||||
)
|
||||
|
||||
# Stop if we've reached max_words
|
||||
if max_words and len(hyphenation_data) >= max_words:
|
||||
break
|
||||
|
||||
print(f"Generated {len(hyphenation_data)} hyphenated words")
|
||||
|
||||
# Write output file
|
||||
print(f"Writing to: {output_file}")
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
# Write header with metadata
|
||||
f.write(f"# Hyphenation Test Data\n")
|
||||
f.write(f"# Source: {Path(input_file).name}\n")
|
||||
f.write(f"# Language: {language}\n")
|
||||
f.write(f"# Min prefix: {min_prefix}\n")
|
||||
f.write(f"# Min suffix: {min_suffix}\n")
|
||||
f.write(f"# Total words: {len(hyphenation_data)}\n")
|
||||
f.write(f"# Format: word | hyphenated_form | frequency_in_source\n")
|
||||
f.write(f"#\n")
|
||||
f.write(f"# Hyphenation points are marked with '='\n")
|
||||
f.write(f"# Example: Silbentrennung -> Sil=ben=tren=nung\n")
|
||||
f.write(f"#\n\n")
|
||||
|
||||
# Write data
|
||||
for item in hyphenation_data:
|
||||
f.write(f"{item['word']}|{item['hyphenated']}|{item['count']}\n")
|
||||
|
||||
print("Done!")
|
||||
|
||||
# Print some statistics
|
||||
print("\n=== Statistics ===")
|
||||
print(f"Total unique words extracted: {len(word_counts)}")
|
||||
print(f"Words with hyphenation points: {len(hyphenation_data)}")
|
||||
print(
|
||||
f"Average hyphenation points per word: {sum(h['hyphenated'].count('=') for h in hyphenation_data) / len(hyphenation_data):.2f}"
|
||||
)
|
||||
|
||||
# Print some examples
|
||||
print("\n=== Examples (first 10) ===")
|
||||
for item in hyphenation_data[:10]:
|
||||
print(
|
||||
f" {item['word']:20} -> {item['hyphenated']:30} (appears {item['count']}x)"
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generate hyphenation test data from a text file",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Generate test data from a German book
|
||||
python generate_hyphenation_test_data.py ../data/books/bobiverse_1.txt hyphenation_test_data.txt
|
||||
|
||||
# Limit to 500 most common words
|
||||
python generate_hyphenation_test_data.py ../data/books/bobiverse_1.txt hyphenation_test_data.txt --max-words 500
|
||||
|
||||
# Use English hyphenation (when available)
|
||||
python generate_hyphenation_test_data.py book.txt test_en.txt --language en_US
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument("input_file", help="Input text file to extract words from")
|
||||
parser.add_argument("output_file", help="Output file for hyphenation test data")
|
||||
parser.add_argument(
|
||||
"--language", default="de_DE", help="Language code (default: de_DE)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--min-length", type=int, default=6, help="Minimum word length (default: 6)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-words",
|
||||
type=int,
|
||||
default=5000,
|
||||
help="Maximum number of words to include (default: 5000)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--min-prefix",
|
||||
type=int,
|
||||
default=2,
|
||||
help="Minimum characters permitted before the first hyphen (default: 2)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--min-suffix",
|
||||
type=int,
|
||||
default=2,
|
||||
help="Minimum characters permitted after the last hyphen (default: 2)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
generate_hyphenation_data(
|
||||
args.input_file,
|
||||
args.output_file,
|
||||
language=args.language,
|
||||
min_length=args.min_length,
|
||||
max_words=args.max_words,
|
||||
min_prefix=args.min_prefix,
|
||||
min_suffix=args.min_suffix,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
5012
test/hyphenation_eval/resources/german_hyphenation_tests.txt
Normal file
5012
test/hyphenation_eval/resources/german_hyphenation_tests.txt
Normal file
File diff suppressed because it is too large
Load Diff
5012
test/hyphenation_eval/resources/russian_hyphenation_tests.txt
Normal file
5012
test/hyphenation_eval/resources/russian_hyphenation_tests.txt
Normal file
File diff suppressed because it is too large
Load Diff
32
test/run_hyphenation_eval.sh
Executable file
32
test/run_hyphenation_eval.sh
Executable file
@@ -0,0 +1,32 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
BUILD_DIR="$ROOT_DIR/build/hyphenation_eval"
|
||||
BINARY="$BUILD_DIR/HyphenationEvaluationTest"
|
||||
|
||||
mkdir -p "$BUILD_DIR"
|
||||
|
||||
SOURCES=(
|
||||
"$ROOT_DIR/test/hyphenation_eval/HyphenationEvaluationTest.cpp"
|
||||
"$ROOT_DIR/lib/Epub/Epub/hyphenation/Hyphenator.cpp"
|
||||
"$ROOT_DIR/lib/Epub/Epub/hyphenation/LanguageRegistry.cpp"
|
||||
"$ROOT_DIR/lib/Epub/Epub/hyphenation/LiangHyphenation.cpp"
|
||||
"$ROOT_DIR/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp"
|
||||
"$ROOT_DIR/lib/Utf8/Utf8.cpp"
|
||||
)
|
||||
|
||||
CXXFLAGS=(
|
||||
-std=c++20
|
||||
-O2
|
||||
-Wall
|
||||
-Wextra
|
||||
-pedantic
|
||||
-I"$ROOT_DIR"
|
||||
-I"$ROOT_DIR/lib"
|
||||
-I"$ROOT_DIR/lib/Utf8"
|
||||
)
|
||||
|
||||
c++ "${CXXFLAGS[@]}" "${SOURCES[@]}" -o "$BINARY"
|
||||
|
||||
"$BINARY" "$@"
|
||||
Reference in New Issue
Block a user