#!/usr/bin/env python3 """Embed hypher-generated `.bin` tries into constexpr headers.""" from __future__ import annotations import argparse import pathlib def _format_bytes(blob: bytes, per_line: int = 16) -> str: # Render the blob as a comma separated list of hex literals with consistent wrapping. lines = [] for i in range(0, len(blob), per_line): chunk = ', '.join(f"0x{b:02X}" for b in blob[i : i + per_line]) lines.append(f" {chunk},") if not lines: lines.append(" 0x00,") return '\n'.join(lines) def _symbol_from_output(path: pathlib.Path) -> str: # Derive a stable C identifier from the destination header name (e.g., hyph-en.trie.h -> en). name = path.name if name.endswith('.trie.h'): name = name[:-7] if name.startswith('hyph-'): name = name[5:] name = name.replace('-', '_') if name.endswith('.trie'): name = name[:-5] return name def write_header(path: pathlib.Path, blob: bytes, symbol: str) -> None: # Emit a constexpr header containing the raw bytes plus a SerializedHyphenationPatterns descriptor. path.parent.mkdir(parents=True, exist_ok=True) data_symbol = f"{symbol}_trie_data" patterns_symbol = f"{symbol}_patterns" bytes_literal = _format_bytes(blob) content = f"""#pragma once #include #include #include "../SerializedHyphenationTrie.h" // Auto-generated by generate_hyphenation_trie.py. Do not edit manually. alignas(4) constexpr uint8_t {data_symbol}[] = {{ {bytes_literal} }}; constexpr SerializedHyphenationPatterns {patterns_symbol} = {{ {data_symbol}, sizeof({data_symbol}), }}; """ path.write_text(content) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument('--input', dest='inputs', action='append', required=True, help='Path to a hypher-generated .bin trie') parser.add_argument('--output', dest='outputs', action='append', required=True, help='Destination header path (hyph-*.trie.h)') args = parser.parse_args() if len(args.inputs) != len(args.outputs): raise SystemExit('input/output counts must match') for src, dst in zip(args.inputs, args.outputs): # Process each input/output pair independently so mixed-language refreshes work in one invocation. src_path = pathlib.Path(src) blob = src_path.read_bytes() out_path = pathlib.Path(dst) symbol = _symbol_from_output(out_path) write_header(out_path, blob, symbol) print(f'wrote {dst} ({len(blob)} bytes payload)') if __name__ == '__main__': main()