Source code for build_tools.name_combiner.cli

"""
Command-line interface for the name combiner.

This module provides the CLI for generating name candidates from an
annotated syllable corpus. It follows the project's CLI documentation
standards with sphinx-argparse compatible argument parser.

Usage
-----
Generate 2-syllable candidates::

    python -m build_tools.name_combiner \\
        --run-dir _working/output/20260110_115453_pyphen/ \\
        --syllables 2 \\
        --count 10000 \\
        --seed 42

Generate 3-syllable candidates with uniform sampling::

    python -m build_tools.name_combiner \\
        --run-dir _working/output/20260110_115453_pyphen/ \\
        --syllables 3 \\
        --count 5000 \\
        --frequency-weight 0.0
"""

from __future__ import annotations

import argparse
import json
import sys
from datetime import datetime, timezone
from pathlib import Path


[docs] def create_argument_parser() -> argparse.ArgumentParser: """ Create and return the argument parser for the name combiner. Returns ------- argparse.ArgumentParser Configured ArgumentParser ready to parse command-line arguments. Notes ----- This function follows the project's CLI documentation standards, enabling sphinx-argparse to auto-generate documentation. """ parser = argparse.ArgumentParser( description=( "Generate N-syllable name candidates from an annotated syllable corpus. " "Combines syllables structurally and aggregates features to the name level. " "This is a build-time tool for the Selection Policy Layer." ), formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples:: # Generate 2-syllable candidates with default settings python -m build_tools.name_combiner \\ --run-dir _working/output/20260110_115453_pyphen/ \\ --syllables 2 # Generate 10000 3-syllable candidates with fixed seed python -m build_tools.name_combiner \\ --run-dir _working/output/20260110_115453_pyphen/ \\ --syllables 3 \\ --count 10000 \\ --seed 42 # Generate with uniform sampling (no frequency weighting) python -m build_tools.name_combiner \\ --run-dir _working/output/20260110_115453_pyphen/ \\ --syllables 2 \\ --frequency-weight 0.0 Output: Creates ``candidates/{prefix}_candidates_{N}syl.json`` in the run directory. The prefix (pyphen\\_ or nltk\\_) is auto-detected from the run directory name. """, ) parser.add_argument( "--run-dir", type=Path, required=True, help=( "Path to extraction run directory containing annotated JSON. " "Example: _working/output/20260110_115453_pyphen/" ), ) parser.add_argument( "--syllables", type=int, required=True, choices=[2, 3, 4], help="Number of syllables per candidate name. Choices: 2, 3, 4.", ) parser.add_argument( "--count", type=int, default=10000, help="Number of candidates to generate. Default: 10000.", ) parser.add_argument( "--seed", type=int, default=None, help=( "RNG seed for deterministic output. If not specified, uses system entropy. " "Same seed always produces identical candidates." ), ) parser.add_argument( "--frequency-weight", type=float, default=1.0, help=( "Weight for frequency-biased sampling. " "0.0 = uniform sampling, 1.0 = fully frequency-weighted. " "Values between 0 and 1 interpolate. Default: 1.0." ), ) return parser
[docs] def parse_arguments(args: list[str] | None = None) -> argparse.Namespace: """ Parse command-line arguments. Parameters ---------- args : list[str] | None, optional Arguments to parse. If None, uses sys.argv. Returns ------- argparse.Namespace Parsed arguments. """ parser = create_argument_parser() return parser.parse_args(args)
[docs] def discover_annotated_json(run_dir: Path) -> tuple[Path, str]: """ Discover the annotated JSON file in a run directory. Parameters ---------- run_dir : Path Path to extraction run directory. Returns ------- tuple[Path, str] (path_to_annotated_json, prefix) where prefix is 'pyphen' or 'nltk'. Raises ------ FileNotFoundError If no annotated JSON is found. ValueError If run directory structure is unexpected. """ data_dir = run_dir / "data" if not data_dir.exists(): raise FileNotFoundError(f"No data/ directory in {run_dir}") # Check for pyphen or nltk annotated file pyphen_path = data_dir / "pyphen_syllables_annotated.json" nltk_path = data_dir / "nltk_syllables_annotated.json" if pyphen_path.exists(): return pyphen_path, "pyphen" elif nltk_path.exists(): return nltk_path, "nltk" else: raise FileNotFoundError( f"No annotated JSON found in {data_dir}. " "Expected pyphen_syllables_annotated.json or nltk_syllables_annotated.json" )
[docs] def main(args: list[str] | None = None) -> int: """ Main entry point for the name combiner CLI. Parameters ---------- args : list[str] | None, optional Command-line arguments. If None, uses sys.argv. Returns ------- int Exit code (0 for success, non-zero for error). """ # Import here to avoid circular imports and speed up --help from build_tools.name_combiner.combiner import combine_syllables parsed = parse_arguments(args) # Validate run directory run_dir = parsed.run_dir.resolve() if not run_dir.exists(): print(f"Error: Run directory not found: {run_dir}", file=sys.stderr) return 1 # Discover annotated JSON try: annotated_path, prefix = discover_annotated_json(run_dir) except (FileNotFoundError, ValueError) as e: print(f"Error: {e}", file=sys.stderr) return 1 print(f"Loading annotated data from: {annotated_path}") # Load annotated data try: with open(annotated_path) as f: annotated_data = json.load(f) except json.JSONDecodeError as e: print(f"Error: Invalid JSON in {annotated_path}: {e}", file=sys.stderr) return 1 print(f"Loaded {len(annotated_data):,} syllables") # Generate candidates print( f"Generating {parsed.count:,} {parsed.syllables}-syllable candidates " f"(seed={parsed.seed}, frequency_weight={parsed.frequency_weight})" ) candidates = combine_syllables( annotated_data=annotated_data, syllable_count=parsed.syllables, count=parsed.count, seed=parsed.seed, frequency_weight=parsed.frequency_weight, ) print(f"Generated {len(candidates):,} candidates") # Prepare output candidates_dir = run_dir / "candidates" candidates_dir.mkdir(parents=True, exist_ok=True) output_filename = f"{prefix}_candidates_{parsed.syllables}syl.json" output_path = candidates_dir / output_filename # Build output structure output = { "metadata": { "source_run": run_dir.name, "source_annotated": annotated_path.name, "syllable_count": parsed.syllables, "total_candidates": len(candidates), "seed": parsed.seed, "frequency_weight": parsed.frequency_weight, "aggregation_rule": "majority", "generated_at": datetime.now(timezone.utc).isoformat(), }, "candidates": candidates, } # Write output with open(output_path, "w") as f: json.dump(output, f, indent=2) print(f"Wrote candidates to: {output_path}") # Summary stats unique_names = len(set(c["name"] for c in candidates)) unique_percentage = unique_names / len(candidates) * 100 print(f"Unique names: {unique_names:,} ({unique_percentage:.1f}%)") # Write meta file meta_output = { "tool": "name_combiner", "version": "1.0.0", "generated_at": datetime.now(timezone.utc).isoformat(), "arguments": { "run_dir": str(run_dir), "syllables": parsed.syllables, "count": parsed.count, "seed": parsed.seed, "frequency_weight": parsed.frequency_weight, }, "input": { "annotated_file": str(annotated_path), "syllables_loaded": len(annotated_data), }, "output": { "candidates_file": str(output_path), "candidates_generated": len(candidates), "unique_names": unique_names, "unique_percentage": round(unique_percentage, 2), }, "statistics": { "source_run": run_dir.name, "source_prefix": prefix, "syllable_count": parsed.syllables, "frequency_weight": parsed.frequency_weight, "aggregation_rule": "majority", }, } meta_filename = f"{prefix}_combiner_meta.json" meta_path = candidates_dir / meta_filename with open(meta_path, "w") as f: json.dump(meta_output, f, indent=2) print(f"Wrote meta to: {meta_path}") return 0
if __name__ == "__main__": # pragma: no cover sys.exit(main())