Source code for build_tools.name_combiner.cli

"""
Command-line interface for the name combiner.

This module provides the CLI for generating name candidates from an
annotated syllable corpus. It follows the project's CLI documentation
standards with sphinx-argparse compatible argument parser.

Usage
-----
Generate 2-syllable candidates::

    python -m build_tools.name_combiner \\
        --run-dir _working/output/20260110_115453_pyphen/ \\
        --syllables 2 \\
        --count 10000 \\
        --seed 42

Generate 3-syllable candidates with uniform sampling::

    python -m build_tools.name_combiner \\
        --run-dir _working/output/20260110_115453_pyphen/ \\
        --syllables 3 \\
        --count 5000 \\
        --frequency-weight 0.0
"""

from __future__ import annotations

import argparse
import json
import sys
from datetime import datetime, timezone
from pathlib import Path



[docs]
def create_argument_parser() -> argparse.ArgumentParser:
    """
    Create and return the argument parser for the name combiner.

    Returns
    -------
    argparse.ArgumentParser
        Configured ArgumentParser ready to parse command-line arguments.

    Notes
    -----
    This function follows the project's CLI documentation standards,
    enabling sphinx-argparse to auto-generate documentation.
    """
    parser = argparse.ArgumentParser(
        description=(
            "Generate N-syllable name candidates from an annotated syllable corpus. "
            "Combines syllables structurally and aggregates features to the name level. "
            "This is a build-time tool for the Selection Policy Layer."
        ),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples::

    # Generate 2-syllable candidates with default settings
    python -m build_tools.name_combiner \\
        --run-dir _working/output/20260110_115453_pyphen/ \\
        --syllables 2

    # Generate 10000 3-syllable candidates with fixed seed
    python -m build_tools.name_combiner \\
        --run-dir _working/output/20260110_115453_pyphen/ \\
        --syllables 3 \\
        --count 10000 \\
        --seed 42

    # Generate with uniform sampling (no frequency weighting)
    python -m build_tools.name_combiner \\
        --run-dir _working/output/20260110_115453_pyphen/ \\
        --syllables 2 \\
        --frequency-weight 0.0

Output:
    Creates ``candidates/{prefix}_candidates_{N}syl.json`` in the run directory.
    The prefix (pyphen\\_ or nltk\\_) is auto-detected from the run directory name.
        """,
    )

    parser.add_argument(
        "--run-dir",
        type=Path,
        required=True,
        help=(
            "Path to extraction run directory containing annotated JSON. "
            "Example: _working/output/20260110_115453_pyphen/"
        ),
    )

    parser.add_argument(
        "--syllables",
        type=int,
        required=True,
        choices=[2, 3, 4],
        help="Number of syllables per candidate name. Choices: 2, 3, 4.",
    )

    parser.add_argument(
        "--count",
        type=int,
        default=10000,
        help="Number of candidates to generate. Default: 10000.",
    )

    parser.add_argument(
        "--seed",
        type=int,
        default=None,
        help=(
            "RNG seed for deterministic output. If not specified, uses system entropy. "
            "Same seed always produces identical candidates."
        ),
    )

    parser.add_argument(
        "--frequency-weight",
        type=float,
        default=1.0,
        help=(
            "Weight for frequency-biased sampling. "
            "0.0 = uniform sampling, 1.0 = fully frequency-weighted. "
            "Values between 0 and 1 interpolate. Default: 1.0."
        ),
    )

    return parser




[docs]
def parse_arguments(args: list[str] | None = None) -> argparse.Namespace:
    """
    Parse command-line arguments.

    Parameters
    ----------
    args : list[str] | None, optional
        Arguments to parse. If None, uses sys.argv.

    Returns
    -------
    argparse.Namespace
        Parsed arguments.
    """
    parser = create_argument_parser()
    return parser.parse_args(args)




[docs]
def discover_annotated_json(run_dir: Path) -> tuple[Path, str]:
    """
    Discover the annotated JSON file in a run directory.

    Parameters
    ----------
    run_dir : Path
        Path to extraction run directory.

    Returns
    -------
    tuple[Path, str]
        (path_to_annotated_json, prefix) where prefix is 'pyphen' or 'nltk'.

    Raises
    ------
    FileNotFoundError
        If no annotated JSON is found.
    ValueError
        If run directory structure is unexpected.
    """
    data_dir = run_dir / "data"
    if not data_dir.exists():
        raise FileNotFoundError(f"No data/ directory in {run_dir}")

    # Check for pyphen or nltk annotated file
    pyphen_path = data_dir / "pyphen_syllables_annotated.json"
    nltk_path = data_dir / "nltk_syllables_annotated.json"

    if pyphen_path.exists():
        return pyphen_path, "pyphen"
    elif nltk_path.exists():
        return nltk_path, "nltk"
    else:
        raise FileNotFoundError(
            f"No annotated JSON found in {data_dir}. "
            "Expected pyphen_syllables_annotated.json or nltk_syllables_annotated.json"
        )




[docs]
def main(args: list[str] | None = None) -> int:
    """
    Main entry point for the name combiner CLI.

    Parameters
    ----------
    args : list[str] | None, optional
        Command-line arguments. If None, uses sys.argv.

    Returns
    -------
    int
        Exit code (0 for success, non-zero for error).
    """
    # Import here to avoid circular imports and speed up --help
    from build_tools.name_combiner.combiner import combine_syllables

    parsed = parse_arguments(args)

    # Validate run directory
    run_dir = parsed.run_dir.resolve()
    if not run_dir.exists():
        print(f"Error: Run directory not found: {run_dir}", file=sys.stderr)
        return 1

    # Discover annotated JSON
    try:
        annotated_path, prefix = discover_annotated_json(run_dir)
    except (FileNotFoundError, ValueError) as e:
        print(f"Error: {e}", file=sys.stderr)
        return 1

    print(f"Loading annotated data from: {annotated_path}")

    # Load annotated data
    try:
        with open(annotated_path) as f:
            annotated_data = json.load(f)
    except json.JSONDecodeError as e:
        print(f"Error: Invalid JSON in {annotated_path}: {e}", file=sys.stderr)
        return 1

    print(f"Loaded {len(annotated_data):,} syllables")

    # Generate candidates
    print(
        f"Generating {parsed.count:,} {parsed.syllables}-syllable candidates "
        f"(seed={parsed.seed}, frequency_weight={parsed.frequency_weight})"
    )

    candidates = combine_syllables(
        annotated_data=annotated_data,
        syllable_count=parsed.syllables,
        count=parsed.count,
        seed=parsed.seed,
        frequency_weight=parsed.frequency_weight,
    )

    print(f"Generated {len(candidates):,} candidates")

    # Prepare output
    candidates_dir = run_dir / "candidates"
    candidates_dir.mkdir(parents=True, exist_ok=True)

    output_filename = f"{prefix}_candidates_{parsed.syllables}syl.json"
    output_path = candidates_dir / output_filename

    # Build output structure
    output = {
        "metadata": {
            "source_run": run_dir.name,
            "source_annotated": annotated_path.name,
            "syllable_count": parsed.syllables,
            "total_candidates": len(candidates),
            "seed": parsed.seed,
            "frequency_weight": parsed.frequency_weight,
            "aggregation_rule": "majority",
            "generated_at": datetime.now(timezone.utc).isoformat(),
        },
        "candidates": candidates,
    }

    # Write output
    with open(output_path, "w") as f:
        json.dump(output, f, indent=2)

    print(f"Wrote candidates to: {output_path}")

    # Summary stats
    unique_names = len(set(c["name"] for c in candidates))
    unique_percentage = unique_names / len(candidates) * 100
    print(f"Unique names: {unique_names:,} ({unique_percentage:.1f}%)")

    # Write meta file
    meta_output = {
        "tool": "name_combiner",
        "version": "1.0.0",
        "generated_at": datetime.now(timezone.utc).isoformat(),
        "arguments": {
            "run_dir": str(run_dir),
            "syllables": parsed.syllables,
            "count": parsed.count,
            "seed": parsed.seed,
            "frequency_weight": parsed.frequency_weight,
        },
        "input": {
            "annotated_file": str(annotated_path),
            "syllables_loaded": len(annotated_data),
        },
        "output": {
            "candidates_file": str(output_path),
            "candidates_generated": len(candidates),
            "unique_names": unique_names,
            "unique_percentage": round(unique_percentage, 2),
        },
        "statistics": {
            "source_run": run_dir.name,
            "source_prefix": prefix,
            "syllable_count": parsed.syllables,
            "frequency_weight": parsed.frequency_weight,
            "aggregation_rule": "majority",
        },
    }

    meta_filename = f"{prefix}_combiner_meta.json"
    meta_path = candidates_dir / meta_filename
    with open(meta_path, "w") as f:
        json.dump(meta_output, f, indent=2)

    print(f"Wrote meta to: {meta_path}")

    return 0



if __name__ == "__main__":  # pragma: no cover
    sys.exit(main())