Source code for build_tools.name_selector.cli

"""
Command-line interface for the name selector.

This module provides the CLI for filtering and ranking name candidates
against a name class policy. It follows the project's CLI documentation
standards with sphinx-argparse compatible argument parser.

Usage
-----
Select first names from 2-syllable candidates::

    python -m build_tools.name_selector \\
        --run-dir _working/output/20260110_115453_pyphen/ \\
        --candidates candidates/pyphen_candidates_2syl.json \\
        --name-class first_name \\
        --count 100

Use soft mode (penalties instead of hard rejection)::

    python -m build_tools.name_selector \\
        --run-dir _working/output/20260110_115453_pyphen/ \\
        --candidates candidates/pyphen_candidates_2syl.json \\
        --name-class first_name \\
        --mode soft
"""

from __future__ import annotations

import argparse
import json
import sys
from datetime import datetime, timezone
from pathlib import Path


[docs] def create_argument_parser() -> argparse.ArgumentParser: """ Create and return the argument parser for the name selector. Returns ------- argparse.ArgumentParser Configured ArgumentParser ready to parse command-line arguments. Notes ----- This function follows the project's CLI documentation standards, enabling sphinx-argparse to auto-generate documentation. """ parser = argparse.ArgumentParser( description=( "Filter and rank name candidates against a name class policy. " "Evaluates candidates using the 12-feature policy matrix and produces " "ranked, admissible name lists. This is a build-time tool for the " "Selection Policy Layer." ), formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples:: # Select first names from 2-syllable candidates python -m build_tools.name_selector \\ --run-dir _working/output/20260110_115453_pyphen/ \\ --candidates candidates/pyphen_candidates_2syl.json \\ --name-class first_name \\ --count 100 # Select place names with soft mode (penalties instead of rejection) python -m build_tools.name_selector \\ --run-dir _working/output/20260110_115453_pyphen/ \\ --candidates candidates/pyphen_candidates_3syl.json \\ --name-class place_name \\ --mode soft # Use a custom policy file python -m build_tools.name_selector \\ --run-dir _working/output/20260110_115453_pyphen/ \\ --candidates candidates/pyphen_candidates_2syl.json \\ --name-class first_name \\ --policy-file custom_policies.yml Output: Creates ``selections/{prefix}_{name_class}_{N}syl.json`` in the run directory. The prefix and syllable count are extracted from the candidates filename. """, ) parser.add_argument( "--run-dir", type=Path, required=True, help=( "Path to extraction run directory. " "Example: _working/output/20260110_115453_pyphen/" ), ) parser.add_argument( "--candidates", type=Path, required=True, help=( "Path to candidates JSON file, relative to run-dir. " "If the wrong prefix is specified (e.g., nltk_ for a pyphen run), " "the correct file will be auto-detected. " "Example: candidates/pyphen_candidates_2syl.json" ), ) parser.add_argument( "--name-class", type=str, required=True, help=( "Name class identifier from name_classes.yml. " "Examples: first_name, last_name, place_name" ), ) parser.add_argument( "--policy-file", type=Path, default=None, help=( "Path to name_classes.yml. If not specified, uses data/name_classes.yml " "from project root. Default: data/name_classes.yml" ), ) parser.add_argument( "--count", type=int, default=100, help="Maximum number of names to output. Default: 100.", ) parser.add_argument( "--mode", type=str, choices=["hard", "soft"], default="hard", help=( "Evaluation mode. 'hard' rejects candidates with discouraged features. " "'soft' applies -10 penalty instead. Default: hard." ), ) return parser
[docs] def parse_arguments(args: list[str] | None = None) -> argparse.Namespace: """ Parse command-line arguments. Parameters ---------- args : list[str] | None, optional Arguments to parse. If None, uses sys.argv. Returns ------- argparse.Namespace Parsed arguments. """ parser = create_argument_parser() return parser.parse_args(args)
[docs] def extract_extractor_type(run_dir: Path) -> str | None: """ Extract extractor type from run directory name. Parameters ---------- run_dir : Path Run directory like "_working/output/20260118_201318_pyphen" Returns ------- str | None Extractor type (e.g., "pyphen", "nltk") or None if not found. """ # Pattern: YYYYMMDD_HHMMSS_{extractor} parts = run_dir.name.split("_") if len(parts) >= 3: return "_".join(parts[2:]) # Handle multi-word extractors return None
[docs] def resolve_candidates_path(run_dir: Path, candidates: Path) -> Path: """ Resolve candidates path, auto-detecting prefix if needed. If the specified path doesn't exist, tries to find a matching file using the extractor type from the run directory name. Parameters ---------- run_dir : Path Run directory path candidates : Path Candidates path (relative to run_dir) Returns ------- Path Resolved candidates path (may be different from input if auto-detected) """ candidates_path = run_dir / candidates if candidates_path.exists(): return candidates_path # Try to auto-detect the correct prefix extractor_type = extract_extractor_type(run_dir) if not extractor_type: return candidates_path # Return original, will fail with proper error # Check if user specified wrong prefix - try the correct one stem = candidates.stem # e.g., "nltk_candidates_2syl" parts = stem.split("_") if len(parts) >= 3 and parts[1] == "candidates": # User specified a prefix, try replacing it with the correct one wrong_prefix = parts[0] if wrong_prefix != extractor_type: correct_filename = f"{extractor_type}_{'_'.join(parts[1:])}.json" correct_path = run_dir / candidates.parent / correct_filename if correct_path.exists(): print(f"Note: Auto-corrected prefix from '{wrong_prefix}' to '{extractor_type}'") return correct_path # Try to find any matching candidates file in the directory candidates_dir = run_dir / candidates.parent if candidates_dir.exists(): # Look for files matching *_candidates_*syl.json for json_file in candidates_dir.glob(f"{extractor_type}_candidates_*.json"): if "_meta" not in json_file.name: # Check if syllable count matches (if specified in original) if "syl" in stem: syl_part = stem.split("_")[-1] # e.g., "2syl" if syl_part in json_file.name: print(f"Note: Found matching candidates file: {json_file.name}") return json_file return candidates_path # Return original, will fail with proper error
[docs] def extract_prefix_and_syllables(candidates_filename: str) -> tuple[str, int]: """ Extract prefix and syllable count from candidates filename. Parameters ---------- candidates_filename : str Filename like "pyphen_candidates_2syl.json" Returns ------- tuple[str, int] (prefix, syllable_count) e.g., ("pyphen", 2) Raises ------ ValueError If filename doesn't match expected pattern. """ # Expected: {prefix}_candidates_{N}syl.json stem = Path(candidates_filename).stem # pyphen_candidates_2syl parts = stem.split("_") if len(parts) < 3 or parts[1] != "candidates": raise ValueError(f"Unexpected candidates filename format: {candidates_filename}") prefix = parts[0] # Extract syllable count from last part (e.g., "2syl" -> 2) syl_part = parts[-1] if not syl_part.endswith("syl"): raise ValueError(f"Cannot extract syllable count from: {candidates_filename}") try: syllables = int(syl_part[:-3]) # Remove "syl" suffix except ValueError as err: raise ValueError(f"Cannot parse syllable count from: {syl_part}") from err return prefix, syllables
[docs] def main(args: list[str] | None = None) -> int: """ Main entry point for the name selector CLI. Parameters ---------- args : list[str] | None, optional Command-line arguments. If None, uses sys.argv. Returns ------- int Exit code (0 for success, non-zero for error). """ # Import here to avoid circular imports and speed up --help from build_tools.name_selector.name_class import get_default_policy_path, load_name_classes from build_tools.name_selector.selector import compute_selection_statistics, select_names parsed = parse_arguments(args) # Validate run directory run_dir = parsed.run_dir.resolve() if not run_dir.exists(): print(f"Error: Run directory not found: {run_dir}", file=sys.stderr) return 1 # Resolve candidates path (with auto-detection) candidates_path = resolve_candidates_path(run_dir, parsed.candidates) if not candidates_path.exists(): # Provide helpful error message extractor_type = extract_extractor_type(run_dir) if extractor_type: expected = f"candidates/{extractor_type}_candidates_Nsyl.json" print( f"Error: Candidates file not found: {run_dir / parsed.candidates}\n" f" Hint: This is a '{extractor_type}' run. Expected format: {expected}", file=sys.stderr, ) else: print( f"Error: Candidates file not found: {run_dir / parsed.candidates}", file=sys.stderr ) return 1 # Load candidates print(f"Loading candidates from: {candidates_path}") try: with open(candidates_path) as f: candidates_data = json.load(f) except json.JSONDecodeError as e: print(f"Error: Invalid JSON in {candidates_path}: {e}", file=sys.stderr) return 1 candidates = candidates_data.get("candidates", []) print(f"Loaded {len(candidates):,} candidates") # Resolve policy file policy_path = parsed.policy_file or get_default_policy_path() if not policy_path.exists(): print(f"Error: Policy file not found: {policy_path}", file=sys.stderr) return 1 # Load policies print(f"Loading policies from: {policy_path}") try: policies = load_name_classes(policy_path) except (ValueError, FileNotFoundError) as e: print(f"Error loading policies: {e}", file=sys.stderr) return 1 # Get target policy if parsed.name_class not in policies: available = ", ".join(sorted(policies.keys())) print( f"Error: Unknown name class '{parsed.name_class}'. " f"Available: {available}", file=sys.stderr, ) return 1 policy = policies[parsed.name_class] print(f"Using policy: {parsed.name_class} - {policy.description}") # Compute statistics print(f"Evaluating candidates (mode={parsed.mode})...") stats = compute_selection_statistics(candidates, policy, mode=parsed.mode) # type: ignore[arg-type] print(f" Evaluated: {stats['total_evaluated']:,}") print( f" Admitted: {stats['admitted']:,} ({stats['admitted']/stats['total_evaluated']*100:.1f}%)" ) print(f" Rejected: {stats['rejected']:,}") if stats["rejection_reasons"]: print(" Rejection reasons:") for reason, count in sorted(stats["rejection_reasons"].items(), key=lambda x: -x[1]): print(f" {reason}: {count:,}") # Select top names selected = select_names(candidates, policy, count=parsed.count, mode=parsed.mode) # type: ignore[arg-type] print(f"Selected top {len(selected):,} names") # Prepare output - use resolved candidates_path (may have auto-corrected prefix) try: prefix, syllables = extract_prefix_and_syllables(candidates_path.name) except ValueError as e: print(f"Warning: {e}. Using defaults.", file=sys.stderr) prefix = "unknown" syllables = candidates_data.get("metadata", {}).get("syllable_count", 0) selections_dir = run_dir / "selections" selections_dir.mkdir(parents=True, exist_ok=True) output_filename = f"{prefix}_{parsed.name_class}_{syllables}syl.json" output_path = selections_dir / output_filename # Build output structure output = { "metadata": { "source_candidates": parsed.candidates.name, "name_class": parsed.name_class, "policy_description": policy.description, "policy_file": str(policy_path), "mode": parsed.mode, "total_evaluated": stats["total_evaluated"], "admitted": stats["admitted"], "rejected": stats["rejected"], "rejection_reasons": stats["rejection_reasons"], "score_distribution": stats["score_distribution"], "output_count": len(selected), "generated_at": datetime.now(timezone.utc).isoformat(), }, "selections": selected, } # Write output with open(output_path, "w") as f: json.dump(output, f, indent=2) print(f"Wrote selections to: {output_path}") # Show top 5 samples if selected: print("\nTop 5 selections:") for s in selected[:5]: features_summary = len([f for f, v in s["features"].items() if v]) print( f" {s['rank']:3d}. {s['name']:15s} score={s['score']:2d} " f"({features_summary} features)" ) # Write meta file meta_output = { "tool": "name_selector", "version": "1.0.0", "generated_at": datetime.now(timezone.utc).isoformat(), "arguments": { "run_dir": str(run_dir), "candidates": str(parsed.candidates), "name_class": parsed.name_class, "policy_file": str(policy_path), "count": parsed.count, "mode": parsed.mode, }, "input": { "candidates_file": str(candidates_path), "candidates_loaded": len(candidates), "policy_file": str(policy_path), "policy_name": parsed.name_class, "policy_description": policy.description, }, "output": { "selections_file": str(output_path), "selections_count": len(selected), }, "statistics": { "total_evaluated": stats["total_evaluated"], "admitted": stats["admitted"], "admitted_percentage": round(stats["admitted"] / stats["total_evaluated"] * 100, 2), "rejected": stats["rejected"], "rejection_reasons": stats["rejection_reasons"], "score_distribution": stats["score_distribution"], "mode": parsed.mode, "source_prefix": prefix, "syllable_count": syllables, }, } meta_filename = f"{prefix}_selector_meta.json" meta_path = selections_dir / meta_filename with open(meta_path, "w") as f: json.dump(meta_output, f, indent=2) print(f"Wrote meta to: {meta_path}") return 0
if __name__ == "__main__": # pragma: no cover sys.exit(main())