Source code for build_tools.name_selector.selector

"""
Main selector orchestration logic.

This module provides the high-level selection function that coordinates
loading candidates, evaluating them against a policy, and producing
ranked output.

The selector is the central orchestrator of the Selection Policy Layer.
It ties together:
- Candidate loading (from name_combiner output)
- Policy evaluation (from policy.py)
- Result ranking and filtering

Usage
-----
>>> from build_tools.name_selector import select_names, load_name_classes
>>>
>>> # Load policies and candidates
>>> policies = load_name_classes("data/name_classes.yml")
>>> with open("candidates/pyphen_candidates_2syl.json") as f:
...     candidates_data = json.load(f)
>>>
>>> # Select names
>>> selected = select_names(
...     candidates=candidates_data["candidates"],
...     policy=policies["first_name"],
...     count=100,
...     mode="hard",
... )
>>>
>>> for name in selected[:5]:
...     print(f"{name['name']}: score={name['score']}, rank={name['rank']}")
"""

from __future__ import annotations

import random
from typing import TYPE_CHECKING, Literal

from build_tools.name_selector.policy import check_syllable_count, evaluate_candidate

if TYPE_CHECKING:
    from collections.abc import Sequence

    from build_tools.name_selector.name_class import NameClassPolicy


[docs] def select_names( candidates: Sequence[dict], policy: NameClassPolicy, count: int = 100, mode: Literal["hard", "soft"] = "hard", order: Literal["alphabetical", "random"] = "alphabetical", seed: int | None = None, ) -> list[dict]: """ Select and rank name candidates against a policy. Evaluates all candidates, filters out rejected ones, ranks by score, and returns the top N. Parameters ---------- candidates : Sequence[dict] List of candidate dictionaries from name_combiner output. Each must have "name", "syllables", and "features" keys. policy : NameClassPolicy The policy to evaluate against. count : int, optional Maximum number of names to return. Default: 100. mode : {"hard", "soft"}, optional Evaluation mode. "hard" rejects on discouraged features. "soft" applies penalties. Default: "hard". order : {"alphabetical", "random"}, optional Ordering for names with equal scores. "alphabetical" sorts by name for deterministic output. "random" shuffles within score groups using the provided seed. Default: "alphabetical". seed : int, optional RNG seed for random ordering. Only used when order="random". Required for deterministic random ordering. Default: None. Returns ------- list[dict] List of selected candidates, sorted by score (descending). Each candidate is augmented with "score", "rank", and "evaluation". Examples -------- >>> selected = select_names(candidates, policy, count=50) >>> selected[0]["rank"] 1 >>> selected[0]["score"] # Highest score 4 >>> len(selected) 50 Notes ----- The returned candidates are augmented with: - score: int - The policy score - rank: int - 1-based rank (1 = best) - evaluation: dict - Detailed evaluation breakdown """ admitted: list[dict] = [] rejected_count = 0 rejection_reasons: dict[str, int] = {} for candidate in candidates: # Check syllable count constraint if not check_syllable_count(candidate, policy): rejected_count += 1 reason = "syllable_count_out_of_range" rejection_reasons[reason] = rejection_reasons.get(reason, 0) + 1 continue # Evaluate against policy is_admitted, score, details = evaluate_candidate(candidate, policy, mode=mode) if not is_admitted: rejected_count += 1 reason = details.get("rejection_reason", "unknown") rejection_reasons[reason] = rejection_reasons.get(reason, 0) + 1 continue # Build augmented candidate admitted.append( { "name": candidate["name"], "syllables": candidate.get("syllables", []), "features": candidate.get("features", {}), "score": score, "evaluation": details, } ) # Sort by score (descending) if order == "random": # Random shuffle within score groups for variety rng = random.Random(seed) # nosec B311 - deterministic name ordering, not security # First sort alphabetically for stable grouping, then shuffle within groups admitted.sort(key=lambda x: (-x["score"], x["name"])) # Group by score and shuffle within each group admitted = _shuffle_within_score_groups(admitted, rng) else: # Alphabetical: sort by score (descending), then by name (deterministic) admitted.sort(key=lambda x: (-x["score"], x["name"])) # Assign ranks and limit output result = admitted[:count] for i, candidate in enumerate(result, start=1): candidate["rank"] = i return result
def _shuffle_within_score_groups(admitted: list[dict], rng: random.Random) -> list[dict]: """ Shuffle candidates within each score group while preserving score order. Parameters ---------- admitted : list[dict] List of candidates sorted by score descending. rng : random.Random Random number generator for shuffling. Returns ------- list[dict] Candidates with shuffled order within score groups. """ if not admitted: return admitted result: list[dict] = [] current_score = admitted[0]["score"] current_group: list[dict] = [] for candidate in admitted: if candidate["score"] == current_score: current_group.append(candidate) else: # Shuffle and add the previous group rng.shuffle(current_group) result.extend(current_group) # Start new group current_score = candidate["score"] current_group = [candidate] # Don't forget the last group rng.shuffle(current_group) result.extend(current_group) return result
[docs] def compute_selection_statistics( candidates: Sequence[dict], policy: NameClassPolicy, mode: Literal["hard", "soft"] = "hard", ) -> dict: """ Compute statistics about a selection operation. Evaluates all candidates and returns aggregate statistics without building the full result list. Parameters ---------- candidates : Sequence[dict] List of candidate dictionaries. policy : NameClassPolicy The policy to evaluate against. mode : {"hard", "soft"}, optional Evaluation mode. Default: "hard". Returns ------- dict Statistics dictionary containing: - total_evaluated: int - admitted: int - rejected: int - rejection_reasons: dict[str, int] - score_distribution: dict[int, int] (score -> count) Examples -------- >>> stats = compute_selection_statistics(candidates, policy) >>> stats["admitted"] 2341 >>> stats["rejection_reasons"]["ends_with_stop"] 1234 """ total_evaluated = len(candidates) admitted_count = 0 rejection_reasons: dict[str, int] = {} score_distribution: dict[int, int] = {} for candidate in candidates: # Check syllable count if not check_syllable_count(candidate, policy): reason = "syllable_count_out_of_range" rejection_reasons[reason] = rejection_reasons.get(reason, 0) + 1 continue # Evaluate is_admitted, score, details = evaluate_candidate(candidate, policy, mode=mode) if not is_admitted: reason = details.get("rejection_reason", "unknown") rejection_reasons[reason] = rejection_reasons.get(reason, 0) + 1 continue admitted_count += 1 score_distribution[score] = score_distribution.get(score, 0) + 1 return { "total_evaluated": total_evaluated, "admitted": admitted_count, "rejected": total_evaluated - admitted_count, "rejection_reasons": rejection_reasons, "score_distribution": dict(sorted(score_distribution.items(), reverse=True)), }