"""
Main selector orchestration logic.
This module provides the high-level selection function that coordinates
loading candidates, evaluating them against a policy, and producing
ranked output.
The selector is the central orchestrator of the Selection Policy Layer.
It ties together:
- Candidate loading (from name_combiner output)
- Policy evaluation (from policy.py)
- Result ranking and filtering
Usage
-----
>>> from build_tools.name_selector import select_names, load_name_classes
>>>
>>> # Load policies and candidates
>>> policies = load_name_classes("data/name_classes.yml")
>>> with open("candidates/pyphen_candidates_2syl.json") as f:
... candidates_data = json.load(f)
>>>
>>> # Select names
>>> selected = select_names(
... candidates=candidates_data["candidates"],
... policy=policies["first_name"],
... count=100,
... mode="hard",
... )
>>>
>>> for name in selected[:5]:
... print(f"{name['name']}: score={name['score']}, rank={name['rank']}")
"""
from __future__ import annotations
import random
from typing import TYPE_CHECKING, Literal
from build_tools.name_selector.policy import check_syllable_count, evaluate_candidate
if TYPE_CHECKING:
from collections.abc import Sequence
from build_tools.name_selector.name_class import NameClassPolicy
[docs]
def select_names(
candidates: Sequence[dict],
policy: NameClassPolicy,
count: int = 100,
mode: Literal["hard", "soft"] = "hard",
order: Literal["alphabetical", "random"] = "alphabetical",
seed: int | None = None,
) -> list[dict]:
"""
Select and rank name candidates against a policy.
Evaluates all candidates, filters out rejected ones, ranks by score,
and returns the top N.
Parameters
----------
candidates : Sequence[dict]
List of candidate dictionaries from name_combiner output.
Each must have "name", "syllables", and "features" keys.
policy : NameClassPolicy
The policy to evaluate against.
count : int, optional
Maximum number of names to return. Default: 100.
mode : {"hard", "soft"}, optional
Evaluation mode. "hard" rejects on discouraged features.
"soft" applies penalties. Default: "hard".
order : {"alphabetical", "random"}, optional
Ordering for names with equal scores. "alphabetical" sorts by name
for deterministic output. "random" shuffles within score groups
using the provided seed. Default: "alphabetical".
seed : int, optional
RNG seed for random ordering. Only used when order="random".
Required for deterministic random ordering. Default: None.
Returns
-------
list[dict]
List of selected candidates, sorted by score (descending).
Each candidate is augmented with "score", "rank", and "evaluation".
Examples
--------
>>> selected = select_names(candidates, policy, count=50)
>>> selected[0]["rank"]
1
>>> selected[0]["score"] # Highest score
4
>>> len(selected)
50
Notes
-----
The returned candidates are augmented with:
- score: int - The policy score
- rank: int - 1-based rank (1 = best)
- evaluation: dict - Detailed evaluation breakdown
"""
admitted: list[dict] = []
rejected_count = 0
rejection_reasons: dict[str, int] = {}
for candidate in candidates:
# Check syllable count constraint
if not check_syllable_count(candidate, policy):
rejected_count += 1
reason = "syllable_count_out_of_range"
rejection_reasons[reason] = rejection_reasons.get(reason, 0) + 1
continue
# Evaluate against policy
is_admitted, score, details = evaluate_candidate(candidate, policy, mode=mode)
if not is_admitted:
rejected_count += 1
reason = details.get("rejection_reason", "unknown")
rejection_reasons[reason] = rejection_reasons.get(reason, 0) + 1
continue
# Build augmented candidate
admitted.append(
{
"name": candidate["name"],
"syllables": candidate.get("syllables", []),
"features": candidate.get("features", {}),
"score": score,
"evaluation": details,
}
)
# Sort by score (descending)
if order == "random":
# Random shuffle within score groups for variety
rng = random.Random(seed) # nosec B311 - deterministic name ordering, not security
# First sort alphabetically for stable grouping, then shuffle within groups
admitted.sort(key=lambda x: (-x["score"], x["name"]))
# Group by score and shuffle within each group
admitted = _shuffle_within_score_groups(admitted, rng)
else:
# Alphabetical: sort by score (descending), then by name (deterministic)
admitted.sort(key=lambda x: (-x["score"], x["name"]))
# Assign ranks and limit output
result = admitted[:count]
for i, candidate in enumerate(result, start=1):
candidate["rank"] = i
return result
def _shuffle_within_score_groups(admitted: list[dict], rng: random.Random) -> list[dict]:
"""
Shuffle candidates within each score group while preserving score order.
Parameters
----------
admitted : list[dict]
List of candidates sorted by score descending.
rng : random.Random
Random number generator for shuffling.
Returns
-------
list[dict]
Candidates with shuffled order within score groups.
"""
if not admitted:
return admitted
result: list[dict] = []
current_score = admitted[0]["score"]
current_group: list[dict] = []
for candidate in admitted:
if candidate["score"] == current_score:
current_group.append(candidate)
else:
# Shuffle and add the previous group
rng.shuffle(current_group)
result.extend(current_group)
# Start new group
current_score = candidate["score"]
current_group = [candidate]
# Don't forget the last group
rng.shuffle(current_group)
result.extend(current_group)
return result
[docs]
def compute_selection_statistics(
candidates: Sequence[dict],
policy: NameClassPolicy,
mode: Literal["hard", "soft"] = "hard",
) -> dict:
"""
Compute statistics about a selection operation.
Evaluates all candidates and returns aggregate statistics without
building the full result list.
Parameters
----------
candidates : Sequence[dict]
List of candidate dictionaries.
policy : NameClassPolicy
The policy to evaluate against.
mode : {"hard", "soft"}, optional
Evaluation mode. Default: "hard".
Returns
-------
dict
Statistics dictionary containing:
- total_evaluated: int
- admitted: int
- rejected: int
- rejection_reasons: dict[str, int]
- score_distribution: dict[int, int] (score -> count)
Examples
--------
>>> stats = compute_selection_statistics(candidates, policy)
>>> stats["admitted"]
2341
>>> stats["rejection_reasons"]["ends_with_stop"]
1234
"""
total_evaluated = len(candidates)
admitted_count = 0
rejection_reasons: dict[str, int] = {}
score_distribution: dict[int, int] = {}
for candidate in candidates:
# Check syllable count
if not check_syllable_count(candidate, policy):
reason = "syllable_count_out_of_range"
rejection_reasons[reason] = rejection_reasons.get(reason, 0) + 1
continue
# Evaluate
is_admitted, score, details = evaluate_candidate(candidate, policy, mode=mode)
if not is_admitted:
reason = details.get("rejection_reason", "unknown")
rejection_reasons[reason] = rejection_reasons.get(reason, 0) + 1
continue
admitted_count += 1
score_distribution[score] = score_distribution.get(score, 0) + 1
return {
"total_evaluated": total_evaluated,
"admitted": admitted_count,
"rejected": total_evaluated - admitted_count,
"rejection_reasons": rejection_reasons,
"score_distribution": dict(sorted(score_distribution.items(), reverse=True)),
}