Source code for build_tools.name_combiner.combiner

"""
Core combination logic for name candidate generation.

This module provides the main combination functionality that takes an
annotated syllable corpus and produces N-syllable name candidates with
aggregated feature vectors.

The combiner is intentionally simple - it performs structural combination
without any policy evaluation. Policy-based filtering is the responsibility
of the name_selector module.

Combination Strategy
--------------------
The default combination strategy uses frequency-weighted random sampling:

1. Load annotated syllables with their frequencies
2. Build a weighted probability distribution (higher frequency = more likely)
3. Sample N syllables using the isolated RNG instance
4. Concatenate syllables to form a name
5. Aggregate features using the rules in aggregator.py

This produces candidates that reflect the natural distribution of the corpus
while maintaining full determinism through seed control.

Determinism
-----------
**Critical**: All combination uses `random.Random(seed)` to create isolated
RNG instances. This ensures:

- Same seed always produces identical candidates
- No global state contamination
- Reproducible builds across sessions

Usage
-----
>>> from build_tools.name_combiner.combiner import combine_syllables
>>> candidates = combine_syllables(
...     annotated_data=corpus,
...     syllable_count=2,
...     count=100,
...     seed=42,
... )
>>> for c in candidates[:3]:
...     print(f"{c['name']}: score-ready features")
"""

from __future__ import annotations

import random
from typing import TYPE_CHECKING

from build_tools.name_combiner.aggregator import aggregate_features

if TYPE_CHECKING:
    from collections.abc import Sequence



[docs]
def combine_syllables(
    annotated_data: Sequence[dict],
    syllable_count: int,
    count: int,
    seed: int | None = None,
    frequency_weight: float = 1.0,
) -> list[dict]:
    """
    Generate name candidates by combining syllables from an annotated corpus.

    Takes an annotated syllable corpus and produces N-syllable name candidates
    with aggregated feature vectors suitable for policy evaluation.

    Parameters
    ----------
    annotated_data : Sequence[dict]
        List of annotated syllable dictionaries, each containing:
        - "syllable": str - The syllable text
        - "frequency": int - Occurrence count in source corpus
        - "features": dict[str, bool] - The 12 boolean features

    syllable_count : int
        Number of syllables per generated name (typically 2, 3, or 4).

    count : int
        Number of candidates to generate.

    seed : int | None, optional
        RNG seed for deterministic output. If None, uses system entropy.
        Default: None.

    frequency_weight : float, optional
        Weight for frequency-biased sampling. 0.0 = uniform sampling,
        1.0 = fully frequency-weighted. Values between 0 and 1 interpolate.
        Default: 1.0.

    Returns
    -------
    list[dict]
        List of candidate dictionaries, each containing:
        - "name": str - The combined name (concatenated syllables)
        - "syllables": list[str] - The constituent syllables
        - "features": dict[str, bool] - Aggregated name-level features

    Raises
    ------
    ValueError
        If annotated_data is empty or syllable_count < 1.

    Examples
    --------
    >>> corpus = [
    ...     {"syllable": "ka", "frequency": 100, "features": {...}},
    ...     {"syllable": "li", "frequency": 50, "features": {...}},
    ...     {"syllable": "ra", "frequency": 75, "features": {...}},
    ... ]
    >>> candidates = combine_syllables(corpus, syllable_count=2, count=5, seed=42)
    >>> len(candidates)
    5
    >>> candidates[0]["name"]  # Deterministic with seed=42
    'kali'  # Example output
    >>> candidates[0]["syllables"]
    ['ka', 'li']

    Notes
    -----
    **Determinism**: Uses `random.Random(seed)` for isolated RNG. Same seed
    always produces identical output.

    **Frequency Weighting**: Higher frequency syllables are more likely to
    be sampled. This reflects the natural distribution of the source corpus
    and tends to produce more "natural-sounding" combinations.

    **No Policy Evaluation**: This function performs structural combination
    only. Policy-based filtering is done by the name_selector module.
    """
    if not annotated_data:
        raise ValueError("Cannot combine from empty annotated data")
    if syllable_count < 1:
        raise ValueError(f"syllable_count must be >= 1, got {syllable_count}")
    if count < 1:
        raise ValueError(f"count must be >= 1, got {count}")

    # Create isolated RNG instance (critical for determinism)
    rng = random.Random(seed)  # nosec B311 - intentional seeded RNG for reproducibility

    # Build weighted probability distribution
    syllables_list = list(annotated_data)
    weights = _compute_weights(syllables_list, frequency_weight)

    candidates: list[dict] = []

    for _ in range(count):
        # Sample N syllables with replacement
        selected = rng.choices(syllables_list, weights=weights, k=syllable_count)

        # Build candidate
        name = "".join(s["syllable"] for s in selected)
        syllable_texts = [s["syllable"] for s in selected]
        features = aggregate_features(selected)

        candidates.append(
            {
                "name": name,
                "syllables": syllable_texts,
                "features": features,
            }
        )

    return candidates



def _compute_weights(
    annotated_data: Sequence[dict],
    frequency_weight: float,
) -> list[float]:
    """
    Compute sampling weights from syllable frequencies.

    Parameters
    ----------
    annotated_data : Sequence[dict]
        Annotated syllables with "frequency" key.

    frequency_weight : float
        Interpolation factor: 0.0 = uniform, 1.0 = fully weighted.

    Returns
    -------
    list[float]
        Sampling weights (not normalized, random.choices handles that).
    """
    if frequency_weight <= 0.0:
        # Uniform weights
        return [1.0] * len(annotated_data)

    frequencies = [s.get("frequency", 1) for s in annotated_data]

    if frequency_weight >= 1.0:
        # Fully frequency-weighted
        return [float(f) for f in frequencies]

    # Interpolate between uniform and frequency-weighted
    uniform = 1.0
    return [uniform * (1.0 - frequency_weight) + float(f) * frequency_weight for f in frequencies]