Source code for build_tools.syllable_walk_tui.services.metrics

"""
Corpus shape metrics computation.

This module provides dataclasses and pure functions for computing raw,
objective metrics about corpus shape. These metrics characterize the
statistical structure of a syllable corpus without interpretation.

Design Philosophy:
    - Raw numbers only, no interpretation or judgment
    - Pure functions (no side effects, no I/O)
    - All metrics are observable facts about the corpus
    - Users draw their own conclusions from the data

Metric Categories:
    - Inventory: What exists (counts, lengths)
    - Frequency: Weight distribution (how syllables are distributed)
    - Feature Saturation: Phonetic feature coverage (per-feature counts)

Usage:
    >>> from build_tools.syllable_walk_tui.services.metrics import (
    ...     compute_corpus_shape_metrics
    ... )
    >>> metrics = compute_corpus_shape_metrics(syllables, frequencies, annotated_data)
    >>> print(f"Total syllables: {metrics.inventory.total_count}")
    >>> print(f"Hapax count: {metrics.frequency.hapax_count}")
"""

from __future__ import annotations

import random
import statistics
from dataclasses import dataclass, field
from typing import TYPE_CHECKING

import numpy as np

from build_tools.syllable_walk_tui.services.terrain_weights import (
    DEFAULT_TERRAIN_WEIGHTS,
    AxisWeights,
    TerrainWeights,
)

if TYPE_CHECKING:
    from collections.abc import Sequence

# =============================================================================
# Inventory Metrics
# =============================================================================



[docs]
@dataclass(frozen=True)
class InventoryMetrics:
    """
    Raw inventory metrics describing what exists in the corpus.

    All metrics are objective counts and statistics about syllable inventory.

    Attributes:
        total_count: Total number of unique syllables
        length_min: Minimum syllable length (characters)
        length_max: Maximum syllable length (characters)
        length_mean: Mean syllable length
        length_median: Median syllable length
        length_std: Standard deviation of syllable lengths
        length_distribution: Count of syllables at each length {length: count}
    """

    total_count: int
    length_min: int
    length_max: int
    length_mean: float
    length_median: float
    length_std: float
    length_distribution: dict[int, int] = field(default_factory=dict)




[docs]
def compute_inventory_metrics(syllables: Sequence[str]) -> InventoryMetrics:
    """
    Compute inventory metrics from a list of syllables.

    Args:
        syllables: List of unique syllables

    Returns:
        InventoryMetrics with all computed values

    Raises:
        ValueError: If syllables list is empty
    """
    if not syllables:
        raise ValueError("Cannot compute metrics for empty syllable list")

    lengths = [len(s) for s in syllables]

    # Build length distribution
    length_dist: dict[int, int] = {}
    for length in lengths:
        length_dist[length] = length_dist.get(length, 0) + 1

    # Handle edge case of single syllable (stdev requires 2+ values)
    length_std = 0.0
    if len(lengths) >= 2:
        length_std = statistics.stdev(lengths)

    return InventoryMetrics(
        total_count=len(syllables),
        length_min=min(lengths),
        length_max=max(lengths),
        length_mean=statistics.mean(lengths),
        length_median=statistics.median(lengths),
        length_std=length_std,
        length_distribution=dict(sorted(length_dist.items())),
    )



# =============================================================================
# Frequency Metrics
# =============================================================================



[docs]
@dataclass(frozen=True)
class FrequencyMetrics:
    """
    Raw frequency distribution metrics.

    Describes how syllable occurrences are distributed across the corpus.

    Attributes:
        total_occurrences: Sum of all frequency counts
        freq_min: Minimum frequency value
        freq_max: Maximum frequency value
        freq_mean: Mean frequency
        freq_median: Median frequency
        freq_std: Standard deviation of frequencies
        percentile_10: 10th percentile frequency
        percentile_25: 25th percentile frequency (Q1)
        percentile_50: 50th percentile frequency (median)
        percentile_75: 75th percentile frequency (Q3)
        percentile_90: 90th percentile frequency
        percentile_99: 99th percentile frequency
        unique_freq_count: Number of distinct frequency values
        hapax_count: Count of syllables appearing exactly once
        top_10: Top 10 syllables by frequency [(syllable, freq), ...]
        bottom_10: Bottom 10 syllables by frequency [(syllable, freq), ...]
    """

    total_occurrences: int
    freq_min: int
    freq_max: int
    freq_mean: float
    freq_median: float
    freq_std: float
    percentile_10: int
    percentile_25: int
    percentile_50: int
    percentile_75: int
    percentile_90: int
    percentile_99: int
    unique_freq_count: int
    hapax_count: int
    top_10: tuple[tuple[str, int], ...] = field(default_factory=tuple)
    bottom_10: tuple[tuple[str, int], ...] = field(default_factory=tuple)




[docs]
def compute_frequency_metrics(frequencies: dict[str, int]) -> FrequencyMetrics:
    """
    Compute frequency distribution metrics.

    Args:
        frequencies: Dictionary mapping syllable to frequency count

    Returns:
        FrequencyMetrics with all computed values

    Raises:
        ValueError: If frequencies dict is empty
    """
    if not frequencies:
        raise ValueError("Cannot compute metrics for empty frequencies dict")

    freq_values = list(frequencies.values())
    freq_array = np.array(freq_values, dtype=np.int64)

    # Compute percentiles
    percentiles = np.percentile(freq_array, [10, 25, 50, 75, 90, 99])

    # Count hapax legomena (frequency = 1)
    hapax_count = sum(1 for f in freq_values if f == 1)

    # Unique frequency values
    unique_freq_count = len(set(freq_values))

    # Sort for top/bottom
    sorted_by_freq = sorted(frequencies.items(), key=lambda x: x[1], reverse=True)
    top_10 = tuple(sorted_by_freq[:10])
    bottom_10 = tuple(sorted_by_freq[-10:][::-1])  # Reverse to show lowest first

    # Handle edge case of single entry
    freq_std = 0.0
    if len(freq_values) >= 2:
        freq_std = statistics.stdev(freq_values)

    return FrequencyMetrics(
        total_occurrences=sum(freq_values),
        freq_min=min(freq_values),
        freq_max=max(freq_values),
        freq_mean=statistics.mean(freq_values),
        freq_median=statistics.median(freq_values),
        freq_std=freq_std,
        percentile_10=int(percentiles[0]),
        percentile_25=int(percentiles[1]),
        percentile_50=int(percentiles[2]),
        percentile_75=int(percentiles[3]),
        percentile_90=int(percentiles[4]),
        percentile_99=int(percentiles[5]),
        unique_freq_count=unique_freq_count,
        hapax_count=hapax_count,
        top_10=top_10,
        bottom_10=bottom_10,
    )



# =============================================================================
# Feature Saturation Metrics
# =============================================================================

# Canonical feature order (matches annotator output)
FEATURE_NAMES: tuple[str, ...] = (
    "starts_with_vowel",
    "starts_with_cluster",
    "starts_with_heavy_cluster",
    "contains_plosive",
    "contains_fricative",
    "contains_liquid",
    "contains_nasal",
    "short_vowel",
    "long_vowel",
    "ends_with_vowel",
    "ends_with_nasal",
    "ends_with_stop",
)



[docs]
@dataclass(frozen=True)
class FeatureSaturation:
    """
    Saturation metrics for a single phonetic feature.

    Attributes:
        feature_name: Name of the feature
        true_count: Number of syllables with feature = True
        false_count: Number of syllables with feature = False
        true_percentage: Percentage of corpus with feature = True
    """

    feature_name: str
    true_count: int
    false_count: int
    true_percentage: float




[docs]
@dataclass(frozen=True)
class FeatureSaturationMetrics:
    """
    Feature saturation metrics for all 12 phonetic features.

    Attributes:
        total_syllables: Total syllables analyzed
        features: Tuple of FeatureSaturation for each feature (in canonical order)
        by_name: Dict mapping feature name to FeatureSaturation (for lookup)
    """

    total_syllables: int
    features: tuple[FeatureSaturation, ...] = field(default_factory=tuple)
    by_name: dict[str, FeatureSaturation] = field(default_factory=dict)




[docs]
def compute_feature_saturation_metrics(
    annotated_data: Sequence[dict],
) -> FeatureSaturationMetrics:
    """
    Compute feature saturation metrics from annotated syllable data.

    Args:
        annotated_data: List of dicts with 'syllable', 'frequency', 'features' keys

    Returns:
        FeatureSaturationMetrics with per-feature saturation counts

    Raises:
        ValueError: If annotated_data is empty or malformed
    """
    if not annotated_data:
        raise ValueError("Cannot compute metrics for empty annotated data")

    # Validate first entry has expected structure
    first = annotated_data[0]
    if "features" not in first:
        raise ValueError("Annotated data entries must have 'features' key")

    total = len(annotated_data)

    # Count True values for each feature
    feature_counts: dict[str, int] = {name: 0 for name in FEATURE_NAMES}

    for entry in annotated_data:
        features = entry.get("features", {})
        for name in FEATURE_NAMES:
            if features.get(name, False):
                feature_counts[name] += 1

    # Build FeatureSaturation objects
    saturations: list[FeatureSaturation] = []
    by_name: dict[str, FeatureSaturation] = {}

    for name in FEATURE_NAMES:
        true_count = feature_counts[name]
        false_count = total - true_count
        true_pct = (true_count / total) * 100.0 if total > 0 else 0.0

        sat = FeatureSaturation(
            feature_name=name,
            true_count=true_count,
            false_count=false_count,
            true_percentage=true_pct,
        )
        saturations.append(sat)
        by_name[name] = sat

    return FeatureSaturationMetrics(
        total_syllables=total,
        features=tuple(saturations),
        by_name=by_name,
    )



# =============================================================================
# Terrain Metrics (Phonaesthetic Axes)
# =============================================================================

# Weights are defined in terrain_weights.py with full phonaesthetic rationale.
# See that module for documentation of each weight's justification.
#
# IMPORTANT: Each axis must be BIPOLAR - features pulling BOTH directions.
# Without this, axes measure "Englishness" not phonaesthetic shape.
# See Section 12 of _working/sfa_shapes_terrain_map.md for calibration findings.



[docs]
@dataclass(frozen=True)
class PoleExemplars:
    """
    Exemplar syllables from each pole of a terrain axis.

    These concrete examples help users understand what syllables
    represent each end of the phonaesthetic spectrum.

    Attributes:
        axis_name: Name of the axis ("shape", "craft", or "space")
        low_pole_exemplars: Syllables from the low pole (Round/Flowing/Open)
        high_pole_exemplars: Syllables from the high pole (Jagged/Worked/Dense)
    """

    axis_name: str
    low_pole_exemplars: tuple[str, ...]
    high_pole_exemplars: tuple[str, ...]




[docs]
@dataclass(frozen=True)
class TerrainMetrics:
    """
    Phonaesthetic terrain metrics describing corpus character.

    Three axes derived from feature saturation percentages:
    - Shape: Round (0.0) ↔ Jagged (1.0) - Bouba/Kiki dimension
    - Craft: Flowing (0.0) ↔ Worked (1.0) - Sung/Forged dimension
    - Space: Open (0.0) ↔ Dense (1.0) - Valley/Workshop dimension

    Scores are normalized to 0.0-1.0 range where 0.5 is neutral.

    Attributes:
        shape_score: Position on Round↔Jagged axis (0.0-1.0)
        craft_score: Position on Flowing↔Worked axis (0.0-1.0)
        space_score: Position on Open↔Dense axis (0.0-1.0)
        shape_label: Human-readable label for shape position
        craft_label: Human-readable label for craft position
        space_label: Human-readable label for space position
        shape_exemplars: Optional exemplar syllables for shape axis
        craft_exemplars: Optional exemplar syllables for craft axis
        space_exemplars: Optional exemplar syllables for space axis
    """

    shape_score: float
    craft_score: float
    space_score: float
    shape_label: str
    craft_label: str
    space_label: str
    shape_exemplars: PoleExemplars | None = None
    craft_exemplars: PoleExemplars | None = None
    space_exemplars: PoleExemplars | None = None



def _compute_axis_score(
    feature_saturation: FeatureSaturationMetrics,
    axis_weights: AxisWeights,
) -> float:
    """
    Compute a single axis score from weighted feature percentages.

    Args:
        feature_saturation: Feature saturation metrics
        axis_weights: AxisWeights containing feature-to-weight mappings

    Returns:
        Score normalized to 0.0-1.0 range (0.5 = neutral)
    """
    # Compute weighted sum of feature percentages (as 0-1 values)
    weighted_sum = 0.0
    total_weight = 0.0

    for feature_name, weight in axis_weights.items():
        if feature_name in feature_saturation.by_name:
            pct = feature_saturation.by_name[feature_name].true_percentage / 100.0
            weighted_sum += pct * weight
            total_weight += abs(weight)

    if total_weight == 0:
        return 0.5  # Neutral if no features match

    # Normalize: weighted_sum can range from -total_weight to +total_weight
    # Map to 0.0-1.0 where 0.5 is neutral
    normalized = (weighted_sum / total_weight + 1.0) / 2.0

    # Clamp to valid range
    return max(0.0, min(1.0, normalized))



[docs]
def score_syllable_on_axis(
    features: dict[str, bool],
    axis_weights: AxisWeights,
) -> float:
    """
    Compute axis score for a single syllable from its boolean features.

    Unlike _compute_axis_score() which uses corpus percentages, this uses
    binary features (0 or 1) to rank individual syllables.

    Args:
        features: Dictionary of feature_name -> boolean
        axis_weights: AxisWeights containing feature-to-weight mappings

    Returns:
        Raw weighted sum (not normalized). Higher = more toward high pole.
    """
    weighted_sum = 0.0
    for feature_name, weight in axis_weights.items():
        if features.get(feature_name, False):
            weighted_sum += weight
    return weighted_sum




[docs]
def sample_pole_exemplars(
    annotated_data: Sequence[dict],
    axis_weights: AxisWeights,
    axis_name: str,
    n_exemplars: int = 3,
    rng: random.Random | None = None,
) -> PoleExemplars:
    """
    Sample exemplar syllables from each pole of an axis.

    Scores all syllables in the corpus and samples from the low and high
    tails to provide concrete examples of syllables at each pole.

    Args:
        annotated_data: List of {"syllable": str, "features": dict} entries
        axis_weights: Weights for the axis
        axis_name: Name of axis ("shape", "craft", "space")
        n_exemplars: Number of exemplars per pole (default 3)
        rng: Optional RNG for shuffling within tails (isolated from generation)

    Returns:
        PoleExemplars with syllables from low and high poles
    """
    if not annotated_data:
        return PoleExemplars(
            axis_name=axis_name,
            low_pole_exemplars=(),
            high_pole_exemplars=(),
        )

    # Score all syllables
    scored = [
        (entry["syllable"], score_syllable_on_axis(entry["features"], axis_weights))
        for entry in annotated_data
    ]

    # Shuffle BEFORE sorting if RNG provided - this randomizes tie-breaking
    # (Python's sort is stable, so equal scores would otherwise stay in
    # original alphabetical order, always giving 'a' syllables for low pole
    # and 'z' syllables for high pole)
    if rng:
        rng.shuffle(scored)

    # Sort by score (ascending: low pole first, high pole last)
    scored.sort(key=lambda x: x[1])

    # Take exemplars directly from the sorted tails
    low_exemplars = tuple(s[0] for s in scored[:n_exemplars])
    high_exemplars = tuple(s[0] for s in scored[-n_exemplars:])

    return PoleExemplars(
        axis_name=axis_name,
        low_pole_exemplars=low_exemplars,
        high_pole_exemplars=high_exemplars,
    )



def _score_to_label(score: float, low_label: str, high_label: str) -> str:
    """
    Convert a 0-1 score to a human-readable label.

    Args:
        score: Value from 0.0 to 1.0
        low_label: Label for low end (e.g., "ROUND")
        high_label: Label for high end (e.g., "JAGGED")

    Returns:
        Appropriate label based on score position
    """
    if score < 0.35:
        return low_label
    elif score > 0.65:
        return high_label
    else:
        return "BALANCED"



[docs]
def compute_terrain_metrics(
    feature_saturation: FeatureSaturationMetrics,
    weights: TerrainWeights | None = None,
    annotated_data: Sequence[dict] | None = None,
    exemplar_rng: random.Random | None = None,
    n_exemplars: int = 3,
) -> TerrainMetrics:
    """
    Compute phonaesthetic terrain metrics from feature saturation.

    Derives three axis scores representing the corpus's position in
    phonaesthetic space. These are descriptive, not prescriptive -
    they characterize the acoustic terrain without imposing meaning.

    Args:
        feature_saturation: Computed feature saturation metrics
        weights: Optional TerrainWeights configuration. If None, uses
                 DEFAULT_TERRAIN_WEIGHTS from terrain_weights module.
                 Custom weights allow calibration for different phonaesthetic
                 models or user preferences.
        annotated_data: Optional list of {"syllable": str, "features": dict}
                        entries. If provided, pole exemplars will be computed.
        exemplar_rng: Optional RNG for shuffling exemplars. Isolated from
                      name generation to maintain determinism.
        n_exemplars: Number of exemplars per pole (default 3)

    Returns:
        TerrainMetrics with scores and labels for all three axes

    Example:
        >>> terrain = compute_terrain_metrics(feature_saturation)
        >>> print(f"Shape: {terrain.shape_score:.2f} ({terrain.shape_label})")
        >>> print(f"Craft: {terrain.craft_score:.2f} ({terrain.craft_label})")

        # With custom weights:
        >>> from build_tools.syllable_walk_tui.services.terrain_weights import (
        ...     TerrainWeights, AxisWeights
        ... )
        >>> custom = TerrainWeights(shape=AxisWeights({"contains_plosive": 1.5}))
        >>> terrain = compute_terrain_metrics(feature_saturation, weights=custom)

        # With exemplars:
        >>> terrain = compute_terrain_metrics(
        ...     feature_saturation, annotated_data=corpus_data
        ... )
        >>> print(terrain.shape_exemplars.low_pole_exemplars)
    """
    if weights is None:
        weights = DEFAULT_TERRAIN_WEIGHTS

    shape_score = _compute_axis_score(feature_saturation, weights.shape)
    craft_score = _compute_axis_score(feature_saturation, weights.craft)
    space_score = _compute_axis_score(feature_saturation, weights.space)

    # Compute exemplars if annotated_data provided
    shape_exemplars = None
    craft_exemplars = None
    space_exemplars = None

    if annotated_data:
        shape_exemplars = sample_pole_exemplars(
            annotated_data, weights.shape, "shape", n_exemplars, exemplar_rng
        )
        craft_exemplars = sample_pole_exemplars(
            annotated_data, weights.craft, "craft", n_exemplars, exemplar_rng
        )
        space_exemplars = sample_pole_exemplars(
            annotated_data, weights.space, "space", n_exemplars, exemplar_rng
        )

    return TerrainMetrics(
        shape_score=shape_score,
        craft_score=craft_score,
        space_score=space_score,
        shape_label=_score_to_label(shape_score, "ROUND", "JAGGED"),
        craft_label=_score_to_label(craft_score, "FLOWING", "WORKED"),
        space_label=_score_to_label(space_score, "OPEN", "DENSE"),
        shape_exemplars=shape_exemplars,
        craft_exemplars=craft_exemplars,
        space_exemplars=space_exemplars,
    )



# =============================================================================
# Composite Corpus Shape Metrics
# =============================================================================



[docs]
@dataclass(frozen=True)
class CorpusShapeMetrics:
    """
    Complete corpus shape metrics combining all categories.

    This is the primary interface for corpus analysis. Contains all raw
    metrics needed to understand corpus structure.

    Attributes:
        inventory: Inventory metrics (counts, lengths)
        frequency: Frequency distribution metrics
        feature_saturation: Per-feature saturation metrics
        terrain: Phonaesthetic terrain metrics (derived from features)
    """

    inventory: InventoryMetrics
    frequency: FrequencyMetrics
    feature_saturation: FeatureSaturationMetrics
    terrain: TerrainMetrics




[docs]
def compute_corpus_shape_metrics(
    syllables: Sequence[str],
    frequencies: dict[str, int],
    annotated_data: Sequence[dict],
) -> CorpusShapeMetrics:
    """
    Compute complete corpus shape metrics.

    This is the main entry point for corpus analysis. Computes all metric
    categories and returns a composite result.

    Args:
        syllables: List of unique syllables
        frequencies: Dictionary mapping syllable to frequency count
        annotated_data: List of annotated syllable dicts

    Returns:
        CorpusShapeMetrics containing all computed metrics

    Raises:
        ValueError: If any input is empty or malformed

    Example:
        >>> metrics = compute_corpus_shape_metrics(syllables, frequencies, annotated_data)
        >>> print(f"Corpus has {metrics.inventory.total_count} syllables")
        >>> print(f"Hapax legomena: {metrics.frequency.hapax_count}")
        >>> vowel_pct = metrics.feature_saturation.by_name['starts_with_vowel'].true_percentage
        >>> print(f"Starts with vowel: {vowel_pct:.1f}%")
        >>> print(f"Terrain: {metrics.terrain.shape_label}")
    """
    feature_saturation = compute_feature_saturation_metrics(annotated_data)

    return CorpusShapeMetrics(
        inventory=compute_inventory_metrics(syllables),
        frequency=compute_frequency_metrics(frequencies),
        feature_saturation=feature_saturation,
        terrain=compute_terrain_metrics(feature_saturation, annotated_data=annotated_data),
    )