"""
Corpus shape metrics computation.
This module provides dataclasses and pure functions for computing raw,
objective metrics about corpus shape. These metrics characterize the
statistical structure of a syllable corpus without interpretation.
Design Philosophy:
- Raw numbers only, no interpretation or judgment
- Pure functions (no side effects, no I/O)
- All metrics are observable facts about the corpus
- Users draw their own conclusions from the data
Metric Categories:
- Inventory: What exists (counts, lengths)
- Frequency: Weight distribution (how syllables are distributed)
- Feature Saturation: Phonetic feature coverage (per-feature counts)
Usage:
>>> from build_tools.syllable_walk_tui.services.metrics import (
... compute_corpus_shape_metrics
... )
>>> metrics = compute_corpus_shape_metrics(syllables, frequencies, annotated_data)
>>> print(f"Total syllables: {metrics.inventory.total_count}")
>>> print(f"Hapax count: {metrics.frequency.hapax_count}")
"""
from __future__ import annotations
import random
import statistics
from dataclasses import dataclass, field
from typing import TYPE_CHECKING
import numpy as np
from build_tools.syllable_walk_tui.services.terrain_weights import (
DEFAULT_TERRAIN_WEIGHTS,
AxisWeights,
TerrainWeights,
)
if TYPE_CHECKING:
from collections.abc import Sequence
# =============================================================================
# Inventory Metrics
# =============================================================================
[docs]
@dataclass(frozen=True)
class InventoryMetrics:
"""
Raw inventory metrics describing what exists in the corpus.
All metrics are objective counts and statistics about syllable inventory.
Attributes:
total_count: Total number of unique syllables
length_min: Minimum syllable length (characters)
length_max: Maximum syllable length (characters)
length_mean: Mean syllable length
length_median: Median syllable length
length_std: Standard deviation of syllable lengths
length_distribution: Count of syllables at each length {length: count}
"""
total_count: int
length_min: int
length_max: int
length_mean: float
length_median: float
length_std: float
length_distribution: dict[int, int] = field(default_factory=dict)
[docs]
def compute_inventory_metrics(syllables: Sequence[str]) -> InventoryMetrics:
"""
Compute inventory metrics from a list of syllables.
Args:
syllables: List of unique syllables
Returns:
InventoryMetrics with all computed values
Raises:
ValueError: If syllables list is empty
"""
if not syllables:
raise ValueError("Cannot compute metrics for empty syllable list")
lengths = [len(s) for s in syllables]
# Build length distribution
length_dist: dict[int, int] = {}
for length in lengths:
length_dist[length] = length_dist.get(length, 0) + 1
# Handle edge case of single syllable (stdev requires 2+ values)
length_std = 0.0
if len(lengths) >= 2:
length_std = statistics.stdev(lengths)
return InventoryMetrics(
total_count=len(syllables),
length_min=min(lengths),
length_max=max(lengths),
length_mean=statistics.mean(lengths),
length_median=statistics.median(lengths),
length_std=length_std,
length_distribution=dict(sorted(length_dist.items())),
)
# =============================================================================
# Frequency Metrics
# =============================================================================
[docs]
@dataclass(frozen=True)
class FrequencyMetrics:
"""
Raw frequency distribution metrics.
Describes how syllable occurrences are distributed across the corpus.
Attributes:
total_occurrences: Sum of all frequency counts
freq_min: Minimum frequency value
freq_max: Maximum frequency value
freq_mean: Mean frequency
freq_median: Median frequency
freq_std: Standard deviation of frequencies
percentile_10: 10th percentile frequency
percentile_25: 25th percentile frequency (Q1)
percentile_50: 50th percentile frequency (median)
percentile_75: 75th percentile frequency (Q3)
percentile_90: 90th percentile frequency
percentile_99: 99th percentile frequency
unique_freq_count: Number of distinct frequency values
hapax_count: Count of syllables appearing exactly once
top_10: Top 10 syllables by frequency [(syllable, freq), ...]
bottom_10: Bottom 10 syllables by frequency [(syllable, freq), ...]
"""
total_occurrences: int
freq_min: int
freq_max: int
freq_mean: float
freq_median: float
freq_std: float
percentile_10: int
percentile_25: int
percentile_50: int
percentile_75: int
percentile_90: int
percentile_99: int
unique_freq_count: int
hapax_count: int
top_10: tuple[tuple[str, int], ...] = field(default_factory=tuple)
bottom_10: tuple[tuple[str, int], ...] = field(default_factory=tuple)
[docs]
def compute_frequency_metrics(frequencies: dict[str, int]) -> FrequencyMetrics:
"""
Compute frequency distribution metrics.
Args:
frequencies: Dictionary mapping syllable to frequency count
Returns:
FrequencyMetrics with all computed values
Raises:
ValueError: If frequencies dict is empty
"""
if not frequencies:
raise ValueError("Cannot compute metrics for empty frequencies dict")
freq_values = list(frequencies.values())
freq_array = np.array(freq_values, dtype=np.int64)
# Compute percentiles
percentiles = np.percentile(freq_array, [10, 25, 50, 75, 90, 99])
# Count hapax legomena (frequency = 1)
hapax_count = sum(1 for f in freq_values if f == 1)
# Unique frequency values
unique_freq_count = len(set(freq_values))
# Sort for top/bottom
sorted_by_freq = sorted(frequencies.items(), key=lambda x: x[1], reverse=True)
top_10 = tuple(sorted_by_freq[:10])
bottom_10 = tuple(sorted_by_freq[-10:][::-1]) # Reverse to show lowest first
# Handle edge case of single entry
freq_std = 0.0
if len(freq_values) >= 2:
freq_std = statistics.stdev(freq_values)
return FrequencyMetrics(
total_occurrences=sum(freq_values),
freq_min=min(freq_values),
freq_max=max(freq_values),
freq_mean=statistics.mean(freq_values),
freq_median=statistics.median(freq_values),
freq_std=freq_std,
percentile_10=int(percentiles[0]),
percentile_25=int(percentiles[1]),
percentile_50=int(percentiles[2]),
percentile_75=int(percentiles[3]),
percentile_90=int(percentiles[4]),
percentile_99=int(percentiles[5]),
unique_freq_count=unique_freq_count,
hapax_count=hapax_count,
top_10=top_10,
bottom_10=bottom_10,
)
# =============================================================================
# Feature Saturation Metrics
# =============================================================================
# Canonical feature order (matches annotator output)
FEATURE_NAMES: tuple[str, ...] = (
"starts_with_vowel",
"starts_with_cluster",
"starts_with_heavy_cluster",
"contains_plosive",
"contains_fricative",
"contains_liquid",
"contains_nasal",
"short_vowel",
"long_vowel",
"ends_with_vowel",
"ends_with_nasal",
"ends_with_stop",
)
[docs]
@dataclass(frozen=True)
class FeatureSaturation:
"""
Saturation metrics for a single phonetic feature.
Attributes:
feature_name: Name of the feature
true_count: Number of syllables with feature = True
false_count: Number of syllables with feature = False
true_percentage: Percentage of corpus with feature = True
"""
feature_name: str
true_count: int
false_count: int
true_percentage: float
[docs]
@dataclass(frozen=True)
class FeatureSaturationMetrics:
"""
Feature saturation metrics for all 12 phonetic features.
Attributes:
total_syllables: Total syllables analyzed
features: Tuple of FeatureSaturation for each feature (in canonical order)
by_name: Dict mapping feature name to FeatureSaturation (for lookup)
"""
total_syllables: int
features: tuple[FeatureSaturation, ...] = field(default_factory=tuple)
by_name: dict[str, FeatureSaturation] = field(default_factory=dict)
[docs]
def compute_feature_saturation_metrics(
annotated_data: Sequence[dict],
) -> FeatureSaturationMetrics:
"""
Compute feature saturation metrics from annotated syllable data.
Args:
annotated_data: List of dicts with 'syllable', 'frequency', 'features' keys
Returns:
FeatureSaturationMetrics with per-feature saturation counts
Raises:
ValueError: If annotated_data is empty or malformed
"""
if not annotated_data:
raise ValueError("Cannot compute metrics for empty annotated data")
# Validate first entry has expected structure
first = annotated_data[0]
if "features" not in first:
raise ValueError("Annotated data entries must have 'features' key")
total = len(annotated_data)
# Count True values for each feature
feature_counts: dict[str, int] = {name: 0 for name in FEATURE_NAMES}
for entry in annotated_data:
features = entry.get("features", {})
for name in FEATURE_NAMES:
if features.get(name, False):
feature_counts[name] += 1
# Build FeatureSaturation objects
saturations: list[FeatureSaturation] = []
by_name: dict[str, FeatureSaturation] = {}
for name in FEATURE_NAMES:
true_count = feature_counts[name]
false_count = total - true_count
true_pct = (true_count / total) * 100.0 if total > 0 else 0.0
sat = FeatureSaturation(
feature_name=name,
true_count=true_count,
false_count=false_count,
true_percentage=true_pct,
)
saturations.append(sat)
by_name[name] = sat
return FeatureSaturationMetrics(
total_syllables=total,
features=tuple(saturations),
by_name=by_name,
)
# =============================================================================
# Terrain Metrics (Phonaesthetic Axes)
# =============================================================================
# Weights are defined in terrain_weights.py with full phonaesthetic rationale.
# See that module for documentation of each weight's justification.
#
# IMPORTANT: Each axis must be BIPOLAR - features pulling BOTH directions.
# Without this, axes measure "Englishness" not phonaesthetic shape.
# See Section 12 of _working/sfa_shapes_terrain_map.md for calibration findings.
[docs]
@dataclass(frozen=True)
class PoleExemplars:
"""
Exemplar syllables from each pole of a terrain axis.
These concrete examples help users understand what syllables
represent each end of the phonaesthetic spectrum.
Attributes:
axis_name: Name of the axis ("shape", "craft", or "space")
low_pole_exemplars: Syllables from the low pole (Round/Flowing/Open)
high_pole_exemplars: Syllables from the high pole (Jagged/Worked/Dense)
"""
axis_name: str
low_pole_exemplars: tuple[str, ...]
high_pole_exemplars: tuple[str, ...]
[docs]
@dataclass(frozen=True)
class TerrainMetrics:
"""
Phonaesthetic terrain metrics describing corpus character.
Three axes derived from feature saturation percentages:
- Shape: Round (0.0) ↔ Jagged (1.0) - Bouba/Kiki dimension
- Craft: Flowing (0.0) ↔ Worked (1.0) - Sung/Forged dimension
- Space: Open (0.0) ↔ Dense (1.0) - Valley/Workshop dimension
Scores are normalized to 0.0-1.0 range where 0.5 is neutral.
Attributes:
shape_score: Position on Round↔Jagged axis (0.0-1.0)
craft_score: Position on Flowing↔Worked axis (0.0-1.0)
space_score: Position on Open↔Dense axis (0.0-1.0)
shape_label: Human-readable label for shape position
craft_label: Human-readable label for craft position
space_label: Human-readable label for space position
shape_exemplars: Optional exemplar syllables for shape axis
craft_exemplars: Optional exemplar syllables for craft axis
space_exemplars: Optional exemplar syllables for space axis
"""
shape_score: float
craft_score: float
space_score: float
shape_label: str
craft_label: str
space_label: str
shape_exemplars: PoleExemplars | None = None
craft_exemplars: PoleExemplars | None = None
space_exemplars: PoleExemplars | None = None
def _compute_axis_score(
feature_saturation: FeatureSaturationMetrics,
axis_weights: AxisWeights,
) -> float:
"""
Compute a single axis score from weighted feature percentages.
Args:
feature_saturation: Feature saturation metrics
axis_weights: AxisWeights containing feature-to-weight mappings
Returns:
Score normalized to 0.0-1.0 range (0.5 = neutral)
"""
# Compute weighted sum of feature percentages (as 0-1 values)
weighted_sum = 0.0
total_weight = 0.0
for feature_name, weight in axis_weights.items():
if feature_name in feature_saturation.by_name:
pct = feature_saturation.by_name[feature_name].true_percentage / 100.0
weighted_sum += pct * weight
total_weight += abs(weight)
if total_weight == 0:
return 0.5 # Neutral if no features match
# Normalize: weighted_sum can range from -total_weight to +total_weight
# Map to 0.0-1.0 where 0.5 is neutral
normalized = (weighted_sum / total_weight + 1.0) / 2.0
# Clamp to valid range
return max(0.0, min(1.0, normalized))
[docs]
def score_syllable_on_axis(
features: dict[str, bool],
axis_weights: AxisWeights,
) -> float:
"""
Compute axis score for a single syllable from its boolean features.
Unlike _compute_axis_score() which uses corpus percentages, this uses
binary features (0 or 1) to rank individual syllables.
Args:
features: Dictionary of feature_name -> boolean
axis_weights: AxisWeights containing feature-to-weight mappings
Returns:
Raw weighted sum (not normalized). Higher = more toward high pole.
"""
weighted_sum = 0.0
for feature_name, weight in axis_weights.items():
if features.get(feature_name, False):
weighted_sum += weight
return weighted_sum
[docs]
def sample_pole_exemplars(
annotated_data: Sequence[dict],
axis_weights: AxisWeights,
axis_name: str,
n_exemplars: int = 3,
rng: random.Random | None = None,
) -> PoleExemplars:
"""
Sample exemplar syllables from each pole of an axis.
Scores all syllables in the corpus and samples from the low and high
tails to provide concrete examples of syllables at each pole.
Args:
annotated_data: List of {"syllable": str, "features": dict} entries
axis_weights: Weights for the axis
axis_name: Name of axis ("shape", "craft", "space")
n_exemplars: Number of exemplars per pole (default 3)
rng: Optional RNG for shuffling within tails (isolated from generation)
Returns:
PoleExemplars with syllables from low and high poles
"""
if not annotated_data:
return PoleExemplars(
axis_name=axis_name,
low_pole_exemplars=(),
high_pole_exemplars=(),
)
# Score all syllables
scored = [
(entry["syllable"], score_syllable_on_axis(entry["features"], axis_weights))
for entry in annotated_data
]
# Shuffle BEFORE sorting if RNG provided - this randomizes tie-breaking
# (Python's sort is stable, so equal scores would otherwise stay in
# original alphabetical order, always giving 'a' syllables for low pole
# and 'z' syllables for high pole)
if rng:
rng.shuffle(scored)
# Sort by score (ascending: low pole first, high pole last)
scored.sort(key=lambda x: x[1])
# Take exemplars directly from the sorted tails
low_exemplars = tuple(s[0] for s in scored[:n_exemplars])
high_exemplars = tuple(s[0] for s in scored[-n_exemplars:])
return PoleExemplars(
axis_name=axis_name,
low_pole_exemplars=low_exemplars,
high_pole_exemplars=high_exemplars,
)
def _score_to_label(score: float, low_label: str, high_label: str) -> str:
"""
Convert a 0-1 score to a human-readable label.
Args:
score: Value from 0.0 to 1.0
low_label: Label for low end (e.g., "ROUND")
high_label: Label for high end (e.g., "JAGGED")
Returns:
Appropriate label based on score position
"""
if score < 0.35:
return low_label
elif score > 0.65:
return high_label
else:
return "BALANCED"
[docs]
def compute_terrain_metrics(
feature_saturation: FeatureSaturationMetrics,
weights: TerrainWeights | None = None,
annotated_data: Sequence[dict] | None = None,
exemplar_rng: random.Random | None = None,
n_exemplars: int = 3,
) -> TerrainMetrics:
"""
Compute phonaesthetic terrain metrics from feature saturation.
Derives three axis scores representing the corpus's position in
phonaesthetic space. These are descriptive, not prescriptive -
they characterize the acoustic terrain without imposing meaning.
Args:
feature_saturation: Computed feature saturation metrics
weights: Optional TerrainWeights configuration. If None, uses
DEFAULT_TERRAIN_WEIGHTS from terrain_weights module.
Custom weights allow calibration for different phonaesthetic
models or user preferences.
annotated_data: Optional list of {"syllable": str, "features": dict}
entries. If provided, pole exemplars will be computed.
exemplar_rng: Optional RNG for shuffling exemplars. Isolated from
name generation to maintain determinism.
n_exemplars: Number of exemplars per pole (default 3)
Returns:
TerrainMetrics with scores and labels for all three axes
Example:
>>> terrain = compute_terrain_metrics(feature_saturation)
>>> print(f"Shape: {terrain.shape_score:.2f} ({terrain.shape_label})")
>>> print(f"Craft: {terrain.craft_score:.2f} ({terrain.craft_label})")
# With custom weights:
>>> from build_tools.syllable_walk_tui.services.terrain_weights import (
... TerrainWeights, AxisWeights
... )
>>> custom = TerrainWeights(shape=AxisWeights({"contains_plosive": 1.5}))
>>> terrain = compute_terrain_metrics(feature_saturation, weights=custom)
# With exemplars:
>>> terrain = compute_terrain_metrics(
... feature_saturation, annotated_data=corpus_data
... )
>>> print(terrain.shape_exemplars.low_pole_exemplars)
"""
if weights is None:
weights = DEFAULT_TERRAIN_WEIGHTS
shape_score = _compute_axis_score(feature_saturation, weights.shape)
craft_score = _compute_axis_score(feature_saturation, weights.craft)
space_score = _compute_axis_score(feature_saturation, weights.space)
# Compute exemplars if annotated_data provided
shape_exemplars = None
craft_exemplars = None
space_exemplars = None
if annotated_data:
shape_exemplars = sample_pole_exemplars(
annotated_data, weights.shape, "shape", n_exemplars, exemplar_rng
)
craft_exemplars = sample_pole_exemplars(
annotated_data, weights.craft, "craft", n_exemplars, exemplar_rng
)
space_exemplars = sample_pole_exemplars(
annotated_data, weights.space, "space", n_exemplars, exemplar_rng
)
return TerrainMetrics(
shape_score=shape_score,
craft_score=craft_score,
space_score=space_score,
shape_label=_score_to_label(shape_score, "ROUND", "JAGGED"),
craft_label=_score_to_label(craft_score, "FLOWING", "WORKED"),
space_label=_score_to_label(space_score, "OPEN", "DENSE"),
shape_exemplars=shape_exemplars,
craft_exemplars=craft_exemplars,
space_exemplars=space_exemplars,
)
# =============================================================================
# Composite Corpus Shape Metrics
# =============================================================================
[docs]
@dataclass(frozen=True)
class CorpusShapeMetrics:
"""
Complete corpus shape metrics combining all categories.
This is the primary interface for corpus analysis. Contains all raw
metrics needed to understand corpus structure.
Attributes:
inventory: Inventory metrics (counts, lengths)
frequency: Frequency distribution metrics
feature_saturation: Per-feature saturation metrics
terrain: Phonaesthetic terrain metrics (derived from features)
"""
inventory: InventoryMetrics
frequency: FrequencyMetrics
feature_saturation: FeatureSaturationMetrics
terrain: TerrainMetrics
[docs]
def compute_corpus_shape_metrics(
syllables: Sequence[str],
frequencies: dict[str, int],
annotated_data: Sequence[dict],
) -> CorpusShapeMetrics:
"""
Compute complete corpus shape metrics.
This is the main entry point for corpus analysis. Computes all metric
categories and returns a composite result.
Args:
syllables: List of unique syllables
frequencies: Dictionary mapping syllable to frequency count
annotated_data: List of annotated syllable dicts
Returns:
CorpusShapeMetrics containing all computed metrics
Raises:
ValueError: If any input is empty or malformed
Example:
>>> metrics = compute_corpus_shape_metrics(syllables, frequencies, annotated_data)
>>> print(f"Corpus has {metrics.inventory.total_count} syllables")
>>> print(f"Hapax legomena: {metrics.frequency.hapax_count}")
>>> vowel_pct = metrics.feature_saturation.by_name['starts_with_vowel'].true_percentage
>>> print(f"Starts with vowel: {vowel_pct:.1f}%")
>>> print(f"Terrain: {metrics.terrain.shape_label}")
"""
feature_saturation = compute_feature_saturation_metrics(annotated_data)
return CorpusShapeMetrics(
inventory=compute_inventory_metrics(syllables),
frequency=compute_frequency_metrics(frequencies),
feature_saturation=feature_saturation,
terrain=compute_terrain_metrics(feature_saturation, annotated_data=annotated_data),
)