Source code for build_tools.syllable_analysis.dimensionality.feature_matrix

"""Feature matrix extraction for dimensionality reduction.

This module provides utilities for extracting numerical feature matrices from
annotated syllable records. The matrices are suitable for dimensionality reduction
algorithms like t-SNE, PCA, UMAP, etc.
"""

from typing import Dict, List, Tuple

import numpy as np  # type: ignore[import-not-found]

# All features tracked by the annotator (order matters for consistent feature vectors)
# This canonical ordering ensures the same syllable always produces the same feature vector
ALL_FEATURES = [
    "contains_liquid",
    "contains_plosive",
    "contains_fricative",
    "contains_nasal",
    "long_vowel",
    "short_vowel",
    "starts_with_vowel",
    "starts_with_cluster",
    "starts_with_heavy_cluster",
    "ends_with_vowel",
    "ends_with_stop",
    "ends_with_nasal",
]


[docs] def extract_feature_matrix( records: List[Dict], feature_names: List[str] = ALL_FEATURES ) -> Tuple[np.ndarray, List[int]]: """Extract binary feature matrix from annotated syllable records. Converts feature dictionaries to a numerical matrix suitable for dimensionality reduction algorithms. Each row represents a syllable, each column represents a feature (0 or 1). Args: records: List of annotated syllable records with 'features' and 'frequency' keys. Each record should have structure: { "syllable": "ka", "frequency": 187, "features": {"contains_liquid": False, "contains_plosive": True, ...} } feature_names: Ordered list of feature names to extract (default: ALL_FEATURES). Order determines column order in output matrix. Returns: Tuple of (feature_matrix, frequencies): - feature_matrix: numpy array of shape (n_syllables, n_features) with binary values - frequencies: List of frequency counts for each syllable Example: >>> records = [ ... { ... "syllable": "ka", ... "frequency": 187, ... "features": {"contains_liquid": False, "contains_plosive": True, ...} ... } ... ] >>> matrix, freqs = extract_feature_matrix(records) >>> matrix.shape (1, 12) >>> freqs [187] Notes: - Missing features default to False (0) - Feature values are converted to int (True→1, False→0) - Output matrix dtype is int for memory efficiency - Empty record list returns (0, n_features) shaped array """ feature_matrix = [] frequencies = [] for record in records: # Extract feature values in consistent order feature_vector = [int(record["features"].get(feat, False)) for feat in feature_names] feature_matrix.append(feature_vector) frequencies.append(record["frequency"]) # Handle empty case explicitly to ensure correct shape if not feature_matrix: return np.empty((0, len(feature_names)), dtype=int), frequencies return np.array(feature_matrix, dtype=int), frequencies
[docs] def validate_feature_matrix(feature_matrix: np.ndarray, expected_features: int = 12) -> None: """Validate feature matrix shape and contents. Ensures the feature matrix has the expected structure for dimensionality reduction algorithms. Args: feature_matrix: Binary feature matrix expected_features: Expected number of features (default: 12) Raises: ValueError: If validation fails (wrong shape, non-binary values, etc.) Example: >>> matrix = np.array([[1, 0, 1], [0, 1, 0]]) >>> validate_feature_matrix(matrix, expected_features=3) # OK >>> validate_feature_matrix(matrix, expected_features=4) # Raises ValueError """ if feature_matrix.ndim != 2: raise ValueError( f"Feature matrix must be 2D, got {feature_matrix.ndim}D with shape {feature_matrix.shape}" ) if feature_matrix.shape[1] != expected_features: raise ValueError( f"Expected {expected_features} features, got {feature_matrix.shape[1]} features" ) if feature_matrix.shape[0] == 0: raise ValueError("Feature matrix has no samples (0 rows)") # Check for binary values (0 or 1 only) unique_values = np.unique(feature_matrix) if not np.all(np.isin(unique_values, [0, 1])): raise ValueError( f"Feature matrix must contain only binary values (0, 1), found: {unique_values}" )
[docs] def get_feature_vector( features: Dict[str, bool], feature_names: List[str] = ALL_FEATURES ) -> List[int]: """Extract a single feature vector from a feature dictionary. Converts a dictionary of feature flags to an ordered binary vector. Useful for extracting vectors from individual syllables. Args: features: Dictionary of feature name → boolean value feature_names: Ordered list of feature names (default: ALL_FEATURES) Returns: Binary feature vector matching feature_names order Example: >>> features = {"contains_liquid": True, "contains_plosive": False} >>> vector = get_feature_vector(features, ["contains_liquid", "contains_plosive"]) >>> vector [1, 0] Notes: - Missing features default to False (0) - Order of output matches order of feature_names - Output is Python list, not numpy array (for flexibility) """ return [int(features.get(feat, False)) for feat in feature_names]