Source code for build_tools.syllable_analysis.dimensionality.tsne_core

"""t-SNE dimensionality reduction core functionality.

This module provides the core t-SNE application logic, isolated from visualization
and I/O concerns. It can be used for any dimensionality reduction task on feature
matrices.
"""

import numpy as np  # type: ignore[import-not-found]


[docs] def apply_tsne( feature_matrix: np.ndarray, n_components: int = 2, perplexity: int = 30, random_state: int = 42, metric: str = "hamming", ) -> np.ndarray: """Apply t-SNE dimensionality reduction to feature matrix. t-SNE (t-distributed Stochastic Neighbor Embedding) is a technique for dimensionality reduction that projects high-dimensional data into lower dimensions while preserving local structure. Args: feature_matrix: Input feature matrix (n_samples, n_features). For binary features, should contain only 0s and 1s. n_components: Number of dimensions for output (default: 2). 2D is typical for visualization, 3D also common. perplexity: t-SNE perplexity parameter (default: 30). Controls balance between local and global structure. Typical range: 5-50. Higher values consider more neighbors. Should be less than n_samples. random_state: Random seed for reproducibility (default: 42). Same seed ensures identical output for same input. metric: Distance metric (default: 'hamming'). 'hamming' is optimal for binary features (counts # of differences). Other options: 'euclidean', 'manhattan', 'cosine', etc. Returns: Reduced coordinates array of shape (n_samples, n_components). For default n_components=2, output is (n_samples, 2) with x,y coordinates. Raises: ImportError: If scikit-learn is not installed ValueError: If perplexity is invalid (too large for sample size) Example: >>> import numpy as np >>> from build_tools.syllable_analysis.dimensionality import apply_tsne >>> # Create sample binary feature matrix (100 samples, 12 features) >>> feature_matrix = np.random.randint(0, 2, size=(100, 12)) >>> # Apply t-SNE to reduce to 2D >>> coords_2d = apply_tsne(feature_matrix, n_components=2, perplexity=30) >>> coords_2d.shape (100, 2) Notes: - Processing time scales roughly O(n²) with sample size - Perplexity should be less than n_samples (typically n_samples/3 max) - Hamming distance is best for binary features (our use case) - Fixed random_state ensures reproducible results - For large datasets (>10,000 samples), consider using approximate methods """ try: from sklearn.manifold import TSNE # type: ignore[import-not-found] except ImportError as e: raise ImportError( "scikit-learn is required for t-SNE. Install with: pip install scikit-learn" ) from e # Validate perplexity is reasonable for sample size n_samples = feature_matrix.shape[0] if perplexity >= n_samples: raise ValueError( f"Perplexity ({perplexity}) must be less than number of samples ({n_samples}). " f"Suggested: perplexity <= {n_samples // 3}" ) # Apply t-SNE tsne = TSNE( n_components=n_components, perplexity=perplexity, random_state=random_state, metric=metric, ) reduced_coords = tsne.fit_transform(feature_matrix) return reduced_coords
[docs] def calculate_optimal_perplexity( n_samples: int, min_perplexity: int = 5, max_perplexity: int = 50 ) -> int: """Suggest optimal perplexity value based on dataset size. Perplexity is a key t-SNE parameter that balances local vs global structure. This function provides a reasonable default based on dataset size. Rule of thumb: - Perplexity should be between 5 and 50 - Perplexity should be less than n_samples - Common heuristic: perplexity ≈ sqrt(n_samples), clamped to [5, 50] Args: n_samples: Number of samples in dataset min_perplexity: Minimum perplexity value (default: 5) max_perplexity: Maximum perplexity value (default: 50) Returns: Suggested perplexity value Example: >>> calculate_optimal_perplexity(100) 10 >>> calculate_optimal_perplexity(1000) 31 >>> calculate_optimal_perplexity(10000) 50 >>> calculate_optimal_perplexity(10) 5 Notes: - For small datasets (<25 samples): use min_perplexity (5) - For large datasets (>2500 samples): use max_perplexity (50) - For medium datasets: use sqrt(n_samples) - This is a heuristic, not a strict rule - experiment for best results """ # Use square root heuristic suggested = int(np.sqrt(n_samples)) # Clamp to valid range return max(min_perplexity, min(suggested, max_perplexity))