"""t-SNE Visualization for Feature Signature Space
This build-time analysis tool creates a t-SNE (t-distributed Stochastic Neighbor Embedding)
visualization of the feature signature space in the annotated syllable corpus.
t-SNE is a dimensionality reduction technique that projects high-dimensional feature vectors
into 2D space while preserving local structure. This visualization helps identify:
- Clustering patterns in the feature space
- Syllable similarity based on phonetic features
- Natural groupings and outliers in the corpus
The visualization uses:
- Position (x, y): t-SNE projection of 12-dimensional feature vectors
- Size: Syllable frequency (larger points = more common syllables)
- Color: Syllable frequency (warmer colors = more common syllables)
Technical Details:
- Uses Hamming distance metric (optimal for binary feature vectors)
- Perplexity=30 (balances local vs global structure)
- Fixed random seed for reproducibility (seed=42)
Output Formats:
- Static PNG: High-resolution matplotlib visualization (always generated)
- Interactive HTML: Plotly-based interactive visualization (optional, requires --interactive flag)
Usage::
# Generate static PNG visualization with default paths
python -m build_tools.syllable_analysis.tsne_visualizer
# Generate both static PNG and interactive HTML
python -m build_tools.syllable_analysis.tsne_visualizer \\
--interactive \\
--save-mapping
# Custom input/output paths
python -m build_tools.syllable_analysis.tsne_visualizer \\
--input data/annotated/syllables_annotated.json \\
--output _working/analysis/tsne/ \\
--interactive
# Adjust t-SNE parameters
python -m build_tools.syllable_analysis.tsne_visualizer \\
--perplexity 50 \\
--random-state 123 \\
--interactive
# High-resolution output with interactive HTML
python -m build_tools.syllable_analysis.tsne_visualizer \\
--dpi 600 \\
--interactive \\
--save-mapping
Programmatic Usage:
>>> from pathlib import Path
>>> from build_tools.syllable_analysis import (
... run_tsne_visualization,
... extract_feature_matrix
... )
>>> result = run_tsne_visualization(
... input_path=Path("data/annotated/syllables_annotated.json"),
... output_dir=Path("_working/analysis/tsne/"),
... perplexity=30,
... random_state=42,
... interactive=True,
... save_mapping=True
... )
>>> print(f"Static visualization: {result['output_path']}")
>>> print(f"Interactive HTML: {result['interactive_path']}")
Architecture:
This module orchestrates calls to specialized modules:
- common.data_io: Load annotated syllables
- common.paths: Default path configuration
- common.output: Output directory and file management
- dimensionality.feature_matrix: Extract feature matrices
- dimensionality.tsne_core: Apply t-SNE reduction
- dimensionality.mapping: Create and save coordinate mappings
- plotting.static: Create and save matplotlib PNG visualizations
- plotting.interactive: Create and save Plotly HTML visualizations
"""
from __future__ import annotations
import argparse
import time
from pathlib import Path
# Configure matplotlib to use non-interactive backend (for headless environments like CI)
import matplotlib # type: ignore[import-not-found]
matplotlib.use("Agg")
import matplotlib.pyplot as plt # type: ignore[import-not-found]
# Import from refactored modules
from build_tools.syllable_analysis.common import (
default_paths,
ensure_output_dir,
load_annotated_syllables,
)
from build_tools.syllable_analysis.dimensionality import (
ALL_FEATURES,
apply_tsne,
create_tsne_mapping,
extract_feature_matrix,
save_tsne_mapping,
)
from build_tools.syllable_analysis.plotting import (
PLOTLY_AVAILABLE,
create_metadata_text,
create_tsne_scatter,
save_static_plot,
)
# Conditional import for interactive plotting
if PLOTLY_AVAILABLE:
from build_tools.syllable_analysis.plotting import (
create_interactive_scatter,
save_interactive_html,
)
[docs]
def run_tsne_visualization(
input_path: Path,
output_dir: Path,
perplexity: int = 30,
random_state: int = 42,
dpi: int = 300,
verbose: bool = False,
save_mapping: bool = False,
interactive: bool = False,
) -> dict:
"""Run the complete t-SNE visualization pipeline.
This is the main entry point for programmatic use. It handles the full workflow:
1. Load annotated syllables
2. Extract feature matrix
3. Apply t-SNE dimensionality reduction
4. Create visualization
5. Save outputs (PNG + optional HTML + optional mapping)
Args:
input_path: Path to syllables_annotated.json
output_dir: Directory to save visualization outputs
perplexity: t-SNE perplexity parameter (default: 30)
random_state: Random seed for reproducibility (default: 42)
dpi: Output resolution in dots per inch (default: 300)
verbose: Print detailed progress information
save_mapping: Save syllable→features→coordinates mapping as JSON (default: False)
interactive: Generate interactive HTML visualization (requires Plotly, default: False)
Returns:
Dictionary containing:
- syllable_count: Number of syllables visualized
- feature_count: Number of features (always 12)
- output_path: Path to saved visualization PNG
- metadata_path: Path to saved metadata file
- tsne_coordinates: numpy array of 2D coordinates
- mapping_path: Path to mapping JSON (None if save_mapping=False)
- interactive_path: Path to interactive HTML (None if interactive=False or Plotly unavailable)
- processing_time: Total processing time in seconds
Raises:
FileNotFoundError: If input file does not exist
ImportError: If required dependencies are missing
ValueError: If input data is invalid
Example:
>>> from pathlib import Path
>>> result = run_tsne_visualization(
... input_path=Path("data/annotated/syllables_annotated.json"),
... output_dir=Path("_working/analysis/tsne/"),
... interactive=True,
... save_mapping=True
... )
>>> print(f"Visualized {result['syllable_count']} syllables")
>>> print(f"Interactive HTML: {result['interactive_path']}")
"""
start_time = time.time()
if verbose:
print(f"Loading data from: {input_path}")
# Load annotated syllables using common module
records = load_annotated_syllables(input_path)
if verbose:
print(f"Loaded {len(records):,} annotated syllables")
print("Extracting feature matrix...")
# Extract feature matrix and frequencies using dimensionality module
feature_matrix, frequencies = extract_feature_matrix(records)
if verbose:
print(f"Feature matrix shape: {feature_matrix.shape}")
print("Running t-SNE (this may take a minute)...")
# Apply t-SNE using dimensionality module
tsne_coords = apply_tsne(
feature_matrix, n_components=2, perplexity=perplexity, random_state=random_state
)
if verbose:
print("Creating static visualization...")
# Create static matplotlib visualization using plotting module
fig = create_tsne_scatter(tsne_coords, frequencies)
# Ensure output directory exists
ensure_output_dir(output_dir)
if verbose:
print("Saving visualization...")
# Generate timestamped output paths
from datetime import datetime
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
viz_path = output_dir / f"{timestamp}.tsne_visualization.png"
meta_path = output_dir / f"{timestamp}.tsne_metadata.txt"
# Save static plot using plotting module
save_static_plot(fig, viz_path, dpi=dpi)
# Generate and save metadata
processing_time = time.time() - start_time
metadata_text = create_metadata_text(
output_filename=viz_path.name,
dpi=dpi,
perplexity=perplexity,
random_state=random_state,
processing_time=processing_time,
)
meta_path.write_text(metadata_text, encoding="utf-8")
# Conditionally save mapping file
mapping_path = None
if save_mapping:
# Create mapping using dimensionality module
mapping = create_tsne_mapping(records, tsne_coords)
mapping_path = output_dir / f"{timestamp}.tsne_mapping.json"
save_tsne_mapping(mapping, mapping_path)
if verbose:
print(f"✓ Mapping saved to: {mapping_path}")
# Conditionally save interactive HTML visualization
interactive_path = None
if interactive:
if not PLOTLY_AVAILABLE:
print("Warning: Plotly not available. Skipping interactive visualization.")
print("Install with: pip install plotly")
else:
if verbose:
print("Creating interactive visualization...")
# Create interactive figure using plotting module
interactive_fig = create_interactive_scatter(records, tsne_coords)
interactive_path = output_dir / f"{timestamp}.tsne_interactive.html"
save_interactive_html(interactive_fig, interactive_path, perplexity, random_state)
if verbose:
print(f"✓ Interactive HTML saved to: {interactive_path}")
# Clean up matplotlib figure
plt.close(fig)
return {
"syllable_count": len(records),
"feature_count": len(ALL_FEATURES),
"output_path": viz_path,
"metadata_path": meta_path,
"tsne_coordinates": tsne_coords,
"mapping_path": mapping_path,
"interactive_path": interactive_path,
"processing_time": processing_time,
}
[docs]
def create_argument_parser() -> argparse.ArgumentParser:
"""
Create and return the argument parser for t-SNE visualization.
This function creates the ArgumentParser with all CLI options but does not
parse arguments. This separation allows Sphinx documentation tools to
introspect the parser and auto-generate CLI documentation.
Returns
-------
argparse.ArgumentParser
Configured ArgumentParser ready to parse command-line arguments
"""
parser = argparse.ArgumentParser(
description="Generate t-SNE visualization of feature signature space",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples
========
.. code-block:: bash
# Generate visualization with default settings
python -m build_tools.syllable_analysis.tsne_visualizer
# Custom input/output paths
python -m build_tools.syllable_analysis.tsne_visualizer \\
--input data/annotated/syllables_annotated.json \\
--output _working/analysis/tsne/
# Adjust t-SNE parameters
python -m build_tools.syllable_analysis.tsne_visualizer \\
--perplexity 50 \\
--random-state 123
# High-resolution output
python -m build_tools.syllable_analysis.tsne_visualizer \\
--dpi 600
# Verbose output
python -m build_tools.syllable_analysis.tsne_visualizer --verbose
""",
)
parser.add_argument(
"--input",
type=Path,
default=default_paths.annotated_syllables,
help=f"Path to syllables_annotated.json (default: {default_paths.annotated_syllables})",
)
parser.add_argument(
"--output",
type=Path,
default=default_paths.analysis_output_dir("tsne"),
help=f"Output directory for visualizations (default: {default_paths.analysis_output_dir('tsne')})",
)
parser.add_argument(
"--perplexity",
type=int,
default=30,
help="t-SNE perplexity parameter (default: 30, range: 5-50)",
)
parser.add_argument(
"--random-state",
type=int,
default=42,
help="Random seed for reproducibility (default: 42)",
)
parser.add_argument(
"--dpi",
type=int,
default=300,
help="Output resolution in DPI (default: 300)",
)
parser.add_argument(
"--save-mapping",
action="store_true",
help="Save syllable→features→coordinates mapping as JSON (default: False)",
)
parser.add_argument(
"--interactive",
action="store_true",
help="Generate interactive HTML visualization in addition to static PNG (requires Plotly)",
)
parser.add_argument(
"--verbose",
action="store_true",
help="Print detailed progress information",
)
return parser
[docs]
def parse_args() -> argparse.Namespace:
"""Parse command-line arguments.
Returns:
Parsed argument namespace with validated parameters
"""
parser = create_argument_parser()
return parser.parse_args()
[docs]
def main() -> None:
"""Main entry point for the t-SNE visualization tool."""
args = parse_args()
# Validate input file exists
if not args.input.exists():
print(f"Error: Input file not found: {args.input}")
print("Have you run the syllable feature annotator yet?")
print("Expected path: data/annotated/syllables_annotated.json")
return
# Validate perplexity range
if not 5 <= args.perplexity <= 50:
print(f"Warning: Perplexity {args.perplexity} is outside typical range (5-50)")
print("This may produce suboptimal results.")
# Add helpful note if --interactive used without --save-mapping
if args.interactive and not args.save_mapping:
print("Note: Interactive visualization works best with --save-mapping enabled")
print(" to enable coordinate reuse and feature exploration.\n")
if not args.verbose:
print(f"Generating t-SNE visualization from: {args.input}")
print(f"Output directory: {args.output}")
print()
try:
# Run visualization
result = run_tsne_visualization(
input_path=args.input,
output_dir=args.output,
perplexity=args.perplexity,
random_state=args.random_state,
dpi=args.dpi,
verbose=args.verbose,
save_mapping=args.save_mapping,
interactive=args.interactive,
)
# Display summary
print(f"✓ Visualized {result['syllable_count']:,} syllables")
print(f"✓ Projected {result['feature_count']} features into 2D space")
print(f"✓ Visualization saved to: {result['output_path']}")
print(f"✓ Metadata saved to: {result['metadata_path']}")
if result["mapping_path"]:
print(f"✓ Mapping saved to: {result['mapping_path']}")
if result["interactive_path"]:
print(f"✓ Interactive HTML saved to: {result['interactive_path']}")
print(f"\nTotal processing time: {result['processing_time']:.2f} seconds")
except ImportError as e:
print(f"Error: {e}")
print("\nRequired dependencies:")
print(" pip install scikit-learn matplotlib numpy pandas")
return
except Exception as e:
print(f"Error: {e}")
return
if __name__ == "__main__":
main()