Source code for build_tools.syllable_analysis.dimensionality.mapping

"""Coordinate mapping utilities for dimensionality reduction results.

This module provides functions for creating and saving mappings between syllables
and their reduced-dimension coordinates (e.g., from t-SNE, PCA, UMAP).
"""

from __future__ import annotations

import json
from pathlib import Path

import numpy as np  # type: ignore[import-not-found]


[docs] def create_tsne_mapping(records: list[dict], tsne_coords: np.ndarray) -> list[dict]: """Create syllable→features→coordinates mapping. Combines annotated syllable records with their t-SNE coordinates to create a comprehensive mapping structure. This is useful for: - Post-hoc cluster analysis - Cross-referencing visualizations - Interactive exploration - Sharing visualizations with collaborators Args: records: Original annotated syllable records from load_annotated_syllables(). Each record should have: - syllable (str): The syllable text - frequency (int): Occurrence count - features (dict): Boolean feature flags tsne_coords: t-SNE coordinate array (n_syllables × n_dimensions). Typically 2D for visualization, but can be 3D or higher. Returns: List of mapping records with structure: [ { "syllable": "kran", "frequency": 7, "tsne_x": -2.34, "tsne_y": 5.67, "features": {...} }, ... ] Raises: ValueError: If records and tsne_coords have mismatched lengths Example: >>> records = [ ... {"syllable": "ka", "frequency": 187, "features": {...}}, ... {"syllable": "ran", "frequency": 42, "features": {...}} ... ] >>> coords = np.array([[-2.1, 3.4], [1.5, -0.8]]) >>> mapping = create_tsne_mapping(records, coords) >>> mapping[0]["tsne_x"] -2.1 >>> mapping[0]["syllable"] 'ka' Notes: - Array indices preserve order from input records - Coordinates are converted from numpy float64 to Python float for JSON compatibility - All original record fields are preserved in the mapping - For 2D t-SNE: creates tsne_x and tsne_y fields - For 3D+ t-SNE: creates tsne_x, tsne_y, tsne_z, ... fields """ if len(records) != len(tsne_coords): raise ValueError( f"Records count ({len(records)}) does not match coordinates count ({len(tsne_coords)})" ) n_dimensions = tsne_coords.shape[1] # Build mapping: combine records with their t-SNE coordinates mapping = [] for i, record in enumerate(records): entry = { "syllable": record["syllable"], "frequency": record["frequency"], } # Add coordinate fields (tsne_x, tsne_y, tsne_z, ...) coord_labels = ["tsne_x", "tsne_y", "tsne_z", "tsne_w"] # Support up to 4D for dim in range(n_dimensions): label = coord_labels[dim] if dim < len(coord_labels) else f"tsne_dim{dim}" entry[label] = float(tsne_coords[i, dim]) # Convert numpy float to Python float # Add features entry["features"] = record["features"] mapping.append(entry) return mapping
[docs] def save_tsne_mapping(mapping: list[dict], output_path: Path, indent: int = 2) -> None: """Save t-SNE mapping to JSON file. Writes the syllable→coordinates mapping as formatted JSON for human readability and programmatic access. Args: mapping: Mapping data from create_tsne_mapping() output_path: Output file path (should end in .json) indent: JSON indentation for readability (default: 2) Example: >>> from pathlib import Path >>> mapping = [{"syllable": "ka", "tsne_x": -2.1, "tsne_y": 3.4, "features": {...}}] >>> save_tsne_mapping(mapping, Path("output.json")) Notes: - Output is formatted with indentation for human readability - Uses ensure_ascii=False to preserve Unicode characters - UTF-8 encoding ensures international character support - Parent directories are created if they don't exist """ # Ensure parent directory exists output_path.parent.mkdir(parents=True, exist_ok=True) # Save as formatted JSON output_path.write_text(json.dumps(mapping, indent=indent, ensure_ascii=False), encoding="utf-8")