Source code for build_tools.syllable_walk_tui.modules.analyzer.exporter

"""
Analysis export functionality.

This module provides functions to export corpus shape metrics to text format
for sharing and discussion. Exports are human-readable and include all metrics
displayed on the AnalysisScreen.

Design Philosophy:
    - Mirror the screen display in text form
    - Include timestamps and corpus paths for provenance
    - Pure formatting functions (no side effects except final write)
    - Percentages shown in parentheses for contextual understanding

Percentage Display:
    Exported metrics include percentages where they add meaningful context:

    - **Length distribution**: Each length count shown as "length:count (pct%)"
      where pct is the share of total inventory at that length.
      Example: "2:120 (9.7%), 3:456 (37.0%)"

    - **Hapax rate**: Syllables appearing exactly once, shown as "count (pct%)"
      where pct is hapax_count / total_syllables * 100.
      Example: "Hapax (freq=1):     456 (37.0%)"

    - **Top 5 frequency**: Each top syllable shown as "syllable: count (pct%)"
      where pct is count / total_occurrences * 100.
      Example: "the: 500 (4.1%)"

    These percentages help users quickly assess:
    - Syllable shape preferences (length distribution)
    - Vocabulary diversity vs. concentration (hapax rate)
    - Zipfian distribution characteristics (top N coverage)

Export Format:
    CORPUS SHAPE METRICS EXPORT
    Generated: YYYY-MM-DD HH:MM:SS

    ==================================================
    PATCH A
    ==================================================
    Corpus: corpus_name

    INVENTORY
      Total syllables:    1,234
      Length dist:        2:120 (9.7%), 3:456 (37.0%), ...

    FREQUENCY
      Hapax (freq=1):     456 (37.0%)
      Top 5 by frequency:
        the: 500 (4.1%)
        ...

    [FEATURE SATURATION, TERRAIN sections follow]

    ==================================================
    PATCH B
    ==================================================
    [Same format as Patch A]
"""

from __future__ import annotations

from datetime import datetime
from pathlib import Path
from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from build_tools.syllable_walk_tui.services.metrics import (
        CorpusShapeMetrics,
        FeatureSaturationMetrics,
        FrequencyMetrics,
        InventoryMetrics,
        PoleExemplars,
        TerrainMetrics,
    )


[docs] def format_inventory_metrics(inv: InventoryMetrics) -> str: """ Format inventory metrics as text. Displays raw counts and derived percentages for length distribution. Percentages show each length's share of total inventory. Args: inv: Inventory metrics to format Returns: Formatted text block with length distribution percentages Example output: INVENTORY Total syllables: 1,234 Length min: 2 Length max: 8 Length mean: 3.45 Length median: 3.0 Length std: 1.23 Length dist: 2:120 (9.7%), 3:456 (37.0%), 4:389 (31.5%), ... """ lines = [ "INVENTORY", f" Total syllables: {inv.total_count:,}", f" Length min: {inv.length_min}", f" Length max: {inv.length_max}", f" Length mean: {inv.length_mean:.2f}", f" Length median: {inv.length_median:.1f}", f" Length std: {inv.length_std:.2f}", ] # Length distribution with percentages # Each count shown as both raw value and percentage of total inventory dist_parts = [ f"{length}:{count} ({count / inv.total_count * 100:.1f}%)" for length, count in sorted(inv.length_distribution.items()) ] lines.append(f" Length dist: {', '.join(dist_parts)}") return "\n".join(lines)
[docs] def format_frequency_metrics(freq: FrequencyMetrics, total_syllables: int | None = None) -> str: """ Format frequency metrics as text. Displays raw frequency statistics and derived percentages for: - Hapax rate: percentage of unique syllables appearing exactly once - Top 5 coverage: percentage of total occurrences for most frequent syllables Args: freq: Frequency metrics to format total_syllables: Total unique syllable count (from InventoryMetrics) for computing hapax rate percentage. If None, percentage is omitted. Returns: Formatted text block with percentages in parentheses Example output: FREQUENCY Total occurrences: 12,345 Freq min: 1 Freq max: 500 Freq mean: 10.00 Freq median: 5.0 Freq std: 25.50 Unique freq values: 234 Hapax (freq=1): 456 (37.0%) ... Top 5 by frequency: the: 500 (4.1%) and: 350 (2.8%) """ # Compute hapax rate if total_syllables provided # Hapax rate shows vocabulary diversity - high rate = many unique rare syllables if total_syllables and total_syllables > 0: hapax_rate = freq.hapax_count / total_syllables * 100 hapax_line = f" Hapax (freq=1): {freq.hapax_count:,} ({hapax_rate:.1f}%)" else: hapax_line = f" Hapax (freq=1): {freq.hapax_count:,}" lines = [ "FREQUENCY", f" Total occurrences: {freq.total_occurrences:,}", f" Freq min: {freq.freq_min:,}", f" Freq max: {freq.freq_max:,}", f" Freq mean: {freq.freq_mean:.2f}", f" Freq median: {freq.freq_median:.1f}", f" Freq std: {freq.freq_std:.2f}", f" Unique freq values: {freq.unique_freq_count:,}", hapax_line, "", " Percentiles:", f" P10={freq.percentile_10:,} P25={freq.percentile_25:,} " f"P50={freq.percentile_50:,}", f" P75={freq.percentile_75:,} P90={freq.percentile_90:,} " f"P99={freq.percentile_99:,}", "", " Top 5 by frequency:", ] # Top 5 with percentage of total occurrences # Shows corpus concentration - how much the top syllables dominate for syl, count in freq.top_10[:5]: pct_of_total = (count / freq.total_occurrences * 100) if freq.total_occurrences > 0 else 0.0 lines.append(f" {syl}: {count:,} ({pct_of_total:.1f}%)") return "\n".join(lines)
[docs] def format_feature_saturation(feat: FeatureSaturationMetrics) -> str: """ Format feature saturation metrics as text. Args: feat: Feature saturation metrics to format Returns: Formatted text block """ lines = [ "FEATURE SATURATION", f" Total analyzed: {feat.total_syllables:,}", "", ] # Group features by category categories = { "Onset": ["starts_with_vowel", "starts_with_cluster", "starts_with_heavy_cluster"], "Internal": ["contains_plosive", "contains_fricative", "contains_liquid", "contains_nasal"], "Nucleus": ["short_vowel", "long_vowel"], "Coda": ["ends_with_vowel", "ends_with_nasal", "ends_with_stop"], } for category, feature_names in categories.items(): lines.append(f" {category}:") for name in feature_names: fs = feat.by_name[name] # Clean up feature name for display short_name = ( name.replace("starts_with_", "") .replace("ends_with_", "") .replace("contains_", "") .replace("_", " ") ) lines.append(f" {short_name:18} {fs.true_count:>6,} ({fs.true_percentage:5.1f}%)") return "\n".join(lines)
def _format_exemplars_line( exemplars: PoleExemplars | None, low_label: str, high_label: str, ) -> str | None: """ Format exemplar syllables for both poles of an axis. Args: exemplars: PoleExemplars containing syllables from each pole, or None low_label: Label for low pole (e.g., "round") high_label: Label for high pole (e.g., "jagged") Returns: Formatted string or None if no exemplars """ if exemplars is None: return None low_str = ", ".join(exemplars.low_pole_exemplars) or "(none)" high_str = ", ".join(exemplars.high_pole_exemplars) or "(none)" return f" {low_label}: {low_str} {high_label}: {high_str}"
[docs] def format_terrain_metrics(terrain: TerrainMetrics) -> str: """ Format terrain metrics as text with ASCII bars. Hi-fi resolution (30 chars) with center marker and delta display. Args: terrain: Terrain metrics to format Returns: Formatted text block with visualization """ bar_width = 30 # Hi-fi resolution bar_filled = "█" bar_empty = "░" def format_delta(score: float) -> str: delta = score - 0.5 sign = "+" if delta >= 0 else "" return f"{sign}{delta:.3f}" def render_bar(score: float, label: str) -> str: filled_count = int(score * bar_width) empty_count = bar_width - filled_count bar = bar_filled * filled_count + bar_empty * empty_count delta = format_delta(score) return f"{bar} {label:8} {delta}" lines = [ "TERRAIN", "", " Shape: Round <-> Jagged (Bouba/Kiki)", f" {render_bar(terrain.shape_score, terrain.shape_label)}", ] exemplar_line = _format_exemplars_line(terrain.shape_exemplars, "round", "jagged") if exemplar_line: lines.append(exemplar_line) lines.append("") lines.append(" Craft: Flowing <-> Worked (Sung/Forged)") lines.append(f" {render_bar(terrain.craft_score, terrain.craft_label)}") exemplar_line = _format_exemplars_line(terrain.craft_exemplars, "flowing", "worked") if exemplar_line: lines.append(exemplar_line) lines.append("") lines.append(" Space: Open <-> Dense (Valley/Workshop)") lines.append(f" {render_bar(terrain.space_score, terrain.space_label)}") exemplar_line = _format_exemplars_line(terrain.space_exemplars, "open", "dense") if exemplar_line: lines.append(exemplar_line) return "\n".join(lines)
[docs] def format_patch_metrics( patch_name: str, metrics: CorpusShapeMetrics | None, corpus_path: Path | None = None, ) -> str: """ Format all metrics for a single patch. Combines inventory, frequency, feature saturation, and terrain metrics into a single formatted text block. Passes total_syllables from inventory to frequency formatter for hapax rate percentage computation. Args: patch_name: "A" or "B" metrics: Corpus shape metrics, or None if not loaded corpus_path: Optional path to corpus directory Returns: Formatted text block for entire patch with all metrics and percentages """ header = f"PATCH {patch_name}" separator = "=" * 50 lines = [separator, header, separator] if corpus_path: lines.append(f"Corpus: {corpus_path.name}") lines.append("") if metrics is None: lines.append("(no corpus loaded)") return "\n".join(lines) lines.append(format_inventory_metrics(metrics.inventory)) lines.append("") # Pass total_syllables to enable hapax rate percentage computation lines.append(format_frequency_metrics(metrics.frequency, metrics.inventory.total_count)) lines.append("") lines.append(format_feature_saturation(metrics.feature_saturation)) lines.append("") lines.append(format_terrain_metrics(metrics.terrain)) return "\n".join(lines)
[docs] def format_analysis_export( metrics_a: CorpusShapeMetrics | None, metrics_b: CorpusShapeMetrics | None, corpus_path_a: Path | None = None, corpus_path_b: Path | None = None, ) -> str: """ Format complete analysis export for both patches. Args: metrics_a: Metrics for Patch A, or None if not loaded metrics_b: Metrics for Patch B, or None if not loaded corpus_path_a: Optional path to Patch A corpus corpus_path_b: Optional path to Patch B corpus Returns: Complete formatted export text """ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") header = [ "CORPUS SHAPE METRICS EXPORT", f"Generated: {timestamp}", "", ] patch_a_text = format_patch_metrics("A", metrics_a, corpus_path_a) patch_b_text = format_patch_metrics("B", metrics_b, corpus_path_b) footer = [ "", "=" * 50, "Export generated by Syllable Walker TUI", "https://github.com/aa-parky/pipeworks_name_generation", ] return "\n".join(header + [patch_a_text, "", patch_b_text] + footer)
[docs] def export_analysis_to_file( filepath: Path, metrics_a: CorpusShapeMetrics | None, metrics_b: CorpusShapeMetrics | None, corpus_path_a: Path | None = None, corpus_path_b: Path | None = None, ) -> Path: """ Export analysis to a text file. Args: filepath: Path to write the export file metrics_a: Metrics for Patch A, or None if not loaded metrics_b: Metrics for Patch B, or None if not loaded corpus_path_a: Optional path to Patch A corpus corpus_path_b: Optional path to Patch B corpus Returns: Path to the written file Raises: OSError: If file cannot be written """ content = format_analysis_export( metrics_a=metrics_a, metrics_b=metrics_b, corpus_path_a=corpus_path_a, corpus_path_b=corpus_path_b, ) filepath.write_text(content, encoding="utf-8") return filepath
[docs] def generate_export_filename() -> str: """ Generate a timestamped filename for export. Returns: Filename like "corpus_metrics_20260118_143022.txt" """ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") return f"corpus_metrics_{timestamp}.txt"