Source code for build_tools.syllable_walk_tui.modules.analyzer.exporter
"""
Analysis export functionality.
This module provides functions to export corpus shape metrics to text format
for sharing and discussion. Exports are human-readable and include all metrics
displayed on the AnalysisScreen.
Design Philosophy:
- Mirror the screen display in text form
- Include timestamps and corpus paths for provenance
- Pure formatting functions (no side effects except final write)
- Percentages shown in parentheses for contextual understanding
Percentage Display:
Exported metrics include percentages where they add meaningful context:
- **Length distribution**: Each length count shown as "length:count (pct%)"
where pct is the share of total inventory at that length.
Example: "2:120 (9.7%), 3:456 (37.0%)"
- **Hapax rate**: Syllables appearing exactly once, shown as "count (pct%)"
where pct is hapax_count / total_syllables * 100.
Example: "Hapax (freq=1): 456 (37.0%)"
- **Top 5 frequency**: Each top syllable shown as "syllable: count (pct%)"
where pct is count / total_occurrences * 100.
Example: "the: 500 (4.1%)"
These percentages help users quickly assess:
- Syllable shape preferences (length distribution)
- Vocabulary diversity vs. concentration (hapax rate)
- Zipfian distribution characteristics (top N coverage)
Export Format:
CORPUS SHAPE METRICS EXPORT
Generated: YYYY-MM-DD HH:MM:SS
==================================================
PATCH A
==================================================
Corpus: corpus_name
INVENTORY
Total syllables: 1,234
Length dist: 2:120 (9.7%), 3:456 (37.0%), ...
FREQUENCY
Hapax (freq=1): 456 (37.0%)
Top 5 by frequency:
the: 500 (4.1%)
...
[FEATURE SATURATION, TERRAIN sections follow]
==================================================
PATCH B
==================================================
[Same format as Patch A]
"""
from __future__ import annotations
from datetime import datetime
from pathlib import Path
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from build_tools.syllable_walk_tui.services.metrics import (
CorpusShapeMetrics,
FeatureSaturationMetrics,
FrequencyMetrics,
InventoryMetrics,
PoleExemplars,
TerrainMetrics,
)
[docs]
def format_inventory_metrics(inv: InventoryMetrics) -> str:
"""
Format inventory metrics as text.
Displays raw counts and derived percentages for length distribution.
Percentages show each length's share of total inventory.
Args:
inv: Inventory metrics to format
Returns:
Formatted text block with length distribution percentages
Example output:
INVENTORY
Total syllables: 1,234
Length min: 2
Length max: 8
Length mean: 3.45
Length median: 3.0
Length std: 1.23
Length dist: 2:120 (9.7%), 3:456 (37.0%), 4:389 (31.5%), ...
"""
lines = [
"INVENTORY",
f" Total syllables: {inv.total_count:,}",
f" Length min: {inv.length_min}",
f" Length max: {inv.length_max}",
f" Length mean: {inv.length_mean:.2f}",
f" Length median: {inv.length_median:.1f}",
f" Length std: {inv.length_std:.2f}",
]
# Length distribution with percentages
# Each count shown as both raw value and percentage of total inventory
dist_parts = [
f"{length}:{count} ({count / inv.total_count * 100:.1f}%)"
for length, count in sorted(inv.length_distribution.items())
]
lines.append(f" Length dist: {', '.join(dist_parts)}")
return "\n".join(lines)
[docs]
def format_frequency_metrics(freq: FrequencyMetrics, total_syllables: int | None = None) -> str:
"""
Format frequency metrics as text.
Displays raw frequency statistics and derived percentages for:
- Hapax rate: percentage of unique syllables appearing exactly once
- Top 5 coverage: percentage of total occurrences for most frequent syllables
Args:
freq: Frequency metrics to format
total_syllables: Total unique syllable count (from InventoryMetrics) for
computing hapax rate percentage. If None, percentage is omitted.
Returns:
Formatted text block with percentages in parentheses
Example output:
FREQUENCY
Total occurrences: 12,345
Freq min: 1
Freq max: 500
Freq mean: 10.00
Freq median: 5.0
Freq std: 25.50
Unique freq values: 234
Hapax (freq=1): 456 (37.0%)
...
Top 5 by frequency:
the: 500 (4.1%)
and: 350 (2.8%)
"""
# Compute hapax rate if total_syllables provided
# Hapax rate shows vocabulary diversity - high rate = many unique rare syllables
if total_syllables and total_syllables > 0:
hapax_rate = freq.hapax_count / total_syllables * 100
hapax_line = f" Hapax (freq=1): {freq.hapax_count:,} ({hapax_rate:.1f}%)"
else:
hapax_line = f" Hapax (freq=1): {freq.hapax_count:,}"
lines = [
"FREQUENCY",
f" Total occurrences: {freq.total_occurrences:,}",
f" Freq min: {freq.freq_min:,}",
f" Freq max: {freq.freq_max:,}",
f" Freq mean: {freq.freq_mean:.2f}",
f" Freq median: {freq.freq_median:.1f}",
f" Freq std: {freq.freq_std:.2f}",
f" Unique freq values: {freq.unique_freq_count:,}",
hapax_line,
"",
" Percentiles:",
f" P10={freq.percentile_10:,} P25={freq.percentile_25:,} "
f"P50={freq.percentile_50:,}",
f" P75={freq.percentile_75:,} P90={freq.percentile_90:,} "
f"P99={freq.percentile_99:,}",
"",
" Top 5 by frequency:",
]
# Top 5 with percentage of total occurrences
# Shows corpus concentration - how much the top syllables dominate
for syl, count in freq.top_10[:5]:
pct_of_total = (count / freq.total_occurrences * 100) if freq.total_occurrences > 0 else 0.0
lines.append(f" {syl}: {count:,} ({pct_of_total:.1f}%)")
return "\n".join(lines)
[docs]
def format_feature_saturation(feat: FeatureSaturationMetrics) -> str:
"""
Format feature saturation metrics as text.
Args:
feat: Feature saturation metrics to format
Returns:
Formatted text block
"""
lines = [
"FEATURE SATURATION",
f" Total analyzed: {feat.total_syllables:,}",
"",
]
# Group features by category
categories = {
"Onset": ["starts_with_vowel", "starts_with_cluster", "starts_with_heavy_cluster"],
"Internal": ["contains_plosive", "contains_fricative", "contains_liquid", "contains_nasal"],
"Nucleus": ["short_vowel", "long_vowel"],
"Coda": ["ends_with_vowel", "ends_with_nasal", "ends_with_stop"],
}
for category, feature_names in categories.items():
lines.append(f" {category}:")
for name in feature_names:
fs = feat.by_name[name]
# Clean up feature name for display
short_name = (
name.replace("starts_with_", "")
.replace("ends_with_", "")
.replace("contains_", "")
.replace("_", " ")
)
lines.append(f" {short_name:18} {fs.true_count:>6,} ({fs.true_percentage:5.1f}%)")
return "\n".join(lines)
def _format_exemplars_line(
exemplars: PoleExemplars | None,
low_label: str,
high_label: str,
) -> str | None:
"""
Format exemplar syllables for both poles of an axis.
Args:
exemplars: PoleExemplars containing syllables from each pole, or None
low_label: Label for low pole (e.g., "round")
high_label: Label for high pole (e.g., "jagged")
Returns:
Formatted string or None if no exemplars
"""
if exemplars is None:
return None
low_str = ", ".join(exemplars.low_pole_exemplars) or "(none)"
high_str = ", ".join(exemplars.high_pole_exemplars) or "(none)"
return f" {low_label}: {low_str} {high_label}: {high_str}"
[docs]
def format_terrain_metrics(terrain: TerrainMetrics) -> str:
"""
Format terrain metrics as text with ASCII bars.
Hi-fi resolution (30 chars) with center marker and delta display.
Args:
terrain: Terrain metrics to format
Returns:
Formatted text block with visualization
"""
bar_width = 30 # Hi-fi resolution
bar_filled = "█"
bar_empty = "░"
def format_delta(score: float) -> str:
delta = score - 0.5
sign = "+" if delta >= 0 else ""
return f"{sign}{delta:.3f}"
def render_bar(score: float, label: str) -> str:
filled_count = int(score * bar_width)
empty_count = bar_width - filled_count
bar = bar_filled * filled_count + bar_empty * empty_count
delta = format_delta(score)
return f"{bar} {label:8} {delta}"
lines = [
"TERRAIN",
"",
" Shape: Round <-> Jagged (Bouba/Kiki)",
f" {render_bar(terrain.shape_score, terrain.shape_label)}",
]
exemplar_line = _format_exemplars_line(terrain.shape_exemplars, "round", "jagged")
if exemplar_line:
lines.append(exemplar_line)
lines.append("")
lines.append(" Craft: Flowing <-> Worked (Sung/Forged)")
lines.append(f" {render_bar(terrain.craft_score, terrain.craft_label)}")
exemplar_line = _format_exemplars_line(terrain.craft_exemplars, "flowing", "worked")
if exemplar_line:
lines.append(exemplar_line)
lines.append("")
lines.append(" Space: Open <-> Dense (Valley/Workshop)")
lines.append(f" {render_bar(terrain.space_score, terrain.space_label)}")
exemplar_line = _format_exemplars_line(terrain.space_exemplars, "open", "dense")
if exemplar_line:
lines.append(exemplar_line)
return "\n".join(lines)
[docs]
def format_patch_metrics(
patch_name: str,
metrics: CorpusShapeMetrics | None,
corpus_path: Path | None = None,
) -> str:
"""
Format all metrics for a single patch.
Combines inventory, frequency, feature saturation, and terrain metrics
into a single formatted text block. Passes total_syllables from inventory
to frequency formatter for hapax rate percentage computation.
Args:
patch_name: "A" or "B"
metrics: Corpus shape metrics, or None if not loaded
corpus_path: Optional path to corpus directory
Returns:
Formatted text block for entire patch with all metrics and percentages
"""
header = f"PATCH {patch_name}"
separator = "=" * 50
lines = [separator, header, separator]
if corpus_path:
lines.append(f"Corpus: {corpus_path.name}")
lines.append("")
if metrics is None:
lines.append("(no corpus loaded)")
return "\n".join(lines)
lines.append(format_inventory_metrics(metrics.inventory))
lines.append("")
# Pass total_syllables to enable hapax rate percentage computation
lines.append(format_frequency_metrics(metrics.frequency, metrics.inventory.total_count))
lines.append("")
lines.append(format_feature_saturation(metrics.feature_saturation))
lines.append("")
lines.append(format_terrain_metrics(metrics.terrain))
return "\n".join(lines)
[docs]
def format_analysis_export(
metrics_a: CorpusShapeMetrics | None,
metrics_b: CorpusShapeMetrics | None,
corpus_path_a: Path | None = None,
corpus_path_b: Path | None = None,
) -> str:
"""
Format complete analysis export for both patches.
Args:
metrics_a: Metrics for Patch A, or None if not loaded
metrics_b: Metrics for Patch B, or None if not loaded
corpus_path_a: Optional path to Patch A corpus
corpus_path_b: Optional path to Patch B corpus
Returns:
Complete formatted export text
"""
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
header = [
"CORPUS SHAPE METRICS EXPORT",
f"Generated: {timestamp}",
"",
]
patch_a_text = format_patch_metrics("A", metrics_a, corpus_path_a)
patch_b_text = format_patch_metrics("B", metrics_b, corpus_path_b)
footer = [
"",
"=" * 50,
"Export generated by Syllable Walker TUI",
"https://github.com/aa-parky/pipeworks_name_generation",
]
return "\n".join(header + [patch_a_text, "", patch_b_text] + footer)
[docs]
def export_analysis_to_file(
filepath: Path,
metrics_a: CorpusShapeMetrics | None,
metrics_b: CorpusShapeMetrics | None,
corpus_path_a: Path | None = None,
corpus_path_b: Path | None = None,
) -> Path:
"""
Export analysis to a text file.
Args:
filepath: Path to write the export file
metrics_a: Metrics for Patch A, or None if not loaded
metrics_b: Metrics for Patch B, or None if not loaded
corpus_path_a: Optional path to Patch A corpus
corpus_path_b: Optional path to Patch B corpus
Returns:
Path to the written file
Raises:
OSError: If file cannot be written
"""
content = format_analysis_export(
metrics_a=metrics_a,
metrics_b=metrics_b,
corpus_path_a=corpus_path_a,
corpus_path_b=corpus_path_b,
)
filepath.write_text(content, encoding="utf-8")
return filepath
[docs]
def generate_export_filename() -> str:
"""
Generate a timestamped filename for export.
Returns:
Filename like "corpus_metrics_20260118_143022.txt"
"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
return f"corpus_metrics_{timestamp}.txt"