Source code for build_tools.syllable_walk_web.services.metrics

"""
Corpus metrics service for the web application.

Computes inventory, frequency, feature saturation, and terrain metrics
for the analysis screen.
"""

from __future__ import annotations

from typing import Any, Sequence


[docs] def compute_analysis( annotated_data: Sequence[dict[str, Any]], frequencies: dict[str, int], ) -> dict[str, Any]: """Compute corpus analysis metrics for a patch. Args: annotated_data: Annotated syllable records. frequencies: Syllable frequency map. Returns: JSON-serialisable dict with inventory, frequency, terrain metrics. """ from build_tools.syllable_walk_tui.services.metrics import ( compute_corpus_shape_metrics, ) syllables = [s["syllable"] for s in annotated_data] metrics = compute_corpus_shape_metrics(syllables, frequencies, annotated_data) # Flatten to JSON-serialisable dict inv = metrics.inventory freq = metrics.frequency terrain = metrics.terrain # Syllables longer than 5 chars are rare; grouping them into a single # "5+" bucket keeps the UI histogram clean. len_dist: dict[str, list[int | float]] = {} for length, count in sorted(inv.length_distribution.items()): key = str(length) if length < 5 else "5+" if key in len_dist: len_dist[key][0] += count else: pct = (count / inv.total_count * 100) if inv.total_count else 0 len_dist[key] = [count, round(pct, 1)] # When multiple lengths are merged into the 5+ bucket, the initial # percentage was calculated from one length only — recompute from the # merged count. if "5+" in len_dist: len_dist["5+"][1] = round(len_dist["5+"][0] / inv.total_count * 100, 1) # Each terrain axis has low-pole and high-pole exemplars (e.g. "shape" # axis: simple syllables vs complex syllables). Merging both poles # into a flat list gives the UI a representative sample. def _exemplars(axis_exemplars: Any) -> list[str]: if axis_exemplars is None: return [] low = axis_exemplars.low_pole_exemplars or [] high = axis_exemplars.high_pole_exemplars or [] return low + high return { "total": inv.total_count, "unique": inv.total_count, # all are already unique "hapax": freq.hapax_count, "hapax_rate": round(freq.hapax_count / inv.total_count, 3) if inv.total_count else 0, "length_distribution": len_dist, "terrain": { "shape": { "score": round(terrain.shape_score, 3), "label": terrain.shape_label, "pct": round(terrain.shape_score * 100, 0), "exemplars": _exemplars(terrain.shape_exemplars), }, "craft": { "score": round(terrain.craft_score, 3), "label": terrain.craft_label, "pct": round(terrain.craft_score * 100, 0), "exemplars": _exemplars(terrain.craft_exemplars), }, "space": { "score": round(terrain.space_score, 3), "label": terrain.space_label, "pct": round(terrain.space_score * 100, 0), "exemplars": _exemplars(terrain.space_exemplars), }, }, }