Source code for build_tools.pyphen_syllable_normaliser.models

"""
Data models for syllable normalization.

This module defines the data structures used to represent normalization
configuration, statistics, and results.
"""

from __future__ import annotations

from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path



[docs]
@dataclass
class NormalizationConfig:
    """
    Configuration for syllable normalization process.

    This dataclass stores all parameters that control how syllables are
    normalized to canonical form.

    Attributes:
        min_length: Minimum syllable length (characters). Syllables shorter
            than this are rejected. Default: 2
        max_length: Maximum syllable length (characters). Syllables longer
            than this are rejected. Default: 20
        allowed_charset: String of allowed characters. Only syllables
            containing these characters (after normalization) are kept.
            Default: "abcdefghijklmnopqrstuvwxyz"
        unicode_form: Unicode normalization form. Options: "NFC", "NFD",
            "NFKC", "NFKD". Default: "NFKD" (compatibility decomposition)

    Example:
        >>> config = NormalizationConfig(min_length=3, max_length=10)
        >>> config.min_length
        3
        >>> config.allowed_charset
        'abcdefghijklmnopqrstuvwxyz'
    """

    min_length: int = 2
    max_length: int = 20
    allowed_charset: str = "abcdefghijklmnopqrstuvwxyz"
    unicode_form: str = "NFKD"


[docs]
    def __post_init__(self):
        """Validate configuration parameters after initialization."""
        if self.min_length < 1:
            raise ValueError(f"min_length must be >= 1, got {self.min_length}")
        if self.max_length < self.min_length:
            raise ValueError(
                f"max_length ({self.max_length}) must be >= min_length ({self.min_length})"
            )
        if self.unicode_form not in ("NFC", "NFD", "NFKC", "NFKD"):
            raise ValueError(
                f"unicode_form must be one of NFC, NFD, NFKC, NFKD, got {self.unicode_form}"
            )





[docs]
@dataclass
class NormalizationStats:
    """
    Statistics from the syllable normalization process.

    This dataclass tracks counts and metrics throughout the normalization
    pipeline, useful for understanding data quality and processing results.

    Attributes:
        raw_count: Total number of syllables in raw input (before normalization)
        after_canonicalization: Number of syllables after normalization
        rejected_charset: Syllables rejected due to invalid characters
        rejected_length: Syllables rejected due to length constraints
        rejected_empty: Syllables that became empty after normalization
        unique_canonical: Number of unique canonical syllables
        processing_time: Total processing time in seconds

    Example:
        >>> stats = NormalizationStats(
        ...     raw_count=1000,
        ...     after_canonicalization=950,
        ...     rejected_charset=30,
        ...     rejected_length=20,
        ...     rejected_empty=0,
        ...     unique_canonical=412,
        ...     processing_time=1.5
        ... )
        >>> stats.rejection_rate
        5.0
    """

    raw_count: int = 0
    after_canonicalization: int = 0
    rejected_charset: int = 0
    rejected_length: int = 0
    rejected_empty: int = 0
    unique_canonical: int = 0
    processing_time: float = 0.0

    @property
    def total_rejected(self) -> int:
        """Calculate total number of rejected syllables."""
        return self.rejected_charset + self.rejected_length + self.rejected_empty

    @property
    def rejection_rate(self) -> float:
        """Calculate rejection rate as percentage of raw count."""
        if self.raw_count == 0:
            return 0.0
        return (self.total_rejected / self.raw_count) * 100




[docs]
@dataclass
class FrequencyEntry:
    """
    Single syllable with frequency and ranking information.

    This dataclass represents one syllable in the frequency analysis,
    including its occurrence count and relative ranking.

    Attributes:
        canonical: The canonical form of the syllable (e.g., "ka")
        frequency: Number of times this syllable appears
        rank: Frequency rank (1 = most common, 2 = second most common, etc.)
        percentage: Percentage of total syllables (0-100)

    Example:
        >>> entry = FrequencyEntry(canonical="ka", frequency=187, rank=1, percentage=10.2)
        >>> print(f"{entry.canonical}: {entry.frequency} ({entry.percentage:.1f}%)")
        ka: 187 (10.2%)
    """

    canonical: str
    frequency: int
    rank: int
    percentage: float




[docs]
@dataclass
class NormalizationResult:
    """
    Complete result from the syllable normalization pipeline.

    This dataclass encapsulates all outputs from the normalization process,
    including configuration, statistics, frequencies, and file paths.

    Attributes:
        config: Configuration used for normalization
        stats: Statistics from the processing
        frequencies: Dictionary mapping canonical syllable to frequency count
        unique_syllables: Sorted list of unique canonical syllables
        input_files: List of input file paths that were processed
        output_dir: Directory where output files were saved
        timestamp: When the normalization was performed
        raw_file: Path to raw aggregated file (syllables_raw.txt)
        canonical_file: Path to canonicalized file (syllables_canonicalised.txt)
        frequency_file: Path to frequency JSON (syllables_frequencies.json)
        unique_file: Path to unique syllables (syllables_unique.txt)
        meta_file: Path to metadata report (normalization_meta.txt)

    Example:
        >>> result = NormalizationResult(
        ...     config=NormalizationConfig(),
        ...     stats=NormalizationStats(raw_count=1000),
        ...     frequencies={"ka": 187, "ra": 162},
        ...     unique_syllables=["ka", "ra"],
        ...     input_files=[Path("file1.txt")],
        ...     output_dir=Path("_working/normalized"),
        ...     timestamp=datetime.now(),
        ...     raw_file=Path("syllables_raw.txt"),
        ...     canonical_file=Path("syllables_canonicalised.txt"),
        ...     frequency_file=Path("syllables_frequencies.json"),
        ...     unique_file=Path("syllables_unique.txt"),
        ...     meta_file=Path("normalization_meta.txt")
        ... )
        >>> result.stats.raw_count
        1000
    """

    config: NormalizationConfig
    stats: NormalizationStats
    frequencies: dict[str, int]
    unique_syllables: list[str]
    input_files: list[Path]
    output_dir: Path
    timestamp: datetime = field(default_factory=datetime.now)
    raw_file: Path = field(default=Path("syllables_raw.txt"))
    canonical_file: Path = field(default=Path("syllables_canonicalised.txt"))
    frequency_file: Path = field(default=Path("syllables_frequencies.json"))
    unique_file: Path = field(default=Path("syllables_unique.txt"))
    meta_file: Path = field(default=Path("normalization_meta.txt"))


[docs]
    def format_metadata(self) -> str:
        """
        Format normalization metadata as a human-readable string.

        Creates a detailed report including statistics, rejection breakdown,
        and top frequencies.

        Returns:
            Multi-line string containing all normalization metadata formatted
            for display or file output.

        Example:
            >>> result = NormalizationResult(...)
            >>> print(result.format_metadata())
            ======================================================================
            SYLLABLE NORMALIZATION METADATA
            ======================================================================
            Timestamp:           2026-01-05 17:30:22
            ...
        """
        lines = []
        lines.append("=" * 70)
        lines.append("SYLLABLE NORMALIZATION METADATA")
        lines.append("=" * 70)
        lines.append(f"Timestamp:           {self.timestamp.strftime('%Y-%m-%d %H:%M:%S')}")
        lines.append(f"Input Files:         {len(self.input_files)} files processed")
        lines.append(f"Output Directory:    {self.output_dir}")
        lines.append("=" * 70)

        # Processing statistics
        lines.append("\nProcessing Statistics:")
        lines.append(f"  Raw Syllables:           {self.stats.raw_count:,}")
        lines.append(f"  After Canonicalization:  {self.stats.after_canonicalization:,}")
        lines.append(f"  Total Rejected:          {self.stats.total_rejected:,}")
        lines.append(f"  Unique Canonical:        {self.stats.unique_canonical:,}")
        lines.append(f"  Processing Time:         {self.stats.processing_time:.2f}s")

        # Rejection breakdown
        if self.stats.total_rejected > 0:
            lines.append("\nRejection Breakdown:")
            if self.stats.rejected_charset > 0:
                lines.append(f"  Invalid charset:    {self.stats.rejected_charset:,} syllables")
            if self.stats.rejected_length > 0:
                lines.append(f"  Length constraint:  {self.stats.rejected_length:,} syllables")
            if self.stats.rejected_empty > 0:
                lines.append(f"  Empty after norm:   {self.stats.rejected_empty:,} syllables")
            lines.append(f"  Rejection Rate:     {self.stats.rejection_rate:.1f}%")

        # Configuration
        lines.append("\nNormalization Configuration:")
        lines.append(f"  Min Length:         {self.config.min_length}")
        lines.append(f"  Max Length:         {self.config.max_length}")
        lines.append(f"  Unicode Form:       {self.config.unicode_form}")
        lines.append(f"  Allowed Charset:    {self.config.allowed_charset}")

        # Top frequencies
        if self.frequencies:
            # Sort by frequency descending
            sorted_freqs = sorted(self.frequencies.items(), key=lambda x: x[1], reverse=True)
            top_n = min(20, len(sorted_freqs))
            total_count = sum(self.frequencies.values())

            lines.append(f"\nTop {top_n} Most Frequent Syllables:")
            for i, (syllable, count) in enumerate(sorted_freqs[:top_n], 1):
                percentage = (count / total_count * 100) if total_count > 0 else 0
                lines.append(
                    f"  {i:2d}. {syllable:10s} ({count:5,} occurrences, {percentage:5.1f}%)"
                )

            if len(sorted_freqs) > top_n:
                lines.append(f"  ... and {len(sorted_freqs) - top_n} more")

        # Output files
        lines.append("\nOutput Files:")
        lines.append(f"  Raw:              {self.raw_file.name}")
        lines.append(f"  Canonicalized:    {self.canonical_file.name}")
        lines.append(f"  Frequencies:      {self.frequency_file.name}")
        lines.append(f"  Unique:           {self.unique_file.name}")
        lines.append(f"  Metadata:         {self.meta_file.name}")

        lines.append("\n" + "=" * 70)
        return "\n".join(lines)