Source code for build_tools.pyphen_syllable_normaliser.models

"""
Data models for syllable normalization.

This module defines the data structures used to represent normalization
configuration, statistics, and results.
"""

from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Dict, List



[docs]
@dataclass
class NormalizationConfig:
    """
    Configuration for syllable normalization process.

    This dataclass stores all parameters that control how syllables are
    normalized to canonical form.

    Attributes:
        min_length: Minimum syllable length (characters). Syllables shorter
            than this are rejected. Default: 2
        max_length: Maximum syllable length (characters). Syllables longer
            than this are rejected. Default: 20
        allowed_charset: String of allowed characters. Only syllables
            containing these characters (after normalization) are kept.
            Default: "abcdefghijklmnopqrstuvwxyz"
        unicode_form: Unicode normalization form. Options: "NFC", "NFD",
            "NFKC", "NFKD". Default: "NFKD" (compatibility decomposition)

    Example:
        >>> config = NormalizationConfig(min_length=3, max_length=10)
        >>> config.min_length
        3
        >>> config.allowed_charset
        'abcdefghijklmnopqrstuvwxyz'
    """

    min_length: int = 2
    max_length: int = 20
    allowed_charset: str = "abcdefghijklmnopqrstuvwxyz"
    unicode_form: str = "NFKD"


[docs]
    def __post_init__(self):
        """Validate configuration parameters after initialization."""
        if self.min_length < 1:
            raise ValueError(f"min_length must be >= 1, got {self.min_length}")
        if self.max_length < self.min_length:
            raise ValueError(
                f"max_length ({self.max_length}) must be >= min_length ({self.min_length})"
            )
        if self.unicode_form not in ("NFC", "NFD", "NFKC", "NFKD"):
            raise ValueError(
                f"unicode_form must be one of NFC, NFD, NFKC, NFKD, got {self.unicode_form}"
            )





[docs]
@dataclass
class NormalizationStats:
    """
    Statistics from the syllable normalization process.

    This dataclass tracks counts and metrics throughout the normalization
    pipeline, useful for understanding data quality and processing results.

    Attributes:
        raw_count: Total number of syllables in raw input (before normalization)
        after_canonicalization: Number of syllables after normalization
        rejected_charset: Syllables rejected due to invalid characters
        rejected_length: Syllables rejected due to length constraints
        rejected_empty: Syllables that became empty after normalization
        unique_canonical: Number of unique canonical syllables
        processing_time: Total processing time in seconds

    Example:
        >>> stats = NormalizationStats(
        ...     raw_count=1000,
        ...     after_canonicalization=950,
        ...     rejected_charset=30,
        ...     rejected_length=20,
        ...     rejected_empty=0,
        ...     unique_canonical=412,
        ...     processing_time=1.5
        ... )
        >>> stats.rejection_rate
        5.0
    """

    raw_count: int = 0
    after_canonicalization: int = 0
    rejected_charset: int = 0
    rejected_length: int = 0
    rejected_empty: int = 0
    unique_canonical: int = 0
    processing_time: float = 0.0

    @property
    def total_rejected(self) -> int:
        """Calculate total number of rejected syllables."""
        return self.rejected_charset + self.rejected_length + self.rejected_empty

    @property
    def rejection_rate(self) -> float:
        """Calculate rejection rate as percentage of raw count."""
        if self.raw_count == 0:
            return 0.0
        return (self.total_rejected / self.raw_count) * 100




[docs]
@dataclass
class FrequencyEntry:
    """
    Single syllable with frequency and ranking information.

    This dataclass represents one syllable in the frequency analysis,
    including its occurrence count and relative ranking.

    Attributes:
        canonical: The canonical form of the syllable (e.g., "ka")
        frequency: Number of times this syllable appears
        rank: Frequency rank (1 = most common, 2 = second most common, etc.)
        percentage: Percentage of total syllables (0-100)

    Example:
        >>> entry = FrequencyEntry(canonical="ka", frequency=187, rank=1, percentage=10.2)
        >>> print(f"{entry.canonical}: {entry.frequency} ({entry.percentage:.1f}%)")
        ka: 187 (10.2%)
    """

    canonical: str
    frequency: int
    rank: int
    percentage: float




[docs]
@dataclass
class NormalizationResult:
    """
    Complete result from the syllable normalization pipeline.

    This dataclass encapsulates all outputs from the normalization process,
    including configuration, statistics, frequencies, and file paths.

    Attributes:
        config: Configuration used for normalization
        stats: Statistics from the processing
        frequencies: Dictionary mapping canonical syllable to frequency count
        unique_syllables: Sorted list of unique canonical syllables
        input_files: List of input file paths that were processed
        output_dir: Directory where output files were saved
        timestamp: When the normalization was performed
        raw_file: Path to raw aggregated file (syllables_raw.txt)
        canonical_file: Path to canonicalized file (syllables_canonicalised.txt)
        frequency_file: Path to frequency JSON (syllables_frequencies.json)
        unique_file: Path to unique syllables (syllables_unique.txt)
        meta_file: Path to metadata report (normalization_meta.txt)

    Example:
        >>> result = NormalizationResult(
        ...     config=NormalizationConfig(),
        ...     stats=NormalizationStats(raw_count=1000),
        ...     frequencies={"ka": 187, "ra": 162},
        ...     unique_syllables=["ka", "ra"],
        ...     input_files=[Path("file1.txt")],
        ...     output_dir=Path("_working/normalized"),
        ...     timestamp=datetime.now(),
        ...     raw_file=Path("syllables_raw.txt"),
        ...     canonical_file=Path("syllables_canonicalised.txt"),
        ...     frequency_file=Path("syllables_frequencies.json"),
        ...     unique_file=Path("syllables_unique.txt"),
        ...     meta_file=Path("normalization_meta.txt")
        ... )
        >>> result.stats.raw_count
        1000
    """

    config: NormalizationConfig
    stats: NormalizationStats
    frequencies: Dict[str, int]
    unique_syllables: List[str]
    input_files: List[Path]
    output_dir: Path
    timestamp: datetime = field(default_factory=datetime.now)
    raw_file: Path = field(default=Path("syllables_raw.txt"))
    canonical_file: Path = field(default=Path("syllables_canonicalised.txt"))
    frequency_file: Path = field(default=Path("syllables_frequencies.json"))
    unique_file: Path = field(default=Path("syllables_unique.txt"))
    meta_file: Path = field(default=Path("normalization_meta.txt"))


[docs]
    def format_metadata(self) -> str:
        """
        Format normalization metadata as a human-readable string.

        Creates a detailed report including statistics, rejection breakdown,
        and top frequencies.

        Returns:
            Multi-line string containing all normalization metadata formatted
            for display or file output.

        Example:
            >>> result = NormalizationResult(...)
            >>> print(result.format_metadata())
            ======================================================================
            SYLLABLE NORMALIZATION METADATA
            ======================================================================
            Timestamp:           2026-01-05 17:30:22
            ...
        """
        lines = []
        lines.append("=" * 70)
        lines.append("SYLLABLE NORMALIZATION METADATA")
        lines.append("=" * 70)
        lines.append(f"Timestamp:           {self.timestamp.strftime('%Y-%m-%d %H:%M:%S')}")
        lines.append(f"Input Files:         {len(self.input_files)} files processed")
        lines.append(f"Output Directory:    {self.output_dir}")
        lines.append("=" * 70)

        # Processing statistics
        lines.append("\nProcessing Statistics:")
        lines.append(f"  Raw Syllables:           {self.stats.raw_count:,}")
        lines.append(f"  After Canonicalization:  {self.stats.after_canonicalization:,}")
        lines.append(f"  Total Rejected:          {self.stats.total_rejected:,}")
        lines.append(f"  Unique Canonical:        {self.stats.unique_canonical:,}")
        lines.append(f"  Processing Time:         {self.stats.processing_time:.2f}s")

        # Rejection breakdown
        if self.stats.total_rejected > 0:
            lines.append("\nRejection Breakdown:")
            if self.stats.rejected_charset > 0:
                lines.append(f"  Invalid charset:    {self.stats.rejected_charset:,} syllables")
            if self.stats.rejected_length > 0:
                lines.append(f"  Length constraint:  {self.stats.rejected_length:,} syllables")
            if self.stats.rejected_empty > 0:
                lines.append(f"  Empty after norm:   {self.stats.rejected_empty:,} syllables")
            lines.append(f"  Rejection Rate:     {self.stats.rejection_rate:.1f}%")

        # Configuration
        lines.append("\nNormalization Configuration:")
        lines.append(f"  Min Length:         {self.config.min_length}")
        lines.append(f"  Max Length:         {self.config.max_length}")
        lines.append(f"  Unicode Form:       {self.config.unicode_form}")
        lines.append(f"  Allowed Charset:    {self.config.allowed_charset}")

        # Top frequencies
        if self.frequencies:
            # Sort by frequency descending
            sorted_freqs = sorted(self.frequencies.items(), key=lambda x: x[1], reverse=True)
            top_n = min(20, len(sorted_freqs))
            total_count = sum(self.frequencies.values())

            lines.append(f"\nTop {top_n} Most Frequent Syllables:")
            for i, (syllable, count) in enumerate(sorted_freqs[:top_n], 1):
                percentage = (count / total_count * 100) if total_count > 0 else 0
                lines.append(
                    f"  {i:2d}. {syllable:10s} ({count:5,} occurrences, {percentage:5.1f}%)"
                )

            if len(sorted_freqs) > top_n:
                lines.append(f"  ... and {len(sorted_freqs) - top_n} more")

        # Output files
        lines.append("\nOutput Files:")
        lines.append(f"  Raw:              {self.raw_file.name}")
        lines.append(f"  Canonicalized:    {self.canonical_file.name}")
        lines.append(f"  Frequencies:      {self.frequency_file.name}")
        lines.append(f"  Unique:           {self.unique_file.name}")
        lines.append(f"  Metadata:         {self.meta_file.name}")

        lines.append("\n" + "=" * 70)
        return "\n".join(lines)