Source code for build_tools.pyphen_syllable_normaliser.normalizer

"""
Core normalization logic for syllable canonicalization.

This module provides the SyllableNormalizer class which handles the transformation
of raw syllables into canonical form through Unicode normalization, diacritic
stripping, lowercase conversion, and validation.
"""

import unicodedata
from typing import Literal, Optional, cast

from .models import NormalizationConfig



[docs]
class SyllableNormalizer:
    """
    Normalizes syllables to canonical form.

    This class applies a multi-step normalization pipeline to transform raw
    syllables into a standardized canonical representation. The pipeline
    includes Unicode normalization, diacritic removal, case normalization,
    and validation against charset and length constraints.

    Attributes:
        config: Configuration specifying normalization parameters such as
            allowed charset, length constraints, and Unicode normalization form.

    Example:
        >>> from build_tools.pyphen_syllable_normaliser import NormalizationConfig
        >>> config = NormalizationConfig(min_length=2, max_length=8)
        >>> normalizer = SyllableNormalizer(config)
        >>> normalizer.normalize("Café")
        'cafe'
        >>> normalizer.normalize("x")  # Too short
        None
        >>> normalizer.normalize("résumé123")  # Invalid characters
        None
    """


[docs]
    def __init__(self, config: NormalizationConfig):
        """
        Initialize normalizer with configuration.

        Args:
            config: NormalizationConfig instance specifying normalization
                parameters including charset, length constraints, and
                Unicode normalization form.

        Example:
            >>> config = NormalizationConfig(
            ...     min_length=3,
            ...     max_length=10,
            ...     allowed_charset="abcdefghijklmnopqrstuvwxyz",
            ...     unicode_form="NFKD"
            ... )
            >>> normalizer = SyllableNormalizer(config)
        """
        self.config = config



[docs]
    def normalize(self, syllable: str) -> Optional[str]:
        """
        Normalize a single syllable to canonical form.

        Applies the complete normalization pipeline:
        1. Unicode normalization (NFKD by default)
        2. Strip diacritics (remove combining characters)
        3. Lowercase conversion
        4. Trim whitespace
        5. Validate charset (only allowed characters)
        6. Check length constraints

        Args:
            syllable: Raw syllable string to normalize.

        Returns:
            Normalized canonical syllable string, or None if the syllable
            is rejected due to:
            - Becoming empty after normalization
            - Containing invalid characters
            - Not meeting length constraints

        Example:
            >>> config = NormalizationConfig()
            >>> normalizer = SyllableNormalizer(config)
            >>> normalizer.normalize("Café")
            'cafe'
            >>> normalizer.normalize("  HELLO  ")
            'hello'
            >>> normalizer.normalize("résumé")
            'resume'
            >>> normalizer.normalize("")  # Empty
            None
            >>> normalizer.normalize("x")  # Too short (min_length=2)
            None
            >>> normalizer.normalize("hello123")  # Invalid chars
            None
        """
        # Step 1: Unicode normalization
        form = cast(Literal["NFC", "NFD", "NFKC", "NFKD"], self.config.unicode_form)
        normalized = unicodedata.normalize(form, syllable)

        # Step 2: Strip diacritics (remove combining characters)
        normalized = self.strip_diacritics(normalized)

        # Step 3: Lowercase conversion
        normalized = normalized.lower()

        # Step 4: Trim whitespace
        normalized = normalized.strip()

        # Check if empty after normalization
        if not normalized:
            return None

        # Step 5: Validate charset (only allowed characters)
        if not self._is_valid_charset(normalized):
            return None

        # Step 6: Check length constraints
        if not self._is_valid_length(normalized):
            return None

        return normalized



[docs]
    def strip_diacritics(self, text: str) -> str:
        """
        Remove diacritics (accent marks) from Unicode text.

        Uses Unicode normalization (NFD/NFKD) to decompose characters into
        base characters and combining marks, then removes the combining marks.
        This converts accented characters like 'é' → 'e', 'ñ' → 'n', etc.

        Args:
            text: Unicode string potentially containing diacritics.

        Returns:
            String with all combining diacritical marks removed.

        Example:
            >>> normalizer = SyllableNormalizer(NormalizationConfig())
            >>> normalizer.strip_diacritics("café")
            'cafe'
            >>> normalizer.strip_diacritics("naïve")
            'naive'
            >>> normalizer.strip_diacritics("Zürich")
            'Zurich'
            >>> normalizer.strip_diacritics("São Paulo")
            'Sao Paulo'

        Note:
            This method assumes the text has already been normalized to
            NFD or NFKD form. The normalize() method handles this automatically.
        """
        # Filter out combining characters (category Mn = Mark, nonspacing)
        return "".join(char for char in text if unicodedata.category(char) != "Mn")


    def _is_valid_charset(self, syllable: str) -> bool:
        """
        Check if syllable contains only allowed characters.

        Args:
            syllable: Syllable string to validate.

        Returns:
            True if all characters are in allowed_charset, False otherwise.

        Example:
            >>> config = NormalizationConfig(allowed_charset="abcdefghijklmnopqrstuvwxyz")
            >>> normalizer = SyllableNormalizer(config)
            >>> normalizer._is_valid_charset("hello")
            True
            >>> normalizer._is_valid_charset("hello123")
            False
            >>> normalizer._is_valid_charset("hello-world")
            False
        """
        return all(char in self.config.allowed_charset for char in syllable)

    def _is_valid_length(self, syllable: str) -> bool:
        """
        Check if syllable meets length constraints.

        Args:
            syllable: Syllable string to validate.

        Returns:
            True if syllable length is between min_length and max_length
            (inclusive), False otherwise.

        Example:
            >>> config = NormalizationConfig(min_length=2, max_length=8)
            >>> normalizer = SyllableNormalizer(config)
            >>> normalizer._is_valid_length("ab")
            True
            >>> normalizer._is_valid_length("hello")
            True
            >>> normalizer._is_valid_length("x")
            False
            >>> normalizer._is_valid_length("verylongword")
            False
        """
        length = len(syllable)
        return self.config.min_length <= length <= self.config.max_length




[docs]
def normalize_batch(
    syllables: list[str], config: NormalizationConfig
) -> tuple[list[str], dict[str, int]]:
    """
    Normalize a batch of syllables and collect rejection statistics.

    This is a convenience function for normalizing multiple syllables at once
    while tracking why syllables were rejected.

    Args:
        syllables: List of raw syllable strings to normalize.
        config: NormalizationConfig specifying normalization parameters.

    Returns:
        Tuple of (normalized_syllables, rejection_stats) where:
        - normalized_syllables: List of successfully normalized syllables
        - rejection_stats: Dictionary with rejection counts:
            - "rejected_empty": Syllables that became empty after normalization
            - "rejected_charset": Syllables with invalid characters
            - "rejected_length": Syllables outside length constraints

    Example:
        >>> config = NormalizationConfig(min_length=2, max_length=8)
        >>> syllables = ["Café", "x", "Hello", "world123", "  résumé  "]
        >>> normalized, stats = normalize_batch(syllables, config)
        >>> normalized
        ['cafe', 'hello', 'resume']
        >>> stats
        {'rejected_empty': 0, 'rejected_charset': 1, 'rejected_length': 1}

    Note:
        This function processes syllables in order and preserves duplicates.
        For frequency analysis, use the frequency.py module which handles
        deduplication and counting.
    """
    normalizer = SyllableNormalizer(config)
    normalized_syllables: list[str] = []
    rejection_stats = {
        "rejected_empty": 0,
        "rejected_charset": 0,
        "rejected_length": 0,
    }

    for syllable in syllables:
        # Try each normalization step to track specific rejection reasons
        # Step 1-4: Unicode normalization, diacritic stripping, lowercase, trim
        form = cast(Literal["NFC", "NFD", "NFKC", "NFKD"], config.unicode_form)
        temp = unicodedata.normalize(form, syllable)
        temp = normalizer.strip_diacritics(temp)
        temp = temp.lower().strip()

        # Check if empty after normalization
        if not temp:
            rejection_stats["rejected_empty"] += 1
            continue

        # Check charset
        if not normalizer._is_valid_charset(temp):
            rejection_stats["rejected_charset"] += 1
            continue

        # Check length
        if not normalizer._is_valid_length(temp):
            rejection_stats["rejected_length"] += 1
            continue

        # All checks passed
        normalized_syllables.append(temp)

    return normalized_syllables, rejection_stats