Source code for build_tools.pyphen_syllable_normaliser.normalizer

"""
Core normalization logic for syllable canonicalization.

This module provides the SyllableNormalizer class which handles the transformation
of raw syllables into canonical form through Unicode normalization, diacritic
stripping, lowercase conversion, and validation.
"""

import unicodedata
from typing import Literal, Optional, cast

from .models import NormalizationConfig


[docs] class SyllableNormalizer: """ Normalizes syllables to canonical form. This class applies a multi-step normalization pipeline to transform raw syllables into a standardized canonical representation. The pipeline includes Unicode normalization, diacritic removal, case normalization, and validation against charset and length constraints. Attributes: config: Configuration specifying normalization parameters such as allowed charset, length constraints, and Unicode normalization form. Example: >>> from build_tools.pyphen_syllable_normaliser import NormalizationConfig >>> config = NormalizationConfig(min_length=2, max_length=8) >>> normalizer = SyllableNormalizer(config) >>> normalizer.normalize("Café") 'cafe' >>> normalizer.normalize("x") # Too short None >>> normalizer.normalize("résumé123") # Invalid characters None """
[docs] def __init__(self, config: NormalizationConfig): """ Initialize normalizer with configuration. Args: config: NormalizationConfig instance specifying normalization parameters including charset, length constraints, and Unicode normalization form. Example: >>> config = NormalizationConfig( ... min_length=3, ... max_length=10, ... allowed_charset="abcdefghijklmnopqrstuvwxyz", ... unicode_form="NFKD" ... ) >>> normalizer = SyllableNormalizer(config) """ self.config = config
[docs] def normalize(self, syllable: str) -> Optional[str]: """ Normalize a single syllable to canonical form. Applies the complete normalization pipeline: 1. Unicode normalization (NFKD by default) 2. Strip diacritics (remove combining characters) 3. Lowercase conversion 4. Trim whitespace 5. Validate charset (only allowed characters) 6. Check length constraints Args: syllable: Raw syllable string to normalize. Returns: Normalized canonical syllable string, or None if the syllable is rejected due to: - Becoming empty after normalization - Containing invalid characters - Not meeting length constraints Example: >>> config = NormalizationConfig() >>> normalizer = SyllableNormalizer(config) >>> normalizer.normalize("Café") 'cafe' >>> normalizer.normalize(" HELLO ") 'hello' >>> normalizer.normalize("résumé") 'resume' >>> normalizer.normalize("") # Empty None >>> normalizer.normalize("x") # Too short (min_length=2) None >>> normalizer.normalize("hello123") # Invalid chars None """ # Step 1: Unicode normalization form = cast(Literal["NFC", "NFD", "NFKC", "NFKD"], self.config.unicode_form) normalized = unicodedata.normalize(form, syllable) # Step 2: Strip diacritics (remove combining characters) normalized = self.strip_diacritics(normalized) # Step 3: Lowercase conversion normalized = normalized.lower() # Step 4: Trim whitespace normalized = normalized.strip() # Check if empty after normalization if not normalized: return None # Step 5: Validate charset (only allowed characters) if not self._is_valid_charset(normalized): return None # Step 6: Check length constraints if not self._is_valid_length(normalized): return None return normalized
[docs] def strip_diacritics(self, text: str) -> str: """ Remove diacritics (accent marks) from Unicode text. Uses Unicode normalization (NFD/NFKD) to decompose characters into base characters and combining marks, then removes the combining marks. This converts accented characters like 'é' → 'e', 'ñ' → 'n', etc. Args: text: Unicode string potentially containing diacritics. Returns: String with all combining diacritical marks removed. Example: >>> normalizer = SyllableNormalizer(NormalizationConfig()) >>> normalizer.strip_diacritics("café") 'cafe' >>> normalizer.strip_diacritics("naïve") 'naive' >>> normalizer.strip_diacritics("Zürich") 'Zurich' >>> normalizer.strip_diacritics("São Paulo") 'Sao Paulo' Note: This method assumes the text has already been normalized to NFD or NFKD form. The normalize() method handles this automatically. """ # Filter out combining characters (category Mn = Mark, nonspacing) return "".join(char for char in text if unicodedata.category(char) != "Mn")
def _is_valid_charset(self, syllable: str) -> bool: """ Check if syllable contains only allowed characters. Args: syllable: Syllable string to validate. Returns: True if all characters are in allowed_charset, False otherwise. Example: >>> config = NormalizationConfig(allowed_charset="abcdefghijklmnopqrstuvwxyz") >>> normalizer = SyllableNormalizer(config) >>> normalizer._is_valid_charset("hello") True >>> normalizer._is_valid_charset("hello123") False >>> normalizer._is_valid_charset("hello-world") False """ return all(char in self.config.allowed_charset for char in syllable) def _is_valid_length(self, syllable: str) -> bool: """ Check if syllable meets length constraints. Args: syllable: Syllable string to validate. Returns: True if syllable length is between min_length and max_length (inclusive), False otherwise. Example: >>> config = NormalizationConfig(min_length=2, max_length=8) >>> normalizer = SyllableNormalizer(config) >>> normalizer._is_valid_length("ab") True >>> normalizer._is_valid_length("hello") True >>> normalizer._is_valid_length("x") False >>> normalizer._is_valid_length("verylongword") False """ length = len(syllable) return self.config.min_length <= length <= self.config.max_length
[docs] def normalize_batch( syllables: list[str], config: NormalizationConfig ) -> tuple[list[str], dict[str, int]]: """ Normalize a batch of syllables and collect rejection statistics. This is a convenience function for normalizing multiple syllables at once while tracking why syllables were rejected. Args: syllables: List of raw syllable strings to normalize. config: NormalizationConfig specifying normalization parameters. Returns: Tuple of (normalized_syllables, rejection_stats) where: - normalized_syllables: List of successfully normalized syllables - rejection_stats: Dictionary with rejection counts: - "rejected_empty": Syllables that became empty after normalization - "rejected_charset": Syllables with invalid characters - "rejected_length": Syllables outside length constraints Example: >>> config = NormalizationConfig(min_length=2, max_length=8) >>> syllables = ["Café", "x", "Hello", "world123", " résumé "] >>> normalized, stats = normalize_batch(syllables, config) >>> normalized ['cafe', 'hello', 'resume'] >>> stats {'rejected_empty': 0, 'rejected_charset': 1, 'rejected_length': 1} Note: This function processes syllables in order and preserves duplicates. For frequency analysis, use the frequency.py module which handles deduplication and counting. """ normalizer = SyllableNormalizer(config) normalized_syllables: list[str] = [] rejection_stats = { "rejected_empty": 0, "rejected_charset": 0, "rejected_length": 0, } for syllable in syllables: # Try each normalization step to track specific rejection reasons # Step 1-4: Unicode normalization, diacritic stripping, lowercase, trim form = cast(Literal["NFC", "NFD", "NFKC", "NFKD"], config.unicode_form) temp = unicodedata.normalize(form, syllable) temp = normalizer.strip_diacritics(temp) temp = temp.lower().strip() # Check if empty after normalization if not temp: rejection_stats["rejected_empty"] += 1 continue # Check charset if not normalizer._is_valid_charset(temp): rejection_stats["rejected_charset"] += 1 continue # Check length if not normalizer._is_valid_length(temp): rejection_stats["rejected_length"] += 1 continue # All checks passed normalized_syllables.append(temp) return normalized_syllables, rejection_stats