Source code for build_tools.nltk_syllable_extractor.extractor

"""
Core CMUDict-based syllable extraction functionality.

This module provides the NltkSyllableExtractor class for extracting syllables
from text using CMU Pronouncing Dictionary with phonetically-guided
orthographic syllabification based on onset/coda principles.
"""

from __future__ import annotations

import re
from pathlib import Path

# Optional dependency - only needed at runtime, not for documentation builds
try:
    import cmudict

    CMUDICT_AVAILABLE = True
except ImportError:
    cmudict = None  # type: ignore[assignment]
    CMUDICT_AVAILABLE = False



[docs]
class NltkSyllableExtractor:
    """
    Extracts syllables from text using CMU Pronouncing Dictionary.

    This class uses phonetic information from CMUDict to guide orthographic
    syllable splitting, respecting English phonotactic constraints via
    onset/coda principles.

    The extractor works by:
    1. Reading text input (string or file)
    2. Tokenizing into words using regex
    3. Looking up phonetic transcriptions in CMUDict
    4. Using vowel phonemes to identify syllable boundaries
    5. Mapping phonetic structure back to orthographic positions
    6. Applying onset/coda rules to split consonant clusters
    7. Filtering syllables by length constraints
    8. Returning unique syllables (case-insensitive)

    Key Differences from pyphen:
        - Uses phonetic information (CMUDict) rather than typographic rules
        - Respects consonant cluster constraints (onset/coda principles)
        - Produces more "natural" phonetic splits
        - English only (CMUDict limitation)
        - Includes fallback for out-of-vocabulary words

    Typical Usage:
        >>> # Basic extraction
        >>> extractor = NltkSyllableExtractor('en_US', min_syllable_length=2, max_syllable_length=8)
        >>> syllables = extractor.extract_syllables_from_text("Hello wonderful world")
        >>> print(sorted(syllables))
        ['der', 'ful', 'hel', 'lo', 'won', 'world']

        >>> # Extract from file and save
        >>> syllables = extractor.extract_syllables_from_file(Path('input.txt'))
        >>> extractor.save_syllables(syllables, Path('output.txt'))

    Attributes:
        language_code: The language code (always 'en_US' for NLTK extractor)
        min_syllable_length: Minimum syllable length to include in results
        max_syllable_length: Maximum syllable length to include in results
        cmu_dict: The loaded CMU Pronouncing Dictionary

    Note:
        This is a build-time tool. The nltk dependency should not be used
        at runtime in the core name generation system.
    """

    # Valid English onset clusters (for onset/coda split decisions)
    VALID_ONSETS = {
        "bl",
        "br",
        "cl",
        "cr",
        "dr",
        "fl",
        "fr",
        "gl",
        "gr",
        "pl",
        "pr",
        "sl",
        "sm",
        "sn",
        "sp",
        "st",
        "sw",
        "tr",
        "tw",
        "thr",
        "scr",
        "shr",
        "spl",
        "spr",
        "str",
        "squ",
        "ch",
        "sh",
        "th",
        "wh",
        "ph",
        "gh",
    }

    VOWELS = "aeiouy"


[docs]
    def __init__(
        self, language_code: str, min_syllable_length: int = 1, max_syllable_length: int = 999
    ):
        """
        Initialize the NLTK syllable extractor.

        Args:
            language_code: Language code (must be 'en_US' for NLTK extractor)
            min_syllable_length: Minimum syllable length to include (default: 1, no filtering)
            max_syllable_length: Maximum syllable length to include (default: 999, no filtering)

        Raises:
            ImportError: If cmudict is not installed
            ValueError: If the language code is not 'en_US'
        """
        if not CMUDICT_AVAILABLE:
            raise ImportError(
                "cmudict is not installed. This is a build-time dependency.\n"
                "Install it with: pip install cmudict"
            )

        if language_code != "en_US":
            raise ValueError(
                f"Language code '{language_code}' is not supported by nltk_syllable_extractor.\n"
                "Only 'en_US' is supported (CMUDict limitation)."
            )

        self.language_code = language_code
        self.min_syllable_length = min_syllable_length
        self.max_syllable_length = max_syllable_length
        self.cmu_dict = cmudict.dict()



[docs]
    def extract_syllables_from_text(
        self, text: str, only_hyphenated: bool = True
    ) -> tuple[list[str], dict[str, int]]:
        """
        Extract all syllables from a block of text (preserves duplicates).

        This method processes input text by tokenizing it into words, applying
        CMUDict phonetic lookup and onset/coda principles to extract individual
        syllables that meet the configured length constraints.

        Args:
            text: Input text to process. Can contain any characters, but only
                  alphabetic sequences will be processed as words.
            only_hyphenated: If True, only include syllables from words that were
                           successfully split (CMUDict lookup succeeded). Set to False
                           to include fallback syllabification for unknown words.

        Returns:
            Tuple of (syllables, statistics) where:
                - syllables: List of all lowercase syllable strings (includes duplicates)
                - statistics: Dict with the following keys:
                    - 'total_words': Total number of words found in source text
                    - 'processed_words': Words that were successfully processed
                    - 'fallback_count': Words not in CMUDict (used fallback heuristics)
                    - 'rejected_syllables': Syllables rejected due to length constraints

        Note:
            - Only processes words containing alphabetic characters (a-z, A-Z)
            - Case-insensitive processing (all output is lowercase)
            - Automatically removes punctuation and special characters
            - Filters syllables by configured min/max length constraints
            - When only_hyphenated=True, excludes words not in CMUDict
            - Deterministic: same input always produces same output
            - Uses first pronunciation when multiple exist (deterministic)
            - Words are extracted using regex pattern: \\b[a-zA-Z]+\\b

        Example:
            >>> extractor = NltkSyllableExtractor('en_US', min_syllable_length=2, max_syllable_length=8)
            >>> syllables, stats = extractor.extract_syllables_from_text("Hello world!")
            >>> print(syllables)
            ['hel', 'lo', 'world']
            >>> print(stats['total_words'])
            2
        """
        # Extract words using regex (alphanumeric sequences)
        words = re.findall(r"\b[a-zA-Z]+\b", text)

        syllables: list[str] = []
        stats = {
            "total_words": len(words),
            "fallback_count": 0,
            "rejected_syllables": 0,
            "processed_words": 0,
        }

        for word in words:
            # Convert to lowercase for CMUDict lookup
            word_lower = word.lower()

            # Extract syllables from word
            word_syllables = self._extract_orthographic_syllables(word_lower)

            # Track if CMUDict lookup failed (word not in dictionary)
            if word_lower not in self.cmu_dict:
                stats["fallback_count"] += 1

            # If CMUDict lookup failed and only_hyphenated is True, skip
            if not word_syllables and only_hyphenated:
                continue

            # If we got syllables (either from CMUDict or fallback)
            if word_syllables:
                stats["processed_words"] += 1

                # Filter syllables by length and add to list (preserves duplicates)
                for syllable in word_syllables:
                    if self.min_syllable_length <= len(syllable) <= self.max_syllable_length:
                        syllables.append(syllable)
                    else:
                        stats["rejected_syllables"] += 1

        return syllables, stats


    def _extract_orthographic_syllables(self, word: str) -> list[str]:
        """
        Extract orthographic syllables from a word using CMUDict.

        Uses vowel positions and phonetic syllable structure to determine
        boundaries, then maps back to spelling.

        Args:
            word: Lowercase word to syllabify

        Returns:
            List of orthographic syllables, empty list if word cannot be processed
        """
        # Normalize word: remove non-alphabetic characters
        word_clean = re.sub(r"[^a-z]", "", word.lower())

        if not word_clean:
            return []

        if len(word_clean) == 1:
            return [word_clean]

        # Get pronunciation from CMU Dictionary
        if word_clean not in self.cmu_dict:
            # Fallback: simple vowel-based splitting
            return self._fallback_split(word_clean)

        # Use first pronunciation (deterministic)
        pronunciation = self.cmu_dict[word_clean][0]

        # Extract phonetic syllables
        phonetic_syllables = self._extract_phonetic_syllables(pronunciation)

        if len(phonetic_syllables) <= 1:
            return [word_clean]

        # Map phonetic syllables to orthographic positions
        return self._map_to_orthographic(word_clean, phonetic_syllables)

    def _extract_phonetic_syllables(self, phonemes: list[str]) -> list[list[str]]:
        """
        Extract phonetic syllables from phoneme list.

        Each syllable ends with a vowel phoneme (marked with stress digit).

        Args:
            phonemes: List of phoneme strings from CMUDict

        Returns:
            List of syllables, where each syllable is a list of phonemes
        """
        syllables = []
        current_syllable = []

        for phoneme in phonemes:
            current_syllable.append(phoneme)
            # Vowel phonemes end with stress digits (0, 1, or 2)
            if phoneme[-1].isdigit():
                syllables.append(current_syllable)
                current_syllable = []

        # Add any remaining consonants to last syllable
        if current_syllable:
            syllables.append(current_syllable)

        return syllables

    def _map_to_orthographic(self, word: str, phonetic_syllables: list[list[str]]) -> list[str]:
        """
        Map phonetic syllables to orthographic character positions.

        Strategy:
        1. Find orthographic vowel positions
        2. For each pair of vowel positions, determine split point using:
           - Consonants between vowels
           - Onset/coda principles
           - Phonetic syllable structure for guidance

        Args:
            word: The word to split
            phonetic_syllables: List of phonetic syllables from CMUDict

        Returns:
            List of orthographic syllables
        """
        # Find orthographic vowel positions
        vowel_positions = [i for i, c in enumerate(word) if c in self.VOWELS]

        # Count vowel phonemes (should equal number of vowel positions)
        vowel_phoneme_count = sum(1 for syl in phonetic_syllables for p in syl if p[-1].isdigit())

        # If counts don't match, fall back
        if len(vowel_positions) < vowel_phoneme_count:
            return self._fallback_split(word)

        # Build split points between consecutive vowel positions
        split_points = []

        for i in range(len(vowel_positions) - 1):
            current_vowel_pos = vowel_positions[i]
            next_vowel_pos = vowel_positions[i + 1]

            # Find end of current vowel group (consecutive vowels)
            vowel_end = current_vowel_pos
            while vowel_end + 1 < len(word) and word[vowel_end + 1] in self.VOWELS:
                vowel_end += 1

            # Get consonants between current and next vowel
            consonants_between = word[vowel_end + 1 : next_vowel_pos]
            num_consonants = len(consonants_between)

            if num_consonants == 0:
                # Adjacent vowels - split between them
                split_point = vowel_end + 1
            elif num_consonants == 1:
                # Single consonant: keep with next vowel (onset principle)
                split_point = vowel_end + 1
            else:
                # Multiple consonants: use onset/coda principles
                # Try to find a valid onset cluster at the end
                split_point = vowel_end + 1  # Default: split after first consonant

                # Check for valid onset clusters (try 3, 2, then 1 consonant)
                for cluster_len in range(min(3, num_consonants), 0, -1):
                    potential_onset = consonants_between[-cluster_len:]
                    if self._is_valid_onset(potential_onset):
                        split_point = vowel_end + 1 + (num_consonants - cluster_len)
                        break

            if 0 < split_point < len(word):
                split_points.append(split_point)

        return self._build_syllables(word, split_points)

    def _fallback_split(self, word: str) -> list[str]:
        """
        Fallback syllable splitting using vowel groups and onset/coda rules.

        This is used when CMUDict lookup fails.

        Args:
            word: The word to split

        Returns:
            List of syllables using heuristic rules
        """
        # Find vowel groups
        vowel_groups = []
        i = 0
        while i < len(word):
            if word[i] in self.VOWELS:
                start = i
                while i < len(word) and word[i] in self.VOWELS:
                    i += 1
                vowel_groups.append((start, i))
            else:
                i += 1

        if len(vowel_groups) <= 1:
            return [word]

        split_points = []

        for i in range(len(vowel_groups) - 1):
            current_vowel_end = vowel_groups[i][1]
            next_vowel_start = vowel_groups[i + 1][0]

            consonants_between = word[current_vowel_end:next_vowel_start]
            num_consonants = len(consonants_between)

            if num_consonants == 0:
                split_points.append(current_vowel_end)
            elif num_consonants == 1:
                split_points.append(current_vowel_end)
            else:
                split_pos = current_vowel_end + 1

                for cluster_len in range(min(3, num_consonants), 0, -1):
                    potential_onset = consonants_between[-cluster_len:]
                    if self._is_valid_onset(potential_onset):
                        split_pos = current_vowel_end + num_consonants - cluster_len
                        break

                split_points.append(split_pos)

        return self._build_syllables(word, split_points)

    def _is_valid_onset(self, consonant_cluster: str) -> bool:
        """
        Check if a consonant cluster is a valid English onset.

        Args:
            consonant_cluster: String of consonants to check

        Returns:
            True if valid onset cluster, False otherwise
        """
        return consonant_cluster.lower() in self.VALID_ONSETS

    def _build_syllables(self, word: str, split_points: list[int]) -> list[str]:
        """
        Build syllables from split points.

        Args:
            word: The word to split
            split_points: List of character positions where splits occur

        Returns:
            List of syllable strings
        """
        if not split_points:
            return [word]

        syllables = []
        start = 0

        # Sort and deduplicate split points
        split_points = sorted(set(split_points))

        for split_point in split_points:
            if 0 < split_point < len(word):
                syllables.append(word[start:split_point])
                start = split_point

        # Add final syllable
        if start < len(word):
            syllables.append(word[start:])

        return [s for s in syllables if s]


[docs]
    def extract_syllables_from_file(self, input_path: Path) -> tuple[list[str], dict[str, int]]:
        """
        Extract all syllables from a text file (preserves duplicates).

        This is a convenience wrapper around extract_syllables_from_text() that
        handles file reading with proper encoding (UTF-8) and error handling.

        Args:
            input_path: Path to the input text file. File should be UTF-8 encoded
                       plain text. Binary files or non-text formats will cause errors.

        Returns:
            Tuple of (syllables, statistics) where:
                - syllables: List of all lowercase syllable strings (includes duplicates)
                - statistics: Dict with processing statistics (see extract_syllables_from_text)

        Raises:
            FileNotFoundError: If the input file doesn't exist at the specified path
            IOError: If there's an error reading the file (permissions, encoding, etc.)

        Example:
            >>> from pathlib import Path
            >>> extractor = NltkSyllableExtractor('en_US', min_syllable_length=2, max_syllable_length=8)
            >>> syllables, stats = extractor.extract_syllables_from_file(Path('book.txt'))
            >>> print(f"Extracted {len(syllables)} unique syllables from {stats['total_words']} words")
            Extracted 1250 unique syllables from 50000 words
        """
        if not input_path.exists():
            raise FileNotFoundError(f"Input file not found: {input_path}")

        try:
            with open(input_path, "r", encoding="utf-8") as f:
                text = f.read()
        except Exception as e:
            raise IOError(f"Error reading file {input_path}: {e}")

        return self.extract_syllables_from_text(text)



[docs]
    def save_syllables(self, syllables: list[str], output_path: Path) -> None:
        """
        Save syllables to a text file (one syllable per line, preserves all).

        Writes syllables with UTF-8 encoding, one syllable per line. Syllables
        are written in the order they appear in the list (preserving duplicates).
        This format is ideal for downstream processing by normalizer tools.

        Args:
            syllables: List of syllables to save (may contain duplicates).
                      Written in the order provided.
            output_path: Path to the output file. Parent directories must exist.
                        If the file exists, it will be overwritten.

        Raises:
            IOError: If there's an error writing the file (permissions, disk space, etc.)

        Example:
            >>> from pathlib import Path
            >>> extractor = NltkSyllableExtractor('en_US')
            >>> syllables = ['hel', 'lo', 'world', 'hel']  # Note: 'hel' appears twice
            >>> extractor.save_syllables(syllables, Path('output.txt'))
            # Creates file with content (preserving duplicates and order):
            # hel
            # lo
            # world
            # hel

        Note:
            The output file uses UTF-8 encoding with Unix-style line endings (\\n).
            Each line contains exactly one syllable with no leading/trailing whitespace.
            Duplicates are preserved. Use downstream tools for deduplication if needed.
        """
        try:
            with open(output_path, "w", encoding="utf-8") as f:
                for syllable in syllables:
                    f.write(f"{syllable}\n")
        except Exception as e:
            raise IOError(f"Error writing file {output_path}: {e}")