Source code for build_tools.pyphen_syllable_extractor.extractor

"""
Core syllable extraction functionality.

This module provides the main SyllableExtractor class for extracting syllables
from text using pyphen's dictionary-based hyphenation.
"""

import re
from pathlib import Path
from typing import Dict, Set

# Optional dependency - only needed at runtime, not for documentation builds
try:
    import pyphen  # type: ignore[import-not-found, import-untyped]

    PYPHEN_AVAILABLE = True
except ImportError:
    pyphen = None  # type: ignore[assignment]
    PYPHEN_AVAILABLE = False



[docs]
class SyllableExtractor:
    """
    Extracts syllables from text using pyphen hyphenation dictionaries.

    This class provides methods to process text files and extract individual
    syllables based on language-specific hyphenation rules from LibreOffice's
    dictionary collection.

    The extractor works by:
    1. Reading text input (string or file)
    2. Tokenizing into words using regex
    3. Applying language-specific hyphenation rules via pyphen
    4. Splitting hyphenated words into syllables
    5. Filtering syllables by length constraints
    6. Returning unique syllables (case-insensitive)

    Key Features:
        - Support for 40+ languages via pyphen
        - Configurable syllable length constraints
        - Option to include/exclude non-hyphenated words
        - Case-insensitive processing
        - Unicode support for accented characters
        - Deterministic extraction (same input = same output)

    Typical Usage:
        >>> # Basic extraction
        >>> extractor = SyllableExtractor('en_US', min_syllable_length=2, max_syllable_length=8)
        >>> syllables = extractor.extract_syllables_from_text("Hello wonderful world")
        >>> print(sorted(syllables))
        ['der', 'ful', 'hel', 'lo', 'won', 'world']

        >>> # Extract from file and save
        >>> syllables = extractor.extract_syllables_from_file(Path('input.txt'))
        >>> extractor.save_syllables(syllables, Path('output.txt'))

    Attributes:
        dictionary: Pyphen hyphenation dictionary for the selected language
        language_code: The pyphen language/locale code (e.g., 'en_US', 'de_DE')
        min_syllable_length: Minimum syllable length to include in results
        max_syllable_length: Maximum syllable length to include in results

    Note:
        This is a build-time tool. The pyphen dependency should not be used
        at runtime in the core name generation system.
    """


[docs]
    def __init__(
        self, language_code: str, min_syllable_length: int = 1, max_syllable_length: int = 10
    ):
        """
        Initialize the syllable extractor with a specific language.

        Args:
            language_code: Pyphen language/locale code (e.g., 'en_US', 'de_DE')
            min_syllable_length: Minimum syllable length to include (default: 1)
            max_syllable_length: Maximum syllable length to include (default: 10)

        Raises:
            ImportError: If pyphen is not installed
            ValueError: If the language code is not supported by pyphen
        """
        if not PYPHEN_AVAILABLE:
            raise ImportError(
                "pyphen is not installed. This is a build-time dependency.\n"
                "Install it with: pip install pyphen"
            )

        try:
            self.dictionary = pyphen.Pyphen(lang=language_code)
            self.language_code = language_code
            self.min_syllable_length = min_syllable_length
            self.max_syllable_length = max_syllable_length
        except KeyError:
            available = ", ".join(sorted(pyphen.LANGUAGES.keys()))
            raise ValueError(
                f"Language code '{language_code}' is not supported by pyphen.\n"
                f"Available codes: {available}"
            )



[docs]
    def extract_syllables_from_text(
        self, text: str, only_hyphenated: bool = True
    ) -> tuple[Set[str], Dict[str, int]]:
        """
        Extract unique syllables from a block of text.

        This method processes input text by tokenizing it into words, applying
        hyphenation rules via pyphen, and extracting individual syllables that
        meet the configured length constraints.

        Args:
            text: Input text to process. Can contain any characters, but only
                  alphabetic sequences (including accented characters) will be
                  processed as words.
            only_hyphenated: If True, only include syllables from words that pyphen
                           actually hyphenated (default: True). This filters out
                           whole words that couldn't be syllabified. Set to False
                           to include all words, even if they can't be split.

        Returns:
            Tuple of (syllables, statistics) where:
                - syllables: Set of unique lowercase syllable strings
                - statistics: Dict with the following keys:
                    - 'total_words': Total number of words found in source text
                    - 'processed_words': Words that were successfully hyphenated/processed
                    - 'skipped_unhyphenated': Words skipped (only when only_hyphenated=True)
                    - 'rejected_syllables': Syllables rejected due to length constraints

        Note:
            - Only processes words containing alphabetic characters (a-z, A-Z, À-ÿ)
            - Case-insensitive processing (all output is lowercase)
            - Automatically removes punctuation and special characters
            - Filters syllables by configured min/max length constraints
            - When only_hyphenated=True, excludes words pyphen couldn't split
            - Deterministic: same input always produces same output
            - Words are extracted using regex pattern: \\b[a-zA-ZÀ-ÿ]+\\b

        Example:
            >>> extractor = SyllableExtractor('en_US', min_syllable_length=2, max_syllable_length=8)
            >>> syllables, stats = extractor.extract_syllables_from_text("Hello world!")
            >>> print(sorted(syllables))
            ['hel', 'lo', 'world']
            >>> print(stats['total_words'])
            2
        """
        # Extract words using regex (alphanumeric sequences)
        words = re.findall(r"\b[a-zA-ZÀ-ÿ]+\b", text)

        syllables: Set[str] = set()
        stats = {
            "total_words": len(words),
            "skipped_unhyphenated": 0,
            "rejected_syllables": 0,
            "processed_words": 0,
        }

        for word in words:
            # Convert to lowercase for consistency
            word_lower = word.lower()

            # Get hyphenated version of the word
            # pyphen.inserted() returns the word with hyphens at syllable boundaries
            hyphenated = self.dictionary.inserted(word_lower, hyphen="-")

            # Check if the word was actually hyphenated
            # If no hyphens were inserted, the word couldn't be syllabified
            if only_hyphenated and "-" not in hyphenated:
                stats["skipped_unhyphenated"] += 1
                continue

            stats["processed_words"] += 1

            # Split on hyphens to get individual syllables
            word_syllables = hyphenated.split("-")

            # Filter syllables by length and add to set
            for syllable in word_syllables:
                if self.min_syllable_length <= len(syllable) <= self.max_syllable_length:
                    syllables.add(syllable)
                else:
                    stats["rejected_syllables"] += 1

        return syllables, stats



[docs]
    def extract_syllables_from_file(self, input_path: Path) -> tuple[Set[str], Dict[str, int]]:
        """
        Extract unique syllables from a text file.

        This is a convenience wrapper around extract_syllables_from_text() that
        handles file reading with proper encoding (UTF-8) and error handling.

        Args:
            input_path: Path to the input text file. File should be UTF-8 encoded
                       plain text. Binary files or non-text formats will cause errors.

        Returns:
            Tuple of (syllables, statistics) where:
                - syllables: Set of unique lowercase syllable strings
                - statistics: Dict with processing statistics (see extract_syllables_from_text)

        Raises:
            FileNotFoundError: If the input file doesn't exist at the specified path
            IOError: If there's an error reading the file (permissions, encoding, etc.)

        Example:
            >>> from pathlib import Path
            >>> extractor = SyllableExtractor('en_US', min_syllable_length=2, max_syllable_length=8)
            >>> syllables, stats = extractor.extract_syllables_from_file(Path('book.txt'))
            >>> print(f"Extracted {len(syllables)} unique syllables from {stats['total_words']} words")
            Extracted 1250 unique syllables from 50000 words
        """
        if not input_path.exists():
            raise FileNotFoundError(f"Input file not found: {input_path}")

        try:
            with open(input_path, "r", encoding="utf-8") as f:
                text = f.read()
        except Exception as e:
            raise IOError(f"Error reading file {input_path}: {e}")

        return self.extract_syllables_from_text(text)



[docs]
    def save_syllables(self, syllables: Set[str], output_path: Path) -> None:
        """
        Save syllables to a text file (one syllable per line, sorted).

        Writes syllables in alphabetical order with UTF-8 encoding, one syllable
        per line. This format is ideal for version control and easy importing into
        other tools.

        Args:
            syllables: Set of syllables to save. Each syllable should be a string.
                      The set will be sorted alphabetically before writing.
            output_path: Path to the output file. Parent directories must exist.
                        If the file exists, it will be overwritten.

        Raises:
            IOError: If there's an error writing the file (permissions, disk space, etc.)

        Example:
            >>> from pathlib import Path
            >>> extractor = SyllableExtractor('en_US')
            >>> syllables = {'hel', 'lo', 'world'}
            >>> extractor.save_syllables(syllables, Path('output.txt'))
            # Creates file with content:
            # hel
            # lo
            # world

        Note:
            The output file uses UTF-8 encoding with Unix-style line endings (\\n).
            Each line contains exactly one syllable with no leading/trailing whitespace.
        """
        try:
            with open(output_path, "w", encoding="utf-8") as f:
                for syllable in sorted(syllables):
                    f.write(f"{syllable}\n")
        except Exception as e:
            raise IOError(f"Error writing file {output_path}: {e}")



[docs]
    @staticmethod
    def extract_with_auto_language(
        text: str,
        min_syllable_length: int = 1,
        max_syllable_length: int = 10,
        only_hyphenated: bool = True,
        default_language: str = "en_US",
        min_detection_length: int = 20,
        suppress_warnings: bool = False,
    ) -> tuple[Set[str], Dict[str, int], str]:
        """
        Extract syllables with automatic language detection.

        This convenience method combines language detection with syllable extraction.
        It automatically detects the language of the input text and creates an
        appropriate SyllableExtractor instance for that language.

        Args:
            text: Input text to process. Should be at least 20-50 characters for
                  reliable language detection.
            min_syllable_length: Minimum syllable length to include (default: 1)
            max_syllable_length: Maximum syllable length to include (default: 10)
            only_hyphenated: If True, only include syllables from hyphenated words
                           (default: True)
            default_language: Language code to use if detection fails (default: "en_US")
            min_detection_length: Minimum text length for detection attempt (default: 20)
            suppress_warnings: If True, suppress language detection warnings (default: False)

        Returns:
            Tuple of (syllables, statistics, detected_language_code) where:
                - syllables: Set of unique lowercase syllable strings
                - statistics: Dict with processing statistics
                - detected_language_code: The pyphen language code that was used

        Raises:
            ImportError: If langdetect is not installed (unless suppress_warnings=True)

        Example:
            >>> # Auto-detect English text
            >>> text = "Hello beautiful world, this is wonderful"
            >>> syllables, stats, lang = SyllableExtractor.extract_with_auto_language(text)
            >>> print(f"Detected language: {lang}")
            Detected language: en_US
            >>> print(f"Found {len(syllables)} syllables")
            Found 8 syllables

            >>> # Auto-detect French text
            >>> text = "Bonjour le monde, comment allez-vous aujourd'hui?"
            >>> syllables, stats, lang = SyllableExtractor.extract_with_auto_language(text)
            >>> print(f"Detected language: {lang}")
            Detected language: fr

            >>> # With custom parameters
            >>> syllables, stats, lang = SyllableExtractor.extract_with_auto_language(
            ...     text="Das sind deutsche Wörter",
            ...     min_syllable_length=2,
            ...     max_syllable_length=8,
            ...     default_language="en_US"
            ... )
            >>> print(lang)
            de_DE

        Note:
            - Requires langdetect: pip install langdetect
            - Detection accuracy depends on text length (20-50+ chars recommended)
            - For production use, consider setting suppress_warnings=True
            - Short text will fall back to default_language with a warning
        """
        from .language_detection import detect_language_code

        # Detect language
        language_code = detect_language_code(
            text,
            default=default_language,
            min_confidence_length=min_detection_length,
            suppress_warnings=suppress_warnings,
        )

        # Create extractor with detected language
        extractor = SyllableExtractor(
            language_code=language_code,
            min_syllable_length=min_syllable_length,
            max_syllable_length=max_syllable_length,
        )

        # Extract syllables
        syllables, stats = extractor.extract_syllables_from_text(text, only_hyphenated)

        return syllables, stats, language_code



[docs]
    @staticmethod
    def extract_file_with_auto_language(
        input_path: Path,
        min_syllable_length: int = 1,
        max_syllable_length: int = 10,
        only_hyphenated: bool = True,
        default_language: str = "en_US",
        min_detection_length: int = 20,
        suppress_warnings: bool = False,
    ) -> tuple[Set[str], Dict[str, int], str]:
        """
        Extract syllables from a file with automatic language detection.

        This convenience method reads a file, detects its language, and extracts
        syllables using the appropriate language-specific hyphenation rules.

        Args:
            input_path: Path to the input text file
            min_syllable_length: Minimum syllable length to include (default: 1)
            max_syllable_length: Maximum syllable length to include (default: 10)
            only_hyphenated: If True, only include syllables from hyphenated words
                           (default: True)
            default_language: Language code to use if detection fails (default: "en_US")
            min_detection_length: Minimum text length for detection attempt (default: 20)
            suppress_warnings: If True, suppress language detection warnings (default: False)

        Returns:
            Tuple of (syllables, statistics, detected_language_code) where:
                - syllables: Set of unique lowercase syllable strings
                - statistics: Dict with processing statistics
                - detected_language_code: The pyphen language code that was used

        Raises:
            FileNotFoundError: If the input file doesn't exist
            IOError: If there's an error reading the file
            ImportError: If langdetect is not installed (unless suppress_warnings=True)

        Example:
            >>> from pathlib import Path
            >>> syllables, stats, lang = SyllableExtractor.extract_file_with_auto_language(
            ...     Path('document.txt'),
            ...     min_syllable_length=2,
            ...     max_syllable_length=8
            ... )
            >>> print(f"Detected: {lang}, Found: {len(syllables)} syllables")
            Detected: de_DE, Found: 1500 syllables
        """
        if not input_path.exists():
            raise FileNotFoundError(f"Input file not found: {input_path}")

        try:
            with open(input_path, "r", encoding="utf-8") as f:
                text = f.read()
        except Exception as e:
            raise IOError(f"Error reading file {input_path}: {e}")

        return SyllableExtractor.extract_with_auto_language(
            text=text,
            min_syllable_length=min_syllable_length,
            max_syllable_length=max_syllable_length,
            only_hyphenated=only_hyphenated,
            default_language=default_language,
            min_detection_length=min_detection_length,
            suppress_warnings=suppress_warnings,
        )