Source code for build_tools.nltk_syllable_normaliser.fragment_cleaner

"""
Fragment cleaning logic for NLTK syllable normalization.

This module provides the FragmentCleaner class which handles reconstruction
of phonetically coherent syllables from NLTK's over-segmented output by
merging isolated single-letter fragments with their neighbors.
"""

VOWELS = set("aeiouy")



[docs]
class FragmentCleaner:
    """
    Clean NLTK-produced syllable fragments by merging isolated letters.

    The NLTK syllable extractor uses phonetically-guided splitting with
    onset/coda principles, which can sometimes over-segment words into
    isolated single-letter fragments. This cleaner reconstructs more
    coherent syllables by applying mechanical merging rules.

    Merging Rules:
        1. Single vowels (a, e, i, o, u, y) merge with next fragment
        2. Single consonants merge with next fragment
        3. Multi-character fragments remain unchanged

    Example:
        >>> cleaner = FragmentCleaner()
        >>> fragments = ["i", "down", "the", "ra", "bbit"]
        >>> cleaner.clean_fragments(fragments)
        ['idown', 'the', 'rabbit']

    Note:
        This is NLTK-specific preprocessing. Pyphen output doesn't need
        fragment cleaning as it uses typographic hyphenation rules.
    """


[docs]
    @staticmethod
    def is_single_letter(token: str) -> bool:
        """
        Check if token is a single alphabetic character.

        Args:
            token: String to check.

        Returns:
            True if token is exactly one alphabetic character, False otherwise.

        Example:
            >>> FragmentCleaner.is_single_letter("a")
            True
            >>> FragmentCleaner.is_single_letter("ab")
            False
            >>> FragmentCleaner.is_single_letter("1")
            False
        """
        return len(token) == 1 and token.isalpha()



[docs]
    @staticmethod
    def is_single_vowel(token: str) -> bool:
        """
        Check if token is a single vowel character.

        Args:
            token: String to check.

        Returns:
            True if token is a single vowel (a, e, i, o, u, y), False otherwise.

        Example:
            >>> FragmentCleaner.is_single_vowel("a")
            True
            >>> FragmentCleaner.is_single_vowel("b")
            False
            >>> FragmentCleaner.is_single_vowel("ae")
            False
        """
        return len(token) == 1 and token.lower() in VOWELS



[docs]
    def clean_fragments(self, fragments: list[str]) -> list[str]:
        """
        Perform mechanical cleanup by merging single-letter fragments.

        Applies two merging rules in sequence:
        1. Merge isolated single vowels with the following fragment
        2. Merge isolated single consonants with the following fragment

        This reconstructs more phonetically coherent syllables from
        NLTK's onset/coda-based over-segmentation.

        Args:
            fragments: List of syllable fragments (possibly over-segmented).

        Returns:
            List of cleaned fragments with single letters merged.

        Example:
            >>> cleaner = FragmentCleaner()
            >>> # Example 1: Single vowel merging
            >>> cleaner.clean_fragments(["i", "down"])
            ['idown']
            >>>
            >>> # Example 2: Single consonant merging
            >>> cleaner.clean_fragments(["r", "abbit"])
            ['rabbit']
            >>>
            >>> # Example 3: Mixed fragments
            >>> cleaner.clean_fragments(["cha", "pter", "i", "down", "the", "r", "a"])
            ['cha', 'pter', 'idown', 'the', 'ra']
            >>>
            >>> # Example 4: Preserve multi-character fragments
            >>> cleaner.clean_fragments(["hel", "lo", "world"])
            ['hel', 'lo', 'world']

        Note:
            - Fragments are processed left-to-right
            - Single letters merge with next fragment (if available)
            - Last fragment never merges (no next fragment available)
            - Empty input returns empty output
        """
        if not fragments:
            return []

        cleaned = []
        i = 0

        while i < len(fragments):
            current = fragments[i]

            # Lookahead safely
            next_frag = fragments[i + 1] if i + 1 < len(fragments) else None

            # Rule 1: Merge isolated single vowels with the next fragment
            if next_frag and self.is_single_vowel(current):
                merged = current + next_frag
                cleaned.append(merged)
                i += 2  # Skip both current and next
                continue

            # Rule 2: Merge single consonants with the next fragment
            if next_frag and self.is_single_letter(current):
                merged = current + next_frag
                cleaned.append(merged)
                i += 2  # Skip both current and next
                continue

            # Otherwise, keep fragment as-is
            cleaned.append(current)
            i += 1

        return cleaned



[docs]
    def clean_fragments_from_file(self, input_path: str, output_path: str) -> tuple[int, int]:
        """
        Clean fragments from input file and write to output file.

        Convenience method for file-based processing. Reads one fragment
        per line from input file, applies cleaning, and writes cleaned
        fragments to output file (one per line).

        Args:
            input_path: Path to input file (one fragment per line).
            output_path: Path to output file for cleaned fragments.

        Returns:
            Tuple of (original_count, cleaned_count) indicating how many
            fragments were merged.

        Raises:
            FileNotFoundError: If input file doesn't exist.
            IOError: If there's an error reading or writing files.

        Example:
            >>> # input.txt contains:
            >>> # i
            >>> # down
            >>> # the
            >>> # ra
            >>> # bbit
            >>>
            >>> cleaner = FragmentCleaner()
            >>> original, cleaned = cleaner.clean_fragments_from_file(
            ...     "input.txt", "output.txt"
            ... )
            >>> print(f"Cleaned {original} → {cleaned} fragments")
            Cleaned 5 → 3 fragments
            >>>
            >>> # output.txt now contains:
            >>> # idown
            >>> # the
            >>> # rabbit
        """
        from pathlib import Path

        input_file = Path(input_path)
        output_file = Path(output_path)

        if not input_file.exists():
            raise FileNotFoundError(f"Input file not found: {input_path}")

        # Read fragments
        with input_file.open("r", encoding="utf-8") as f:
            fragments = [line.strip() for line in f if line.strip()]

        original_count = len(fragments)

        # Clean fragments
        cleaned = self.clean_fragments(fragments)
        cleaned_count = len(cleaned)

        # Write cleaned output
        with output_file.open("w", encoding="utf-8") as f:
            for frag in cleaned:
                f.write(frag + "\n")

        return original_count, cleaned_count