Source code for build_tools.pyphen_syllable_normaliser.aggregator

"""
File aggregation for syllable normalization pipeline.

This module handles Step 1 of the normalization pipeline: combining multiple
input files into a single raw syllable file while preserving all occurrences
and maintaining raw counts.
"""

from pathlib import Path
from typing import List


[docs] class FileAggregator: """ Aggregates syllables from multiple input files. This class handles the first step of the normalization pipeline: combining syllables from multiple .txt files into a single raw aggregated file. All occurrences are preserved (no deduplication), maintaining the original frequency distribution from the input files. Example: >>> from pathlib import Path >>> aggregator = FileAggregator() >>> input_files = [Path("file1.txt"), Path("file2.txt")] >>> syllables = aggregator.aggregate_files(input_files) >>> len(syllables) # Total from both files 450 >>> aggregator.save_raw_syllables(syllables, Path("syllables_raw.txt")) """
[docs] def aggregate_files(self, input_files: List[Path]) -> List[str]: """ Aggregate syllables from multiple input files. Reads all syllables from the provided input files and combines them into a single list. Each line in each input file is treated as one syllable. Empty lines are skipped. All occurrences are preserved (no deduplication). Args: input_files: List of Path objects pointing to input .txt files. Each file should contain one syllable per line. Returns: List of all syllables from all input files, preserving duplicates and maintaining the original order (file by file). Raises: FileNotFoundError: If any input file does not exist. PermissionError: If any input file cannot be read. UnicodeDecodeError: If any input file contains invalid UTF-8. Example: >>> aggregator = FileAggregator() >>> files = [Path("corpus1.txt"), Path("corpus2.txt")] >>> syllables = aggregator.aggregate_files(files) >>> syllables[:3] ['hello', 'world', 'test'] Note: Files are processed in the order provided. If deterministic ordering is required, ensure input_files is sorted before calling. """ all_syllables: List[str] = [] for file_path in input_files: syllables = self.read_syllables_from_file(file_path) all_syllables.extend(syllables) return all_syllables
[docs] def read_syllables_from_file(self, file_path: Path) -> List[str]: """ Read syllables from a single file. Reads a file line by line, treating each line as one syllable. Empty lines (whitespace only) are skipped. No normalization or transformation is applied - syllables are preserved exactly as they appear in the file. Args: file_path: Path to the input file to read. Returns: List of syllable strings from the file, one per non-empty line. Raises: FileNotFoundError: If the file does not exist. PermissionError: If the file cannot be read. UnicodeDecodeError: If the file contains invalid UTF-8. Example: >>> aggregator = FileAggregator() >>> syllables = aggregator.read_syllables_from_file(Path("input.txt")) >>> syllables ['ka', 'ra', 'mi', 'ka', 'ta'] Note: Leading and trailing whitespace is stripped from each line, but the syllable content itself is not modified. This allows files with varying whitespace formatting to be processed consistently. """ syllables: List[str] = [] with file_path.open("r", encoding="utf-8") as f: for line in f: # Strip whitespace and skip empty lines syllable = line.strip() if syllable: syllables.append(syllable) return syllables
[docs] def save_raw_syllables(self, syllables: List[str], output_path: Path) -> None: """ Save raw aggregated syllables to file. Writes syllables to the output file, one per line, in the order provided. This creates the syllables_raw.txt file for the pipeline. All syllables are written exactly as provided (no normalization). Args: syllables: List of syllable strings to write. output_path: Path where the raw syllables file should be saved. Raises: PermissionError: If the output file cannot be written. OSError: If there are filesystem issues (disk full, etc.). Example: >>> aggregator = FileAggregator() >>> syllables = ['ka', 'ra', 'mi', 'ka', 'ta'] >>> aggregator.save_raw_syllables(syllables, Path("syllables_raw.txt")) # File contains: # ka # ra # mi # ka # ta Note: This method creates the output file if it doesn't exist and overwrites it if it does. The output directory must already exist. """ # Ensure parent directory exists output_path.parent.mkdir(parents=True, exist_ok=True) # Write syllables one per line with output_path.open("w", encoding="utf-8") as f: for syllable in syllables: f.write(f"{syllable}\n")
[docs] def discover_input_files( source_dir: Path, pattern: str = "*.txt", recursive: bool = False ) -> List[Path]: """ Discover input files in a directory matching a pattern. Scans a directory for files matching the specified glob pattern. Returns files in sorted order for deterministic processing. Args: source_dir: Directory to scan for input files. pattern: Glob pattern for matching files. Default: "*.txt". recursive: If True, scan subdirectories recursively using "**/" prefix. Default: False (only scan the immediate directory). Returns: Sorted list of Path objects for all matching files. Raises: ValueError: If source_dir is not a directory. FileNotFoundError: If source_dir does not exist. Example: >>> from pathlib import Path >>> # Non-recursive scan >>> files = discover_input_files(Path("data/"), pattern="*.txt") >>> files [Path('data/corpus1.txt'), Path('data/corpus2.txt')] >>> >>> # Recursive scan >>> files = discover_input_files( ... Path("data/"), ... pattern="*.txt", ... recursive=True ... ) >>> files [Path('data/corpus1.txt'), Path('data/subdir/corpus3.txt'), Path('data/subdir/corpus4.txt')] Note: Files are always returned in sorted order to ensure deterministic processing. This is critical for reproducible normalization results. """ if not source_dir.exists(): raise FileNotFoundError(f"Source directory does not exist: {source_dir}") if not source_dir.is_dir(): raise ValueError(f"Source path is not a directory: {source_dir}") # Use glob or rglob depending on recursive flag if recursive: # Recursive: scan all subdirectories files = list(source_dir.rglob(pattern)) else: # Non-recursive: scan only immediate directory files = list(source_dir.glob(pattern)) # Filter to only files (exclude directories) files = [f for f in files if f.is_file()] # Sort for deterministic order return sorted(files)