"""
File aggregation for syllable normalization pipeline.
This module handles Step 1 of the normalization pipeline: combining multiple
input files into a single raw syllable file while preserving all occurrences
and maintaining raw counts.
"""
from pathlib import Path
from typing import List
[docs]
class FileAggregator:
"""
Aggregates syllables from multiple input files.
This class handles the first step of the normalization pipeline: combining
syllables from multiple .txt files into a single raw aggregated file. All
occurrences are preserved (no deduplication), maintaining the original
frequency distribution from the input files.
Example:
>>> from pathlib import Path
>>> aggregator = FileAggregator()
>>> input_files = [Path("file1.txt"), Path("file2.txt")]
>>> syllables = aggregator.aggregate_files(input_files)
>>> len(syllables) # Total from both files
450
>>> aggregator.save_raw_syllables(syllables, Path("syllables_raw.txt"))
"""
[docs]
def aggregate_files(self, input_files: List[Path]) -> List[str]:
"""
Aggregate syllables from multiple input files.
Reads all syllables from the provided input files and combines them
into a single list. Each line in each input file is treated as one
syllable. Empty lines are skipped. All occurrences are preserved
(no deduplication).
Args:
input_files: List of Path objects pointing to input .txt files.
Each file should contain one syllable per line.
Returns:
List of all syllables from all input files, preserving duplicates
and maintaining the original order (file by file).
Raises:
FileNotFoundError: If any input file does not exist.
PermissionError: If any input file cannot be read.
UnicodeDecodeError: If any input file contains invalid UTF-8.
Example:
>>> aggregator = FileAggregator()
>>> files = [Path("corpus1.txt"), Path("corpus2.txt")]
>>> syllables = aggregator.aggregate_files(files)
>>> syllables[:3]
['hello', 'world', 'test']
Note:
Files are processed in the order provided. If deterministic
ordering is required, ensure input_files is sorted before calling.
"""
all_syllables: List[str] = []
for file_path in input_files:
syllables = self.read_syllables_from_file(file_path)
all_syllables.extend(syllables)
return all_syllables
[docs]
def read_syllables_from_file(self, file_path: Path) -> List[str]:
"""
Read syllables from a single file.
Reads a file line by line, treating each line as one syllable.
Empty lines (whitespace only) are skipped. No normalization or
transformation is applied - syllables are preserved exactly as
they appear in the file.
Args:
file_path: Path to the input file to read.
Returns:
List of syllable strings from the file, one per non-empty line.
Raises:
FileNotFoundError: If the file does not exist.
PermissionError: If the file cannot be read.
UnicodeDecodeError: If the file contains invalid UTF-8.
Example:
>>> aggregator = FileAggregator()
>>> syllables = aggregator.read_syllables_from_file(Path("input.txt"))
>>> syllables
['ka', 'ra', 'mi', 'ka', 'ta']
Note:
Leading and trailing whitespace is stripped from each line,
but the syllable content itself is not modified. This allows
files with varying whitespace formatting to be processed
consistently.
"""
syllables: List[str] = []
with file_path.open("r", encoding="utf-8") as f:
for line in f:
# Strip whitespace and skip empty lines
syllable = line.strip()
if syllable:
syllables.append(syllable)
return syllables
[docs]
def save_raw_syllables(self, syllables: List[str], output_path: Path) -> None:
"""
Save raw aggregated syllables to file.
Writes syllables to the output file, one per line, in the order
provided. This creates the syllables_raw.txt file for the pipeline.
All syllables are written exactly as provided (no normalization).
Args:
syllables: List of syllable strings to write.
output_path: Path where the raw syllables file should be saved.
Raises:
PermissionError: If the output file cannot be written.
OSError: If there are filesystem issues (disk full, etc.).
Example:
>>> aggregator = FileAggregator()
>>> syllables = ['ka', 'ra', 'mi', 'ka', 'ta']
>>> aggregator.save_raw_syllables(syllables, Path("syllables_raw.txt"))
# File contains:
# ka
# ra
# mi
# ka
# ta
Note:
This method creates the output file if it doesn't exist and
overwrites it if it does. The output directory must already exist.
"""
# Ensure parent directory exists
output_path.parent.mkdir(parents=True, exist_ok=True)
# Write syllables one per line
with output_path.open("w", encoding="utf-8") as f:
for syllable in syllables:
f.write(f"{syllable}\n")