"""
Frequency analysis for canonical syllables.
This module handles Step 3 of the normalization pipeline: analyzing frequency
distribution of canonical syllables and generating frequency intelligence
data structures. This captures "how often each canonical syllable occurs
before we collapse identity" - essential for understanding natural language
patterns in the source corpus.
"""
import json
from collections import Counter
from pathlib import Path
from typing import Dict, List, cast
from .models import FrequencyEntry
[docs]
class FrequencyAnalyzer:
"""
Analyzes frequency distribution of canonical syllables.
This class handles the intelligence capture phase of the normalization
pipeline. It counts occurrences of each canonical syllable, creates
frequency rankings, and generates output files for downstream analysis
and feature annotation.
Example:
>>> from pathlib import Path
>>> analyzer = FrequencyAnalyzer()
>>> syllables = ['ka', 'ra', 'mi', 'ka', 'ta', 'ka']
>>> frequencies = analyzer.calculate_frequencies(syllables)
>>> frequencies
{'ka': 3, 'ra': 1, 'mi': 1, 'ta': 1}
>>> analyzer.save_frequencies(frequencies, Path("syllables_frequencies.json"))
>>> unique = analyzer.extract_unique_syllables(syllables)
>>> unique
['ka', 'mi', 'ra', 'ta']
"""
[docs]
def calculate_frequencies(self, syllables: List[str]) -> Dict[str, int]:
"""
Calculate frequency counts for canonical syllables.
Counts how many times each unique syllable appears in the input list.
This captures the natural frequency distribution from the source corpus
before deduplication.
Args:
syllables: List of canonical syllables (may contain duplicates).
Returns:
Dictionary mapping each unique syllable to its occurrence count.
Example:
>>> analyzer = FrequencyAnalyzer()
>>> syllables = ['ka', 'ra', 'mi', 'ka', 'ta', 'ka', 'ra']
>>> frequencies = analyzer.calculate_frequencies(syllables)
>>> frequencies
{'ka': 3, 'ra': 2, 'mi': 1, 'ta': 1}
>>> sum(frequencies.values()) # Total syllable count
7
Note:
The returned dictionary is not sorted. Use create_frequency_entries()
to generate sorted frequency rankings.
"""
return dict(Counter(syllables))
[docs]
def create_frequency_entries(self, frequencies: Dict[str, int]) -> List[FrequencyEntry]:
"""
Create ranked frequency entries from frequency counts.
Converts a frequency dictionary into a list of FrequencyEntry objects
with ranking information and percentage calculations. Entries are
sorted by frequency (descending) then alphabetically (ascending).
Args:
frequencies: Dictionary mapping syllable to occurrence count.
Returns:
List of FrequencyEntry objects sorted by frequency (highest first),
with alphabetical secondary sort for ties.
Example:
>>> analyzer = FrequencyAnalyzer()
>>> frequencies = {'ka': 187, 'ra': 162, 'mi': 145, 'ta': 98}
>>> entries = analyzer.create_frequency_entries(frequencies)
>>> entries[0]
FrequencyEntry(canonical='ka', frequency=187, rank=1, percentage=31.5)
>>> entries[0].canonical
'ka'
>>> entries[0].rank
1
Note:
Percentage is calculated as (frequency / total_count) * 100.
Ranks start at 1 (most frequent syllable has rank=1).
"""
total_count = sum(frequencies.values())
if total_count == 0:
return []
# Sort by frequency (descending), then alphabetically (ascending)
sorted_items = sorted(frequencies.items(), key=lambda x: (-x[1], x[0]))
entries: List[FrequencyEntry] = []
for rank, (syllable, count) in enumerate(sorted_items, start=1):
percentage = (count / total_count) * 100
entry = FrequencyEntry(
canonical=syllable, frequency=count, rank=rank, percentage=percentage
)
entries.append(entry)
return entries
[docs]
def save_frequencies(self, frequencies: Dict[str, int], output_path: Path) -> None:
"""
Save frequency dictionary to JSON file.
Writes the frequency intelligence to a JSON file for downstream
analysis. The output is formatted with indentation for readability
and sorted by key for deterministic output.
Args:
frequencies: Dictionary mapping syllable to occurrence count.
output_path: Path where the JSON file should be saved.
Raises:
PermissionError: If the output file cannot be written.
OSError: If there are filesystem issues (disk full, etc.).
Example:
>>> analyzer = FrequencyAnalyzer()
>>> frequencies = {'ka': 187, 'ra': 162, 'mi': 145}
>>> analyzer.save_frequencies(frequencies, Path("syllables_frequencies.json"))
# File contains:
# {
# "ka": 187,
# "mi": 145,
# "ra": 162
# }
Note:
The JSON is formatted with 2-space indentation and keys are
sorted alphabetically for consistent diffs in version control.
"""
# Ensure parent directory exists
output_path.parent.mkdir(parents=True, exist_ok=True)
# Write JSON with sorted keys and pretty formatting
with output_path.open("w", encoding="utf-8") as f:
json.dump(frequencies, f, indent=2, sort_keys=True, ensure_ascii=False)
f.write("\n") # Trailing newline for POSIX compliance
[docs]
def save_unique_syllables(self, unique_syllables: List[str], output_path: Path) -> None:
"""
Save unique syllables to text file.
Writes the deduplicated canonical syllable inventory to a text file,
one syllable per line. This creates the authoritative syllable list
for feature annotation and downstream processing.
Args:
unique_syllables: Sorted list of unique canonical syllables.
output_path: Path where the text file should be saved.
Raises:
PermissionError: If the output file cannot be written.
OSError: If there are filesystem issues (disk full, etc.).
Example:
>>> analyzer = FrequencyAnalyzer()
>>> unique = ['ka', 'mi', 'ra', 'ta']
>>> analyzer.save_unique_syllables(unique, Path("syllables_unique.txt"))
# File contains:
# ka
# mi
# ra
# ta
Note:
Syllables should be pre-sorted (alphabetically) before calling
this method. Use extract_unique_syllables() which returns
sorted output, or sort manually.
"""
# Ensure parent directory exists
output_path.parent.mkdir(parents=True, exist_ok=True)
# Write syllables one per line
with output_path.open("w", encoding="utf-8") as f:
for syllable in unique_syllables:
f.write(f"{syllable}\n")
[docs]
def load_frequencies_from_file(file_path: Path) -> Dict[str, int]:
"""
Load frequency dictionary from JSON file.
Reads a previously saved syllables_frequencies.json file and returns
the frequency dictionary. Useful for analysis and inspection of
normalization results.
Args:
file_path: Path to the JSON frequency file.
Returns:
Dictionary mapping syllable to occurrence count.
Raises:
FileNotFoundError: If the file does not exist.
json.JSONDecodeError: If the file is not valid JSON.
PermissionError: If the file cannot be read.
Example:
>>> from pathlib import Path
>>> frequencies = load_frequencies_from_file(Path("syllables_frequencies.json"))
>>> frequencies['ka']
187
>>> len(frequencies)
412
Note:
The JSON file must have been created by save_frequencies() or
follow the same format: {"syllable": count, ...}
"""
with file_path.open("r", encoding="utf-8") as f:
return cast(Dict[str, int], json.load(f))
[docs]
def load_unique_syllables_from_file(file_path: Path) -> List[str]:
"""
Load unique syllables from text file.
Reads a previously saved syllables_unique.txt file and returns the
syllable list. Useful for loading the authoritative syllable inventory
for feature annotation or analysis.
Args:
file_path: Path to the text file containing unique syllables.
Returns:
List of syllable strings (one per line from file).
Raises:
FileNotFoundError: If the file does not exist.
PermissionError: If the file cannot be read.
UnicodeDecodeError: If the file contains invalid UTF-8.
Example:
>>> from pathlib import Path
>>> syllables = load_unique_syllables_from_file(Path("syllables_unique.txt"))
>>> syllables[:5]
['ka', 'mi', 'ra', 'ta', 'wa']
>>> len(syllables)
412
Note:
Empty lines are skipped. Leading/trailing whitespace is stripped
from each line.
"""
syllables: List[str] = []
with file_path.open("r", encoding="utf-8") as f:
for line in f:
syllable = line.strip()
if syllable:
syllables.append(syllable)
return syllables