"""
Data models for syllable normalization.
This module defines the data structures used to represent normalization
configuration, statistics, and results.
"""
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Dict, List
[docs]
@dataclass
class NormalizationConfig:
"""
Configuration for syllable normalization process.
This dataclass stores all parameters that control how syllables are
normalized to canonical form.
Attributes:
min_length: Minimum syllable length (characters). Syllables shorter
than this are rejected. Default: 2
max_length: Maximum syllable length (characters). Syllables longer
than this are rejected. Default: 20
allowed_charset: String of allowed characters. Only syllables
containing these characters (after normalization) are kept.
Default: "abcdefghijklmnopqrstuvwxyz"
unicode_form: Unicode normalization form. Options: "NFC", "NFD",
"NFKC", "NFKD". Default: "NFKD" (compatibility decomposition)
Example:
>>> config = NormalizationConfig(min_length=3, max_length=10)
>>> config.min_length
3
>>> config.allowed_charset
'abcdefghijklmnopqrstuvwxyz'
"""
min_length: int = 2
max_length: int = 20
allowed_charset: str = "abcdefghijklmnopqrstuvwxyz"
unicode_form: str = "NFKD"
[docs]
def __post_init__(self):
"""Validate configuration parameters after initialization."""
if self.min_length < 1:
raise ValueError(f"min_length must be >= 1, got {self.min_length}")
if self.max_length < self.min_length:
raise ValueError(
f"max_length ({self.max_length}) must be >= min_length ({self.min_length})"
)
if self.unicode_form not in ("NFC", "NFD", "NFKC", "NFKD"):
raise ValueError(
f"unicode_form must be one of NFC, NFD, NFKC, NFKD, got {self.unicode_form}"
)
[docs]
@dataclass
class NormalizationStats:
"""
Statistics from the syllable normalization process.
This dataclass tracks counts and metrics throughout the normalization
pipeline, useful for understanding data quality and processing results.
Attributes:
raw_count: Total number of syllables in raw input (before normalization)
after_canonicalization: Number of syllables after normalization
rejected_charset: Syllables rejected due to invalid characters
rejected_length: Syllables rejected due to length constraints
rejected_empty: Syllables that became empty after normalization
unique_canonical: Number of unique canonical syllables
processing_time: Total processing time in seconds
Example:
>>> stats = NormalizationStats(
... raw_count=1000,
... after_canonicalization=950,
... rejected_charset=30,
... rejected_length=20,
... rejected_empty=0,
... unique_canonical=412,
... processing_time=1.5
... )
>>> stats.rejection_rate
5.0
"""
raw_count: int = 0
after_canonicalization: int = 0
rejected_charset: int = 0
rejected_length: int = 0
rejected_empty: int = 0
unique_canonical: int = 0
processing_time: float = 0.0
@property
def total_rejected(self) -> int:
"""Calculate total number of rejected syllables."""
return self.rejected_charset + self.rejected_length + self.rejected_empty
@property
def rejection_rate(self) -> float:
"""Calculate rejection rate as percentage of raw count."""
if self.raw_count == 0:
return 0.0
return (self.total_rejected / self.raw_count) * 100
[docs]
@dataclass
class FrequencyEntry:
"""
Single syllable with frequency and ranking information.
This dataclass represents one syllable in the frequency analysis,
including its occurrence count and relative ranking.
Attributes:
canonical: The canonical form of the syllable (e.g., "ka")
frequency: Number of times this syllable appears
rank: Frequency rank (1 = most common, 2 = second most common, etc.)
percentage: Percentage of total syllables (0-100)
Example:
>>> entry = FrequencyEntry(canonical="ka", frequency=187, rank=1, percentage=10.2)
>>> print(f"{entry.canonical}: {entry.frequency} ({entry.percentage:.1f}%)")
ka: 187 (10.2%)
"""
canonical: str
frequency: int
rank: int
percentage: float
[docs]
@dataclass
class NormalizationResult:
"""
Complete result from the syllable normalization pipeline.
This dataclass encapsulates all outputs from the normalization process,
including configuration, statistics, frequencies, and file paths.
Attributes:
config: Configuration used for normalization
stats: Statistics from the processing
frequencies: Dictionary mapping canonical syllable to frequency count
unique_syllables: Sorted list of unique canonical syllables
input_files: List of input file paths that were processed
output_dir: Directory where output files were saved
timestamp: When the normalization was performed
raw_file: Path to raw aggregated file (syllables_raw.txt)
canonical_file: Path to canonicalized file (syllables_canonicalised.txt)
frequency_file: Path to frequency JSON (syllables_frequencies.json)
unique_file: Path to unique syllables (syllables_unique.txt)
meta_file: Path to metadata report (normalization_meta.txt)
Example:
>>> result = NormalizationResult(
... config=NormalizationConfig(),
... stats=NormalizationStats(raw_count=1000),
... frequencies={"ka": 187, "ra": 162},
... unique_syllables=["ka", "ra"],
... input_files=[Path("file1.txt")],
... output_dir=Path("_working/normalized"),
... timestamp=datetime.now(),
... raw_file=Path("syllables_raw.txt"),
... canonical_file=Path("syllables_canonicalised.txt"),
... frequency_file=Path("syllables_frequencies.json"),
... unique_file=Path("syllables_unique.txt"),
... meta_file=Path("normalization_meta.txt")
... )
>>> result.stats.raw_count
1000
"""
config: NormalizationConfig
stats: NormalizationStats
frequencies: Dict[str, int]
unique_syllables: List[str]
input_files: List[Path]
output_dir: Path
timestamp: datetime = field(default_factory=datetime.now)
raw_file: Path = field(default=Path("syllables_raw.txt"))
canonical_file: Path = field(default=Path("syllables_canonicalised.txt"))
frequency_file: Path = field(default=Path("syllables_frequencies.json"))
unique_file: Path = field(default=Path("syllables_unique.txt"))
meta_file: Path = field(default=Path("normalization_meta.txt"))