"""
Data models for syllable normalization.
This module defines the data structures used to represent normalization
configuration, statistics, and results.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
[docs]
@dataclass
class NormalizationConfig:
"""
Configuration for syllable normalization process.
This dataclass stores all parameters that control how syllables are
normalized to canonical form.
Attributes:
min_length: Minimum syllable length (characters). Syllables shorter
than this are rejected. Default: 2
max_length: Maximum syllable length (characters). Syllables longer
than this are rejected. Default: 20
allowed_charset: String of allowed characters. Only syllables
containing these characters (after normalization) are kept.
Default: "abcdefghijklmnopqrstuvwxyz"
unicode_form: Unicode normalization form. Options: "NFC", "NFD",
"NFKC", "NFKD". Default: "NFKD" (compatibility decomposition)
Example:
>>> config = NormalizationConfig(min_length=3, max_length=10)
>>> config.min_length
3
>>> config.allowed_charset
'abcdefghijklmnopqrstuvwxyz'
"""
min_length: int = 2
max_length: int = 20
allowed_charset: str = "abcdefghijklmnopqrstuvwxyz"
unicode_form: str = "NFKD"
[docs]
def __post_init__(self):
"""Validate configuration parameters after initialization."""
if self.min_length < 1:
raise ValueError(f"min_length must be >= 1, got {self.min_length}")
if self.max_length < self.min_length:
raise ValueError(
f"max_length ({self.max_length}) must be >= min_length ({self.min_length})"
)
if self.unicode_form not in ("NFC", "NFD", "NFKC", "NFKD"):
raise ValueError(
f"unicode_form must be one of NFC, NFD, NFKC, NFKD, got {self.unicode_form}"
)
[docs]
@dataclass
class NormalizationStats:
"""
Statistics from the syllable normalization process.
This dataclass tracks counts and metrics throughout the normalization
pipeline, useful for understanding data quality and processing results.
Attributes:
raw_count: Total number of syllables in raw input (before normalization)
after_canonicalization: Number of syllables after normalization
rejected_charset: Syllables rejected due to invalid characters
rejected_length: Syllables rejected due to length constraints
rejected_empty: Syllables that became empty after normalization
unique_canonical: Number of unique canonical syllables
processing_time: Total processing time in seconds
Example:
>>> stats = NormalizationStats(
... raw_count=1000,
... after_canonicalization=950,
... rejected_charset=30,
... rejected_length=20,
... rejected_empty=0,
... unique_canonical=412,
... processing_time=1.5
... )
>>> stats.rejection_rate
5.0
"""
raw_count: int = 0
after_canonicalization: int = 0
rejected_charset: int = 0
rejected_length: int = 0
rejected_empty: int = 0
unique_canonical: int = 0
processing_time: float = 0.0
@property
def total_rejected(self) -> int:
"""Calculate total number of rejected syllables."""
return self.rejected_charset + self.rejected_length + self.rejected_empty
@property
def rejection_rate(self) -> float:
"""Calculate rejection rate as percentage of raw count."""
if self.raw_count == 0:
return 0.0
return (self.total_rejected / self.raw_count) * 100
[docs]
@dataclass
class FrequencyEntry:
"""
Single syllable with frequency and ranking information.
This dataclass represents one syllable in the frequency analysis,
including its occurrence count and relative ranking.
Attributes:
canonical: The canonical form of the syllable (e.g., "ka")
frequency: Number of times this syllable appears
rank: Frequency rank (1 = most common, 2 = second most common, etc.)
percentage: Percentage of total syllables (0-100)
Example:
>>> entry = FrequencyEntry(canonical="ka", frequency=187, rank=1, percentage=10.2)
>>> print(f"{entry.canonical}: {entry.frequency} ({entry.percentage:.1f}%)")
ka: 187 (10.2%)
"""
canonical: str
frequency: int
rank: int
percentage: float
[docs]
@dataclass
class NormalizationResult:
"""
Complete result from the syllable normalization pipeline.
This dataclass encapsulates all outputs from the normalization process,
including configuration, statistics, frequencies, and file paths.
Attributes:
config: Configuration used for normalization
stats: Statistics from the processing
frequencies: Dictionary mapping canonical syllable to frequency count
unique_syllables: Sorted list of unique canonical syllables
input_files: List of input file paths that were processed
output_dir: Directory where output files were saved
timestamp: When the normalization was performed
raw_file: Path to raw aggregated file (syllables_raw.txt)
canonical_file: Path to canonicalized file (syllables_canonicalised.txt)
frequency_file: Path to frequency JSON (syllables_frequencies.json)
unique_file: Path to unique syllables (syllables_unique.txt)
meta_file: Path to metadata report (normalization_meta.txt)
Example:
>>> result = NormalizationResult(
... config=NormalizationConfig(),
... stats=NormalizationStats(raw_count=1000),
... frequencies={"ka": 187, "ra": 162},
... unique_syllables=["ka", "ra"],
... input_files=[Path("file1.txt")],
... output_dir=Path("_working/normalized"),
... timestamp=datetime.now(),
... raw_file=Path("syllables_raw.txt"),
... canonical_file=Path("syllables_canonicalised.txt"),
... frequency_file=Path("syllables_frequencies.json"),
... unique_file=Path("syllables_unique.txt"),
... meta_file=Path("normalization_meta.txt")
... )
>>> result.stats.raw_count
1000
"""
config: NormalizationConfig
stats: NormalizationStats
frequencies: dict[str, int]
unique_syllables: list[str]
input_files: list[Path]
output_dir: Path
timestamp: datetime = field(default_factory=datetime.now)
raw_file: Path = field(default=Path("syllables_raw.txt"))
canonical_file: Path = field(default=Path("syllables_canonicalised.txt"))
frequency_file: Path = field(default=Path("syllables_frequencies.json"))
unique_file: Path = field(default=Path("syllables_unique.txt"))
meta_file: Path = field(default=Path("normalization_meta.txt"))