Source code for build_tools.nltk_syllable_extractor.extractor
"""
Core CMUDict-based syllable extraction functionality.
This module provides the NltkSyllableExtractor class for extracting syllables
from text using CMU Pronouncing Dictionary with phonetically-guided
orthographic syllabification based on onset/coda principles.
"""
import re
from pathlib import Path
from typing import Dict, List
# Optional dependency - only needed at runtime, not for documentation builds
try:
import cmudict
CMUDICT_AVAILABLE = True
except ImportError:
cmudict = None # type: ignore[assignment]
CMUDICT_AVAILABLE = False
[docs]
class NltkSyllableExtractor:
"""
Extracts syllables from text using CMU Pronouncing Dictionary.
This class uses phonetic information from CMUDict to guide orthographic
syllable splitting, respecting English phonotactic constraints via
onset/coda principles.
The extractor works by:
1. Reading text input (string or file)
2. Tokenizing into words using regex
3. Looking up phonetic transcriptions in CMUDict
4. Using vowel phonemes to identify syllable boundaries
5. Mapping phonetic structure back to orthographic positions
6. Applying onset/coda rules to split consonant clusters
7. Filtering syllables by length constraints
8. Returning unique syllables (case-insensitive)
Key Differences from pyphen:
- Uses phonetic information (CMUDict) rather than typographic rules
- Respects consonant cluster constraints (onset/coda principles)
- Produces more "natural" phonetic splits
- English only (CMUDict limitation)
- Includes fallback for out-of-vocabulary words
Typical Usage:
>>> # Basic extraction
>>> extractor = NltkSyllableExtractor('en_US', min_syllable_length=2, max_syllable_length=8)
>>> syllables = extractor.extract_syllables_from_text("Hello wonderful world")
>>> print(sorted(syllables))
['der', 'ful', 'hel', 'lo', 'won', 'world']
>>> # Extract from file and save
>>> syllables = extractor.extract_syllables_from_file(Path('input.txt'))
>>> extractor.save_syllables(syllables, Path('output.txt'))
Attributes:
language_code: The language code (always 'en_US' for NLTK extractor)
min_syllable_length: Minimum syllable length to include in results
max_syllable_length: Maximum syllable length to include in results
cmu_dict: The loaded CMU Pronouncing Dictionary
Note:
This is a build-time tool. The nltk dependency should not be used
at runtime in the core name generation system.
"""
# Valid English onset clusters (for onset/coda split decisions)
VALID_ONSETS = {
"bl",
"br",
"cl",
"cr",
"dr",
"fl",
"fr",
"gl",
"gr",
"pl",
"pr",
"sl",
"sm",
"sn",
"sp",
"st",
"sw",
"tr",
"tw",
"thr",
"scr",
"shr",
"spl",
"spr",
"str",
"squ",
"ch",
"sh",
"th",
"wh",
"ph",
"gh",
}
VOWELS = "aeiouy"
[docs]
def __init__(
self, language_code: str, min_syllable_length: int = 1, max_syllable_length: int = 999
):
"""
Initialize the NLTK syllable extractor.
Args:
language_code: Language code (must be 'en_US' for NLTK extractor)
min_syllable_length: Minimum syllable length to include (default: 1, no filtering)
max_syllable_length: Maximum syllable length to include (default: 999, no filtering)
Raises:
ImportError: If cmudict is not installed
ValueError: If the language code is not 'en_US'
"""
if not CMUDICT_AVAILABLE:
raise ImportError(
"cmudict is not installed. This is a build-time dependency.\n"
"Install it with: pip install cmudict"
)
if language_code != "en_US":
raise ValueError(
f"Language code '{language_code}' is not supported by nltk_syllable_extractor.\n"
"Only 'en_US' is supported (CMUDict limitation)."
)
self.language_code = language_code
self.min_syllable_length = min_syllable_length
self.max_syllable_length = max_syllable_length
self.cmu_dict = cmudict.dict()
[docs]
def extract_syllables_from_text(
self, text: str, only_hyphenated: bool = True
) -> tuple[List[str], Dict[str, int]]:
"""
Extract all syllables from a block of text (preserves duplicates).
This method processes input text by tokenizing it into words, applying
CMUDict phonetic lookup and onset/coda principles to extract individual
syllables that meet the configured length constraints.
Args:
text: Input text to process. Can contain any characters, but only
alphabetic sequences will be processed as words.
only_hyphenated: If True, only include syllables from words that were
successfully split (CMUDict lookup succeeded). Set to False
to include fallback syllabification for unknown words.
Returns:
Tuple of (syllables, statistics) where:
- syllables: List of all lowercase syllable strings (includes duplicates)
- statistics: Dict with the following keys:
- 'total_words': Total number of words found in source text
- 'processed_words': Words that were successfully processed
- 'fallback_count': Words not in CMUDict (used fallback heuristics)
- 'rejected_syllables': Syllables rejected due to length constraints
Note:
- Only processes words containing alphabetic characters (a-z, A-Z)
- Case-insensitive processing (all output is lowercase)
- Automatically removes punctuation and special characters
- Filters syllables by configured min/max length constraints
- When only_hyphenated=True, excludes words not in CMUDict
- Deterministic: same input always produces same output
- Uses first pronunciation when multiple exist (deterministic)
- Words are extracted using regex pattern: \\b[a-zA-Z]+\\b
Example:
>>> extractor = NltkSyllableExtractor('en_US', min_syllable_length=2, max_syllable_length=8)
>>> syllables, stats = extractor.extract_syllables_from_text("Hello world!")
>>> print(syllables)
['hel', 'lo', 'world']
>>> print(stats['total_words'])
2
"""
# Extract words using regex (alphanumeric sequences)
words = re.findall(r"\b[a-zA-Z]+\b", text)
syllables: List[str] = []
stats = {
"total_words": len(words),
"fallback_count": 0,
"rejected_syllables": 0,
"processed_words": 0,
}
for word in words:
# Convert to lowercase for CMUDict lookup
word_lower = word.lower()
# Extract syllables from word
word_syllables = self._extract_orthographic_syllables(word_lower)
# Track if CMUDict lookup failed (word not in dictionary)
if word_lower not in self.cmu_dict:
stats["fallback_count"] += 1
# If CMUDict lookup failed and only_hyphenated is True, skip
if not word_syllables and only_hyphenated:
continue
# If we got syllables (either from CMUDict or fallback)
if word_syllables:
stats["processed_words"] += 1
# Filter syllables by length and add to list (preserves duplicates)
for syllable in word_syllables:
if self.min_syllable_length <= len(syllable) <= self.max_syllable_length:
syllables.append(syllable)
else:
stats["rejected_syllables"] += 1
return syllables, stats
def _extract_orthographic_syllables(self, word: str) -> list[str]:
"""
Extract orthographic syllables from a word using CMUDict.
Uses vowel positions and phonetic syllable structure to determine
boundaries, then maps back to spelling.
Args:
word: Lowercase word to syllabify
Returns:
List of orthographic syllables, empty list if word cannot be processed
"""
# Normalize word: remove non-alphabetic characters
word_clean = re.sub(r"[^a-z]", "", word.lower())
if not word_clean:
return []
if len(word_clean) == 1:
return [word_clean]
# Get pronunciation from CMU Dictionary
if word_clean not in self.cmu_dict:
# Fallback: simple vowel-based splitting
return self._fallback_split(word_clean)
# Use first pronunciation (deterministic)
pronunciation = self.cmu_dict[word_clean][0]
# Extract phonetic syllables
phonetic_syllables = self._extract_phonetic_syllables(pronunciation)
if len(phonetic_syllables) <= 1:
return [word_clean]
# Map phonetic syllables to orthographic positions
return self._map_to_orthographic(word_clean, phonetic_syllables)
def _extract_phonetic_syllables(self, phonemes: list[str]) -> list[list[str]]:
"""
Extract phonetic syllables from phoneme list.
Each syllable ends with a vowel phoneme (marked with stress digit).
Args:
phonemes: List of phoneme strings from CMUDict
Returns:
List of syllables, where each syllable is a list of phonemes
"""
syllables = []
current_syllable = []
for phoneme in phonemes:
current_syllable.append(phoneme)
# Vowel phonemes end with stress digits (0, 1, or 2)
if phoneme[-1].isdigit():
syllables.append(current_syllable)
current_syllable = []
# Add any remaining consonants to last syllable
if current_syllable:
syllables.append(current_syllable)
return syllables
def _map_to_orthographic(self, word: str, phonetic_syllables: list[list[str]]) -> list[str]:
"""
Map phonetic syllables to orthographic character positions.
Strategy:
1. Find orthographic vowel positions
2. For each pair of vowel positions, determine split point using:
- Consonants between vowels
- Onset/coda principles
- Phonetic syllable structure for guidance
Args:
word: The word to split
phonetic_syllables: List of phonetic syllables from CMUDict
Returns:
List of orthographic syllables
"""
# Find orthographic vowel positions
vowel_positions = [i for i, c in enumerate(word) if c in self.VOWELS]
# Count vowel phonemes (should equal number of vowel positions)
vowel_phoneme_count = sum(1 for syl in phonetic_syllables for p in syl if p[-1].isdigit())
# If counts don't match, fall back
if len(vowel_positions) < vowel_phoneme_count:
return self._fallback_split(word)
# Build split points between consecutive vowel positions
split_points = []
for i in range(len(vowel_positions) - 1):
current_vowel_pos = vowel_positions[i]
next_vowel_pos = vowel_positions[i + 1]
# Find end of current vowel group (consecutive vowels)
vowel_end = current_vowel_pos
while vowel_end + 1 < len(word) and word[vowel_end + 1] in self.VOWELS:
vowel_end += 1
# Get consonants between current and next vowel
consonants_between = word[vowel_end + 1 : next_vowel_pos]
num_consonants = len(consonants_between)
if num_consonants == 0:
# Adjacent vowels - split between them
split_point = vowel_end + 1
elif num_consonants == 1:
# Single consonant: keep with next vowel (onset principle)
split_point = vowel_end + 1
else:
# Multiple consonants: use onset/coda principles
# Try to find a valid onset cluster at the end
split_point = vowel_end + 1 # Default: split after first consonant
# Check for valid onset clusters (try 3, 2, then 1 consonant)
for cluster_len in range(min(3, num_consonants), 0, -1):
potential_onset = consonants_between[-cluster_len:]
if self._is_valid_onset(potential_onset):
split_point = vowel_end + 1 + (num_consonants - cluster_len)
break
if 0 < split_point < len(word):
split_points.append(split_point)
return self._build_syllables(word, split_points)
def _fallback_split(self, word: str) -> list[str]:
"""
Fallback syllable splitting using vowel groups and onset/coda rules.
This is used when CMUDict lookup fails.
Args:
word: The word to split
Returns:
List of syllables using heuristic rules
"""
# Find vowel groups
vowel_groups = []
i = 0
while i < len(word):
if word[i] in self.VOWELS:
start = i
while i < len(word) and word[i] in self.VOWELS:
i += 1
vowel_groups.append((start, i))
else:
i += 1
if len(vowel_groups) <= 1:
return [word]
split_points = []
for i in range(len(vowel_groups) - 1):
current_vowel_end = vowel_groups[i][1]
next_vowel_start = vowel_groups[i + 1][0]
consonants_between = word[current_vowel_end:next_vowel_start]
num_consonants = len(consonants_between)
if num_consonants == 0:
split_points.append(current_vowel_end)
elif num_consonants == 1:
split_points.append(current_vowel_end)
else:
split_pos = current_vowel_end + 1
for cluster_len in range(min(3, num_consonants), 0, -1):
potential_onset = consonants_between[-cluster_len:]
if self._is_valid_onset(potential_onset):
split_pos = current_vowel_end + num_consonants - cluster_len
break
split_points.append(split_pos)
return self._build_syllables(word, split_points)
def _is_valid_onset(self, consonant_cluster: str) -> bool:
"""
Check if a consonant cluster is a valid English onset.
Args:
consonant_cluster: String of consonants to check
Returns:
True if valid onset cluster, False otherwise
"""
return consonant_cluster.lower() in self.VALID_ONSETS
def _build_syllables(self, word: str, split_points: list[int]) -> list[str]:
"""
Build syllables from split points.
Args:
word: The word to split
split_points: List of character positions where splits occur
Returns:
List of syllable strings
"""
if not split_points:
return [word]
syllables = []
start = 0
# Sort and deduplicate split points
split_points = sorted(set(split_points))
for split_point in split_points:
if 0 < split_point < len(word):
syllables.append(word[start:split_point])
start = split_point
# Add final syllable
if start < len(word):
syllables.append(word[start:])
return [s for s in syllables if s]
[docs]
def extract_syllables_from_file(self, input_path: Path) -> tuple[List[str], Dict[str, int]]:
"""
Extract all syllables from a text file (preserves duplicates).
This is a convenience wrapper around extract_syllables_from_text() that
handles file reading with proper encoding (UTF-8) and error handling.
Args:
input_path: Path to the input text file. File should be UTF-8 encoded
plain text. Binary files or non-text formats will cause errors.
Returns:
Tuple of (syllables, statistics) where:
- syllables: List of all lowercase syllable strings (includes duplicates)
- statistics: Dict with processing statistics (see extract_syllables_from_text)
Raises:
FileNotFoundError: If the input file doesn't exist at the specified path
IOError: If there's an error reading the file (permissions, encoding, etc.)
Example:
>>> from pathlib import Path
>>> extractor = NltkSyllableExtractor('en_US', min_syllable_length=2, max_syllable_length=8)
>>> syllables, stats = extractor.extract_syllables_from_file(Path('book.txt'))
>>> print(f"Extracted {len(syllables)} unique syllables from {stats['total_words']} words")
Extracted 1250 unique syllables from 50000 words
"""
if not input_path.exists():
raise FileNotFoundError(f"Input file not found: {input_path}")
try:
with open(input_path, "r", encoding="utf-8") as f:
text = f.read()
except Exception as e:
raise IOError(f"Error reading file {input_path}: {e}")
return self.extract_syllables_from_text(text)
[docs]
def save_syllables(self, syllables: List[str], output_path: Path) -> None:
"""
Save syllables to a text file (one syllable per line, preserves all).
Writes syllables with UTF-8 encoding, one syllable per line. Syllables
are written in the order they appear in the list (preserving duplicates).
This format is ideal for downstream processing by normalizer tools.
Args:
syllables: List of syllables to save (may contain duplicates).
Written in the order provided.
output_path: Path to the output file. Parent directories must exist.
If the file exists, it will be overwritten.
Raises:
IOError: If there's an error writing the file (permissions, disk space, etc.)
Example:
>>> from pathlib import Path
>>> extractor = NltkSyllableExtractor('en_US')
>>> syllables = ['hel', 'lo', 'world', 'hel'] # Note: 'hel' appears twice
>>> extractor.save_syllables(syllables, Path('output.txt'))
# Creates file with content (preserving duplicates and order):
# hel
# lo
# world
# hel
Note:
The output file uses UTF-8 encoding with Unix-style line endings (\\n).
Each line contains exactly one syllable with no leading/trailing whitespace.
Duplicates are preserved. Use downstream tools for deduplication if needed.
"""
try:
with open(output_path, "w", encoding="utf-8") as f:
for syllable in syllables:
f.write(f"{syllable}\n")
except Exception as e:
raise IOError(f"Error writing file {output_path}: {e}")