Source code for build_tools.nltk_syllable_normaliser.cli

"""
Command-line interface for NLTK syllable normalization pipeline.

This module provides the main CLI entry point for the nltk_syllable_normaliser tool,
which processes NLTK extractor output with fragment cleaning + normalization pipeline.
"""

from __future__ import annotations

import argparse
import sys
import time
from datetime import datetime
from pathlib import Path

# Import shared components from pyphen normaliser
from build_tools.pyphen_syllable_normaliser import (
    FileAggregator,
    FrequencyAnalyzer,
    NormalizationConfig,
    NormalizationResult,
    NormalizationStats,
    discover_input_files,
    normalize_batch,
)

from .fragment_cleaner import FragmentCleaner


[docs] def detect_nltk_run_directories(source_dir: Path) -> list[Path]: """ Detect NLTK run directories within source directory. Searches for directories matching the pattern YYYYMMDD_HHMMSS_nltk/ which contain a syllables/ subdirectory. Args: source_dir: Directory to search for NLTK run directories. Returns: List of Path objects pointing to NLTK run directories, sorted by directory name (chronological order). Example: >>> source = Path("_working/output/") >>> runs = detect_nltk_run_directories(source) >>> for run in runs: ... print(run.name) 20260110_095213_nltk 20260110_143022_nltk """ if not source_dir.exists(): raise FileNotFoundError(f"Source directory does not exist: {source_dir}") if not source_dir.is_dir(): raise ValueError(f"Path is not a directory: {source_dir}") # Find directories ending with _nltk that have syllables/ subdirectory nltk_dirs = [] for item in source_dir.iterdir(): if item.is_dir() and item.name.endswith("_nltk"): syllables_dir = item / "syllables" if syllables_dir.exists() and syllables_dir.is_dir(): nltk_dirs.append(item) return sorted(nltk_dirs)
[docs] def run_full_pipeline( run_directory: Path, config: NormalizationConfig, verbose: bool = False, skip_fragment_cleaning: bool = False, ) -> NormalizationResult: """ Run complete NLTK normalization pipeline with in-place processing. Executes the full NLTK-specific workflow: 1. Aggregate syllables from run_directory/syllables/*.txt 2. Fragment cleaning (NLTK-specific preprocessing) 3. Canonicalize syllables (Unicode normalization, etc.) 4. Frequency analysis 5. Write 5 output files to run_directory (in-place) Args: run_directory: NLTK run directory (e.g., _working/output/20260110_095213_nltk/). config: NormalizationConfig specifying normalization parameters. verbose: If True, print detailed progress information. skip_fragment_cleaning: If True, skip fragment cleaning step (for comparison). Returns: NormalizationResult containing all outputs, statistics, and file paths. Raises: FileNotFoundError: If run_directory or syllables/ subdirectory doesn't exist. ValueError: If run_directory is not a directory. Example: >>> from pathlib import Path >>> config = NormalizationConfig(min_length=2, max_length=8) >>> run_dir = Path("_working/output/20260110_095213_nltk/") >>> result = run_full_pipeline( ... run_directory=run_dir, ... config=config, ... verbose=True ... ) >>> result.stats.raw_count 15234 >>> result.stats.unique_canonical 4821 """ start_time = time.time() timestamp = datetime.now() # Validate run directory if not run_directory.exists(): raise FileNotFoundError(f"Run directory does not exist: {run_directory}") if not run_directory.is_dir(): raise ValueError(f"Path is not a directory: {run_directory}") syllables_dir = run_directory / "syllables" if not syllables_dir.exists(): raise FileNotFoundError( f"Syllables directory does not exist: {syllables_dir}. " f"Expected NLTK run directory structure with syllables/ subdirectory." ) # Define output file paths (in run directory, with nltk_ prefix) raw_file = run_directory / "nltk_syllables_raw.txt" canonical_file = run_directory / "nltk_syllables_canonicalised.txt" frequency_file = run_directory / "nltk_syllables_frequencies.json" unique_file = run_directory / "nltk_syllables_unique.txt" meta_file = run_directory / "nltk_normalization_meta.txt" print("\n" + "=" * 70) print("NLTK SYLLABLE NORMALIZATION PIPELINE") print("=" * 70) print(f"Run Directory: {run_directory.name}") print(f"Syllables Source: {syllables_dir}") print(f"Fragment Cleaning: {'Disabled' if skip_fragment_cleaning else 'Enabled'}") print(f"Timestamp: {timestamp.strftime('%Y-%m-%d %H:%M:%S')}") print("=" * 70) # Discover input files from syllables/ subdirectory print("\n⏳ Discovering input files...") input_files = discover_input_files(syllables_dir, pattern="*.txt", recursive=False) print(f"✓ Found {len(input_files)} syllable files") if verbose: for f in input_files[:5]: print(f" - {f.name}") if len(input_files) > 5: print(f" ... and {len(input_files) - 5} more") # Step 1: Aggregate files print("\n⏳ Step 1: Aggregating syllable files...") aggregator = FileAggregator() raw_syllables = aggregator.aggregate_files(input_files) aggregator.save_raw_syllables(raw_syllables, raw_file) raw_count = len(raw_syllables) print(f"✓ Aggregated {raw_count:,} syllables → {raw_file.name}") if verbose: print(f" Sample raw: {raw_syllables[:5]}") # Step 2: Fragment Cleaning (NLTK-specific) after_fragment_cleaning = raw_count if not skip_fragment_cleaning: print("\n⏳ Step 2: Cleaning fragments (NLTK-specific)...") cleaner = FragmentCleaner() cleaned_syllables = cleaner.clean_fragments(raw_syllables) after_fragment_cleaning = len(cleaned_syllables) reduction = raw_count - after_fragment_cleaning print(f"✓ Cleaned {raw_count:,}{after_fragment_cleaning:,} syllables") print(f" Merged {reduction:,} single-letter fragments") if verbose: print(f" Sample cleaned: {cleaned_syllables[:5]}") # Use cleaned syllables for canonicalization syllables_for_canon = cleaned_syllables else: print("\n⏳ Step 2: Fragment cleaning skipped") syllables_for_canon = raw_syllables # Step 3: Canonicalization print("\n⏳ Step 3: Canonicalizing syllables...") canonical_syllables, rejection_stats = normalize_batch(syllables_for_canon, config) # Save canonicalized syllables with canonical_file.open("w", encoding="utf-8") as file_handle: for syllable in canonical_syllables: file_handle.write(f"{syllable}\n") after_canonicalization = len(canonical_syllables) print(f"✓ Canonicalized {after_canonicalization:,} syllables → {canonical_file.name}") rejected_total = ( rejection_stats["rejected_empty"] + rejection_stats["rejected_charset"] + rejection_stats["rejected_length"] ) print(f" Rejected: {rejected_total:,} syllables") if verbose: print(f" Empty: {rejection_stats['rejected_empty']:,}") print(f" Invalid charset: {rejection_stats['rejected_charset']:,}") print(f" Length constraint: {rejection_stats['rejected_length']:,}") print(f" Sample canonical: {canonical_syllables[:5]}") # Step 4: Frequency analysis print("\n⏳ Step 4: Analyzing frequencies...") analyzer = FrequencyAnalyzer() # Calculate frequencies frequencies = analyzer.calculate_frequencies(canonical_syllables) analyzer.save_frequencies(frequencies, frequency_file) print(f"✓ Saved frequency data → {frequency_file.name}") # Extract unique syllables unique_syllables = analyzer.extract_unique_syllables(canonical_syllables) analyzer.save_unique_syllables(unique_syllables, unique_file) unique_count = len(unique_syllables) print(f"✓ Extracted {unique_count:,} unique syllables → {unique_file.name}") if verbose: # Show top 5 most frequent entries = analyzer.create_frequency_entries(frequencies) print("\n Top 5 most frequent:") for entry in entries[:5]: print( f" {entry.canonical:10s} ({entry.frequency:5,} occurrences, {entry.percentage:5.1f}%)" ) # Create statistics object (with NLTK-specific stats) stats = NormalizationStats( raw_count=raw_count, after_canonicalization=after_canonicalization, rejected_charset=rejection_stats["rejected_charset"], rejected_length=rejection_stats["rejected_length"], rejected_empty=rejection_stats["rejected_empty"], unique_canonical=unique_count, processing_time=time.time() - start_time, ) # Create result object result = NormalizationResult( config=config, stats=stats, frequencies=frequencies, unique_syllables=unique_syllables, input_files=input_files, output_dir=run_directory, # Output is in run directory (in-place) timestamp=timestamp, raw_file=raw_file, canonical_file=canonical_file, frequency_file=frequency_file, unique_file=unique_file, meta_file=meta_file, ) # Save metadata report print("\n⏳ Generating metadata report...") metadata_content = result.format_metadata() # Add NLTK-specific metadata nltk_metadata = "\n\nNLTK-Specific Processing:\n" nltk_metadata += ( f" Fragment Cleaning: {'Disabled' if skip_fragment_cleaning else 'Enabled'}\n" ) if not skip_fragment_cleaning: reduction = raw_count - after_fragment_cleaning nltk_metadata += f" Before Cleaning: {raw_count:,} syllables\n" nltk_metadata += f" After Cleaning: {after_fragment_cleaning:,} syllables\n" nltk_metadata += f" Fragments Merged: {reduction:,}\n" metadata_content += nltk_metadata with meta_file.open("w", encoding="utf-8") as file_handle: file_handle.write(metadata_content) print(f"✓ Saved metadata report → {meta_file.name}") # Print summary print("\n" + "=" * 70) print("PIPELINE COMPLETE") print("=" * 70) print(f"Total Time: {stats.processing_time:.2f}s") print(f"Raw Syllables: {stats.raw_count:,}") if not skip_fragment_cleaning: print(f"After Cleaning: {after_fragment_cleaning:,}") print(f"Canonical: {stats.after_canonicalization:,}") print(f"Unique: {stats.unique_canonical:,}") print(f"Rejection Rate: {stats.rejection_rate:.1f}%") print("=" * 70 + "\n") return result
[docs] def create_argument_parser() -> argparse.ArgumentParser: """ Create and return the argument parser for NLTK syllable normaliser. Returns: Configured ArgumentParser ready to parse command-line arguments. """ parser = argparse.ArgumentParser( description="NLTK Syllable Normaliser - Fragment cleaning + 3-step normalization pipeline", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples:: # Process specific NLTK run directory python -m build_tools.nltk_syllable_normaliser --run-dir _working/output/20260110_095213_nltk/ # Auto-detect and process all NLTK run directories python -m build_tools.nltk_syllable_normaliser --source _working/output/ # Custom normalization config python -m build_tools.nltk_syllable_normaliser \\ --run-dir _working/output/20260110_095213_nltk/ \\ --min 2 --max 8 # Skip fragment cleaning (for comparison with pyphen) python -m build_tools.nltk_syllable_normaliser \\ --run-dir _working/output/20260110_095213_nltk/ \\ --no-fragment-cleaning """, ) # Input specification (mutually exclusive) input_group = parser.add_mutually_exclusive_group(required=True) input_group.add_argument( "--run-dir", type=Path, help="Specific NLTK run directory to process (e.g., _working/output/20260110_095213_nltk/)", ) input_group.add_argument( "--source", type=Path, help="Directory to scan for NLTK run directories (auto-detects *_nltk/ directories)", ) # Normalization configuration parser.add_argument( "--min", type=int, default=2, help="Minimum syllable length (characters). Default: 2", ) parser.add_argument( "--max", type=int, default=20, help="Maximum syllable length (characters). Default: 20", ) parser.add_argument( "--charset", type=str, default="abcdefghijklmnopqrstuvwxyz", help="Allowed character set for syllables. Default: a-z", ) parser.add_argument( "--unicode-form", type=str, choices=["NFC", "NFD", "NFKC", "NFKD"], default="NFKD", help="Unicode normalization form. Default: NFKD", ) # NLTK-specific options parser.add_argument( "--no-fragment-cleaning", action="store_true", help="Skip fragment cleaning step (for comparison purposes)", ) # Output control parser.add_argument( "--verbose", "-v", action="store_true", help="Enable verbose output with detailed progress information", ) parser.add_argument( "--quiet", "-q", action="store_true", help="Suppress all output except errors", ) return parser
[docs] def parse_arguments(args: list[str] | None = None) -> argparse.Namespace: """Parse command-line arguments.""" parser = create_argument_parser() return parser.parse_args(args)
[docs] def main(args: list[str] | None = None) -> int: """ Main entry point for CLI. Args: args: Command-line arguments (for testing). If None, uses sys.argv. Returns: Exit code (0 for success, 1 for error). """ parsed = parse_arguments(args) # Validate arguments if parsed.min < 1: print("ERROR: --min must be >= 1", file=sys.stderr) return 1 if parsed.max < parsed.min: print(f"ERROR: --max ({parsed.max}) must be >= --min ({parsed.min})", file=sys.stderr) return 1 # Create normalization config config = NormalizationConfig( min_length=parsed.min, max_length=parsed.max, allowed_charset=parsed.charset, unicode_form=parsed.unicode_form, ) try: # Determine run directories to process if parsed.run_dir: run_dirs = [parsed.run_dir] else: # Auto-detect NLTK run directories run_dirs = detect_nltk_run_directories(parsed.source) if not run_dirs: print(f"No NLTK run directories found in: {parsed.source}", file=sys.stderr) print( "NLTK run directories should match pattern: YYYYMMDD_HHMMSS_nltk/", file=sys.stderr, ) return 1 if not parsed.quiet: print(f"Found {len(run_dirs)} NLTK run directories:") for run_dir in run_dirs: print(f" - {run_dir.name}") # Process each run directory for run_dir in run_dirs: _ = run_full_pipeline( run_directory=run_dir, config=config, verbose=parsed.verbose, skip_fragment_cleaning=parsed.no_fragment_cleaning, ) if not parsed.quiet: print(f"\n✓ Successfully processed: {run_dir.name}") print(f" Outputs written to: {run_dir}") return 0 except Exception as e: print(f"ERROR: {e}", file=sys.stderr) if parsed.verbose: import traceback traceback.print_exc() return 1
if __name__ == "__main__": sys.exit(main())