Source code for build_tools.nltk_syllable_extractor.interactive

"""
Interactive mode for the NLTK syllable extractor.

This module provides the interactive CLI workflow for single-file extraction
using NLTK's CMU Pronouncing Dictionary. Unlike the pyphen extractor, this
tool only supports English (CMUDict limitation).
"""

from __future__ import annotations

import sys

from build_tools.tui_common.interactive import (
    print_banner,
    print_extraction_complete,
    print_section,
    prompt_extraction_settings,
    prompt_input_file,
)
from build_tools.tui_common.ledger import ExtractionLedgerContext

from .extractor import NltkSyllableExtractor
from .file_io import DEFAULT_OUTPUT_DIR, generate_output_filename, save_metadata
from .models import ExtractionResult

# Version for ledger
try:
    from build_tools.nltk_syllable_extractor import __version__ as _extractor_version
except (ImportError, AttributeError):
    _extractor_version = "unknown"


[docs] def run_interactive() -> None: """ Interactive mode entry point for the NLTK syllable extractor CLI. Workflow: 1. Display tool information and CMUDict notice 2. Configure extraction parameters (min/max syllable length) 3. Prompt for input file path 4. Extract syllables using CMUDict + onset/coda principles 5. Generate timestamped output filenames 6. Save syllables and metadata to separate files 7. Display summary to console Output Files: - YYYYMMDD_HHMMSS.syllables.en_US.txt: One syllable per line, sorted - YYYYMMDD_HHMMSS.meta.en_US.txt: Extraction metadata and statistics Both files are saved to _working/output/ by default. """ # Display banner print_banner( "NLTK SYLLABLE EXTRACTOR", [ "This tool extracts syllables using NLTK's CMU Pronouncing Dictionary", "with phonetically-guided orthographic splitting (onset/coda principles).", "", "⚠️ English only (CMUDict limitation)", "Output is saved to _working/output/ by default.", ], ) # Configure extraction parameters print_section("EXTRACTION SETTINGS") min_len, max_len = prompt_extraction_settings( default_min=1, default_max=999, min_label="Minimum syllable length (1 = no filtering)", max_label="Maximum syllable length (999 = no filtering)", ) # Initialize extractor try: extractor = NltkSyllableExtractor("en_US", min_len, max_len) print("✓ CMU Pronouncing Dictionary loaded") except (ImportError, LookupError) as e: print(f"\nError: {e}") sys.exit(1) # Get input file path print_section("INPUT FILE SELECTION") input_path = prompt_input_file() # Use ledger context for corpus DB integration with ExtractionLedgerContext( extractor_tool="nltk_syllable_extractor", extractor_version=_extractor_version, pyphen_lang=None, # Not applicable for NLTK min_len=min_len, max_len=max_len, ) as ledger_ctx: # Record input ledger_ctx.record_input(input_path) # Extract syllables print(f"\n⏳ Processing {input_path}...") try: syllables, stats = extractor.extract_syllables_from_file(input_path) print(f"✓ Extracted {len(syllables)} unique syllables") except Exception as e: print(f"\nError during extraction: {e}") ledger_ctx.set_result(success=False) sys.exit(1) # Generate output filenames and create result object syllables_path, metadata_path = generate_output_filename(language_code="en_US") result = ExtractionResult( syllables=syllables, language_code="en_US", min_syllable_length=min_len, max_syllable_length=max_len, input_path=input_path, only_hyphenated=True, total_words=stats["total_words"], fallback_count=stats["fallback_count"], rejected_syllables=stats["rejected_syllables"], processed_words=stats["processed_words"], ) # Save syllables print(f"\n⏳ Saving syllables to {syllables_path}...") try: extractor.save_syllables(syllables, syllables_path) print("✓ Syllables saved successfully") except Exception as e: print(f"\nError saving syllables: {e}") ledger_ctx.set_result(success=False) sys.exit(1) # Save metadata print(f"⏳ Saving metadata to {metadata_path}...") try: save_metadata(result, metadata_path) print("✓ Metadata saved successfully") except Exception as e: print(f"\nError saving metadata: {e}") ledger_ctx.set_result(success=False) sys.exit(1) # Record output ledger_ctx.record_output( output_path=syllables_path, unique_syllable_count=len(syllables), meta_path=metadata_path, ) ledger_ctx.set_result(success=True) # Display summary to console print("\n" + result.format_metadata()) print_extraction_complete(syllables_path, metadata_path, DEFAULT_OUTPUT_DIR)