Source code for build_tools.nltk_syllable_extractor.interactive

"""
Interactive mode for the NLTK syllable extractor.

This module provides the interactive CLI workflow for single-file extraction
using NLTK's CMU Pronouncing Dictionary. Unlike the pyphen extractor, this
tool only supports English (CMUDict limitation).
"""

from __future__ import annotations

import sys

from build_tools.tui_common.interactive import (
    print_banner,
    print_extraction_complete,
    print_section,
    prompt_extraction_settings,
    prompt_input_file,
)
from build_tools.tui_common.ledger import ExtractionLedgerContext

from .extractor import NltkSyllableExtractor
from .file_io import DEFAULT_OUTPUT_DIR, generate_output_filename, save_metadata
from .models import ExtractionResult

# Version for ledger
try:
    from build_tools.nltk_syllable_extractor import __version__ as _extractor_version
except (ImportError, AttributeError):
    _extractor_version = "unknown"



[docs]
def run_interactive() -> None:
    """
    Interactive mode entry point for the NLTK syllable extractor CLI.

    Workflow:
        1. Display tool information and CMUDict notice
        2. Configure extraction parameters (min/max syllable length)
        3. Prompt for input file path
        4. Extract syllables using CMUDict + onset/coda principles
        5. Generate timestamped output filenames
        6. Save syllables and metadata to separate files
        7. Display summary to console

    Output Files:
        - YYYYMMDD_HHMMSS.syllables.en_US.txt: One syllable per line, sorted
        - YYYYMMDD_HHMMSS.meta.en_US.txt: Extraction metadata and statistics

    Both files are saved to _working/output/ by default.
    """
    # Display banner
    print_banner(
        "NLTK SYLLABLE EXTRACTOR",
        [
            "This tool extracts syllables using NLTK's CMU Pronouncing Dictionary",
            "with phonetically-guided orthographic splitting (onset/coda principles).",
            "",
            "⚠️  English only (CMUDict limitation)",
            "Output is saved to _working/output/ by default.",
        ],
    )

    # Configure extraction parameters
    print_section("EXTRACTION SETTINGS")
    min_len, max_len = prompt_extraction_settings(
        default_min=1,
        default_max=999,
        min_label="Minimum syllable length (1 = no filtering)",
        max_label="Maximum syllable length (999 = no filtering)",
    )

    # Initialize extractor
    try:
        extractor = NltkSyllableExtractor("en_US", min_len, max_len)
        print("✓ CMU Pronouncing Dictionary loaded")
    except (ImportError, LookupError) as e:
        print(f"\nError: {e}")
        sys.exit(1)

    # Get input file path
    print_section("INPUT FILE SELECTION")
    input_path = prompt_input_file()

    # Use ledger context for corpus DB integration
    with ExtractionLedgerContext(
        extractor_tool="nltk_syllable_extractor",
        extractor_version=_extractor_version,
        pyphen_lang=None,  # Not applicable for NLTK
        min_len=min_len,
        max_len=max_len,
    ) as ledger_ctx:
        # Record input
        ledger_ctx.record_input(input_path)

        # Extract syllables
        print(f"\n⏳ Processing {input_path}...")
        try:
            syllables, stats = extractor.extract_syllables_from_file(input_path)
            print(f"✓ Extracted {len(syllables)} unique syllables")
        except Exception as e:
            print(f"\nError during extraction: {e}")
            ledger_ctx.set_result(success=False)
            sys.exit(1)

        # Generate output filenames and create result object
        syllables_path, metadata_path = generate_output_filename(language_code="en_US")

        result = ExtractionResult(
            syllables=syllables,
            language_code="en_US",
            min_syllable_length=min_len,
            max_syllable_length=max_len,
            input_path=input_path,
            only_hyphenated=True,
            total_words=stats["total_words"],
            fallback_count=stats["fallback_count"],
            rejected_syllables=stats["rejected_syllables"],
            processed_words=stats["processed_words"],
        )

        # Save syllables
        print(f"\n⏳ Saving syllables to {syllables_path}...")
        try:
            extractor.save_syllables(syllables, syllables_path)
            print("✓ Syllables saved successfully")
        except Exception as e:
            print(f"\nError saving syllables: {e}")
            ledger_ctx.set_result(success=False)
            sys.exit(1)

        # Save metadata
        print(f"⏳ Saving metadata to {metadata_path}...")
        try:
            save_metadata(result, metadata_path)
            print("✓ Metadata saved successfully")
        except Exception as e:
            print(f"\nError saving metadata: {e}")
            ledger_ctx.set_result(success=False)
            sys.exit(1)

        # Record output
        ledger_ctx.record_output(
            output_path=syllables_path,
            unique_syllable_count=len(syllables),
            meta_path=metadata_path,
        )
        ledger_ctx.set_result(success=True)

    # Display summary to console
    print("\n" + result.format_metadata())
    print_extraction_complete(syllables_path, metadata_path, DEFAULT_OUTPUT_DIR)