Source code for build_tools.pyphen_syllable_extractor.interactive

"""
Interactive mode for the pyphen syllable extractor.

This module provides the interactive CLI workflow for single-file extraction
with language selection and user prompts.
"""

from __future__ import annotations

import sys

from build_tools.tui_common.interactive import (
    print_banner,
    print_extraction_complete,
    print_section,
    prompt_extraction_settings,
    prompt_input_file,
)
from build_tools.tui_common.ledger import ExtractionLedgerContext

from .extractor import SyllableExtractor
from .file_io import DEFAULT_OUTPUT_DIR, generate_output_filename, save_metadata
from .language_detection import is_detection_available
from .languages import SUPPORTED_LANGUAGES
from .models import ExtractionResult

# Version for ledger
try:
    from build_tools.pyphen_syllable_extractor import __version__ as _extractor_version
except (ImportError, AttributeError):
    _extractor_version = "unknown"



[docs]
def select_language() -> str:
    """
    Interactive prompt to select a language from supported options.

    Returns:
        The pyphen language code for the selected language, or "auto"
        for automatic language detection

    Note:
        Exits the program if the user provides invalid input after
        multiple attempts or requests to quit.
    """
    print("\n" + "=" * 70)
    print("SYLLABLE EXTRACTOR - Language Selection")
    print("=" * 70)

    # Check if auto-detection is available
    auto_available = is_detection_available()
    if auto_available:
        print("\n💡 Auto-detection available! Type 'auto' to automatically detect language.")

    print("\nSupported Languages:")
    print("-" * 70)

    # Display languages in a formatted list
    languages = sorted(SUPPORTED_LANGUAGES.items())
    for idx, (name, code) in enumerate(languages, 1):
        print(f"{idx:2d}. {name:25s} ({code})")

    print("-" * 70)
    print("\nYou can select by:")
    print("  - Number (e.g., '13' for English UK)")
    print("  - Language name (e.g., 'English (US)')")
    print("  - Language code (e.g., 'en_US')")
    if auto_available:
        print("  - Type 'auto' for automatic language detection")
    print("  - Type 'quit' to exit")
    print("=" * 70)

    while True:
        selection = input("\nSelect a language: ").strip()

        if selection.lower() == "quit":
            print("Exiting.")
            sys.exit(0)

        # Check for auto-detection
        if selection.lower() == "auto":
            if not auto_available:
                print(
                    "Error: Auto-detection not available. "
                    "Install langdetect: pip install langdetect"
                )
                continue
            print("\n✓ Selected: Automatic language detection")
            return "auto"

        # Try to match by number
        if selection.isdigit():
            idx = int(selection) - 1
            if 0 <= idx < len(languages):
                selected_name, selected_code = languages[idx]
                print(f"\nSelected: {selected_name} ({selected_code})")
                return selected_code
            else:
                print(f"Error: Please enter a number between 1 and {len(languages)}")
                continue

        # Try to match by language name
        if selection in SUPPORTED_LANGUAGES:
            selected_code = SUPPORTED_LANGUAGES[selection]
            print(f"\nSelected: {selection} ({selected_code})")
            return selected_code

        # Try to match by language code
        if selection in SUPPORTED_LANGUAGES.values():
            # Find the language name for this code
            selected_name = next(
                name for name, code in SUPPORTED_LANGUAGES.items() if code == selection
            )
            print(f"\nSelected: {selected_name} ({selection})")
            return selection

        print("Error: Invalid selection. Please try again or type 'quit' to exit.")




[docs]
def run_interactive() -> None:
    """
    Interactive mode entry point for the pyphen syllable extractor CLI.

    Workflow:
        1. Prompt user to select a language (or 'auto' for automatic detection)
        2. Configure extraction parameters (min/max syllable length)
        3. Prompt for input file path
        4. Extract syllables from input file (with optional auto-detection)
        5. Generate timestamped output filenames
        6. Save syllables and metadata to separate files
        7. Display summary to console

    Language Detection:
        - If 'auto' is selected and langdetect is installed, the tool will
          automatically detect the language of the input text
        - Detection requires at least 20-50 characters for reliable results
        - Falls back to English (en_US) if detection fails

    Output Files:
        - YYYYMMDD_HHMMSS.syllables.LANG.txt: One syllable per line, sorted
        - YYYYMMDD_HHMMSS.meta.LANG.txt: Extraction metadata and statistics

    Both files are saved to _working/output/ by default.
    """
    # Display banner
    print_banner(
        "PYPHEN SYLLABLE EXTRACTOR",
        [
            "This tool extracts syllables from text files using dictionary-based",
            "hyphenation rules. Output is saved to _working/output/ by default.",
        ],
    )

    # Step 1: Select language
    language_code = select_language()

    # Step 2: Configure extraction parameters
    print_section("EXTRACTION SETTINGS")
    min_len, max_len = prompt_extraction_settings(default_min=2, default_max=8)

    # Step 3: Initialize extractor (skip if using auto-detection)
    if language_code != "auto":
        try:
            extractor = SyllableExtractor(language_code, min_len, max_len)
            print(f"✓ Hyphenation dictionary loaded for: {language_code}")
        except ValueError as e:
            print(f"\nError: {e}")
            sys.exit(1)

    # Step 4: Get input file path
    print_section("INPUT FILE SELECTION")
    input_path = prompt_input_file()

    # Use ledger context for corpus DB integration
    pyphen_lang = None if language_code == "auto" else language_code

    with ExtractionLedgerContext(
        extractor_tool="pyphen_syllable_extractor",
        extractor_version=_extractor_version,
        pyphen_lang=pyphen_lang,
        min_len=min_len,
        max_len=max_len,
    ) as ledger_ctx:
        # Record input
        ledger_ctx.record_input(input_path)

        # Step 5: Extract syllables
        print(f"\n⏳ Processing {input_path}...")
        try:
            if language_code == "auto":
                # Use auto-detection
                syllables, stats, detected_language = (
                    SyllableExtractor.extract_file_with_auto_language(
                        input_path,
                        min_syllable_length=min_len,
                        max_syllable_length=max_len,
                        suppress_warnings=True,
                    )
                )
                language_code = detected_language  # Update for metadata
                print(f"✓ Detected language: {detected_language}")
                print(f"✓ Extracted {len(syllables)} unique syllables")
                # Create extractor instance with detected language for saving
                extractor = SyllableExtractor(language_code, min_len, max_len)
            else:
                # Use manual language selection
                syllables, stats = extractor.extract_syllables_from_file(input_path)
                print(f"✓ Extracted {len(syllables)} unique syllables")
        except Exception as e:
            print(f"\nError during extraction: {e}")
            ledger_ctx.set_result(success=False)
            sys.exit(1)

        # Step 6: Generate output filenames and create result object
        syllables_path, metadata_path = generate_output_filename(language_code=language_code)

        result = ExtractionResult(
            syllables=syllables,
            language_code=language_code,
            min_syllable_length=min_len,
            max_syllable_length=max_len,
            input_path=input_path,
            only_hyphenated=True,
            total_words=stats["total_words"],
            skipped_unhyphenated=stats["skipped_unhyphenated"],
            rejected_syllables=stats["rejected_syllables"],
            processed_words=stats["processed_words"],
        )

        # Step 7: Save syllables
        print(f"\n⏳ Saving syllables to {syllables_path}...")
        try:
            extractor.save_syllables(syllables, syllables_path)
            print("✓ Syllables saved successfully")
        except Exception as e:
            print(f"\nError saving syllables: {e}")
            ledger_ctx.set_result(success=False)
            sys.exit(1)

        # Step 8: Save metadata
        print(f"⏳ Saving metadata to {metadata_path}...")
        try:
            save_metadata(result, metadata_path)
            print("✓ Metadata saved successfully")
        except Exception as e:
            print(f"\nError saving metadata: {e}")
            ledger_ctx.set_result(success=False)
            sys.exit(1)

        # Record output
        ledger_ctx.record_output(
            output_path=syllables_path,
            unique_syllable_count=len(syllables),
            meta_path=metadata_path,
        )
        ledger_ctx.set_result(success=True)

    # Step 9: Display summary to console
    print("\n" + result.format_metadata())
    print_extraction_complete(syllables_path, metadata_path, DEFAULT_OUTPUT_DIR)