Source code for build_tools.pyphen_syllable_extractor.batch

"""
Batch mode for the pyphen syllable extractor.

This module provides batch processing functionality for extracting syllables
from multiple files using pyphen hyphenation.
"""

from __future__ import annotations

import argparse
import sys
import time
from pathlib import Path

from build_tools.tui_common.batch import (
    collect_files_from_args,
    run_batch_extraction,
    validate_extraction_params,
)
from build_tools.tui_common.ledger import ExtractionLedgerContext

from .extractor import SyllableExtractor
from .file_io import DEFAULT_OUTPUT_DIR, generate_output_filename, save_metadata
from .language_detection import is_detection_available
from .models import BatchResult, ExtractionResult, FileProcessingResult

# Version for ledger
try:
    from build_tools.pyphen_syllable_extractor import __version__ as _extractor_version
except (ImportError, AttributeError):
    _extractor_version = "unknown"



[docs]
def process_single_file(
    input_path: Path,
    language_code: str,
    min_len: int,
    max_len: int,
    output_dir: Path,
    run_timestamp: str,
    verbose: bool = False,
) -> FileProcessingResult:
    """
    Process a single file in batch mode with comprehensive error handling.

    This function attempts to extract syllables from a single file and saves
    the results. Unlike interactive mode, this function catches all exceptions
    and returns a result object indicating success or failure, allowing batch
    processing to continue even when individual files fail.

    Args:
        input_path: Path to the input text file to process
        language_code: Language code (e.g., "en_US", "de_DE") or "auto" for
                      automatic language detection
        min_len: Minimum syllable length to include in results
        max_len: Maximum syllable length to include in results
        output_dir: Directory where output files should be saved
        run_timestamp: Timestamp for the batch run (shared across all files in batch)
        verbose: If True, print detailed progress messages (default: False)

    Returns:
        FileProcessingResult object with success status, syllables count,
        output paths (if successful), or error message (if failed).

    Note:
        This function never raises exceptions. All errors are caught and
        returned in the FileProcessingResult.error_message field. This
        design allows batch processing to continue despite individual failures.
    """
    start_time = time.perf_counter()

    try:
        if verbose:
            print(f"⏳ Processing {input_path.name}...")

        # Extract syllables (with auto-detection if requested)
        if language_code == "auto":
            syllables, stats, detected_lang = SyllableExtractor.extract_file_with_auto_language(
                input_path,
                min_syllable_length=min_len,
                max_syllable_length=max_len,
                suppress_warnings=True,
            )
            actual_language = detected_lang
        else:
            extractor = SyllableExtractor(language_code, min_len, max_len)
            syllables, stats = extractor.extract_syllables_from_file(input_path)
            actual_language = language_code

        # Generate output filenames using input filename and shared run timestamp
        syllables_path, metadata_path = generate_output_filename(
            output_dir=output_dir,
            run_timestamp=run_timestamp,
            input_filename=input_path.name,
        )

        # Save syllables (create extractor if needed for auto-detection case)
        if language_code == "auto":
            extractor = SyllableExtractor(actual_language, min_len, max_len)

        extractor.save_syllables(syllables, syllables_path)

        # Save metadata
        result = ExtractionResult(
            syllables=syllables,
            language_code=actual_language,
            min_syllable_length=min_len,
            max_syllable_length=max_len,
            input_path=input_path,
            only_hyphenated=True,
            total_words=stats["total_words"],
            skipped_unhyphenated=stats["skipped_unhyphenated"],
            rejected_syllables=stats["rejected_syllables"],
            processed_words=stats["processed_words"],
        )
        save_metadata(result, metadata_path)

        processing_time = time.perf_counter() - start_time

        if verbose:
            print(f"  ✓ Extracted {len(syllables)} syllables ({actual_language})")

        return FileProcessingResult(
            input_path=input_path,
            success=True,
            syllables_count=len(syllables),
            language_code=actual_language,
            syllables_output_path=syllables_path,
            metadata_output_path=metadata_path,
            processing_time=processing_time,
        )

    except Exception as e:
        processing_time = time.perf_counter() - start_time

        if verbose:
            print(f"  ✗ Failed: {str(e)}")

        return FileProcessingResult(
            input_path=input_path,
            success=False,
            syllables_count=0,
            language_code=language_code,
            error_message=str(e),
            processing_time=processing_time,
        )




[docs]
def process_batch(
    files: list[Path],
    language_code: str,
    min_len: int,
    max_len: int,
    output_dir: Path,
    quiet: bool = False,
    verbose: bool = False,
) -> BatchResult:
    """
    Process multiple files sequentially in batch mode.

    This is a backwards-compatible wrapper around run_batch_extraction.

    Args:
        files: List of input file paths to process
        language_code: Language code (e.g., "en_US") or "auto" for detection
        min_len: Minimum syllable length to include
        max_len: Maximum syllable length to include
        output_dir: Output directory for all results (created if needed)
        quiet: If True, suppress all output except errors (default: False)
        verbose: If True, show detailed progress for each file (default: False).

    Returns:
        BatchResult with overall statistics and individual file results.
    """
    file_processor = _create_file_processor(
        language_code=language_code,
        min_len=min_len,
        max_len=max_len,
    )

    return run_batch_extraction(  # type: ignore[no-any-return]
        files=files,
        output_dir=output_dir,
        process_file_func=file_processor,
        batch_result_class=BatchResult,
        extractor_name="pyphen",
        language_display=language_code,
        min_len=min_len,
        max_len=max_len,
        quiet=quiet,
        verbose=verbose,
    )



def _create_file_processor(
    language_code: str,
    min_len: int,
    max_len: int,
):
    """Create a single-file processor function with bound parameters."""

    def processor(
        input_path: Path,
        output_dir: Path,
        run_timestamp: str,
        verbose: bool,
    ) -> FileProcessingResult:
        return process_single_file(
            input_path=input_path,
            language_code=language_code,
            min_len=min_len,
            max_len=max_len,
            output_dir=output_dir,
            run_timestamp=run_timestamp,
            verbose=verbose,
        )

    return processor



[docs]
def run_batch(args: argparse.Namespace) -> None:
    """
    Batch mode entry point for the pyphen syllable extractor CLI.

    This function processes multiple files based on command-line arguments,
    providing progress indicators and comprehensive error reporting.

    Args:
        args: Parsed command-line arguments from argparse.Namespace containing:
            - file: Single file path (optional)
            - files: List of file paths (optional)
            - source: Directory path for scanning (optional)
            - pattern: File pattern for directory scanning (default: "*.txt")
            - recursive: Whether to scan directories recursively
            - lang: Manual language code (mutually exclusive with auto)
            - auto: Use automatic language detection (mutually exclusive with lang)
            - min: Minimum syllable length (default: 2)
            - max: Maximum syllable length (default: 8)
            - output: Output directory (default: _working/output/)
            - quiet: Suppress progress indicators
            - verbose: Show detailed processing information

    Exit Codes:
        0: All files processed successfully
        1: One or more files failed to process

    Raises:
        SystemExit: On validation errors or processing completion
    """
    # Validate parameters
    validate_extraction_params(args.min, args.max)

    # Determine language code with intelligent defaults
    if args.auto:
        language_code = "auto"
    elif args.lang:
        language_code = args.lang
    else:
        # Default behavior: auto-detect if available, otherwise en_US
        if is_detection_available():
            language_code = "auto"
            if not args.quiet:
                print("ℹ️  No language specified - using automatic detection (--auto)")
        else:
            language_code = "en_US"
            if not args.quiet:
                print("ℹ️  No language specified - defaulting to English US (en_US)")

    # Collect files to process
    try:
        files_to_process, source_dir = collect_files_from_args(
            file_arg=args.file,
            files_arg=args.files,
            source_arg=args.source,
            pattern=args.pattern,
            recursive=args.recursive,
        )
    except ValueError as e:
        print(f"Error: {e}")
        sys.exit(1)

    # Determine output directory
    output_dir = Path(args.output).expanduser().resolve() if args.output else DEFAULT_OUTPUT_DIR

    # Display batch configuration
    if not args.quiet:
        print("\n" + "=" * 70)
        print("BATCH SYLLABLE EXTRACTION")
        print("=" * 70)
        print(f"Files to process:   {len(files_to_process)}")
        print(f"Language:           {language_code}")
        print(f"Syllable length:    {args.min}-{args.max} characters")
        print(f"Output directory:   {output_dir}")
        print("=" * 70 + "\n")

    # Use ledger context for corpus DB integration
    pyphen_lang = None if language_code == "auto" else language_code

    with ExtractionLedgerContext(
        extractor_tool="pyphen_syllable_extractor",
        extractor_version=_extractor_version,
        pyphen_lang=pyphen_lang,
        min_len=args.min,
        max_len=args.max,
        recursive=args.recursive,
        pattern=args.pattern,
        quiet=args.quiet,
    ) as ledger_ctx:
        # Record inputs
        ledger_ctx.record_inputs(files_to_process, source_dir=source_dir)

        # Create file processor with bound parameters
        file_processor = _create_file_processor(
            language_code=language_code,
            min_len=args.min,
            max_len=args.max,
        )

        # Process batch
        batch_result = run_batch_extraction(
            files=files_to_process,
            output_dir=output_dir,
            process_file_func=file_processor,
            batch_result_class=BatchResult,
            extractor_name="pyphen",
            language_display=language_code,
            min_len=args.min,
            max_len=args.max,
            quiet=args.quiet,
            verbose=args.verbose,
        )

        # Record outputs
        for result in batch_result.results:
            if result.success and result.syllables_output_path is not None:
                ledger_ctx.record_output(
                    output_path=result.syllables_output_path,
                    unique_syllable_count=result.syllables_count,
                    meta_path=result.metadata_output_path,
                )

        # Set result based on batch outcome
        ledger_ctx.set_result(success=(batch_result.failed == 0))

    # Display summary
    if not args.quiet:
        print("\n" + batch_result.format_summary())

    # Exit with appropriate code
    if batch_result.failed > 0:
        sys.exit(1)
    else:
        sys.exit(0)