"""
Batch mode for the pyphen syllable extractor.
This module provides batch processing functionality for extracting syllables
from multiple files using pyphen hyphenation.
"""
from __future__ import annotations
import argparse
import sys
import time
from pathlib import Path
from build_tools.tui_common.batch import (
collect_files_from_args,
run_batch_extraction,
validate_extraction_params,
)
from build_tools.tui_common.ledger import ExtractionLedgerContext
from .extractor import SyllableExtractor
from .file_io import DEFAULT_OUTPUT_DIR, generate_output_filename, save_metadata
from .language_detection import is_detection_available
from .models import BatchResult, ExtractionResult, FileProcessingResult
# Version for ledger
try:
from build_tools.pyphen_syllable_extractor import __version__ as _extractor_version
except (ImportError, AttributeError):
_extractor_version = "unknown"
[docs]
def process_single_file(
input_path: Path,
language_code: str,
min_len: int,
max_len: int,
output_dir: Path,
run_timestamp: str,
verbose: bool = False,
) -> FileProcessingResult:
"""
Process a single file in batch mode with comprehensive error handling.
This function attempts to extract syllables from a single file and saves
the results. Unlike interactive mode, this function catches all exceptions
and returns a result object indicating success or failure, allowing batch
processing to continue even when individual files fail.
Args:
input_path: Path to the input text file to process
language_code: Language code (e.g., "en_US", "de_DE") or "auto" for
automatic language detection
min_len: Minimum syllable length to include in results
max_len: Maximum syllable length to include in results
output_dir: Directory where output files should be saved
run_timestamp: Timestamp for the batch run (shared across all files in batch)
verbose: If True, print detailed progress messages (default: False)
Returns:
FileProcessingResult object with success status, syllables count,
output paths (if successful), or error message (if failed).
Note:
This function never raises exceptions. All errors are caught and
returned in the FileProcessingResult.error_message field. This
design allows batch processing to continue despite individual failures.
"""
start_time = time.perf_counter()
try:
if verbose:
print(f"⏳ Processing {input_path.name}...")
# Extract syllables (with auto-detection if requested)
if language_code == "auto":
syllables, stats, detected_lang = SyllableExtractor.extract_file_with_auto_language(
input_path,
min_syllable_length=min_len,
max_syllable_length=max_len,
suppress_warnings=True,
)
actual_language = detected_lang
else:
extractor = SyllableExtractor(language_code, min_len, max_len)
syllables, stats = extractor.extract_syllables_from_file(input_path)
actual_language = language_code
# Generate output filenames using input filename and shared run timestamp
syllables_path, metadata_path = generate_output_filename(
output_dir=output_dir,
run_timestamp=run_timestamp,
input_filename=input_path.name,
)
# Save syllables (create extractor if needed for auto-detection case)
if language_code == "auto":
extractor = SyllableExtractor(actual_language, min_len, max_len)
extractor.save_syllables(syllables, syllables_path)
# Save metadata
result = ExtractionResult(
syllables=syllables,
language_code=actual_language,
min_syllable_length=min_len,
max_syllable_length=max_len,
input_path=input_path,
only_hyphenated=True,
total_words=stats["total_words"],
skipped_unhyphenated=stats["skipped_unhyphenated"],
rejected_syllables=stats["rejected_syllables"],
processed_words=stats["processed_words"],
)
save_metadata(result, metadata_path)
processing_time = time.perf_counter() - start_time
if verbose:
print(f" ✓ Extracted {len(syllables)} syllables ({actual_language})")
return FileProcessingResult(
input_path=input_path,
success=True,
syllables_count=len(syllables),
language_code=actual_language,
syllables_output_path=syllables_path,
metadata_output_path=metadata_path,
processing_time=processing_time,
)
except Exception as e:
processing_time = time.perf_counter() - start_time
if verbose:
print(f" ✗ Failed: {str(e)}")
return FileProcessingResult(
input_path=input_path,
success=False,
syllables_count=0,
language_code=language_code,
error_message=str(e),
processing_time=processing_time,
)
[docs]
def process_batch(
files: list[Path],
language_code: str,
min_len: int,
max_len: int,
output_dir: Path,
quiet: bool = False,
verbose: bool = False,
) -> BatchResult:
"""
Process multiple files sequentially in batch mode.
This is a backwards-compatible wrapper around run_batch_extraction.
Args:
files: List of input file paths to process
language_code: Language code (e.g., "en_US") or "auto" for detection
min_len: Minimum syllable length to include
max_len: Maximum syllable length to include
output_dir: Output directory for all results (created if needed)
quiet: If True, suppress all output except errors (default: False)
verbose: If True, show detailed progress for each file (default: False).
Returns:
BatchResult with overall statistics and individual file results.
"""
file_processor = _create_file_processor(
language_code=language_code,
min_len=min_len,
max_len=max_len,
)
return run_batch_extraction( # type: ignore[no-any-return]
files=files,
output_dir=output_dir,
process_file_func=file_processor,
batch_result_class=BatchResult,
extractor_name="pyphen",
language_display=language_code,
min_len=min_len,
max_len=max_len,
quiet=quiet,
verbose=verbose,
)
def _create_file_processor(
language_code: str,
min_len: int,
max_len: int,
):
"""Create a single-file processor function with bound parameters."""
def processor(
input_path: Path,
output_dir: Path,
run_timestamp: str,
verbose: bool,
) -> FileProcessingResult:
return process_single_file(
input_path=input_path,
language_code=language_code,
min_len=min_len,
max_len=max_len,
output_dir=output_dir,
run_timestamp=run_timestamp,
verbose=verbose,
)
return processor
[docs]
def run_batch(args: argparse.Namespace) -> None:
"""
Batch mode entry point for the pyphen syllable extractor CLI.
This function processes multiple files based on command-line arguments,
providing progress indicators and comprehensive error reporting.
Args:
args: Parsed command-line arguments from argparse.Namespace containing:
- file: Single file path (optional)
- files: List of file paths (optional)
- source: Directory path for scanning (optional)
- pattern: File pattern for directory scanning (default: "*.txt")
- recursive: Whether to scan directories recursively
- lang: Manual language code (mutually exclusive with auto)
- auto: Use automatic language detection (mutually exclusive with lang)
- min: Minimum syllable length (default: 2)
- max: Maximum syllable length (default: 8)
- output: Output directory (default: _working/output/)
- quiet: Suppress progress indicators
- verbose: Show detailed processing information
Exit Codes:
0: All files processed successfully
1: One or more files failed to process
Raises:
SystemExit: On validation errors or processing completion
"""
# Validate parameters
validate_extraction_params(args.min, args.max)
# Determine language code with intelligent defaults
if args.auto:
language_code = "auto"
elif args.lang:
language_code = args.lang
else:
# Default behavior: auto-detect if available, otherwise en_US
if is_detection_available():
language_code = "auto"
if not args.quiet:
print("ℹ️ No language specified - using automatic detection (--auto)")
else:
language_code = "en_US"
if not args.quiet:
print("ℹ️ No language specified - defaulting to English US (en_US)")
# Collect files to process
try:
files_to_process, source_dir = collect_files_from_args(
file_arg=args.file,
files_arg=args.files,
source_arg=args.source,
pattern=args.pattern,
recursive=args.recursive,
)
except ValueError as e:
print(f"Error: {e}")
sys.exit(1)
# Determine output directory
output_dir = Path(args.output).expanduser().resolve() if args.output else DEFAULT_OUTPUT_DIR
# Display batch configuration
if not args.quiet:
print("\n" + "=" * 70)
print("BATCH SYLLABLE EXTRACTION")
print("=" * 70)
print(f"Files to process: {len(files_to_process)}")
print(f"Language: {language_code}")
print(f"Syllable length: {args.min}-{args.max} characters")
print(f"Output directory: {output_dir}")
print("=" * 70 + "\n")
# Use ledger context for corpus DB integration
pyphen_lang = None if language_code == "auto" else language_code
with ExtractionLedgerContext(
extractor_tool="pyphen_syllable_extractor",
extractor_version=_extractor_version,
pyphen_lang=pyphen_lang,
min_len=args.min,
max_len=args.max,
recursive=args.recursive,
pattern=args.pattern,
quiet=args.quiet,
) as ledger_ctx:
# Record inputs
ledger_ctx.record_inputs(files_to_process, source_dir=source_dir)
# Create file processor with bound parameters
file_processor = _create_file_processor(
language_code=language_code,
min_len=args.min,
max_len=args.max,
)
# Process batch
batch_result = run_batch_extraction(
files=files_to_process,
output_dir=output_dir,
process_file_func=file_processor,
batch_result_class=BatchResult,
extractor_name="pyphen",
language_display=language_code,
min_len=args.min,
max_len=args.max,
quiet=args.quiet,
verbose=args.verbose,
)
# Record outputs
for result in batch_result.results:
if result.success and result.syllables_output_path is not None:
ledger_ctx.record_output(
output_path=result.syllables_output_path,
unique_syllable_count=result.syllables_count,
meta_path=result.metadata_output_path,
)
# Set result based on batch outcome
ledger_ctx.set_result(success=(batch_result.failed == 0))
# Display summary
if not args.quiet:
print("\n" + batch_result.format_summary())
# Exit with appropriate code
if batch_result.failed > 0:
sys.exit(1)
else:
sys.exit(0)