Source code for build_tools.corpus_sqlite_builder.cli

"""
Command-line interface for corpus SQLite builder.

This module provides the CLI for converting annotated JSON files to SQLite databases.
"""

from __future__ import annotations

import argparse
import sys
from pathlib import Path

from .converter import convert_json_to_sqlite, find_annotated_json


[docs] def create_argument_parser() -> argparse.ArgumentParser: """ Create and return the argument parser for this tool. Returns: Configured ArgumentParser ready to parse command-line arguments """ parser = argparse.ArgumentParser( prog="python -m build_tools.corpus_sqlite_builder", description=( "Convert annotated JSON files to SQLite databases for efficient querying.\n\n" "This tool converts large JSON files (*_syllables_annotated.json) into " "optimized SQLite databases (corpus.db) for memory-efficient, high-performance " "queries in interactive tools like the TUI.\n\n" "The JSON file remains the canonical source of truth. The SQLite database " "is derived data that can be regenerated at any time." ), formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples ======== .. code-block:: bash # Convert single corpus (auto-discovers JSON in data/ subdirectory) python -m build_tools.corpus_sqlite_builder \\ _working/output/20260110_115453_pyphen/ # Force overwrite existing database python -m build_tools.corpus_sqlite_builder \\ _working/output/20260110_115453_pyphen/ --force # Dry run to check what would be converted python -m build_tools.corpus_sqlite_builder \\ _working/output/20260110_115453_pyphen/ --dry-run # Batch convert all corpora in output directory python -m build_tools.corpus_sqlite_builder --batch _working/output/ Output ====== Creates corpus.db in the data/ subdirectory of the corpus:: _working/output/20260110_115453_pyphen/data/corpus.db """, ) parser.add_argument( "corpus_dir", type=Path, nargs="?", help=( "Path to corpus directory containing data/ subdirectory with annotated JSON. " "Example: _working/output/20260110_115453_pyphen/" ), ) parser.add_argument( "--force", action="store_true", help=( "Overwrite existing corpus.db file if it exists. " "Default: False (raise error if database exists)" ), ) parser.add_argument( "--batch", type=Path, metavar="OUTPUT_DIR", help=( "Batch convert all corpora in the specified output directory. " "Example: --batch _working/output/ " "Will discover and convert all corpus directories." ), ) parser.add_argument( "--batch-size", type=int, default=10000, metavar="N", help=( "Number of records to insert per database transaction. " "Larger values are faster but use more memory. " "Default: 10000" ), ) parser.add_argument( "--dry-run", action="store_true", help=( "Show what would be converted without actually creating the database. " "Useful for checking if JSON files are discoverable." ), ) return parser
[docs] def parse_arguments(args: list[str] | None = None) -> argparse.Namespace: """ Parse command-line arguments. Args: args: List of argument strings to parse. If None, uses sys.argv. Returns: Parsed arguments namespace """ parser = create_argument_parser() return parser.parse_args(args)
[docs] def main(args: list[str] | None = None) -> int: """ Main entry point for the CLI. Args: args: List of argument strings. If None, uses sys.argv. Returns: Exit code (0 for success, non-zero for error) """ parsed_args = parse_arguments(args) # Handle batch mode if parsed_args.batch: return run_batch_conversion(parsed_args) # Single corpus mode requires corpus_dir if not parsed_args.corpus_dir: print("Error: corpus_dir is required (or use --batch)", file=sys.stderr) return 1 return run_single_conversion(parsed_args)
[docs] def run_single_conversion(args: argparse.Namespace) -> int: """ Convert a single corpus directory. Args: args: Parsed command-line arguments Returns: Exit code (0 for success, non-zero for error) """ corpus_dir = args.corpus_dir if not corpus_dir.exists(): print(f"Error: Corpus directory not found: {corpus_dir}", file=sys.stderr) return 1 # Check for annotated JSON data_dir = corpus_dir / "data" json_path = find_annotated_json(data_dir) if json_path is None: print(f"Error: No annotated JSON found in {data_dir}", file=sys.stderr) print("Looking for: *_syllables_annotated.json", file=sys.stderr) return 1 # Dry run mode if args.dry_run: print(f"Would convert: {json_path}") print(f"Output: {data_dir / 'corpus.db'}") db_path = data_dir / "corpus.db" if db_path.exists(): print("Note: Database already exists (would need --force)") return 0 # Perform conversion try: db_path = convert_json_to_sqlite( corpus_dir=corpus_dir, force=args.force, batch_size=args.batch_size, ) print(f"\n✓ Conversion complete: {db_path}") return 0 except FileExistsError as e: print(f"Error: {e}", file=sys.stderr) print("Use --force to overwrite existing database.", file=sys.stderr) return 1 except Exception as e: print(f"Error during conversion: {e}", file=sys.stderr) return 1
[docs] def run_batch_conversion(args: argparse.Namespace) -> int: """ Convert multiple corpus directories. Args: args: Parsed command-line arguments Returns: Exit code (0 for success, non-zero for error) """ output_dir = args.batch if not output_dir.exists() or not output_dir.is_dir(): print(f"Error: Output directory not found: {output_dir}", file=sys.stderr) return 1 # Discover corpus directories corpus_dirs = [] for item in output_dir.iterdir(): if not item.is_dir(): continue # Check if it has a data/ subdirectory with annotated JSON data_dir = item / "data" if find_annotated_json(data_dir): corpus_dirs.append(item) if not corpus_dirs: print(f"No corpus directories found in {output_dir}", file=sys.stderr) return 1 print(f"Found {len(corpus_dirs)} corpus directories") print() # Convert each corpus success_count = 0 error_count = 0 for i, corpus_dir in enumerate(corpus_dirs, 1): print(f"[{i}/{len(corpus_dirs)}] Converting: {corpus_dir.name}") print("-" * 60) try: db_path = convert_json_to_sqlite( corpus_dir=corpus_dir, force=args.force, batch_size=args.batch_size, ) print(f"✓ Success: {db_path}") success_count += 1 except FileExistsError: print("⊗ Skipped: Database already exists (use --force to overwrite)") error_count += 1 except Exception as e: print(f"✗ Error: {e}", file=sys.stderr) error_count += 1 print() # Summary print("=" * 60) print("Batch conversion complete:") print(f" Success: {success_count}") print(f" Errors/Skipped: {error_count}") print(f" Total: {len(corpus_dirs)}") return 0 if error_count == 0 else 1