Source code for build_tools.syllable_walk_tui.services.corpus

"""
Corpus directory validation and utilities for Syllable Walker TUI.

This module provides functions for validating and loading corpus data
from normalized syllable extraction output directories.
"""

import json
import sqlite3
from pathlib import Path



[docs]
def validate_corpus_directory(path: Path) -> tuple[bool, str, str]:
    """
    Validate that a directory contains valid corpus files.

    Checks for either NLTK or Pyphen corpus structure:
    - nltk_syllables_unique.txt + nltk_syllables_frequencies.json
    - pyphen_syllables_unique.txt + pyphen_syllables_frequencies.json

    Args:
        path: Directory path to validate

    Returns:
        Tuple of (is_valid, corpus_type, message)
        - is_valid: True if valid corpus directory
        - corpus_type: "NLTK" or "Pyphen" if valid, empty string otherwise
        - message: Description if valid, error description if invalid

    Examples:
        >>> validate_corpus_directory(Path("/path/to/20260110_115601_nltk"))
        (True, "NLTK", "Valid NLTK corpus")

        >>> validate_corpus_directory(Path("/invalid/path"))
        (False, "", "Directory does not exist")
    """
    # Check directory exists
    if not path.exists():
        return (False, "", "Directory does not exist")

    if not path.is_dir():
        return (False, "", "Path is not a directory")

    # Check for NLTK corpus
    nltk_syllables = path / "nltk_syllables_unique.txt"
    nltk_frequencies = path / "nltk_syllables_frequencies.json"

    if nltk_syllables.exists() and nltk_frequencies.exists():
        # Validate both are files
        if not nltk_syllables.is_file():
            return (False, "", "nltk_syllables_unique.txt is not a file")
        if not nltk_frequencies.is_file():
            return (False, "", "nltk_syllables_frequencies.json is not a file")

        return (True, "NLTK", "Valid NLTK corpus")

    # Check for Pyphen corpus
    pyphen_syllables = path / "pyphen_syllables_unique.txt"
    pyphen_frequencies = path / "pyphen_syllables_frequencies.json"

    if pyphen_syllables.exists() and pyphen_frequencies.exists():
        # Validate both are files
        if not pyphen_syllables.is_file():
            return (False, "", "pyphen_syllables_unique.txt is not a file")
        if not pyphen_frequencies.is_file():
            return (False, "", "pyphen_syllables_frequencies.json is not a file")

        return (True, "Pyphen", "Valid Pyphen corpus")

    # No valid corpus found
    return (
        False,
        "",
        "No corpus files found. Directory must contain either:\n"
        "  - nltk_syllables_unique.txt + nltk_syllables_frequencies.json\n"
        "  - pyphen_syllables_unique.txt + pyphen_syllables_frequencies.json",
    )




[docs]
def get_corpus_info(path: Path) -> str:
    """
    Get display-friendly corpus information string.

    Args:
        path: Path to corpus directory

    Returns:
        Short description string for UI display

    Examples:
        >>> get_corpus_info(Path("/path/to/20260110_115601_nltk"))
        "NLTK (20260110_115601_nltk)"
    """
    is_valid, corpus_type, error = validate_corpus_directory(path)

    if not is_valid:
        return f"Invalid: {error}"

    # Extract directory name for display
    dir_name = path.name

    return f"{corpus_type} ({dir_name})"




[docs]
def load_corpus_data(path: Path) -> tuple[list[str], dict[str, int]]:
    """
    Load syllables and frequencies from a validated corpus directory.

    Args:
        path: Path to validated corpus directory

    Returns:
        Tuple of (syllables_list, frequencies_dict)
        - syllables_list: List of unique syllables (one per line from .txt file)
        - frequencies_dict: Dictionary mapping syllable to frequency count

    Raises:
        ValueError: If directory is invalid or files cannot be loaded
        FileNotFoundError: If expected corpus files are missing
        json.JSONDecodeError: If frequencies JSON is malformed

    Examples:
        >>> syllables, freqs = load_corpus_data(Path("/path/to/20260110_115601_nltk"))
        >>> len(syllables)
        15234
        >>> freqs["hello"]
        42

    Note:
        This function assumes the directory has already been validated with
        validate_corpus_directory(). It will attempt to load from either
        NLTK or Pyphen corpus files based on what exists.
    """
    # Validate directory first
    is_valid, corpus_type, error = validate_corpus_directory(path)
    if not is_valid:
        raise ValueError(f"Invalid corpus directory: {error}")

    # Determine which corpus files to load
    if corpus_type == "NLTK":
        syllables_file = path / "nltk_syllables_unique.txt"
        frequencies_file = path / "nltk_syllables_frequencies.json"
    elif corpus_type == "Pyphen":
        syllables_file = path / "pyphen_syllables_unique.txt"
        frequencies_file = path / "pyphen_syllables_frequencies.json"
    else:
        raise ValueError(f"Unknown corpus type: {corpus_type}")

    # Load syllables (one per line)
    try:
        with open(syllables_file, encoding="utf-8") as f:
            syllables = [line.strip() for line in f if line.strip()]
    except FileNotFoundError:
        raise FileNotFoundError(f"Syllables file not found: {syllables_file}")
    except Exception as e:
        raise ValueError(f"Error reading syllables file: {e}") from e

    # Load frequencies (JSON dict)
    try:
        with open(frequencies_file, encoding="utf-8") as f:
            frequencies = json.load(f)
    except FileNotFoundError:
        raise FileNotFoundError(f"Frequencies file not found: {frequencies_file}")
    except json.JSONDecodeError as e:
        raise json.JSONDecodeError(
            f"Invalid JSON in frequencies file: {e.msg}", e.doc, e.pos
        ) from e
    except Exception as e:
        raise ValueError(f"Error reading frequencies file: {e}") from e

    # Validate data integrity
    if not syllables:
        raise ValueError("Syllables file is empty")

    if not frequencies:
        raise ValueError("Frequencies file is empty")

    # Sanity check: all syllables should have frequency data
    missing_freqs = [s for s in syllables if s not in frequencies]
    if missing_freqs:
        # This is a warning, not a fatal error - some syllables might be legitimately rare
        print(
            f"Warning: {len(missing_freqs)} syllables missing frequency data "
            f"(out of {len(syllables)} total)"
        )

    return syllables, frequencies




[docs]
def load_annotated_data_from_sqlite(db_path: Path) -> list[dict]:
    """
    Load phonetic feature annotations from a SQLite corpus database.

    This function loads syllable data from an optimized SQLite database,
    which is much faster and more memory-efficient than loading from JSON.

    Args:
        db_path: Path to corpus.db file

    Returns:
        List of dictionaries, each containing:
        - syllable: The syllable string
        - frequency: Occurrence count in source corpus
        - features: Dict of boolean phonetic feature flags

    Raises:
        FileNotFoundError: If database file doesn't exist
        sqlite3.Error: If database is corrupted or incompatible

    Performance Notes:
        - Much faster than JSON loading (<100ms vs 1-2s)
        - Memory-efficient (loads on-demand)
        - Can be called on main thread without freezing UI

    Examples:
        >>> db_path = Path("/path/to/20260110_115601_nltk/data/corpus.db")
        >>> data = load_annotated_data_from_sqlite(db_path)
        >>> len(data)
        33640
    """
    if not db_path.exists():
        raise FileNotFoundError(f"SQLite database not found: {db_path}")

    try:
        # Open database in read-only mode
        conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
        conn.row_factory = sqlite3.Row
        cursor = conn.cursor()

        # Query all syllables with features, ordered by syllable for determinism
        cursor.execute("""
            SELECT
                syllable, frequency,
                starts_with_vowel, starts_with_cluster, starts_with_heavy_cluster,
                contains_plosive, contains_fricative, contains_liquid, contains_nasal,
                short_vowel, long_vowel,
                ends_with_vowel, ends_with_nasal, ends_with_stop
            FROM syllables
            ORDER BY syllable
            """)

        # Convert rows to the expected dictionary format
        data = []
        for row in cursor.fetchall():
            data.append(
                {
                    "syllable": row["syllable"],
                    "frequency": row["frequency"],
                    "features": {
                        "starts_with_vowel": bool(row["starts_with_vowel"]),
                        "starts_with_cluster": bool(row["starts_with_cluster"]),
                        "starts_with_heavy_cluster": bool(row["starts_with_heavy_cluster"]),
                        "contains_plosive": bool(row["contains_plosive"]),
                        "contains_fricative": bool(row["contains_fricative"]),
                        "contains_liquid": bool(row["contains_liquid"]),
                        "contains_nasal": bool(row["contains_nasal"]),
                        "short_vowel": bool(row["short_vowel"]),
                        "long_vowel": bool(row["long_vowel"]),
                        "ends_with_vowel": bool(row["ends_with_vowel"]),
                        "ends_with_nasal": bool(row["ends_with_nasal"]),
                        "ends_with_stop": bool(row["ends_with_stop"]),
                    },
                }
            )

        conn.close()
        return data

    except sqlite3.Error as e:
        raise sqlite3.Error(f"Error reading SQLite database {db_path}: {e}") from e




[docs]
def load_annotated_data(path: Path) -> tuple[list[dict], dict[str, str]]:
    """
    Load phonetic feature annotations from a validated corpus directory.

    This function intelligently loads from either SQLite (if available) or
    JSON (fallback for backwards compatibility). SQLite loading is much faster
    and more memory-efficient.

    Data structure (same for both sources):
    [
      {
        "syllable": "aa",
        "frequency": 1022,
        "features": {
          "starts_with_vowel": true,
          "starts_with_cluster": false,
          "starts_with_heavy_cluster": false,
          "contains_plosive": false,
          "contains_fricative": false,
          "contains_liquid": false,
          "contains_nasal": false,
          "short_vowel": false,
          "long_vowel": true,
          "ends_with_vowel": true,
          "ends_with_nasal": false,
          "ends_with_stop": false
        }
      },
      ...
    ]

    Args:
        path: Path to validated corpus directory

    Returns:
        Tuple of (data, metadata):
        - data: List of dictionaries, each containing:
            - syllable: The syllable string
            - frequency: Occurrence count in source corpus
            - features: Dict of boolean phonetic feature flags
        - metadata: Dictionary with loading information:
            - source: "sqlite" or "json"
            - file_name: Name of the file loaded from
            - load_time_ms: Approximate load time in milliseconds

    Raises:
        ValueError: If directory is invalid or file cannot be loaded
        FileNotFoundError: If neither SQLite nor JSON data is available
        json.JSONDecodeError: If JSON is malformed (when loading from JSON)

    Performance Notes:
        - SQLite: <100ms load time, memory-efficient (preferred)
        - JSON: 1-2s load time, loads entire file into memory (fallback)
        - Automatically chooses best available format

    Examples:
        >>> data, meta = load_annotated_data(Path("/path/to/20260110_115601_nltk"))
        >>> len(data)
        33640
        >>> meta["source"]
        "sqlite"
        >>> data[0]["syllable"]
        "aa"
        >>> data[0]["features"]["starts_with_vowel"]
        True
    """
    # Validate directory first to determine corpus type
    is_valid, corpus_type, error = validate_corpus_directory(path)
    if not is_valid:
        raise ValueError(f"Invalid corpus directory: {error}")

    # Check for SQLite database first (preferred, fast and memory-efficient)
    db_path = path / "data" / "corpus.db"
    if db_path.exists():
        try:
            import time

            start_time = time.time()
            data = load_annotated_data_from_sqlite(db_path)
            load_time_ms = int((time.time() - start_time) * 1000)

            metadata = {
                "source": "sqlite",
                "file_name": "corpus.db",
                "load_time_ms": str(load_time_ms),
            }
            return data, metadata
        except Exception as e:
            # If SQLite fails, fall back to JSON
            print(f"Warning: SQLite loading failed ({e}), falling back to JSON")

    # Fall back to JSON loading (backwards compatibility)
    # Determine which annotated file to load based on corpus type
    # These files live in the data/ subdirectory
    if corpus_type == "NLTK":
        annotated_file = path / "data" / "nltk_syllables_annotated.json"
    elif corpus_type == "Pyphen":
        annotated_file = path / "data" / "pyphen_syllables_annotated.json"
    else:
        raise ValueError(f"Unknown corpus type: {corpus_type}")

    # Check that the annotated data file exists
    # (not all corpus directories may have been annotated yet)
    if not annotated_file.exists():
        raise FileNotFoundError(
            f"No annotated data found in {path / 'data'}\n"
            f"Looked for:\n"
            f"  - corpus.db (preferred, SQLite format)\n"
            f"  - {annotated_file.name} (JSON format)\n"
            f"\n"
            f"This corpus directory may not have been processed with "
            f"syllable_feature_annotator yet, or you may need to run:\n"
            f"  python -m build_tools.corpus_sqlite_builder {path}"
        )

    # Load the JSON file
    # Note: This is a potentially slow operation (1-2 seconds for 15MB files)
    # The caller should run this in a background worker to avoid blocking the UI
    print(f"Loading from JSON (slower): {annotated_file.name}")
    try:
        import time

        start_time = time.time()
        with open(annotated_file, encoding="utf-8") as f:
            annotated_data = json.load(f)
        load_time_ms = int((time.time() - start_time) * 1000)
    except FileNotFoundError:
        # Already checked above, but handle race condition
        raise FileNotFoundError(f"Annotated data file not found: {annotated_file}")
    except json.JSONDecodeError as e:
        raise json.JSONDecodeError(
            f"Invalid JSON in annotated data file: {e.msg}", e.doc, e.pos
        ) from e
    except Exception as e:
        raise ValueError(f"Error reading annotated data file: {e}") from e

    # Validate that we got a non-empty list
    if not isinstance(annotated_data, list):
        raise ValueError(
            f"Annotated data should be a JSON array, got {type(annotated_data).__name__}"
        )

    if not annotated_data:
        raise ValueError("Annotated data file is empty")

    # Sanity check: verify first entry has expected structure
    # (don't check all entries for performance - that would be expensive)
    first_entry = annotated_data[0]
    if not isinstance(first_entry, dict):
        raise ValueError(
            f"Annotated data entries should be objects, got {type(first_entry).__name__}"
        )

    required_keys = {"syllable", "frequency", "features"}
    missing_keys = required_keys - set(first_entry.keys())
    if missing_keys:
        raise ValueError(
            f"Annotated data entries missing required keys: {missing_keys}\n"
            f"Found keys: {set(first_entry.keys())}"
        )

    metadata = {
        "source": "json",
        "file_name": annotated_file.name,
        "load_time_ms": str(load_time_ms),
    }
    return annotated_data, metadata