Source code for build_tools.syllable_walk_tui.services.corpus

"""
Corpus directory validation and utilities for Syllable Walker TUI.

This module provides functions for validating and loading corpus data
from normalized syllable extraction output directories.
"""

import json
import sqlite3
from pathlib import Path


[docs] def validate_corpus_directory(path: Path) -> tuple[bool, str, str]: """ Validate that a directory contains valid corpus files. Checks for either NLTK or Pyphen corpus structure: - nltk_syllables_unique.txt + nltk_syllables_frequencies.json - pyphen_syllables_unique.txt + pyphen_syllables_frequencies.json Args: path: Directory path to validate Returns: Tuple of (is_valid, corpus_type, message) - is_valid: True if valid corpus directory - corpus_type: "NLTK" or "Pyphen" if valid, empty string otherwise - message: Description if valid, error description if invalid Examples: >>> validate_corpus_directory(Path("/path/to/20260110_115601_nltk")) (True, "NLTK", "Valid NLTK corpus") >>> validate_corpus_directory(Path("/invalid/path")) (False, "", "Directory does not exist") """ # Check directory exists if not path.exists(): return (False, "", "Directory does not exist") if not path.is_dir(): return (False, "", "Path is not a directory") # Check for NLTK corpus nltk_syllables = path / "nltk_syllables_unique.txt" nltk_frequencies = path / "nltk_syllables_frequencies.json" if nltk_syllables.exists() and nltk_frequencies.exists(): # Validate both are files if not nltk_syllables.is_file(): return (False, "", "nltk_syllables_unique.txt is not a file") if not nltk_frequencies.is_file(): return (False, "", "nltk_syllables_frequencies.json is not a file") return (True, "NLTK", "Valid NLTK corpus") # Check for Pyphen corpus pyphen_syllables = path / "pyphen_syllables_unique.txt" pyphen_frequencies = path / "pyphen_syllables_frequencies.json" if pyphen_syllables.exists() and pyphen_frequencies.exists(): # Validate both are files if not pyphen_syllables.is_file(): return (False, "", "pyphen_syllables_unique.txt is not a file") if not pyphen_frequencies.is_file(): return (False, "", "pyphen_syllables_frequencies.json is not a file") return (True, "Pyphen", "Valid Pyphen corpus") # No valid corpus found return ( False, "", "No corpus files found. Directory must contain either:\n" " - nltk_syllables_unique.txt + nltk_syllables_frequencies.json\n" " - pyphen_syllables_unique.txt + pyphen_syllables_frequencies.json", )
[docs] def get_corpus_info(path: Path) -> str: """ Get display-friendly corpus information string. Args: path: Path to corpus directory Returns: Short description string for UI display Examples: >>> get_corpus_info(Path("/path/to/20260110_115601_nltk")) "NLTK (20260110_115601_nltk)" """ is_valid, corpus_type, error = validate_corpus_directory(path) if not is_valid: return f"Invalid: {error}" # Extract directory name for display dir_name = path.name return f"{corpus_type} ({dir_name})"
[docs] def load_corpus_data(path: Path) -> tuple[list[str], dict[str, int]]: """ Load syllables and frequencies from a validated corpus directory. Args: path: Path to validated corpus directory Returns: Tuple of (syllables_list, frequencies_dict) - syllables_list: List of unique syllables (one per line from .txt file) - frequencies_dict: Dictionary mapping syllable to frequency count Raises: ValueError: If directory is invalid or files cannot be loaded FileNotFoundError: If expected corpus files are missing json.JSONDecodeError: If frequencies JSON is malformed Examples: >>> syllables, freqs = load_corpus_data(Path("/path/to/20260110_115601_nltk")) >>> len(syllables) 15234 >>> freqs["hello"] 42 Note: This function assumes the directory has already been validated with validate_corpus_directory(). It will attempt to load from either NLTK or Pyphen corpus files based on what exists. """ # Validate directory first is_valid, corpus_type, error = validate_corpus_directory(path) if not is_valid: raise ValueError(f"Invalid corpus directory: {error}") # Determine which corpus files to load if corpus_type == "NLTK": syllables_file = path / "nltk_syllables_unique.txt" frequencies_file = path / "nltk_syllables_frequencies.json" elif corpus_type == "Pyphen": syllables_file = path / "pyphen_syllables_unique.txt" frequencies_file = path / "pyphen_syllables_frequencies.json" else: raise ValueError(f"Unknown corpus type: {corpus_type}") # Load syllables (one per line) try: with open(syllables_file, encoding="utf-8") as f: syllables = [line.strip() for line in f if line.strip()] except FileNotFoundError: raise FileNotFoundError(f"Syllables file not found: {syllables_file}") except Exception as e: raise ValueError(f"Error reading syllables file: {e}") from e # Load frequencies (JSON dict) try: with open(frequencies_file, encoding="utf-8") as f: frequencies = json.load(f) except FileNotFoundError: raise FileNotFoundError(f"Frequencies file not found: {frequencies_file}") except json.JSONDecodeError as e: raise json.JSONDecodeError( f"Invalid JSON in frequencies file: {e.msg}", e.doc, e.pos ) from e except Exception as e: raise ValueError(f"Error reading frequencies file: {e}") from e # Validate data integrity if not syllables: raise ValueError("Syllables file is empty") if not frequencies: raise ValueError("Frequencies file is empty") # Sanity check: all syllables should have frequency data missing_freqs = [s for s in syllables if s not in frequencies] if missing_freqs: # This is a warning, not a fatal error - some syllables might be legitimately rare print( f"Warning: {len(missing_freqs)} syllables missing frequency data " f"(out of {len(syllables)} total)" ) return syllables, frequencies
[docs] def load_annotated_data_from_sqlite(db_path: Path) -> list[dict]: """ Load phonetic feature annotations from a SQLite corpus database. This function loads syllable data from an optimized SQLite database, which is much faster and more memory-efficient than loading from JSON. Args: db_path: Path to corpus.db file Returns: List of dictionaries, each containing: - syllable: The syllable string - frequency: Occurrence count in source corpus - features: Dict of boolean phonetic feature flags Raises: FileNotFoundError: If database file doesn't exist sqlite3.Error: If database is corrupted or incompatible Performance Notes: - Much faster than JSON loading (<100ms vs 1-2s) - Memory-efficient (loads on-demand) - Can be called on main thread without freezing UI Examples: >>> db_path = Path("/path/to/20260110_115601_nltk/data/corpus.db") >>> data = load_annotated_data_from_sqlite(db_path) >>> len(data) 33640 """ if not db_path.exists(): raise FileNotFoundError(f"SQLite database not found: {db_path}") try: # Open database in read-only mode conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True) conn.row_factory = sqlite3.Row cursor = conn.cursor() # Query all syllables with features, ordered by syllable for determinism cursor.execute(""" SELECT syllable, frequency, starts_with_vowel, starts_with_cluster, starts_with_heavy_cluster, contains_plosive, contains_fricative, contains_liquid, contains_nasal, short_vowel, long_vowel, ends_with_vowel, ends_with_nasal, ends_with_stop FROM syllables ORDER BY syllable """) # Convert rows to the expected dictionary format data = [] for row in cursor.fetchall(): data.append( { "syllable": row["syllable"], "frequency": row["frequency"], "features": { "starts_with_vowel": bool(row["starts_with_vowel"]), "starts_with_cluster": bool(row["starts_with_cluster"]), "starts_with_heavy_cluster": bool(row["starts_with_heavy_cluster"]), "contains_plosive": bool(row["contains_plosive"]), "contains_fricative": bool(row["contains_fricative"]), "contains_liquid": bool(row["contains_liquid"]), "contains_nasal": bool(row["contains_nasal"]), "short_vowel": bool(row["short_vowel"]), "long_vowel": bool(row["long_vowel"]), "ends_with_vowel": bool(row["ends_with_vowel"]), "ends_with_nasal": bool(row["ends_with_nasal"]), "ends_with_stop": bool(row["ends_with_stop"]), }, } ) conn.close() return data except sqlite3.Error as e: raise sqlite3.Error(f"Error reading SQLite database {db_path}: {e}") from e
[docs] def load_annotated_data(path: Path) -> tuple[list[dict], dict[str, str]]: """ Load phonetic feature annotations from a validated corpus directory. This function intelligently loads from either SQLite (if available) or JSON (fallback for backwards compatibility). SQLite loading is much faster and more memory-efficient. Data structure (same for both sources): [ { "syllable": "aa", "frequency": 1022, "features": { "starts_with_vowel": true, "starts_with_cluster": false, "starts_with_heavy_cluster": false, "contains_plosive": false, "contains_fricative": false, "contains_liquid": false, "contains_nasal": false, "short_vowel": false, "long_vowel": true, "ends_with_vowel": true, "ends_with_nasal": false, "ends_with_stop": false } }, ... ] Args: path: Path to validated corpus directory Returns: Tuple of (data, metadata): - data: List of dictionaries, each containing: - syllable: The syllable string - frequency: Occurrence count in source corpus - features: Dict of boolean phonetic feature flags - metadata: Dictionary with loading information: - source: "sqlite" or "json" - file_name: Name of the file loaded from - load_time_ms: Approximate load time in milliseconds Raises: ValueError: If directory is invalid or file cannot be loaded FileNotFoundError: If neither SQLite nor JSON data is available json.JSONDecodeError: If JSON is malformed (when loading from JSON) Performance Notes: - SQLite: <100ms load time, memory-efficient (preferred) - JSON: 1-2s load time, loads entire file into memory (fallback) - Automatically chooses best available format Examples: >>> data, meta = load_annotated_data(Path("/path/to/20260110_115601_nltk")) >>> len(data) 33640 >>> meta["source"] "sqlite" >>> data[0]["syllable"] "aa" >>> data[0]["features"]["starts_with_vowel"] True """ # Validate directory first to determine corpus type is_valid, corpus_type, error = validate_corpus_directory(path) if not is_valid: raise ValueError(f"Invalid corpus directory: {error}") # Check for SQLite database first (preferred, fast and memory-efficient) db_path = path / "data" / "corpus.db" if db_path.exists(): try: import time start_time = time.time() data = load_annotated_data_from_sqlite(db_path) load_time_ms = int((time.time() - start_time) * 1000) metadata = { "source": "sqlite", "file_name": "corpus.db", "load_time_ms": str(load_time_ms), } return data, metadata except Exception as e: # If SQLite fails, fall back to JSON print(f"Warning: SQLite loading failed ({e}), falling back to JSON") # Fall back to JSON loading (backwards compatibility) # Determine which annotated file to load based on corpus type # These files live in the data/ subdirectory if corpus_type == "NLTK": annotated_file = path / "data" / "nltk_syllables_annotated.json" elif corpus_type == "Pyphen": annotated_file = path / "data" / "pyphen_syllables_annotated.json" else: raise ValueError(f"Unknown corpus type: {corpus_type}") # Check that the annotated data file exists # (not all corpus directories may have been annotated yet) if not annotated_file.exists(): raise FileNotFoundError( f"No annotated data found in {path / 'data'}\n" f"Looked for:\n" f" - corpus.db (preferred, SQLite format)\n" f" - {annotated_file.name} (JSON format)\n" f"\n" f"This corpus directory may not have been processed with " f"syllable_feature_annotator yet, or you may need to run:\n" f" python -m build_tools.corpus_sqlite_builder {path}" ) # Load the JSON file # Note: This is a potentially slow operation (1-2 seconds for 15MB files) # The caller should run this in a background worker to avoid blocking the UI print(f"Loading from JSON (slower): {annotated_file.name}") try: import time start_time = time.time() with open(annotated_file, encoding="utf-8") as f: annotated_data = json.load(f) load_time_ms = int((time.time() - start_time) * 1000) except FileNotFoundError: # Already checked above, but handle race condition raise FileNotFoundError(f"Annotated data file not found: {annotated_file}") except json.JSONDecodeError as e: raise json.JSONDecodeError( f"Invalid JSON in annotated data file: {e.msg}", e.doc, e.pos ) from e except Exception as e: raise ValueError(f"Error reading annotated data file: {e}") from e # Validate that we got a non-empty list if not isinstance(annotated_data, list): raise ValueError( f"Annotated data should be a JSON array, got {type(annotated_data).__name__}" ) if not annotated_data: raise ValueError("Annotated data file is empty") # Sanity check: verify first entry has expected structure # (don't check all entries for performance - that would be expensive) first_entry = annotated_data[0] if not isinstance(first_entry, dict): raise ValueError( f"Annotated data entries should be objects, got {type(first_entry).__name__}" ) required_keys = {"syllable", "frequency", "features"} missing_keys = required_keys - set(first_entry.keys()) if missing_keys: raise ValueError( f"Annotated data entries missing required keys: {missing_keys}\n" f"Found keys: {set(first_entry.keys())}" ) metadata = { "source": "json", "file_name": annotated_file.name, "load_time_ms": str(load_time_ms), } return annotated_data, metadata