Source code for build_tools.syllable_walk.db

"""SQLite database access layer for the syllable walker web interface.

This module provides functions to query the corpus.db SQLite database
for syllable data, avoiding the need to load large JSON files into memory.

The database schema stores syllables with their 12 phonetic features and
frequency counts, with indexes optimized for common query patterns.

Functions:
    load_syllables_from_sqlite: Load all syllables with features
    get_syllable_count: Get total syllable count
    syllable_exists: Check if a syllable exists
    get_random_syllable: Get a random syllable
"""

from __future__ import annotations

import json
import random
import sqlite3
from pathlib import Path

# The 12 phonetic features in the database schema
FEATURE_COLUMNS = [
    "starts_with_vowel",
    "starts_with_cluster",
    "starts_with_heavy_cluster",
    "contains_plosive",
    "contains_fricative",
    "contains_liquid",
    "contains_nasal",
    "short_vowel",
    "long_vowel",
    "ends_with_vowel",
    "ends_with_nasal",
    "ends_with_stop",
]


def _get_connection(db_path: Path) -> sqlite3.Connection:
    """Get a read-only connection to the database.

    Args:
        db_path: Path to corpus.db

    Returns:
        SQLite connection with row_factory set to sqlite3.Row
    """
    conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
    conn.row_factory = sqlite3.Row
    return conn


[docs] def load_syllables_from_sqlite(db_path: Path) -> list[dict]: """Load all syllables with features from the database. This is the primary data loading function for the web interface. Returns data in the same format as the annotated JSON files. Args: db_path: Path to corpus.db Returns: List of dicts with 'syllable', 'frequency', and 'features' keys Example: >>> syllables = load_syllables_from_sqlite(Path("corpus.db")) >>> syllables[0] {'syllable': 'ab', 'frequency': 1, 'features': {'starts_with_vowel': True, ...}} """ conn = _get_connection(db_path) try: # FEATURE_COLUMNS is a constant defined in this module, not user input cursor = conn.execute(f""" SELECT syllable, frequency, {', '.join(FEATURE_COLUMNS)} FROM syllables ORDER BY frequency DESC """) # nosec B608 - FEATURE_COLUMNS is a module constant results = [] for row in cursor: features = {col: bool(row[col]) for col in FEATURE_COLUMNS} results.append( { "syllable": row["syllable"], "frequency": row["frequency"], "features": features, } ) return results finally: conn.close()
[docs] def get_syllable_count(db_path: Path) -> int: """Get the total number of syllables in the database. Args: db_path: Path to corpus.db Returns: Total syllable count """ conn = _get_connection(db_path) try: cursor = conn.execute("SELECT COUNT(*) FROM syllables") count: int = cursor.fetchone()[0] return count finally: conn.close()
[docs] def syllable_exists(db_path: Path, syllable: str) -> bool: """Check if a syllable exists in the database. Args: db_path: Path to corpus.db syllable: Syllable to check Returns: True if syllable exists, False otherwise """ conn = _get_connection(db_path) try: cursor = conn.execute("SELECT 1 FROM syllables WHERE syllable = ? LIMIT 1", (syllable,)) return cursor.fetchone() is not None finally: conn.close()
[docs] def get_syllable_data(db_path: Path, syllable: str) -> dict | None: """Get data for a specific syllable. Args: db_path: Path to corpus.db syllable: Syllable to look up Returns: Dict with syllable data, or None if not found """ conn = _get_connection(db_path) try: # FEATURE_COLUMNS is a constant defined in this module, not user input cursor = conn.execute( f""" SELECT syllable, frequency, {', '.join(FEATURE_COLUMNS)} FROM syllables WHERE syllable = ? """, # nosec B608 - FEATURE_COLUMNS is a module constant (syllable,), ) row = cursor.fetchone() if row is None: return None features = {col: bool(row[col]) for col in FEATURE_COLUMNS} return { "syllable": row["syllable"], "frequency": row["frequency"], "features": features, } finally: conn.close()
[docs] def get_random_syllable(db_path: Path, seed: int | None = None) -> str: """Get a random syllable from the database. Uses frequency-weighted random selection if seed is provided for reproducibility. Args: db_path: Path to corpus.db seed: Optional random seed for reproducibility Returns: Random syllable string """ conn = _get_connection(db_path) try: # Get all syllables with frequencies cursor = conn.execute("SELECT syllable, frequency FROM syllables") rows = cursor.fetchall() if not rows: raise ValueError("Database is empty") # Create RNG instance (deterministic if seed provided) rng = random.Random(seed) # nosec B311 - deterministic generation # Frequency-weighted selection syllables = [row["syllable"] for row in rows] weights = [row["frequency"] for row in rows] result: str = rng.choices(syllables, weights=weights, k=1)[0] return result finally: conn.close()
[docs] def load_syllables_from_json(json_path: Path) -> list[dict]: """Load syllables from annotated JSON file (fallback when no DB). Args: json_path: Path to *_syllables_annotated.json Returns: List of dicts with syllable data """ with open(json_path, encoding="utf-8") as f: data: list[dict] = json.load(f) return data
[docs] def load_syllables( db_path: Path | None = None, json_path: Path | None = None ) -> tuple[list[dict], str]: """Load syllables from database or JSON, with automatic fallback. Prefers SQLite database for performance, falls back to JSON if database is not available. Args: db_path: Path to corpus.db (optional) json_path: Path to annotated JSON (optional) Returns: Tuple of (syllables list, source description) Raises: ValueError: If neither db_path nor json_path is valid """ # Try database first if db_path and db_path.exists(): try: syllables = load_syllables_from_sqlite(db_path) return syllables, f"SQLite ({len(syllables):,} syllables)" except Exception: # nosec B110 - intentional fallback to JSON pass # Fall through to JSON # Try JSON fallback if json_path and json_path.exists(): syllables = load_syllables_from_json(json_path) return syllables, f"JSON ({len(syllables):,} syllables)" raise ValueError("No valid data source (corpus.db or annotated JSON) found")