git » alan.git » master » tree

[master] / language_spec.py

"""Canonical grammar spec and validator for ALAN."""
from __future__ import annotations

from dataclasses import dataclass
from typing import Dict, List, Optional


AGENT = "AGENT"
RECIPIENT = "RECIPIENT"  # OBJ1
THEME = "THEME"  # OBJ2


@dataclass
class NPFeature:
    noun_id: str
    feminine: bool
    plural: bool
    adjectives: List[str]
    role: str  # AGENT/RECIPIENT/THEME
    use_irregular: bool = True


@dataclass
class SentenceFeatures:
    subject: NPFeature
    obj1: NPFeature
    obj2: Optional[NPFeature]
    verb_id: str
    tense: str  # PRES/PAST
    use_irregular_verb: bool = True


@dataclass
class LanguageSpec:
    lexicon: Dict[str, Dict[str, str]]
    irregular_verbs: Dict[str, Dict[str, str]]
    irregular_noun_plurals: Dict[str, str]


def generate_language_instance(seed: int | None = None) -> LanguageSpec:
    lexicon = {
        "nouns": {
            "man": "po",
            "woman": "rema",
            "boy": "tul",
            "girl": "siv",
            "ball": "kob",
            "house": "vut",
        },
        "verbs": {"see": "dak", "give": "mep", "chase": "ror"},
        "adjectives": {"tall": "sar", "red": "lin", "big": "mod", "fast": "par"},
    }
    irregular_verbs = {"chase": {"PAST": "rontmimu"}}
    irregular_noun_plurals = {"boy": "letul"}
    return LanguageSpec(
        lexicon=lexicon,
        irregular_verbs=irregular_verbs,
        irregular_noun_plurals=irregular_noun_plurals,
    )


FEMININE_NOUNS = {"woman", "girl"}


def _plural_form(noun_id: str, spec: LanguageSpec, feminine: bool, use_irregular: bool) -> str:
    if use_irregular and noun_id in spec.irregular_noun_plurals:
        return spec.irregular_noun_plurals[noun_id]
    stem = spec.lexicon["nouns"][noun_id]
    if feminine and noun_id in FEMININE_NOUNS:
        return "memleko" + stem
    return "leko" + stem


def _noun_form(np: NPFeature, spec: LanguageSpec) -> str:
    base = spec.lexicon["nouns"][np.noun_id]
    fem = np.feminine and np.noun_id in FEMININE_NOUNS
    if np.plural:
        form = _plural_form(np.noun_id, spec, fem, np.use_irregular)
    else:
        form = ("mem" if fem else "") + base
    # receiver marker wraps whole NP
    if np.role != AGENT:
        form = "na" + form
    # doer suffix
    if np.role == AGENT:
        form = form + "mur"
    # adjectives after noun
    if np.adjectives:
        adj_forms = [spec.lexicon["adjectives"][a] for a in np.adjectives]
        form = f"{form} {' '.join(adj_forms)}"
    return form


def realize_sentence(spec: LanguageSpec, sf: SentenceFeatures) -> str:
    parts = [
        _noun_form(sf.subject, spec),
        _noun_form(sf.obj1, spec),
    ]
    if sf.obj2:
        parts.append(_noun_form(sf.obj2, spec))
    verb_stem = spec.lexicon["verbs"][sf.verb_id]
    irregular = spec.irregular_verbs.get(sf.verb_id, {}).get(sf.tense)
    if irregular and sf.use_irregular_verb:
        verb_form = irregular
    else:
        verb_form = verb_stem if sf.tense == "PRES" else verb_stem + "mimu"
    parts.append(verb_form)
    return " ".join(parts)


def english_gloss(sf: SentenceFeatures) -> str:
    def np_gloss(np: NPFeature) -> str:
        noun = np.noun_id
        if np.feminine and np.noun_id in FEMININE_NOUNS:
            noun = {"man": "woman", "boy": "girl"}.get(noun, noun)
        adj = " ".join(np.adjectives)
        phrase = f"{adj} {noun}".strip()
        if np.plural:
            phrase += "s"
        role_note = ""
        if np.feminine and np.plural and np.noun_id in FEMININE_NOUNS:
            role_note = " (feminine plural)"
        elif np.plural:
            role_note = " (plural)"
        elif np.feminine and np.noun_id in FEMININE_NOUNS:
            role_note = " (feminine)"
        return f"the {phrase}{role_note}".strip()

    subj = np_gloss(sf.subject)
    obj1 = np_gloss(sf.obj1)
    verb = sf.verb_id
    if verb == "give" and sf.obj2:
        obj2 = np_gloss(sf.obj2)
        verb_en = "gave" if sf.tense == "PAST" else "gives"
        return f"{subj} {verb_en} {obj2} to {obj1} ({'past' if sf.tense=='PAST' else 'present'})"
    verb_en = {
        ("see", "PRES"): "sees",
        ("see", "PAST"): "saw",
        ("chase", "PRES"): "chases",
        ("chase", "PAST"): "chased",
        ("give", "PRES"): "gives",
        ("give", "PAST"): "gave",
    }.get((verb, sf.tense), f"{verb}s")
    return f"{subj} {verb_en} {obj1} ({'past' if sf.tense=='PAST' else 'present'})"


def validate_sentence_surface(sf: SentenceFeatures, sentence: str, spec: LanguageSpec) -> bool:
    return sentence.strip() == realize_sentence(spec, sf)