git » alan.git » commit 423981e

Enforce staged concept progression and dedup questions

author Alan Dipert
2025-12-04 15:58:18 UTC
committer Alan Dipert
2025-12-04 15:58:18 UTC
parent d903690f69c631c6e676e7d5c381c33b4e18b2ab

Enforce staged concept progression and dedup questions

test_generator.py +259 -69

diff --git a/test_generator.py b/test_generator.py
index 9909eb9..3b2ed6f 100644
--- a/test_generator.py
+++ b/test_generator.py
@@ -33,6 +33,20 @@ DIST_MIN = 1
 DIST_MAX = 1
 
 
+@dataclass(frozen=True)
+class SectionConstraints:
+    allowed_verbs: List[str]
+    allowed_agent_nouns: List[str]
+    allowed_recipient_nouns: List[str]
+    allowed_theme_nouns: List[str]
+    allow_plural: bool
+    allow_feminine: bool
+    allow_past: bool
+    allow_ditransitive: bool
+    allow_irregulars: bool
+    allow_adjectives: bool
+
+
 def render_concept_explanation(concept: Concept, spec: LanguageSpec) -> str:
     """Simple explanation with one example."""
     if concept.id == "S_ORDER":
@@ -130,6 +144,35 @@ def question_valid(q: Question, spec: LanguageSpec) -> bool:
 # Feature utilities
 
 
+def section_constraints(unlocked: set[str]) -> SectionConstraints:
+    allow_plural = "NOUN_NUMBER_MARKING" in unlocked
+    allow_feminine = "NOUN_GENDER_MARKING" in unlocked
+    allow_past = "VERB_TENSE_MARKING" in unlocked
+    # Keep ditransitives out of the very first section for simplicity; they unlock once plural or gender is known.
+    allow_ditransitive = allow_plural or allow_feminine
+    allow_irregulars = allow_past or allow_plural
+    allow_adjectives = "NP_ORDER" in unlocked
+
+    def allowed_people(include_feminine: bool) -> List[str]:
+        base = ["man", "boy"]
+        if include_feminine:
+            base += ["woman", "girl"]
+        return base
+
+    return SectionConstraints(
+        allowed_verbs=["see", "chase"] + (["give"] if allow_ditransitive else []),
+        allowed_agent_nouns=allowed_people(allow_feminine),
+        allowed_recipient_nouns=allowed_people(allow_feminine),
+        allowed_theme_nouns=["ball", "house"],
+        allow_plural=allow_plural,
+        allow_feminine=allow_feminine,
+        allow_past=allow_past,
+        allow_ditransitive=allow_ditransitive,
+        allow_irregulars=allow_irregulars,
+        allow_adjectives=allow_adjectives,
+    )
+
+
 def np_features(
     noun_id: str,
     role: str,
@@ -163,6 +206,35 @@ def sentence_features(
     return SentenceFeatures(subject=subj, obj1=obj1, obj2=obj2, verb_id=verb_id, tense=tense, use_irregular_verb=use_irregular_verb)
 
 
+def respects_constraints(sf: SentenceFeatures, constraints: Optional[SectionConstraints]) -> bool:
+    if constraints is None:
+        return True
+    if sf.verb_id not in constraints.allowed_verbs:
+        return False
+    if not constraints.allow_past and sf.tense == "PAST":
+        return False
+    if not constraints.allow_ditransitive and sf.obj2 is not None:
+        return False
+    nps = [sf.subject, sf.obj1] + ([sf.obj2] if sf.obj2 else [])
+    for np in nps:
+        if np.role == AGENT and np.noun_id not in constraints.allowed_agent_nouns:
+            return False
+        if np.role != AGENT and np.noun_id not in constraints.allowed_recipient_nouns + constraints.allowed_theme_nouns:
+            return False
+        if not constraints.allow_plural and np.plural:
+            return False
+        if not constraints.allow_feminine and np.feminine:
+            return False
+        if (
+            not constraints.allow_irregulars
+            and ((np.use_irregular and np.plural) or (sf.use_irregular_verb and sf.tense == "PAST"))
+        ):
+            return False
+        if not constraints.allow_adjectives and np.adjectives:
+            return False
+    return True
+
+
 # ---------------------------------------------------------------------------
 # Perturbations for distractors (all grammatical)
 
@@ -273,7 +345,7 @@ def perturb_irregular(sf: SentenceFeatures) -> SentenceFeatures:
     )
 
 
-def build_distractors(spec: LanguageSpec, sf: SentenceFeatures, rng: random.Random) -> List[Option]:
+def build_distractors(spec: LanguageSpec, sf: SentenceFeatures, rng: random.Random, constraints: Optional[SectionConstraints] = None) -> List[Option]:
     """Generate three minimal-pair distractors (exactly one feature flipped)."""
     from semantic import to_meaning, semantic_distance
     target_meaning = to_meaning(sf)
@@ -383,17 +455,69 @@ def build_distractors(spec: LanguageSpec, sf: SentenceFeatures, rng: random.Rand
 
     flips.append(("Irregular vs regular.", flip_irregular))
 
+    # verb swap within allowed valence
+    def swap_verb(base: SentenceFeatures) -> SentenceFeatures:
+        allowed = constraints.allowed_verbs if constraints else ["see", "chase", "give"]
+        same_valence = [v for v in allowed if (base.obj2 is not None) == (v == "give")]
+        if len(same_valence) <= 1:
+            return base
+        choices = [v for v in same_valence if v != base.verb_id]
+        new_verb = rng.choice(choices) if choices else base.verb_id
+        return replace(base, verb_id=new_verb)
+
+    flips.append(("Verb swap.", swap_verb))
+
+    # subject noun swap
+    def swap_subj_noun(base: SentenceFeatures) -> SentenceFeatures:
+        allowed = constraints.allowed_agent_nouns if constraints else ["man", "woman", "boy", "girl"]
+        if len(allowed) <= 1:
+            return base
+        new_noun = rng.choice([n for n in allowed if n != base.subject.noun_id] or [base.subject.noun_id])
+        new_subj = np_features(new_noun, AGENT, plural=base.subject.plural, adjectives=base.subject.adjectives)
+        return replace(base, subject=new_subj)
+
+    flips.append(("Doer noun swap.", swap_subj_noun))
+
+    # receiver noun swap
+    def swap_obj1_noun(base: SentenceFeatures) -> SentenceFeatures:
+        allowed = constraints.allowed_recipient_nouns if constraints else ["boy", "girl", "man", "woman"]
+        if len(allowed) <= 1:
+            return base
+        new_noun = rng.choice([n for n in allowed if n != base.obj1.noun_id] or [base.obj1.noun_id])
+        new_obj1 = np_features(new_noun, base.obj1.role, plural=base.obj1.plural, adjectives=base.obj1.adjectives)
+        return replace(base, obj1=new_obj1)
+
+    flips.append(("Receiver noun swap.", swap_obj1_noun))
+
+    # subject adjective toggle
+    def flip_subj_adj(base: SentenceFeatures) -> SentenceFeatures:
+        new = clone_sf(base)
+        if new.subject.adjectives:
+            new.subject = replace(new.subject, adjectives=[])
+        else:
+            new.subject = replace(new.subject, adjectives=["tall"])
+        return new
+
+    flips.append(("Adjective swap (doer).", flip_subj_adj))
+
     # apply flips in shuffled order to diversify
     rng.shuffle(flips)
     for expl, fn in flips:
         if len(distractors) >= 3:
             break
-        add_if_valid(fn(sf), expl)
+        cand = fn(sf)
+        if constraints and not respects_constraints(cand, constraints):
+            continue
+        add_if_valid(cand, expl)
     # if still short, retry flips on shuffled order (different seeds) until filled or attempts exhausted
     attempts = 0
     while len(distractors) < 3 and attempts < 20:
         expl, fn = rng.choice(flips)
-        add_if_valid(fn(sf), expl)
+        cand = fn(sf)
+        if constraints and not respects_constraints(cand, constraints):
+            attempts += 1
+            continue
+        add_if_valid(cand, expl)
         attempts += 1
     return distractors if len(distractors) == 3 else []
 
@@ -402,40 +526,56 @@ def build_distractors(spec: LanguageSpec, sf: SentenceFeatures, rng: random.Rand
 # Item generation
 
 
-def _base_features(spec: LanguageSpec, rng: random.Random, difficulty: str) -> SentenceFeatures:
-    verb_id = rng.choice(["see", "chase", "give"])
-    tense = "PAST" if (difficulty == "late" or rng.random() < 0.4) else "PRES"
+def _base_features(
+    spec: LanguageSpec, rng: random.Random, difficulty: str, constraints: Optional[SectionConstraints] = None
+) -> SentenceFeatures:
+    cons = constraints
+    verbs = cons.allowed_verbs if cons else ["see", "chase", "give"]
+    verb_id = rng.choice(verbs)
+    allow_past = cons.allow_past if cons else True
+    tense = "PAST" if allow_past and (difficulty == "late" or rng.random() < 0.4) else "PRES"
 
+    subj_nouns = cons.allowed_agent_nouns if cons else ["man", "woman"]
     subj = np_features(
-        noun_id=rng.choice(["man", "woman"]),
+        noun_id=rng.choice(subj_nouns),
         role=AGENT,
-        plural=difficulty != "early" and rng.random() < 0.4,
-        adjectives=["tall"] if rng.random() < 0.6 else [],
+        plural=(cons.allow_plural if cons else difficulty != "early") and rng.random() < 0.4,
+        adjectives=(["tall"] if ((cons.allow_adjectives if cons else True) and rng.random() < 0.6) else []),
     )
 
     if verb_id == "give":
+        rec_nouns = cons.allowed_recipient_nouns if cons else ["boy", "girl"]
         obj1 = np_features(
-            noun_id=rng.choice(["boy", "girl"]),
+            noun_id=rng.choice(rec_nouns),
             role=RECIPIENT,
-            plural=difficulty != "early" and rng.random() < 0.4,
-            adjectives=["fast"] if rng.random() < 0.4 else [],
+            plural=(cons.allow_plural if cons else difficulty != "early") and rng.random() < 0.4,
+            adjectives=(["fast"] if ((cons.allow_adjectives if cons else True) and rng.random() < 0.4) else []),
         )
+        theme_nouns = cons.allowed_theme_nouns if cons else ["ball", "house"]
         obj2 = np_features(
-            noun_id=rng.choice(["ball", "house"]),
+            noun_id=rng.choice(theme_nouns),
             role=THEME,
-            plural=difficulty == "late" and rng.random() < 0.5,
-            adjectives=["red"] if rng.random() < 0.6 else [],
+            plural=(cons.allow_plural if cons else difficulty == "late") and rng.random() < 0.5,
+            adjectives=(["red"] if ((cons.allow_adjectives if cons else True) and rng.random() < 0.6) else []),
         )
     else:
+        rec_nouns = cons.allowed_recipient_nouns if cons else ["boy", "girl", "man", "woman"]
         obj1 = np_features(
-            noun_id=rng.choice(["boy", "girl", "man", "woman"]),
+            noun_id=rng.choice(rec_nouns),
             role=RECIPIENT,
-            plural=difficulty != "early" and rng.random() < 0.5,
-            adjectives=["red"] if rng.random() < 0.6 else [],
+            plural=(cons.allow_plural if cons else difficulty != "early") and rng.random() < 0.5,
+            adjectives=(["red"] if ((cons.allow_adjectives if cons else True) and rng.random() < 0.6) else []),
         )
         obj2 = None
 
-    return sentence_features(verb_id=verb_id, tense=tense, subj=subj, obj1=obj1, obj2=obj2)
+    return sentence_features(
+        verb_id=verb_id,
+        tense=tense,
+        subj=subj,
+        obj1=obj1,
+        obj2=obj2,
+        use_irregular_verb=True,
+    )
 
 
 def _planned_features(
@@ -445,71 +585,89 @@ def _planned_features(
     remaining: Dict[str, int],
     idx: int,
     items_left: int,
+    constraints: Optional[SectionConstraints] = None,
 ) -> tuple[SentenceFeatures, Dict[str, int]]:
     """Greedy planner to satisfy coverage quotas deterministically with overlap.
 
     Returns the planned SentenceFeatures and a delta dict for counters to apply only
     if the resulting item is accepted.
     """
-    subj_pool = [
-        np_features("man", AGENT, plural=False, adjectives=["tall"]),
-        np_features("woman", AGENT, plural=False, adjectives=["tall"]),
-        np_features("man", AGENT, plural=True, adjectives=["tall"]),
-        np_features("woman", AGENT, plural=True, adjectives=["tall"]),
-    ]
-    rec_pool = [
-        np_features("boy", RECIPIENT, plural=False, adjectives=["red"]),
-        np_features("girl", RECIPIENT, plural=False, adjectives=["red"]),
-        np_features("man", RECIPIENT, plural=False, adjectives=["red"]),
-        np_features("woman", RECIPIENT, plural=False, adjectives=["red"]),
-    ]
-    theme_pool = [
-        np_features("ball", THEME, plural=False, adjectives=["red"]),
-        np_features("house", THEME, plural=False, adjectives=["red"]),
-    ]
-
-    subj = subj_pool[idx % len(subj_pool)]
-    obj1 = rec_pool[idx % len(rec_pool)]
+    cons = constraints
+    allow_plural = cons.allow_plural if cons else True
+    allow_ditransitive = cons.allow_ditransitive if cons else True
+    allow_irregulars = cons.allow_irregulars if cons else True
+    allow_past = cons.allow_past if cons else True
+    allow_adjectives = cons.allow_adjectives if cons else True
+    allow_feminine = cons.allow_feminine if cons else True
+
+    def build_pool(nouns: List[str], role: str) -> List[NPFeature]:
+        pool: List[NPFeature] = []
+        for noun in nouns:
+            base_adj = ["tall"] if role == AGENT else ["red"]
+            adjs = [base_adj] if allow_adjectives else [[]]
+            for adj_list in adjs:
+                pool.append(np_features(noun, role, plural=False, adjectives=adj_list))
+                if allow_plural:
+                    pool.append(np_features(noun, role, plural=True, adjectives=adj_list))
+        return pool
+
+    subj_pool = build_pool(cons.allowed_agent_nouns if cons else ["man", "woman", "boy"], AGENT)
+    rec_pool = build_pool(cons.allowed_recipient_nouns if cons else ["boy", "girl", "man", "woman"], RECIPIENT)
+    theme_pool = build_pool(cons.allowed_theme_nouns if cons else ["ball", "house"], THEME)
+
+    subj = rng.choice(subj_pool)
+    obj1 = rng.choice(rec_pool)
     obj2 = None
-    verb_id = "see"
+    verb_choices = cons.allowed_verbs if cons else ["see", "chase", "give"]
+    verb_id = rng.choice(verb_choices)
     tense = "PRES"
     use_irregular_verb = True
     delta: Dict[str, int] = {"irregular_verb": 0, "irregular_noun": 0, "ditransitive": 0, "fem_plural": 0, "plural": 0, "adjective": 0}
 
     # 0) Force ditransitive if quota equals items_left
-    if remaining.get("ditransitive", 0) >= items_left:
+    if allow_ditransitive and remaining.get("ditransitive", 0) >= items_left:
         verb_id = "give"
-        obj2 = theme_pool[idx % len(theme_pool)]
+        obj2 = rng.choice(theme_pool)
         delta["ditransitive"] = 1
     # 1) Irregular verb coverage (monotransitive chase past), forced if needed
-    if verb_id == "see" and remaining.get("irregular_verb", 0) >= items_left:
+    if (
+        verb_id == "see"
+        and allow_past
+        and allow_irregulars
+        and remaining.get("irregular_verb", 0) >= items_left
+    ):
         verb_id = "chase"
         tense = "PAST"
         use_irregular_verb = True
         delta["irregular_verb"] = 1
-    elif verb_id == "see" and remaining.get("irregular_verb", 0) > 0:
+    elif verb_id == "see" and allow_past and allow_irregulars and remaining.get("irregular_verb", 0) > 0:
         verb_id = "chase"
         tense = "PAST"
         use_irregular_verb = True
         delta["irregular_verb"] = 1
 
     # 2) Ditransitive coverage (can overlap with irregular noun)
-    if verb_id != "chase" and remaining.get("ditransitive", 0) > 0 and delta.get("ditransitive", 0) == 0:
+    if (
+        allow_ditransitive
+        and verb_id != "chase"
+        and remaining.get("ditransitive", 0) > 0
+        and delta.get("ditransitive", 0) == 0
+    ):
         verb_id = "give"
         obj2 = theme_pool[idx % len(theme_pool)]
         delta["ditransitive"] = 1
 
     # 3) Irregular noun coverage (can overlap with ditransitive)
-    if remaining.get("irregular_noun", 0) > 0:
-        obj1 = np_features("boy", RECIPIENT, plural=True, adjectives=["red"], use_irregular=True)
+    if allow_plural and allow_irregulars and remaining.get("irregular_noun", 0) > 0:
+        obj1 = np_features("boy", RECIPIENT, plural=True, adjectives=["red"] if allow_adjectives else [], use_irregular=True)
         delta["irregular_noun"] = 1
         # if we still need more irregular noun items, also set subject to irregular boy plural
         if remaining.get("irregular_noun", 0) - delta["irregular_noun"] > 0:
             subj = np_features("boy", AGENT, plural=True, adjectives=subj.adjectives or ["tall"], use_irregular=True)
 
     # 4) fem plural receiver (if applicable)
-    must_fem = remaining.get("fem_plural", 0) >= items_left
-    if (remaining.get("fem_plural", 0) > 0 or must_fem):
+    must_fem = remaining.get("fem_plural", 0) >= items_left if allow_plural and allow_feminine else False
+    if allow_plural and allow_feminine and (remaining.get("fem_plural", 0) > 0 or must_fem):
         if obj1.noun_id not in {"woman", "girl"}:
             obj1 = np_features("woman", RECIPIENT, feminine=True, plural=True, adjectives=obj1.adjectives or ["red"])
             delta["fem_plural"] = 1
@@ -518,18 +676,18 @@ def _planned_features(
             delta["fem_plural"] = 1
 
     # 5) plural coverage (if still needed)
-    must_plural = remaining.get("plural", 0) >= items_left
-    if (remaining.get("plural", 0) > 0 or must_plural) and not (obj1.plural or subj.plural or (obj2 and obj2.plural)):
+    must_plural = remaining.get("plural", 0) >= items_left if allow_plural else False
+    if allow_plural and (remaining.get("plural", 0) > 0 or must_plural) and not (obj1.plural or subj.plural or (obj2 and obj2.plural)):
         obj1 = np_features(obj1.noun_id, obj1.role, feminine=obj1.feminine, plural=True, adjectives=obj1.adjectives)
         delta["plural"] = 1
     # secondary plural if still needed and possible
-    if remaining.get("plural", 0) - delta.get("plural", 0) > 0 and not subj.plural:
+    if allow_plural and remaining.get("plural", 0) - delta.get("plural", 0) > 0 and not subj.plural:
         subj = NPFeature(subj.noun_id, subj.feminine, True, subj.adjectives, subj.role, subj.use_irregular)
         delta["plural"] = delta.get("plural", 0) + 1
 
     # 6) adjective coverage
-    must_adj = remaining.get("adjective", 0) >= items_left
-    if (remaining.get("adjective", 0) > 0 or must_adj):
+    must_adj = remaining.get("adjective", 0) >= items_left if allow_adjectives else False
+    if allow_adjectives and (remaining.get("adjective", 0) > 0 or must_adj):
         if not obj1.adjectives:
             obj1 = np_features(obj1.noun_id, obj1.role, feminine=obj1.feminine, plural=obj1.plural, adjectives=["red"])
             delta["adjective"] = 1
@@ -570,11 +728,15 @@ def _feature_load(sf: SentenceFeatures) -> int:
     return load
 
 
-def _boost_feature_load(sf: SentenceFeatures, target: int) -> SentenceFeatures:
+def _boost_feature_load(sf: SentenceFeatures, target: int, constraints: Optional[SectionConstraints] = None) -> SentenceFeatures:
     """Increase feature load deterministically without breaking rules."""
     current = _feature_load(sf)
     if current >= target:
         return sf
+    cons = constraints
+    allow_plural = cons.allow_plural if cons else True
+    allow_past = cons.allow_past if cons else True
+    allow_adjectives = cons.allow_adjectives if cons else True
     subj = sf.subject
     obj1 = sf.obj1
     obj2 = sf.obj2
@@ -589,17 +751,29 @@ def _boost_feature_load(sf: SentenceFeatures, target: int) -> SentenceFeatures:
                 return NPFeature(np.noun_id, np.feminine, np.plural, np.adjectives + [adj], np.role, np.use_irregular)
         return np
 
-    steps = [
-        "tense",
-        "obj1_adj",
-        "subj_adj",
-        "obj1_plural",
-        "subj_plural",
-        "obj2_adj",
-        "obj1_second_adj",
-        "subj_second_adj",
-        "obj2_second_adj",
-    ]
+    steps = []
+    if allow_past:
+        steps.append("tense")
+    if allow_adjectives:
+        steps.extend(
+            [
+                "obj1_adj",
+                "subj_adj",
+            ]
+        )
+    if allow_plural:
+        steps.extend(["obj1_plural", "subj_plural"])
+    if allow_adjectives:
+        steps.extend(
+            [
+                "obj2_adj",
+                "obj1_second_adj",
+                "subj_second_adj",
+                "obj2_second_adj",
+            ]
+        )
+    if not steps:
+        return SentenceFeatures(subj, obj1, obj2, verb_id, tense, use_irregular_verb)
     for i in range(24):
         current = _feature_load(SentenceFeatures(subj, obj1, obj2, verb_id, tense, use_irregular_verb))
         if current >= target:
@@ -635,12 +809,15 @@ def generate_item(
     difficulty: str = "mid",
     sf_override: Optional[SentenceFeatures] = None,
     min_feature_load: int = 1,
+    constraints: Optional[SectionConstraints] = None,
 ) -> Question:
-    sf = sf_override or _base_features(spec, rng, difficulty)
-    sf = _boost_feature_load(sf, min_feature_load)
+    sf = sf_override or _base_features(spec, rng, difficulty, constraints=constraints)
+    sf = _boost_feature_load(sf, min_feature_load, constraints=constraints)
+    if constraints and not respects_constraints(sf, constraints):
+        sf = sf_override or _base_features(spec, rng, difficulty, constraints=constraints)
     correct_text = realize_sentence(spec, sf)
     gloss = english_gloss(sf)
-    distractors = build_distractors(spec, sf, rng)
+    distractors = build_distractors(spec, sf, rng, constraints=constraints)
     options = [Option(label="", text=correct_text, is_correct=True, explanation="Correct", features=sf)] + distractors
     # ensure uniqueness
     texts = set()
@@ -715,9 +892,13 @@ def generate_test(
         "adjective": min(cfg.get("min_adjective", 12), total_items),
     }
 
+    unlocked: set[str] = set()
     for section in blueprint.sections:
+        unlocked |= set(section.introduce_concepts)
+        constraints = section_constraints(unlocked)
         questions: List[Question] = []
         section_intro = [render_concept_explanation(concepts[cid], spec) for cid in section.introduce_concepts]
+        section_meanings = set()
 
         idx = 0
         while len(questions) < section.num_items:
@@ -727,7 +908,9 @@ def generate_test(
             difficulty_tag = "early" if current_number <= 8 else "mid" if current_number <= 24 else "late"
 
             items_left = total_items - current_number + 1
-            sf_override, delta = _planned_features(spec, rng, difficulty_tag, remaining.copy(), current_number, items_left)
+            sf_override, delta = _planned_features(
+                spec, rng, difficulty_tag, remaining.copy(), current_number, items_left, constraints=constraints
+            )
             q = generate_item(
                 spec,
                 section.focus_concepts,
@@ -737,10 +920,17 @@ def generate_test(
                 difficulty=difficulty_tag,
                 sf_override=sf_override,
                 min_feature_load=cfg.get("min_feature_load", 1),
+                constraints=constraints,
             )
             # enforce invariants: one correct, 4 unique options
             if not question_valid(q, spec):
                 continue
+            correct_opt = next((o for o in q.options if o.is_correct), None)
+            if correct_opt:
+                meaning = to_meaning(correct_opt.features)
+                if meaning in section_meanings:
+                    continue
+                section_meanings.add(meaning)
             # apply deltas only when item accepted
             for key, dec in delta.items():
                 remaining[key] = max(0, remaining.get(key, 0) - dec)