git » alan.git » commit 88bd958

Unify backtracking generator and containerized runs

author Alan Dipert
2025-12-06 17:38:01 UTC
committer Alan Dipert
2025-12-06 17:39:41 UTC
parent efc7ba2399f939c595b867af2a39c5e6696fb3cb

Unify backtracking generator and containerized runs

.gitignore +5 -0
Containerfile +19 -0
Makefile +3 -7
README.md +18 -45
generator.py +613 -0
hypothesis_generator.py +0 -217
main.py +43 -54
python +0 -0
render_text.py +19 -8
requirements-dev.txt +0 -1
test_generator.py +0 -1021

diff --git a/.gitignore b/.gitignore
index b1c4696..10e1ae0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,7 +9,12 @@ answer_key*.txt
 my_answers*.txt
 answers*.txt
 *.pdf
+runs/
 
 # OS/editor noise
 .DS_Store
 Thumbs.db
+
+# Local tooling
+.venv/
+.uv-cache/
diff --git a/Containerfile b/Containerfile
new file mode 100644
index 0000000..2bb7551
--- /dev/null
+++ b/Containerfile
@@ -0,0 +1,19 @@
+# Minimal build/run image for ALAN.
+FROM debian:bookworm-slim
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        ca-certificates \
+        make \
+        pandoc \
+        texlive-latex-base \
+        texlive-fonts-recommended \
+        fonts-dejavu-core && \
+    rm -rf /var/lib/apt/lists/*
+
+WORKDIR /workspace
+
+# Use the bundled ./python; no additional Python packages required.
+CMD ["make"]
diff --git a/Makefile b/Makefile
index 64955de..ac8e543 100644
--- a/Makefile
+++ b/Makefile
@@ -1,10 +1,6 @@
-PYTHON ?= python3
+PYTHON ?= ./python
 
 GENERATED_JSON := generated_test.json
-BOOKLET := test_booklet.txt
-KEY := answer_key.txt
-BOOKLET_PDF := test_booklet.pdf
-KEY_PDF := answer_key.pdf
 
 .PHONY: generate render run clean test
 
@@ -14,12 +10,12 @@ generate:
 	$(PYTHON) main.py --out $(GENERATED_JSON) $(EXTRA_ARGS)
 
 render: generate
-	$(PYTHON) render_text.py --in $(GENERATED_JSON) --test-out $(BOOKLET) --key-out $(KEY) --test-pdf $(BOOKLET_PDF) --key-pdf $(KEY_PDF)
+	$(PYTHON) render_text.py --in $(GENERATED_JSON) --test-out test_booklet.txt --key-out answer_key.txt --test-pdf test_booklet.pdf --key-pdf answer_key.pdf
 
 run: generate render
 
 clean:
-	rm -f $(GENERATED_JSON) $(BOOKLET) $(KEY) $(BOOKLET_PDF) $(KEY_PDF)
+	rm -f generated_test.json test_booklet.txt answer_key.txt test_booklet.pdf answer_key.pdf
 	find . -name "__pycache__" -type d -prune -exec rm -rf {} +
 
 test:
diff --git a/README.md b/README.md
index a82f154..4005580 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,13 @@
 # ALAN — Alan's Language Aptitude iNstrument
 
-ALAN is a fully self-contained artificial-language aptitude assessment inspired by DLAB-style tasks. It generates a consistent micro-grammar, produces a 32-item multiple-choice test, renders a booklet and answer key, and validates every form against strict grammatical and psychometric properties. The goal is to measure how quickly and accurately someone can infer and apply unfamiliar language rules—skills that map to disciplined software reasoning (spec reading, edge-case handling, protocol compliance). Test generation now also guarantees that no correct answer meaning or surface repeats across items.
+ALAN is a fully self-contained artificial-language aptitude assessment inspired by DLAB-style tasks. It generates a consistent micro-grammar, produces a 32-item multiple-choice test, renders a booklet and answer key, and validates every form against strict grammatical and psychometric properties. The goal is to measure how quickly and accurately someone can infer and apply unfamiliar language rules—skills that map to disciplined software reasoning (spec reading, edge-case handling, protocol compliance). Test generation guarantees that no correct answer meaning or surface repeats across items.
 
 ## What This Is
 
 - **Purpose:** Measure rapid rule inference, pattern generalization, and attention to fine-grained grammatical cues—abilities correlated with learning new syntactic systems and with disciplined software engineering (spec reading, refactoring, edge-case handling).
 - **Format:** 32 multiple-choice items across sections that introduce rules, then test them with strictly grammatical distractors that differ by exactly one semantic/morphosyntactic feature (minimal pairs).
-- **Artifacts produced:** `generated_test.json` (canonical test), `test_booklet.txt` (questions only), `answer_key.txt` (answers with explanations). The booklet header includes seed, git SHA, and generation params in italics so any form can be reproduced.
-- **Dependencies:** Python 3 only, no external libraries.
+- **Artifacts produced:** `generated_test.json` (canonical test), `test_booklet.txt` (questions only), `answer_key.txt` (answers with explanations). The booklet/header embeds seed, git SHA, and generation params in a compact monospace “Test Version” line so any form can be reproduced.
+- **Dependencies:** Bundled `./python` (Python 3 compiled as a Cosmopolitan/APE fatbinary) included; no external libraries required.
 
 ## Why It Works (Theory & Inspirations)
 
@@ -26,48 +26,22 @@ ALAN is a fully self-contained artificial-language aptitude assessment inspired
 
 ## Quick Start
 ```bash
-make run      # generates JSON, booklet, and key
+./python main.py --out generated_test.json
+./python render_text.py --in generated_test.json --test-out test_booklet.txt --key-out answer_key.txt
+make run      # same as above (uses bundled ./python)
 cat test_booklet.txt   # view the booklet
 cat answer_key.txt     # view the key
 ```
 
-- **PDF output:** Requires `pandoc` **and** `wkhtmltopdf` (only supported engine). Example:
+- Each run is written to `runs/<timestamp>_seed<...>_hardness<...>/generated_test.json` (and copied to `--out` for convenience). Override the base folder with `--run-dir` or the subfolder name with `--run-name`. Rendered outputs are also mirrored into the run directory so each run folder is self-contained.
+- **PDF output:** Requires `pandoc` **and** `pdflatex` (TeX Live). Example:
   ```bash
-  python3 render_text.py --in generated_test.json \
+  ./python render_text.py --in generated_test.json \
     --test-out test_booklet.txt --key-out answer_key.txt \
     --test-pdf test_booklet.pdf --key-pdf answer_key.pdf
   ```
-  If `pandoc` or `wkhtmltopdf` is missing, the script skips PDF generation and reports the issue. The booklet/key are rendered as Markdown, so bullets/headings convert cleanly to PDF when the tools are present.
+  If `pandoc` or `pdflatex` is missing, the script skips PDF generation and reports the issue. The booklet/key are rendered as Markdown, so bullets/headings convert cleanly to PDF when the tools are present.
 
-## Generate Different Hardness Levels
-
-- **Standard (balanced):**
-  ```bash
-  python3 main.py --seed 424242 --out generated_test.json
-  python3 render_text.py --in generated_test.json --test-out test_booklet.txt --key-out answer_key.txt
-  ```
-- **Hard (recommended for programmers):**
-  ```bash
-  python3 main.py \
-    --seed 424242 \
-    --hardness-multiplier 2.0 \
-    --min-feature-load 5 \
-    --min-irregular 12 --min-irregular-contrast 8 \
-    --min-ditransitive 12 --min-plural 16 --min-adjective 16 --min-fem-plural 10 \
-    --out generated_test.json
-  python3 render_text.py --in generated_test.json --test-out test_booklet.txt --key-out answer_key.txt
-  ```
-- **Very Hard (use cautiously; may need retries):**
-  ```bash
-  python3 main.py \
-    --seed 424242 \
-    --hardness-multiplier 2.5 \
-    --min-feature-load 6 \
-    --min-irregular 14 --min-irregular-contrast 10 \
-    --min-ditransitive 14 --min-plural 18 --min-adjective 18 --min-fem-plural 12 \
-    --out generated_test.json
-  python3 render_text.py --in generated_test.json --test-out test_booklet.txt --key-out answer_key.txt
-  ```
 If PDF engines are missing, PDF output is skipped; the Markdown text still renders correctly.
 
 ## Administering ALAN
@@ -114,14 +88,14 @@ Do **not** use the score as a sole hiring gate; treat it as one data point along
   - No prefix/suffix ordering violations.
   - Tense/number/gender surfaces remain distinct (no collapses).
   - JSON output validated against the canonical feature-structure schema (`meta_schema.py`).
-- **Regeneration:** `main.py` will retry seeds until all properties pass; otherwise it fails loudly. Hardness knobs (`--min-irregular`, `--min-irregular-contrast`, `--min-ditransitive`, `--min-plural`, `--min-adjective`, `--min-fem-plural`, `--min-feature-load`) can be scaled in one go with `--hardness-multiplier` (e.g., `2.0` to roughly double thresholds). All chosen params are recorded in `generation_params` in the JSON and printed in the booklet for reproducibility.
+- **Regeneration:** `main.py` synthesizes a test via a deterministic backtracking generator and only writes output if all property tests pass. Seeds are recorded in `meta` for reproducibility. Use `./python main.py --seed 424242 --out generated_test.json`. Set `BACKTRACK_TIMEOUT` (seconds) to bound search time (default 20s). Use `--run-dir` to choose a base dir (default `runs/`), and `--run-name` to override the subdirectory (otherwise timestamp+params).
 
-## Hypothesis-Driven Synthesis (Parallel Path)
+## Backtracking Synthesis
 
-For an alternate “correct-by-construction” path, you can synthesize tests with Hypothesis strategies instead of RNG + filtering:
-- Install dev dep: `pip install -r requirements-dev.txt` (requires `hypothesis`).
-- Use `hypothesis_generator.synthesize_test(spec, blueprint, concepts, rng_seed=0)` to build a test dict that already respects section constraints and uniqueness (leveraging Hypothesis’s `find` to satisfy invariants).
-- This lives alongside the existing generator; no existing code paths were changed.
+Generation is now backed by a deterministic backtracking generator:
+- CLI: `./python main.py --seed 424242 --out generated_test.json` (runs backtracking then `property_tests`).
+- Hardness presets (coverage targets): `--hardness easy|medium|hard|extreme` (default: `medium`). Higher settings raise quotas for irregulars, plurals, adjectives, fem-plurals, and ditransitives while still passing property tests, and limit reuse of identical noun/adjective “clues” to reduce redundancy.
+- Library: `generator.generate_test(spec, blueprint, concepts, rng, seed, git_sha, hardness)` to build a test dict that respects section constraints, global uniqueness, and passes `property_tests.validate_data`.
 
 ## Proctoring Guidance
 
@@ -146,15 +120,14 @@ For an alternate “correct-by-construction” path, you can synthesize tests wi
 
 - `language_spec.py` — Grammar, lexicon, canonical renderer, irregulars.
 - `test_blueprint.py` — Section structure and default blueprint.
-- `test_generator.py` — Feature-based item generation and minimal-pair distractors.
+- `generator.py` — Feature-based item generation and minimal-pair distractors via deterministic backtracking search.
 - `property_tests.py` — Gatekeeper checks (grammaticality, uniqueness, quotas).
 - `meta_schema.py` — Lightweight JSON schema validator for the canonical feature-structure format.
 - `render_text.py` — Converts JSON to booklet and answer key.
 - `main.py` — CLI to generate JSON; retries until all properties pass.
 - `Makefile` — `make run` builds everything; `make clean` removes artifacts.
 - `answer_key.txt`, `test_booklet.txt`, `generated_test.json` — outputs from the last generation.
-- `hypothesis_generator.py` — optional Hypothesis-based synthesizer (builds items by construction rather than RNG filtering). See "Hypothesis-driven synthesis" below.
-- `requirements-dev.txt` — dev-only dependencies (Hypothesis).
+- `python` — bundled Python 3 APE binary used by the Makefile and scripts.
 
 ## Taking the Test (Candidate View)
 
diff --git a/generator.py b/generator.py
new file mode 100644
index 0000000..239712a
--- /dev/null
+++ b/generator.py
@@ -0,0 +1,613 @@
+"""Unified item and test generator using deterministic backtracking."""
+from __future__ import annotations
+
+from dataclasses import dataclass, asdict, replace
+from typing import Dict, List, Optional, Set
+import random
+import os
+import time
+
+from language_spec import (
+    LanguageSpec,
+    SentenceFeatures,
+    NPFeature,
+    AGENT,
+    RECIPIENT,
+    THEME,
+    realize_sentence,
+    english_gloss,
+)
+from semantic import meanings_equal, to_meaning, semantic_distance
+from grammar_check import is_grammatical
+from test_blueprint import TestBlueprint, Concept, TRANSLATE_TO_LANG, TRANSLATE_FROM_LANG
+from property_tests import (
+    MIN_IRREG_USE,
+    MIN_DITRANSITIVE,
+    MIN_PLURAL_ITEMS,
+    MIN_ADJECTIVE_ITEMS,
+    MIN_FEM_PLURAL_ITEMS,
+)
+
+# ---------------------------------------------------------------------------
+# Data structures
+
+
+@dataclass
+class Option:
+    label: str
+    text: str
+    is_correct: bool
+    explanation: str
+    features: SentenceFeatures
+
+
+@dataclass
+class Question:
+    id: str
+    item_type: str
+    section_id: str
+    concepts: List[str]
+    stem: str
+    options: List[Option]
+    difficulty_score: float
+
+
+def question_valid(q: Question, spec: LanguageSpec) -> bool:
+    if len(q.options) != 4:
+        return False
+    if sum(opt.is_correct for opt in q.options) != 1:
+        return False
+    meanings = []
+    glosses = []
+    correct_meaning = None
+    for opt in q.options:
+        m = to_meaning(opt.features)
+        g = english_gloss(opt.features)
+        if opt.is_correct:
+            correct_meaning = m
+        for existing in meanings:
+            if meanings_equal(m, existing):
+                return False
+        if g in glosses:
+            return False
+        meanings.append(m)
+        glosses.append(g)
+        if not is_grammatical(asdict(opt), spec):
+            return False
+    if correct_meaning is None:
+        return False
+    for opt in q.options:
+        if opt.is_correct:
+            continue
+        dist = semantic_distance(to_meaning(opt.features), correct_meaning)
+        if dist != 1:
+            return False
+    return True
+
+
+# ---------------------------------------------------------------------------
+# Feature utilities
+
+
+def np_features(
+    noun_id: str,
+    role: str,
+    feminine: Optional[bool] = None,
+    plural: bool = False,
+    adjectives: Optional[List[str]] = None,
+    use_irregular: bool = True,
+) -> NPFeature:
+    feminine_default = noun_id in {"woman", "girl"}
+    fem = feminine_default if feminine is None else feminine
+    if noun_id not in {"woman", "girl"}:
+        fem = False
+    return NPFeature(
+        noun_id=noun_id,
+        feminine=fem,
+        plural=plural,
+        adjectives=adjectives or [],
+        role=role,
+        use_irregular=use_irregular,
+    )
+
+
+def sentence_features(
+    verb_id: str,
+    tense: str,
+    subj: NPFeature,
+    obj1: NPFeature,
+    obj2: Optional[NPFeature] = None,
+    use_irregular_verb: bool = True,
+) -> SentenceFeatures:
+    return SentenceFeatures(subject=subj, obj1=obj1, obj2=obj2, verb_id=verb_id, tense=tense, use_irregular_verb=use_irregular_verb)
+
+
+def section_constraints(unlocked: set[str]):
+    allow_plural = "NOUN_NUMBER_MARKING" in unlocked
+    allow_feminine = "NOUN_GENDER_MARKING" in unlocked
+    allow_past = "VERB_TENSE_MARKING" in unlocked
+    allow_ditransitive = allow_plural or allow_feminine
+    allow_irregulars = allow_past or allow_plural
+    allow_adjectives = "NP_ORDER" in unlocked
+
+    def allowed_people(include_feminine: bool) -> List[str]:
+        base = ["man", "boy"]
+        if include_feminine:
+            base += ["woman", "girl"]
+        return base
+
+    return type(
+        "Cons",
+        (),
+        {
+            "allowed_verbs": ["see", "chase"] + (["give"] if allow_ditransitive else []),
+            "allowed_agent_nouns": allowed_people(allow_feminine),
+            "allowed_recipient_nouns": allowed_people(allow_feminine),
+            "allowed_theme_nouns": ["ball", "house"],
+            "allow_plural": allow_plural,
+            "allow_feminine": allow_feminine,
+            "allow_past": allow_past,
+            "allow_ditransitive": allow_ditransitive,
+            "allow_irregulars": allow_irregulars,
+            "allow_adjectives": allow_adjectives,
+        },
+    )()
+
+
+# ---------------------------------------------------------------------------
+# Distractors and item generation
+
+
+def build_distractors(spec: LanguageSpec, sf: SentenceFeatures, rng: random.Random) -> List[Option]:
+    target_meaning = to_meaning(sf)
+    correct_text = realize_sentence(spec, sf)
+    seen_surfaces = {correct_text}
+    seen_meanings = {target_meaning}
+    distractors: List[Option] = []
+
+    def clone_sf(orig: SentenceFeatures) -> SentenceFeatures:
+        return SentenceFeatures(
+            subject=replace(orig.subject),
+            obj1=replace(orig.obj1),
+            obj2=replace(orig.obj2) if orig.obj2 else None,
+            verb_id=orig.verb_id,
+            tense=orig.tense,
+            use_irregular_verb=orig.use_irregular_verb,
+        )
+
+    def add_if_valid(cand_sf: SentenceFeatures, explanation: str) -> None:
+        nonlocal distractors
+        text = realize_sentence(spec, cand_sf)
+        meaning = to_meaning(cand_sf)
+        dist = semantic_distance(meaning, target_meaning)
+        if text in seen_surfaces:
+            return
+        if any(meanings_equal(meaning, m) for m in seen_meanings):
+            return
+        if dist != 1:
+            return
+        opt = Option(label="", text=text, is_correct=False, explanation=explanation, features=cand_sf)
+        if not is_grammatical(asdict(opt), spec):
+            return
+        seen_surfaces.add(text)
+        seen_meanings.add(meaning)
+        distractors.append(opt)
+
+    flips = []
+    flips.append(("Tense flip.", lambda base: replace(base, tense="PAST" if base.tense == "PRES" else "PRES")))
+
+    def flip_obj1_number(base: SentenceFeatures) -> SentenceFeatures:
+        new = clone_sf(base)
+        new.obj1 = replace(new.obj1, plural=not new.obj1.plural)
+        return new
+
+    flips.append(("Number flip (receiver).", flip_obj1_number))
+
+    def flip_subj_number(base: SentenceFeatures) -> SentenceFeatures:
+        new = clone_sf(base)
+        new.subject = replace(new.subject, plural=not new.subject.plural)
+        return new
+
+    flips.append(("Number flip (doer).", flip_subj_number))
+
+    def flip_obj2_number(base: SentenceFeatures) -> SentenceFeatures:
+        if base.obj2 is None:
+            return base
+        new = clone_sf(base)
+        new.obj2 = replace(new.obj2, plural=not new.obj2.plural)
+        return new
+
+    flips.append(("Number flip (theme).", flip_obj2_number))
+
+    def flip_obj1_gender(base: SentenceFeatures) -> SentenceFeatures:
+        new = clone_sf(base)
+        if new.obj1.noun_id not in {"woman", "girl"}:
+            return new
+        new.obj1 = replace(new.obj1, feminine=not new.obj1.feminine)
+        return new
+
+    flips.append(("Gender flip (receiver).", flip_obj1_gender))
+
+    def flip_obj1_adj(base: SentenceFeatures) -> SentenceFeatures:
+        new = clone_sf(base)
+        if new.obj1.adjectives:
+            new.obj1 = replace(new.obj1, adjectives=[])
+        else:
+            new.obj1 = replace(new.obj1, adjectives=["red"])
+        return new
+
+    flips.append(("Adjective scope change.", flip_obj1_adj))
+
+    def flip_obj1_role(base: SentenceFeatures) -> SentenceFeatures:
+        new = clone_sf(base)
+        new_role = THEME if new.obj1.role == RECIPIENT else RECIPIENT
+        new.obj1 = replace(new.obj1, role=new_role)
+        return new
+
+    flips.append(("Role flip (receiver/theme).", flip_obj1_role))
+
+    def flip_irregular(base: SentenceFeatures) -> SentenceFeatures:
+        new = clone_sf(base)
+        if new.obj1.noun_id == "boy" and new.obj1.plural:
+            new.obj1 = replace(new.obj1, use_irregular=not new.obj1.use_irregular)
+        elif new.verb_id == "chase" and new.tense == "PAST":
+            new = replace(new, use_irregular_verb=not new.use_irregular_verb)
+        return new
+
+    flips.append(("Irregular vs regular.", flip_irregular))
+
+    rng.shuffle(flips)
+    for expl, fn in flips:
+        if len(distractors) >= 3:
+            break
+        add_if_valid(fn(sf), expl)
+    attempts = 0
+    while len(distractors) < 3 and attempts < 20:
+        expl, fn = rng.choice(flips)
+        add_if_valid(fn(sf), expl)
+        attempts += 1
+    return distractors if len(distractors) == 3 else []
+
+
+def _base_features(cons, rng: random.Random) -> SentenceFeatures:
+    verb_id = rng.choice(cons.allowed_verbs)
+    tense = "PAST" if (cons.allow_past and rng.random() < 0.4) else "PRES"
+    subj = np_features(
+        noun_id=rng.choice(cons.allowed_agent_nouns),
+        role=AGENT,
+        plural=cons.allow_plural and rng.random() < 0.4,
+        adjectives=["tall"] if (cons.allow_adjectives and rng.random() < 0.6) else [],
+    )
+    if verb_id == "give":
+        obj1 = np_features(
+            noun_id=rng.choice(cons.allowed_recipient_nouns),
+            role=RECIPIENT,
+            plural=cons.allow_plural and rng.random() < 0.4,
+            adjectives=["fast"] if (cons.allow_adjectives and rng.random() < 0.4) else [],
+        )
+        obj2 = np_features(
+            noun_id=rng.choice(cons.allowed_theme_nouns),
+            role=THEME,
+            plural=cons.allow_plural and rng.random() < 0.5,
+            adjectives=["red"] if (cons.allow_adjectives and rng.random() < 0.6) else [],
+        )
+    else:
+        obj1 = np_features(
+            noun_id=rng.choice(cons.allowed_recipient_nouns),
+            role=RECIPIENT,
+            plural=cons.allow_plural and rng.random() < 0.5,
+            adjectives=["red"] if (cons.allow_adjectives and rng.random() < 0.6) else [],
+        )
+        obj2 = None
+    return sentence_features(verb_id=verb_id, tense=tense, subj=subj, obj1=obj1, obj2=obj2)
+
+
+def generate_item(
+    spec: LanguageSpec,
+    concepts: List[str],
+    section_id: str,
+    item_type: str,
+    rng: random.Random,
+    sf_override: Optional[SentenceFeatures] = None,
+    constraints=None,
+) -> Question:
+    cons = constraints
+    sf = sf_override or _base_features(cons, rng)
+    correct_text = realize_sentence(spec, sf)
+    gloss = english_gloss(sf)
+    distractors = build_distractors(spec, sf, rng)
+    options = [Option(label="", text=correct_text, is_correct=True, explanation="Correct", features=sf)] + distractors
+    texts = set()
+    unique_options = []
+    for opt in options:
+        if opt.text in texts:
+            continue
+        texts.add(opt.text)
+        unique_options.append(opt)
+    options = unique_options[:4]
+    rng.shuffle(options)
+    labels = ["A", "B", "C", "D"]
+    for i, opt in enumerate(options):
+        opt.label = labels[i]
+
+    if item_type == TRANSLATE_TO_LANG:
+        stem = f"Translate into the language: {gloss}"
+    elif item_type == TRANSLATE_FROM_LANG:
+        stem = f"What does this sentence mean? {correct_text}"
+    else:
+        stem = f"Use the rules to choose the correct sentence. Target meaning: {gloss}"
+
+    return Question(
+        id=f"{section_id}_{rng.randrange(10_000)}",
+        item_type=item_type,
+        section_id=section_id,
+        concepts=concepts,
+        stem=stem,
+        options=options,
+        difficulty_score=0.5,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Backtracking synthesis
+
+
+def _question_attrs(q) -> Dict[str, bool]:
+    correct = next(o for o in q.options if o.is_correct)
+    feats = correct.features
+    clue_keys = [
+        ("AGENT", feats.subject.noun_id, tuple(feats.subject.adjectives)),
+        ("RECIPIENT", feats.obj1.noun_id, tuple(feats.obj1.adjectives)),
+    ]
+    if feats.obj2:
+        clue_keys.append(("THEME", feats.obj2.noun_id, tuple(feats.obj2.adjectives)))
+    return {
+        "meaning_key": repr(to_meaning(feats)),
+        "surface": correct.text,
+        "irreg_noun": feats.obj1.noun_id == "boy" and feats.obj1.plural and feats.obj1.use_irregular,
+        "irreg_verb": feats.verb_id == "chase" and feats.tense == "PAST" and feats.use_irregular_verb,
+        "ditransitive": feats.obj2 is not None,
+        "plural": any(np.plural for np in [feats.subject, feats.obj1] + ([feats.obj2] if feats.obj2 else [])),
+        "adjective": any(np.adjectives for np in [feats.subject, feats.obj1] + ([feats.obj2] if feats.obj2 else [])),
+        "fem_plural": feats.obj1.feminine and feats.obj1.plural if hasattr(feats.obj1, "feminine") else False,
+        "clue_keys": clue_keys,
+    }
+
+
+def _generate_candidates_for_section(spec: LanguageSpec, section, unlocked: set[str], rng: random.Random, max_trials: int = 20000) -> List[Dict]:
+    cons = section_constraints(unlocked)
+    candidates: List[Dict] = []
+    seen_meaning: Set[str] = set()
+    seen_surface: Set[str] = set()
+    verbs = cons.allowed_verbs
+    tenses = ["PRES"] + (["PAST"] if cons.allow_past else [])
+    subj_nouns = cons.allowed_agent_nouns
+    obj1_nouns = cons.allowed_recipient_nouns
+    obj2_nouns = cons.allowed_theme_nouns if cons.allow_ditransitive else []
+    subj_pl_flags = [False, True] if cons.allow_plural else [False]
+    obj1_pl_flags = [False, True] if cons.allow_plural else [False]
+    adj_flags = [False, True] if cons.allow_adjectives else [False]
+
+    for verb in verbs:
+        for tense in tenses:
+            for subj_n in subj_nouns:
+                for obj1_n in obj1_nouns:
+                    obj2_loop = obj2_nouns if verb == "give" else [None]
+                    for obj2_n in obj2_loop:
+                        for subj_pl in subj_pl_flags:
+                            for obj1_pl in obj1_pl_flags:
+                                for subj_adj_flag in adj_flags:
+                                    for obj1_adj_flag in adj_flags:
+                                        subj_adj = ["tall"] if subj_adj_flag else []
+                                        obj1_adj = ["red"] if obj1_adj_flag else []
+                                        subj = np_features(subj_n, AGENT, plural=subj_pl, adjectives=subj_adj)
+                                        obj1 = np_features(obj1_n, RECIPIENT, plural=obj1_pl, adjectives=obj1_adj)
+                                        obj2 = None
+                                        if obj2_n:
+                                            obj2 = np_features(obj2_n, THEME, plural=False, adjectives=["red"] if cons.allow_adjectives else [])
+                                        sf = sentence_features(verb, tense, subj, obj1, obj2, use_irregular_verb=True)
+                                        for item_type in section.item_types:
+                                            q = generate_item(
+                                                spec,
+                                                section.focus_concepts,
+                                                section.id,
+                                                item_type,
+                                                rng,
+                                                constraints=cons,
+                                                sf_override=sf,
+                                            )
+                                            if not question_valid(q, spec):
+                                                continue
+                                            attrs = _question_attrs(q)
+                                            if attrs["meaning_key"] in seen_meaning or attrs["surface"] in seen_surface:
+                                                continue
+                                            seen_meaning.add(attrs["meaning_key"])
+                                            seen_surface.add(attrs["surface"])
+                                            candidates.append({"question": q, **attrs})
+                                            if len(candidates) >= max_trials:
+                                                return candidates
+    return candidates
+
+
+def _max_possible(attr: str, section_idx: int, section_slots: List[int], pools: List[List[Dict]], used_meaning: Set[str], used_surface: Set[str], chosen: List[List[Dict]]) -> int:
+    total = 0
+    for s in range(section_idx, len(pools)):
+        remaining_slots = section_slots[s] - len(chosen[s])
+        if remaining_slots <= 0:
+            continue
+        avail = [
+            c
+            for c in pools[s]
+            if c["meaning_key"] not in used_meaning and c["surface"] not in used_surface
+        ]
+        count_attr = sum(1 for c in avail if c[attr])
+        total += min(remaining_slots, count_attr)
+    return total
+
+
+def _backtrack(
+    section_idx: int,
+    section_slots: List[int],
+    pools: List[List[Dict]],
+    used_meaning: Set[str],
+    used_surface: Set[str],
+    clue_counts: Dict[tuple, int],
+    max_clue_reuse: int,
+    quotas: Dict[str, int],
+    chosen: List[List[Dict]],
+    start_time: float,
+    max_seconds: Optional[float],
+) -> bool:
+    if max_seconds is not None and (time.time() - start_time) > max_seconds:
+        return False
+    if section_idx == len(pools):
+        return all(v <= 0 for v in quotas.values())
+    if len(chosen[section_idx]) == section_slots[section_idx]:
+        return _backtrack(section_idx + 1, section_slots, pools, used_meaning, used_surface, clue_counts, max_clue_reuse, quotas, chosen, start_time, max_seconds)
+
+    for attr, remaining in quotas.items():
+        if remaining <= 0:
+            continue
+        max_avail = _max_possible(attr, section_idx, section_slots, pools, used_meaning, used_surface, chosen)
+        if remaining > max_avail:
+            return False
+
+    avail = [
+        c
+        for c in pools[section_idx]
+        if c["meaning_key"] not in used_meaning
+        and c["surface"] not in used_surface
+        and all(clue_counts.get(k, 0) < max_clue_reuse for k in c["clue_keys"])
+    ]
+    if not avail:
+        return False
+    def score(c):
+        return sum(1 for attr, rem in quotas.items() if rem > 0 and c[attr])
+    avail.sort(key=score, reverse=True)
+    avail = avail[:100]
+
+    for cand in avail:
+        new_quotas = quotas.copy()
+        for attr in ["irreg_noun", "irreg_verb", "ditransitive", "plural", "adjective", "fem_plural"]:
+            if cand[attr]:
+                new_quotas[attr] = max(0, new_quotas[attr] - 1)
+        used_meaning.add(cand["meaning_key"])
+        used_surface.add(cand["surface"])
+        for k in cand["clue_keys"]:
+            clue_counts[k] = clue_counts.get(k, 0) + 1
+        chosen[section_idx].append(cand)
+        if _backtrack(section_idx, section_slots, pools, used_meaning, used_surface, clue_counts, max_clue_reuse, new_quotas, chosen, start_time, max_seconds):
+            return True
+        chosen[section_idx].pop()
+        used_meaning.remove(cand["meaning_key"])
+        used_surface.remove(cand["surface"])
+        for k in cand["clue_keys"]:
+            clue_counts[k] = clue_counts.get(k, 0) - 1
+    return False
+
+
+def generate_test(
+    spec: LanguageSpec,
+    blueprint: TestBlueprint,
+    concepts: Dict[str, Concept],
+    rng: random.Random,
+    seed: int | None = None,
+    git_sha: str | None = None,
+    hardness: str = "medium",
+) -> Dict:
+    max_seconds = float(os.environ.get("BACKTRACK_TIMEOUT", "20"))
+    profiles = {
+        "easy": {"irreg": 6, "ditransitive": 8, "plural": 12, "adjective": 12, "fem_plural": 6, "max_clue_reuse": 10},
+        "medium": {"irreg": 6, "ditransitive": 8, "plural": 12, "adjective": 12, "fem_plural": 6, "max_clue_reuse": 10},
+        "hard": {"irreg": 7, "ditransitive": 9, "plural": 14, "adjective": 14, "fem_plural": 7, "max_clue_reuse": 8},
+        "extreme": {"irreg": 8, "ditransitive": 10, "plural": 16, "adjective": 16, "fem_plural": 8, "max_clue_reuse": 6},
+    }
+    if hardness not in profiles:
+        raise SystemExit(f"Unknown hardness '{hardness}'. Choose from {list(profiles.keys())}.")
+    quotas = profiles[hardness].copy()
+    max_clue_reuse = quotas.pop("max_clue_reuse")
+    total_items = sum(section.num_items for section in blueprint.sections)
+    for k, v in quotas.items():
+        quotas[k] = min(v, total_items)
+    quotas["irreg"] = max(quotas["irreg"], MIN_IRREG_USE)
+    quotas["ditransitive"] = max(quotas["ditransitive"], MIN_DITRANSITIVE)
+    quotas["plural"] = max(quotas["plural"], MIN_PLURAL_ITEMS)
+    quotas["adjective"] = max(quotas["adjective"], MIN_ADJECTIVE_ITEMS)
+    quotas["fem_plural"] = max(quotas["fem_plural"], MIN_FEM_PLURAL_ITEMS)
+
+    pools: List[List[Dict]] = []
+    section_slots: List[int] = []
+    unlocked: set[str] = set()
+    for section in blueprint.sections:
+        unlocked |= set(section.introduce_concepts)
+        cand = _generate_candidates_for_section(spec, section, unlocked, rng)
+        if len(cand) < section.num_items:
+            raise SystemExit(f"Insufficient candidates for section {section.id}")
+        rng.shuffle(cand)
+        pools.append(cand)
+        section_slots.append(section.num_items)
+
+    quotas_expanded = {
+        "irreg_noun": quotas["irreg"],
+        "irreg_verb": quotas["irreg"],
+        "ditransitive": quotas["ditransitive"],
+        "plural": quotas["plural"],
+        "adjective": quotas["adjective"],
+        "fem_plural": quotas["fem_plural"],
+    }
+    used_meaning: Set[str] = set()
+    used_surface: Set[str] = set()
+    chosen: List[List[Dict]] = [[] for _ in pools]
+    clue_counts: Dict[tuple, int] = {}
+    start_time = time.time()
+
+    if not _backtrack(0, section_slots, pools, used_meaning, used_surface, clue_counts, max_clue_reuse, quotas_expanded, chosen, start_time, max_seconds):
+        raise SystemExit("Backtracking generator could not satisfy quotas.")
+
+    sections_out = []
+    question_counter = 1
+    unlocked = set()
+    for sec_idx, section in enumerate(blueprint.sections):
+        unlocked |= set(section.introduce_concepts)
+        section_intro = [f"{concepts[cid].description_en}" for cid in section.introduce_concepts]
+        questions = []
+        for cand in chosen[sec_idx]:
+            q = cand["question"]
+            qd = asdict(q)
+            qd["number"] = question_counter
+            question_counter += 1
+            questions.append(qd)
+        sections_out.append(
+            {
+                "id": section.id,
+                "introduce_concepts": section.introduce_concepts,
+                "intro_text": section_intro,
+                "questions": questions,
+            }
+        )
+
+    meta = {
+        "version": "backtrack-0.2",
+        "description": "Alan's Language Aptitude iNstrument (ALAN) synthesized via backtracking search",
+        "seed": seed,
+        "git_sha": git_sha,
+        "generation_params": {"hardness": hardness, "max_clue_reuse": max_clue_reuse},
+        "dictionary": spec.lexicon,
+        "instructions": (
+            "You will see a brief dictionary, a handful of rules, and examples. Words may take small "
+            "prefixes or suffixes to mark who does what or when it happens—copy these patterns from the examples. "
+            "You do not need linguistics training; apply the rules logically. In every question exactly one option (A–D) "
+            "matches the target meaning. Correct answers always follow the stated word order: doer (subject), receiver (object), verb."
+        ),
+        "rules": [
+            "Word order: DOER RECEIVER VERB (SOV). For 'give': doer, recipient, theme, verb.",
+            "Adjectives follow the noun they describe.",
+            "Prefix stacking: na (receiver) + mem (feminine) + leko (plural) + noun; doer adds suffix mur.",
+            "Feminine plural: memleko + noun (e.g., memlekorema).",
+            "Irregulars: verb 'ror' past = 'rontmimu'; plural of 'tul' = 'letul'.",
+            "Receiver marker na- applies to the whole noun phrase (e.g., namemlekorema).",
+            "Past tense: verb takes suffix 'mimu' unless irregular.",
+        ],
+    }
+    test_dict = {"meta": meta, "sections": sections_out}
+    return test_dict
diff --git a/hypothesis_generator.py b/hypothesis_generator.py
deleted file mode 100644
index 6f63aa9..0000000
--- a/hypothesis_generator.py
+++ /dev/null
@@ -1,217 +0,0 @@
-"""Hypothesis-driven synthesis of ALAN tests (parallel to the existing generator).
-
-This module constructs questions/tests by *finding* examples that satisfy the same
-invariants enforced elsewhere, rather than generating and discarding with RNG.
-It leaves the existing generator untouched.
-"""
-from __future__ import annotations
-
-from dataclasses import replace
-from typing import Dict, List, Optional
-import random
-
-from hypothesis import strategies as st
-from hypothesis import find, settings
-
-from language_spec import (
-    LanguageSpec,
-    SentenceFeatures,
-    NPFeature,
-    AGENT,
-    RECIPIENT,
-    THEME,
-    realize_sentence,
-    english_gloss,
-)
-from test_blueprint import TestBlueprint, Concept, SectionBlueprint
-from test_generator import (
-    Option,
-    Question,
-    question_valid,
-    build_distractors,
-    section_constraints,
-    respects_constraints,
-    sentence_features,
-)
-from semantic import to_meaning
-
-ADJ_POOL = ["tall", "red", "big", "fast"]
-
-
-def _np_strategy(cons, role: str) -> st.SearchStrategy[NPFeature]:
-    """Generate NPFeature respecting section constraints."""
-    if role == AGENT:
-        nouns = cons.allowed_agent_nouns
-    elif role == THEME:
-        nouns = cons.allowed_theme_nouns
-    else:
-        nouns = cons.allowed_recipient_nouns
-    adj_strategy = st.just([])
-    if cons.allow_adjectives:
-        adj_strategy = st.one_of(
-            st.just([]),
-            st.lists(st.sampled_from(ADJ_POOL), min_size=1, max_size=1),
-        )
-    return st.builds(
-        NPFeature,
-        noun_id=st.sampled_from(nouns),
-        feminine=st.booleans() if cons.allow_feminine else st.just(False),
-        plural=st.booleans() if cons.allow_plural else st.just(False),
-        adjectives=adj_strategy,
-        role=st.just(role),
-        use_irregular=st.booleans() if cons.allow_irregulars else st.just(False),
-    )
-
-
-def _sentence_strategy(cons) -> st.SearchStrategy[SentenceFeatures]:
-    """SentenceFeatures within section constraints (includes valence/tense)."""
-    verb_strategy = st.sampled_from(cons.allowed_verbs)
-    def _build(verb_id: str, subj: NPFeature, obj1: NPFeature, obj2: Optional[NPFeature], tense: str, use_irregular_verb: bool) -> SentenceFeatures:
-        return sentence_features(
-            verb_id=verb_id,
-            tense=tense,
-            subj=subj,
-            obj1=obj1,
-            obj2=obj2,
-            use_irregular_verb=use_irregular_verb,
-        )
-
-    def _obj2_strategy(verb_id: str) -> st.SearchStrategy[Optional[NPFeature]]:
-        if verb_id != "give" or not cons.allow_ditransitive:
-            return st.just(None)
-        return _np_strategy(cons, THEME)
-
-    def _tense_strategy() -> st.SearchStrategy[str]:
-        if cons.allow_past:
-            return st.sampled_from(["PRES", "PAST"])
-        return st.just("PRES")
-
-    return st.deferred(
-        lambda: st.builds(
-            _build,
-            verb_strategy,
-            _np_strategy(cons, AGENT),
-            _np_strategy(cons, RECIPIENT),
-            st.none(),
-            _tense_strategy(),
-            st.booleans() if cons.allow_irregulars else st.just(False),
-        ).flatmap(
-            # attach obj2 based on verb to keep valence aligned
-            lambda sf: st.builds(
-                _build,
-                st.just(sf.verb_id),
-                st.just(sf.subject),
-                st.just(sf.obj1),
-                _obj2_strategy(sf.verb_id),
-                st.just(sf.tense),
-                st.just(sf.use_irregular_verb),
-            )
-        )
-    )
-
-
-def _question_from_sf(spec: LanguageSpec, sf: SentenceFeatures, section_id: str, concepts: List[str], item_type: str, rng: random.Random, constraints) -> Optional[Question]:
-    """Create a Question from a SentenceFeatures, returning None if invalid."""
-    correct_text = realize_sentence(spec, sf)
-    gloss = english_gloss(sf)
-    distractors = build_distractors(spec, sf, rng, constraints=constraints)
-    options = [Option(label="", text=correct_text, is_correct=True, explanation="Correct", features=sf)] + distractors
-    labels = ["A", "B", "C", "D"]
-    rng.shuffle(options)
-    for i, opt in enumerate(options):
-        opt.label = labels[i]
-    stem = f"Use the rules to choose the correct sentence. Target meaning: {gloss}" if item_type != "TRANSLATE_TO_LANG" else f"Translate into the language: {gloss}"
-    q = Question(
-        id=f"{section_id}_{rng.randrange(10_000)}",
-        item_type=item_type,
-        section_id=section_id,
-        concepts=concepts,
-        stem=stem,
-        options=options,
-        difficulty_score=0.5,
-    )
-    return q if question_valid(q, spec) else None
-
-
-def synthesize_question(
-    spec: LanguageSpec,
-    cons,
-    section: SectionBlueprint,
-    item_type: str,
-    seen_meanings: set,
-    seen_surfaces: set,
-    rng_seed: int = 0,
-) -> Question:
-    """Find a single valid question respecting uniqueness constraints."""
-    sf_strategy = _sentence_strategy(cons)
-
-    def _valid_sf(sf: SentenceFeatures) -> bool:
-        rng = random.Random(rng_seed)
-        if not respects_constraints(sf, cons):
-            return False
-        q = _question_from_sf(spec, sf, section.id, section.focus_concepts, item_type, rng, cons)
-        if q is None or not q.options:
-            return False
-        correct = next(o for o in q.options if o.is_correct)
-        meaning_key = repr(to_meaning(correct.features))
-        surface = correct.text
-        if meaning_key in seen_meanings:
-            return False
-        if surface in seen_surfaces:
-            return False
-        return True
-
-    sf_example = find(sf_strategy, _valid_sf, settings=settings(max_examples=500, database=None))
-    # Recreate question with fresh RNG to align labels
-    rng = random.Random(rng_seed + 1)
-    q = _question_from_sf(spec, sf_example, section.id, section.focus_concepts, item_type, rng, cons)
-    if q is None:
-        raise AssertionError("Hypothesis produced an invalid question unexpectedly.")
-    correct = next(o for o in q.options if o.is_correct)
-    meaning_key = repr(to_meaning(correct.features))
-    surface = correct.text
-    seen_meanings.add(meaning_key)
-    seen_surfaces.add(surface)
-    return q
-
-
-def synthesize_test(
-    spec: LanguageSpec,
-    blueprint: TestBlueprint,
-    concepts: Dict[str, Concept],
-    rng_seed: int = 0,
-) -> Dict:
-    """Build a full test using Hypothesis to find valid items by construction."""
-    sections_out = []
-    question_counter = 1
-    seen_meanings: set = set()
-    seen_surfaces: set = set()
-    unlocked: set[str] = set()
-    for section in blueprint.sections:
-        unlocked |= set(section.introduce_concepts)
-        cons = section_constraints(unlocked)
-        questions: List[Question] = []
-        section_intro = [f"{concepts[cid].description_en}" for cid in section.introduce_concepts]
-        for idx in range(section.num_items):
-            item_type = section.item_types[idx % len(section.item_types)]
-            q = synthesize_question(spec, cons, section, item_type, seen_meanings, seen_surfaces, rng_seed + idx)
-            q_dict = asdict(q)
-            q_dict["number"] = question_counter
-            question_counter += 1
-            questions.append(q_dict)
-        sections_out.append(
-            {
-                "id": section.id,
-                "introduce_concepts": section.introduce_concepts,
-                "intro_text": section_intro,
-                "questions": questions,
-            }
-        )
-    return {
-        "meta": {
-            "version": "hypothesis-0.1",
-            "description": "Alan's Language Aptitude iNstrument (ALAN) synthesized with Hypothesis",
-            "seed": rng_seed,
-        },
-        "sections": sections_out,
-    }
diff --git a/main.py b/main.py
index 360a0c1..13ca49c 100644
--- a/main.py
+++ b/main.py
@@ -1,35 +1,40 @@
-"""CLI entry point for generating a JSON DLAB-style test."""
+"""CLI entry point for generating a JSON DLAB-style test (RNG generator by default)."""
 from __future__ import annotations
 
 import argparse
 import json
 import random
 import subprocess
+import os
+from datetime import datetime
 
 from language_spec import generate_language_instance
 from test_blueprint import get_default_concepts, get_default_blueprint
-from test_generator import generate_test
 from property_tests import validate_data
+from generator import generate_test
 
 
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(description="Generate an artificial language test JSON.")
-    parser.add_argument("--seed", type=int, help="Random seed for reproducibility.")
-    parser.add_argument("--out", dest="out_path", default="generated_test.json", help="Output path.")
-    parser.add_argument("--min-irregular", type=int, default=6, help="Minimum irregular uses.")
+    parser.add_argument("--seed", type=int, help="Random seed for reproducibility.", default=None)
+    parser.add_argument("--out", dest="out_path", default="generated_test.json", help="Output file name (within run dir).")
     parser.add_argument(
-        "--min-irregular-contrast", type=int, default=4, help="Minimum irregular contrast items (correct vs distractor)."
+        "--hardness",
+        choices=["easy", "medium", "hard", "extreme"],
+        default="medium",
+        help="Preset coverage targets (medium = default quotas).",
     )
-    parser.add_argument("--min-ditransitive", type=int, default=8, help="Minimum ditransitive items.")
-    parser.add_argument("--min-plural", type=int, default=12, help="Minimum plural-bearing items.")
-    parser.add_argument("--min-adjective", type=int, default=12, help="Minimum adjective-bearing items.")
-    parser.add_argument("--min-fem-plural", type=int, default=6, help="Minimum feminine-plural items.")
-    parser.add_argument("--min-feature-load", type=int, default=1, help="Minimum feature load per item (hardness).")
     parser.add_argument(
-        "--hardness-multiplier",
-        type=float,
-        default=1.0,
-        help="Scale all coverage/complexity thresholds (e.g., 2.0 ~ twice as hard).",
+        "--run-dir",
+        dest="run_dir",
+        default=None,
+        help="Base directory to create the run subdir under. Defaults to ./runs.",
+    )
+    parser.add_argument(
+        "--run-name",
+        dest="run_name",
+        default=None,
+        help="Optional explicit subdirectory name; otherwise uses timestamp + params.",
     )
     return parser.parse_args()
 
@@ -46,47 +51,31 @@ def main() -> None:
     actual_seed = args.seed if args.seed is not None else random.randint(0, 1_000_000)
     concepts = get_default_concepts()
     blueprint = get_default_blueprint()
-    max_attempts = 200
-    total_items = sum(s.num_items for s in blueprint.sections)
-
-    def scale(base: int) -> int:
-        return min(total_items, int(round(base * args.hardness_multiplier)))
-
-    req_irregular = max(1, scale(args.min_irregular))
-    req_irregular_contrast = max(1, scale(args.min_irregular_contrast))
-    req_ditransitive = max(1, min(scale(args.min_ditransitive), total_items))
-    feasible_irregular = max(1, min(req_irregular, max(0, total_items - req_ditransitive)))
-    max_feature_load = 12  # estimated feasible max given grammar
-    params = {
-        "min_irregular": feasible_irregular,
-        "min_irregular_contrast": min(req_irregular_contrast, feasible_irregular),
-        "min_irregular_distractor": max(3, min(req_irregular_contrast, feasible_irregular)),
-        "min_ditransitive": req_ditransitive,
-        "min_plural": max(1, min(scale(args.min_plural), total_items)),
-        "min_adjective": max(1, min(scale(args.min_adjective), total_items)),
-        "min_fem_plural": max(1, min(scale(args.min_fem_plural), total_items)),
-        "min_feature_load": max(
-            args.min_feature_load,
-            min(max_feature_load, int(round(args.min_feature_load * args.hardness_multiplier))),
-        ),
-        "hardness_multiplier": args.hardness_multiplier,
-    }
 
     git_sha = _git_sha()
-    for attempt in range(max_attempts):
-        rng = random.Random(actual_seed + attempt)
-        spec = generate_language_instance(actual_seed + attempt)
-        test_dict = generate_test(
-            spec, blueprint, concepts, rng, seed=actual_seed + attempt, params=params, git_sha=git_sha
-        )
-        if validate_data(test_dict, spec, overrides=params, quiet=True):
-            with open(args.out_path, "w", encoding="utf-8") as f:
-                json.dump(test_dict, f, indent=2)
-            print(f"Generated test JSON at {args.out_path} (seed {actual_seed + attempt})")
-            # final visible validation for transparency
-            validate_data(test_dict, spec, overrides=params, quiet=False)
-            return
-    raise SystemExit(f"Property tests failed after {max_attempts} attempts; test not written.")
+    spec = generate_language_instance(actual_seed)
+    rng = random.Random(actual_seed)
+    test_dict = generate_test(spec, blueprint, concepts, rng, seed=actual_seed, git_sha=git_sha, hardness=args.hardness)
+    if not validate_data(test_dict, spec, quiet=False):
+        raise SystemExit("Property tests failed; test not written.")
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    base_dir = args.run_dir or "runs"
+    run_name = args.run_name or f"{timestamp}_seed{actual_seed}_hardness{args.hardness}"
+    run_dir = os.path.join(base_dir, run_name)
+    os.makedirs(run_dir, exist_ok=True)
+    test_dict.setdefault("meta", {})
+    test_dict["meta"]["run_dir"] = os.path.abspath(run_dir)
+    test_dict["meta"]["run_name"] = run_name
+    test_dict["meta"]["run_base_dir"] = os.path.abspath(base_dir)
+    out_json = os.path.join(run_dir, os.path.basename(args.out_path))
+    with open(out_json, "w", encoding="utf-8") as f:
+        json.dump(test_dict, f, indent=2)
+    # also copy to requested path (relative to cwd) for compatibility
+    if args.out_path and os.path.dirname(args.out_path):
+        os.makedirs(os.path.dirname(args.out_path), exist_ok=True)
+    with open(args.out_path, "w", encoding="utf-8") as f:
+        json.dump(test_dict, f, indent=2)
+    print(f"Generated test JSON at {out_json} (seed {actual_seed})")
 
 
 if __name__ == "__main__":
diff --git a/python b/python
new file mode 100755
index 0000000..aa6f44e
Binary files /dev/null and b/python differ
diff --git a/render_text.py b/render_text.py
index da42d6a..51da112 100644
--- a/render_text.py
+++ b/render_text.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 
 import argparse
 import json
+import os
 from typing import Dict, Any
 import subprocess
 import shutil
@@ -106,24 +107,22 @@ def render_key(data: Dict[str, Any]) -> str:
 
 
 def _write_pdf(text: str, pdf_path: str, title: str) -> None:
-    """Render plain text to PDF via pandoc if available."""
+    """Render plain text to PDF via pandoc + pdflatex if available."""
     pandoc = shutil.which("pandoc")
-    wkhtml = shutil.which("wkhtmltopdf")
-    if not pandoc or not wkhtml:
-        print("pandoc + wkhtmltopdf required for PDF generation; skipping.", file=sys.stderr)
+    pdflatex = shutil.which("pdflatex")
+    if not pandoc or not pdflatex:
+        print("pandoc + pdflatex required for PDF generation; skipping.", file=sys.stderr)
         return
     with tempfile.NamedTemporaryFile("w", delete=False, suffix=".txt") as tmp:
         tmp.write(f"{title}\n\n{text}")
         tmp_path = tmp.name
     try:
-        cmd = [pandoc, tmp_path, "-o", pdf_path, "--from", "gfm", "--pdf-engine", "wkhtmltopdf"]
+        cmd = [pandoc, tmp_path, "-o", pdf_path, "--from", "gfm", "--pdf-engine", "pdflatex"]
         subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
     except subprocess.CalledProcessError:
-        print("pandoc + wkhtmltopdf failed to produce PDF.", file=sys.stderr)
+        print("pandoc + pdflatex failed to produce PDF.", file=sys.stderr)
     finally:
         try:
-            import os
-
             os.remove(tmp_path)
         except OSError:
             pass
@@ -143,6 +142,18 @@ def main() -> None:
         _write_pdf(booklet_text, args.test_pdf, "ALAN Booklet")
     if args.key_pdf:
         _write_pdf(key_text, args.key_pdf, "ALAN Answer Key")
+    run_dir = data.get("meta", {}).get("run_dir")
+    if run_dir and os.path.isdir(run_dir):
+        for src in [args.test_out, args.key_out, args.test_pdf, args.key_pdf]:
+            if not src:
+                continue
+            if not os.path.exists(src):
+                continue
+            dest = os.path.join(run_dir, os.path.basename(src))
+            try:
+                shutil.copyfile(src, dest)
+            except OSError:
+                pass
 
 
 if __name__ == "__main__":
diff --git a/requirements-dev.txt b/requirements-dev.txt
deleted file mode 100644
index 4897858..0000000
--- a/requirements-dev.txt
+++ /dev/null
@@ -1 +0,0 @@
-hypothesis>=6.88.0
diff --git a/test_generator.py b/test_generator.py
deleted file mode 100644
index e11aade..0000000
--- a/test_generator.py
+++ /dev/null
@@ -1,1021 +0,0 @@
-"""Constraint-driven generator for ALAN using structured features."""
-from __future__ import annotations
-
-from dataclasses import dataclass, asdict, replace
-from typing import List, Dict, Optional
-import random
-
-from language_spec import (
-    LanguageSpec,
-    SentenceFeatures,
-    NPFeature,
-    AGENT,
-    RECIPIENT,
-    THEME,
-    realize_sentence,
-    english_gloss,
-)
-from semantic import meanings_equal, to_meaning, semantic_distance
-from grammar_check import is_grammatical
-from test_blueprint import (
-    Concept,
-    SectionBlueprint,
-    TestBlueprint,
-    EXEMPLAR_COMPREHENSION,
-    TRANSLATE_TO_LANG,
-    TRANSLATE_FROM_LANG,
-    RULE_APPLICATION,
-    STACKED_RULES,
-)
-
-# semantic distance bounds for distractors (kept in sync with property tests)
-DIST_MIN = 1
-DIST_MAX = 1
-
-
-@dataclass(frozen=True)
-class SectionConstraints:
-    allowed_verbs: List[str]
-    allowed_agent_nouns: List[str]
-    allowed_recipient_nouns: List[str]
-    allowed_theme_nouns: List[str]
-    allow_plural: bool
-    allow_feminine: bool
-    allow_past: bool
-    allow_ditransitive: bool
-    allow_irregulars: bool
-    allow_adjectives: bool
-
-
-def render_concept_explanation(concept: Concept, spec: LanguageSpec) -> str:
-    """Simple explanation with one example."""
-    if concept.id == "S_ORDER":
-        sf = sentence_features(
-            "see",
-            "PRES",
-            np_features("man", AGENT, plural=False, adjectives=["tall"]),
-            np_features("girl", RECIPIENT, plural=False, adjectives=[]),
-            None,
-        )
-        return f"{concept.description_en}\nExample: {realize_sentence(spec, sf)} = {english_gloss(sf)}"
-    if concept.id == "NOUN_NUMBER_MARKING":
-        sf = sentence_features(
-            "see",
-            "PRES",
-            np_features("man", AGENT, plural=False, adjectives=[]),
-            np_features("girl", RECIPIENT, plural=True, adjectives=[]),
-            None,
-        )
-        return f"{concept.description_en}\nExample: {realize_sentence(spec, sf)} = {english_gloss(sf)}"
-    if concept.id == "VERB_TENSE_MARKING":
-        sf = sentence_features(
-            "chase",
-            "PAST",
-            np_features("woman", AGENT, plural=False, adjectives=[]),
-            np_features("boy", RECIPIENT, plural=False, adjectives=[]),
-            None,
-        )
-        return f"{concept.description_en}\nExample: {realize_sentence(spec, sf)} = {english_gloss(sf)}"
-    # default example
-    sf = sentence_features(
-        "give",
-        "PRES",
-        np_features("woman", AGENT, plural=False, adjectives=[]),
-        np_features("boy", RECIPIENT, plural=False, adjectives=[]),
-        np_features("ball", THEME, plural=False, adjectives=["red"]),
-    )
-    return f"{concept.description_en}\nExample: {realize_sentence(spec, sf)} = {english_gloss(sf)}"
-
-
-@dataclass
-class Option:
-    label: str
-    text: str
-    is_correct: bool
-    explanation: str
-    features: SentenceFeatures
-
-
-@dataclass
-class Question:
-    id: str
-    item_type: str
-    section_id: str
-    concepts: List[str]
-    stem: str
-    options: List[Option]
-    difficulty_score: float
-
-
-def question_valid(q: Question, spec: LanguageSpec) -> bool:
-    if len(q.options) != 4:
-        return False
-    if sum(opt.is_correct for opt in q.options) != 1:
-        return False
-    meanings = []
-    glosses = []
-    correct_meaning = None
-    for opt in q.options:
-        m = to_meaning(opt.features)
-        g = english_gloss(opt.features)
-        if opt.is_correct:
-            correct_meaning = m
-        for existing in meanings:
-            if meanings_equal(m, existing):
-                return False
-        if g in glosses:
-            return False
-        meanings.append(m)
-        glosses.append(g)
-        if not is_grammatical(asdict(opt), spec):
-            return False
-    if correct_meaning is None:
-        return False
-    for opt in q.options:
-        if opt.is_correct:
-            continue
-        dist = semantic_distance(to_meaning(opt.features), correct_meaning)
-        if dist != 1:
-            return False
-    return True
-
-
-# ---------------------------------------------------------------------------
-# Feature utilities
-
-
-def section_constraints(unlocked: set[str]) -> SectionConstraints:
-    allow_plural = "NOUN_NUMBER_MARKING" in unlocked
-    allow_feminine = "NOUN_GENDER_MARKING" in unlocked
-    allow_past = "VERB_TENSE_MARKING" in unlocked
-    # Keep ditransitives out of the very first section for simplicity; they unlock once plural or gender is known.
-    allow_ditransitive = allow_plural or allow_feminine
-    allow_irregulars = allow_past or allow_plural
-    allow_adjectives = "NP_ORDER" in unlocked
-
-    def allowed_people(include_feminine: bool) -> List[str]:
-        base = ["man", "boy"]
-        if include_feminine:
-            base += ["woman", "girl"]
-        return base
-
-    return SectionConstraints(
-        allowed_verbs=["see", "chase"] + (["give"] if allow_ditransitive else []),
-        allowed_agent_nouns=allowed_people(allow_feminine),
-        allowed_recipient_nouns=allowed_people(allow_feminine),
-        allowed_theme_nouns=["ball", "house"],
-        allow_plural=allow_plural,
-        allow_feminine=allow_feminine,
-        allow_past=allow_past,
-        allow_ditransitive=allow_ditransitive,
-        allow_irregulars=allow_irregulars,
-        allow_adjectives=allow_adjectives,
-    )
-
-
-def np_features(
-    noun_id: str,
-    role: str,
-    feminine: Optional[bool] = None,
-    plural: bool = False,
-    adjectives: Optional[List[str]] = None,
-    use_irregular: bool = True,
-) -> NPFeature:
-    feminine_default = noun_id in {"woman", "girl"}
-    fem = feminine_default if feminine is None else feminine
-    if noun_id not in {"woman", "girl"}:
-        fem = False
-    return NPFeature(
-        noun_id=noun_id,
-        feminine=fem,
-        plural=plural,
-        adjectives=adjectives or [],
-        role=role,
-        use_irregular=use_irregular,
-    )
-
-
-def sentence_features(
-    verb_id: str,
-    tense: str,
-    subj: NPFeature,
-    obj1: NPFeature,
-    obj2: Optional[NPFeature] = None,
-    use_irregular_verb: bool = True,
-) -> SentenceFeatures:
-    return SentenceFeatures(subject=subj, obj1=obj1, obj2=obj2, verb_id=verb_id, tense=tense, use_irregular_verb=use_irregular_verb)
-
-
-def respects_constraints(sf: SentenceFeatures, constraints: Optional[SectionConstraints]) -> bool:
-    if constraints is None:
-        return True
-    if sf.verb_id not in constraints.allowed_verbs:
-        return False
-    if not constraints.allow_past and sf.tense == "PAST":
-        return False
-    if not constraints.allow_ditransitive and sf.obj2 is not None:
-        return False
-    nps = [sf.subject, sf.obj1] + ([sf.obj2] if sf.obj2 else [])
-    for np in nps:
-        if np.role == AGENT and np.noun_id not in constraints.allowed_agent_nouns:
-            return False
-        if np.role != AGENT and np.noun_id not in constraints.allowed_recipient_nouns + constraints.allowed_theme_nouns:
-            return False
-        if not constraints.allow_plural and np.plural:
-            return False
-        if not constraints.allow_feminine and np.feminine:
-            return False
-        if (
-            not constraints.allow_irregulars
-            and ((np.use_irregular and np.plural) or (sf.use_irregular_verb and sf.tense == "PAST"))
-        ):
-            return False
-        if not constraints.allow_adjectives and np.adjectives:
-            return False
-    return True
-
-
-# ---------------------------------------------------------------------------
-# Perturbations for distractors (all grammatical)
-
-
-def perturb_tense(sf: SentenceFeatures) -> SentenceFeatures:
-    return SentenceFeatures(
-        subject=sf.subject,
-        obj1=sf.obj1,
-        obj2=sf.obj2,
-        verb_id=sf.verb_id,
-        tense="PAST" if sf.tense == "PRES" else "PRES",
-    )
-
-
-def perturb_roles(sf: SentenceFeatures) -> SentenceFeatures:
-    new_subj = NPFeature(
-        noun_id=sf.obj1.noun_id,
-        feminine=sf.obj1.feminine,
-        plural=sf.obj1.plural,
-        adjectives=sf.obj1.adjectives,
-        role=AGENT,
-    )
-    new_obj1 = NPFeature(
-        noun_id=sf.subject.noun_id,
-        feminine=sf.subject.feminine,
-        plural=sf.subject.plural,
-        adjectives=sf.subject.adjectives,
-        role=RECIPIENT,
-    )
-    return SentenceFeatures(
-        subject=new_subj,
-        obj1=new_obj1,
-        obj2=sf.obj2,
-        verb_id=sf.verb_id,
-        tense=sf.tense,
-    )
-
-
-def perturb_number_gender(sf: SentenceFeatures) -> SentenceFeatures:
-    target = sf.obj1
-    if target.noun_id in {"woman", "girl"}:
-        flipped_fem = not target.feminine
-    else:
-        flipped_fem = False
-    swapped = NPFeature(
-        noun_id=target.noun_id,
-        feminine=flipped_fem,
-        plural=not target.plural,
-        adjectives=target.adjectives,
-        role=target.role,
-    )
-    return SentenceFeatures(
-        subject=sf.subject,
-        obj1=swapped,
-        obj2=sf.obj2,
-        verb_id=sf.verb_id,
-        tense=sf.tense,
-    )
-
-
-def perturb_adj_scope(sf: SentenceFeatures) -> SentenceFeatures:
-    new_subj = NPFeature(
-        noun_id=sf.subject.noun_id,
-        feminine=sf.subject.feminine,
-        plural=sf.subject.plural,
-        adjectives=sf.obj1.adjectives,
-        role=sf.subject.role,
-    )
-    new_obj1 = NPFeature(
-        noun_id=sf.obj1.noun_id,
-        feminine=sf.obj1.feminine,
-        plural=sf.obj1.plural,
-        adjectives=sf.subject.adjectives,
-        role=sf.obj1.role,
-    )
-    return SentenceFeatures(
-        subject=new_subj,
-        obj1=new_obj1,
-        obj2=sf.obj2,
-        verb_id=sf.verb_id,
-        tense=sf.tense,
-    )
-
-
-def perturb_irregular(sf: SentenceFeatures) -> SentenceFeatures:
-    """Swap irregular to regular (or vice versa) to create a near-miss."""
-    # Regularize irregular noun plural.
-    new_obj1 = sf.obj1
-    if sf.obj1.noun_id == "boy" and sf.obj1.plural:
-        new_obj1 = NPFeature(
-            noun_id="boy",
-            feminine=sf.obj1.feminine,
-            plural=True,
-            adjectives=sf.obj1.adjectives,
-            role=sf.obj1.role,
-        )
-    # Regularize irregular verb past.
-    new_verb = sf.verb_id
-    new_tense = sf.tense
-    if sf.verb_id == "chase" and sf.tense == "PAST":
-        new_tense = "PAST"
-    return SentenceFeatures(
-        subject=sf.subject,
-        obj1=new_obj1,
-        obj2=sf.obj2,
-        verb_id=new_verb,
-        tense=new_tense,
-    )
-
-
-def build_distractors(spec: LanguageSpec, sf: SentenceFeatures, rng: random.Random, constraints: Optional[SectionConstraints] = None) -> List[Option]:
-    """Generate three minimal-pair distractors (exactly one feature flipped)."""
-    from semantic import to_meaning, semantic_distance
-    target_meaning = to_meaning(sf)
-    correct_text = realize_sentence(spec, sf)
-    seen_surfaces = {correct_text}
-    seen_meanings = {target_meaning}
-    distractors: List[Option] = []
-
-    def clone_sf(orig: SentenceFeatures) -> SentenceFeatures:
-        return SentenceFeatures(
-            subject=replace(orig.subject),
-            obj1=replace(orig.obj1),
-            obj2=replace(orig.obj2) if orig.obj2 else None,
-            verb_id=orig.verb_id,
-            tense=orig.tense,
-            use_irregular_verb=orig.use_irregular_verb,
-        )
-
-    def add_if_valid(cand_sf: SentenceFeatures, explanation: str) -> None:
-        nonlocal distractors
-        text = realize_sentence(spec, cand_sf)
-        meaning = to_meaning(cand_sf)
-        dist = semantic_distance(meaning, target_meaning)
-        if text in seen_surfaces:
-            return
-        if any(meanings_equal(meaning, m) for m in seen_meanings):
-            return
-        if dist != 1:
-            return
-        opt = Option(label="", text=text, is_correct=False, explanation=explanation, features=cand_sf)
-        from grammar_check import is_grammatical
-
-        if not is_grammatical(asdict(opt), spec):
-            return
-        seen_surfaces.add(text)
-        seen_meanings.add(meaning)
-        distractors.append(opt)
-
-    # Available single-feature flips
-    flips = []
-    flips.append(("Tense flip.", lambda base: replace(base, tense="PAST" if base.tense == "PRES" else "PRES")))
-
-    # number flip on obj1
-    def flip_obj1_number(base: SentenceFeatures) -> SentenceFeatures:
-        new = clone_sf(base)
-        new.obj1 = replace(new.obj1, plural=not new.obj1.plural)
-        return new
-
-    flips.append(("Number flip (receiver).", flip_obj1_number))
-
-    # subject number flip
-    def flip_subj_number(base: SentenceFeatures) -> SentenceFeatures:
-        new = clone_sf(base)
-        new.subject = replace(new.subject, plural=not new.subject.plural)
-        return new
-
-    flips.append(("Number flip (doer).", flip_subj_number))
-
-    # obj2 number flip when present
-    def flip_obj2_number(base: SentenceFeatures) -> SentenceFeatures:
-        if base.obj2 is None:
-            return base
-        new = clone_sf(base)
-        new.obj2 = replace(new.obj2, plural=not new.obj2.plural)
-        return new
-
-    flips.append(("Number flip (theme).", flip_obj2_number))
-
-    # gender flip on obj1 when allowed
-    def flip_obj1_gender(base: SentenceFeatures) -> SentenceFeatures:
-        new = clone_sf(base)
-        if new.obj1.noun_id not in {"woman", "girl"}:
-            return new
-        new.obj1 = replace(new.obj1, feminine=not new.obj1.feminine)
-        return new
-
-    flips.append(("Gender flip (receiver).", flip_obj1_gender))
-
-    # adjective toggle on obj1
-    def flip_obj1_adj(base: SentenceFeatures) -> SentenceFeatures:
-        new = clone_sf(base)
-        if new.obj1.adjectives:
-            new.obj1 = replace(new.obj1, adjectives=[])
-        else:
-            new.obj1 = replace(new.obj1, adjectives=["red"])
-        return new
-
-    flips.append(("Adjective scope change.", flip_obj1_adj))
-
-    # role flip on obj1 (recipient vs theme) stays grammatical
-    def flip_obj1_role(base: SentenceFeatures) -> SentenceFeatures:
-        new = clone_sf(base)
-        new_role = THEME if new.obj1.role == RECIPIENT else RECIPIENT
-        new.obj1 = replace(new.obj1, role=new_role)
-        return new
-
-    flips.append(("Role flip (receiver/theme).", flip_obj1_role))
-
-    # irregular flip: toggle use of irregular noun plural or verb past
-    def flip_irregular(base: SentenceFeatures) -> SentenceFeatures:
-        new = clone_sf(base)
-        if new.obj1.noun_id == "boy" and new.obj1.plural:
-            new.obj1 = replace(new.obj1, use_irregular=not new.obj1.use_irregular)
-        elif new.verb_id == "chase" and new.tense == "PAST":
-            new = replace(new, use_irregular_verb=not new.use_irregular_verb)
-        return new
-
-    flips.append(("Irregular vs regular.", flip_irregular))
-
-    # verb swap within allowed valence
-    def swap_verb(base: SentenceFeatures) -> SentenceFeatures:
-        allowed = constraints.allowed_verbs if constraints else ["see", "chase", "give"]
-        same_valence = [v for v in allowed if (base.obj2 is not None) == (v == "give")]
-        if len(same_valence) <= 1:
-            return base
-        choices = [v for v in same_valence if v != base.verb_id]
-        new_verb = rng.choice(choices) if choices else base.verb_id
-        return replace(base, verb_id=new_verb)
-
-    flips.append(("Verb swap.", swap_verb))
-
-    # subject noun swap
-    def swap_subj_noun(base: SentenceFeatures) -> SentenceFeatures:
-        allowed = constraints.allowed_agent_nouns if constraints else ["man", "woman", "boy", "girl"]
-        if len(allowed) <= 1:
-            return base
-        new_noun = rng.choice([n for n in allowed if n != base.subject.noun_id] or [base.subject.noun_id])
-        new_subj = np_features(new_noun, AGENT, plural=base.subject.plural, adjectives=base.subject.adjectives)
-        return replace(base, subject=new_subj)
-
-    flips.append(("Doer noun swap.", swap_subj_noun))
-
-    # receiver noun swap
-    def swap_obj1_noun(base: SentenceFeatures) -> SentenceFeatures:
-        allowed = constraints.allowed_recipient_nouns if constraints else ["boy", "girl", "man", "woman"]
-        if len(allowed) <= 1:
-            return base
-        new_noun = rng.choice([n for n in allowed if n != base.obj1.noun_id] or [base.obj1.noun_id])
-        new_obj1 = np_features(new_noun, base.obj1.role, plural=base.obj1.plural, adjectives=base.obj1.adjectives)
-        return replace(base, obj1=new_obj1)
-
-    flips.append(("Receiver noun swap.", swap_obj1_noun))
-
-    # subject adjective toggle
-    def flip_subj_adj(base: SentenceFeatures) -> SentenceFeatures:
-        new = clone_sf(base)
-        if new.subject.adjectives:
-            new.subject = replace(new.subject, adjectives=[])
-        else:
-            new.subject = replace(new.subject, adjectives=["tall"])
-        return new
-
-    flips.append(("Adjective swap (doer).", flip_subj_adj))
-
-    # apply flips in shuffled order to diversify
-    rng.shuffle(flips)
-    for expl, fn in flips:
-        if len(distractors) >= 3:
-            break
-        cand = fn(sf)
-        if constraints and not respects_constraints(cand, constraints):
-            continue
-        add_if_valid(cand, expl)
-    # if still short, retry flips on shuffled order (different seeds) until filled or attempts exhausted
-    attempts = 0
-    while len(distractors) < 3 and attempts < 20:
-        expl, fn = rng.choice(flips)
-        cand = fn(sf)
-        if constraints and not respects_constraints(cand, constraints):
-            attempts += 1
-            continue
-        add_if_valid(cand, expl)
-        attempts += 1
-    return distractors if len(distractors) == 3 else []
-
-
-# ---------------------------------------------------------------------------
-# Item generation
-
-
-def _base_features(
-    spec: LanguageSpec, rng: random.Random, difficulty: str, constraints: Optional[SectionConstraints] = None
-) -> SentenceFeatures:
-    cons = constraints
-    adj_pool = ["tall", "red", "big", "fast"]
-    verbs = cons.allowed_verbs if cons else ["see", "chase", "give"]
-    verb_id = rng.choice(verbs)
-    allow_past = cons.allow_past if cons else True
-    tense = "PAST" if allow_past and (difficulty == "late" or rng.random() < 0.4) else "PRES"
-
-    subj_nouns = cons.allowed_agent_nouns if cons else ["man", "woman"]
-    subj_adj = []
-    if (cons.allow_adjectives if cons else True) and rng.random() < 0.6:
-        subj_adj = [rng.choice(adj_pool)]
-    subj = np_features(
-        noun_id=rng.choice(subj_nouns),
-        role=AGENT,
-        plural=(cons.allow_plural if cons else difficulty != "early") and rng.random() < 0.4,
-        adjectives=subj_adj,
-    )
-
-    if verb_id == "give":
-        rec_nouns = cons.allowed_recipient_nouns if cons else ["boy", "girl"]
-        obj1_adj = []
-        if (cons.allow_adjectives if cons else True) and rng.random() < 0.5:
-            obj1_adj = [rng.choice(adj_pool)]
-        obj1 = np_features(
-            noun_id=rng.choice(rec_nouns),
-            role=RECIPIENT,
-            plural=(cons.allow_plural if cons else difficulty != "early") and rng.random() < 0.4,
-            adjectives=obj1_adj,
-        )
-        theme_nouns = cons.allowed_theme_nouns if cons else ["ball", "house"]
-        obj2_adj = []
-        if (cons.allow_adjectives if cons else True) and rng.random() < 0.6:
-            obj2_adj = [rng.choice(adj_pool)]
-        obj2 = np_features(
-            noun_id=rng.choice(theme_nouns),
-            role=THEME,
-            plural=(cons.allow_plural if cons else difficulty == "late") and rng.random() < 0.5,
-            adjectives=obj2_adj,
-        )
-    else:
-        rec_nouns = cons.allowed_recipient_nouns if cons else ["boy", "girl", "man", "woman"]
-        obj1_adj = []
-        if (cons.allow_adjectives if cons else True) and rng.random() < 0.6:
-            obj1_adj = [rng.choice(adj_pool)]
-        obj1 = np_features(
-            noun_id=rng.choice(rec_nouns),
-            role=RECIPIENT,
-            plural=(cons.allow_plural if cons else difficulty != "early") and rng.random() < 0.5,
-            adjectives=obj1_adj,
-        )
-        obj2 = None
-
-    return sentence_features(
-        verb_id=verb_id,
-        tense=tense,
-        subj=subj,
-        obj1=obj1,
-        obj2=obj2,
-        use_irregular_verb=True,
-    )
-
-
-def _planned_features(
-    spec: LanguageSpec,
-    rng: random.Random,
-    difficulty: str,
-    remaining: Dict[str, int],
-    idx: int,
-    items_left: int,
-    constraints: Optional[SectionConstraints] = None,
-) -> tuple[SentenceFeatures, Dict[str, int]]:
-    """Greedy planner to satisfy coverage quotas deterministically with overlap.
-
-    Returns the planned SentenceFeatures and a delta dict for counters to apply only
-    if the resulting item is accepted.
-    """
-    cons = constraints
-    allow_plural = cons.allow_plural if cons else True
-    allow_ditransitive = cons.allow_ditransitive if cons else True
-    allow_irregulars = cons.allow_irregulars if cons else True
-    allow_past = cons.allow_past if cons else True
-    allow_adjectives = cons.allow_adjectives if cons else True
-    allow_feminine = cons.allow_feminine if cons else True
-    adj_pool = ["tall", "red", "big", "fast"]
-
-    def build_pool(nouns: List[str], role: str) -> List[NPFeature]:
-        pool: List[NPFeature] = []
-        for noun in nouns:
-            adj_choices = adj_pool if allow_adjectives else []
-            if not adj_choices:
-                adj_variants = [[]]
-            else:
-                # include one no-adj variant plus one for each adjective to diversify surfaces
-                adj_variants = [[]] + [[adj] for adj in adj_choices]
-            for adj_list in adj_variants:
-                pool.append(np_features(noun, role, plural=False, adjectives=adj_list))
-                if allow_plural:
-                    pool.append(np_features(noun, role, plural=True, adjectives=adj_list))
-        return pool
-
-    subj_pool = build_pool(cons.allowed_agent_nouns if cons else ["man", "woman", "boy"], AGENT)
-    rec_pool = build_pool(cons.allowed_recipient_nouns if cons else ["boy", "girl", "man", "woman"], RECIPIENT)
-    theme_pool = build_pool(cons.allowed_theme_nouns if cons else ["ball", "house"], THEME)
-
-    subj = rng.choice(subj_pool)
-    obj1 = rng.choice(rec_pool)
-    obj2 = None
-    verb_choices = cons.allowed_verbs if cons else ["see", "chase", "give"]
-    verb_id = rng.choice(verb_choices)
-    tense = "PRES"
-    use_irregular_verb = True
-    delta: Dict[str, int] = {"irregular_verb": 0, "irregular_noun": 0, "ditransitive": 0, "fem_plural": 0, "plural": 0, "adjective": 0}
-
-    # 0) Force ditransitive if quota equals items_left
-    if allow_ditransitive and remaining.get("ditransitive", 0) >= items_left:
-        verb_id = "give"
-        obj2 = rng.choice(theme_pool)
-        delta["ditransitive"] = 1
-    # 1) Irregular verb coverage (monotransitive chase past), forced if needed
-    if (
-        verb_id == "see"
-        and allow_past
-        and allow_irregulars
-        and remaining.get("irregular_verb", 0) >= items_left
-    ):
-        verb_id = "chase"
-        tense = "PAST"
-        use_irregular_verb = True
-        delta["irregular_verb"] = 1
-    elif verb_id == "see" and allow_past and allow_irregulars and remaining.get("irregular_verb", 0) > 0:
-        verb_id = "chase"
-        tense = "PAST"
-        use_irregular_verb = True
-        delta["irregular_verb"] = 1
-
-    # 2) Ditransitive coverage (can overlap with irregular noun)
-    if (
-        allow_ditransitive
-        and verb_id != "chase"
-        and remaining.get("ditransitive", 0) > 0
-        and delta.get("ditransitive", 0) == 0
-    ):
-        verb_id = "give"
-        obj2 = theme_pool[idx % len(theme_pool)]
-        delta["ditransitive"] = 1
-
-    # 3) Irregular noun coverage (can overlap with ditransitive)
-    if allow_plural and allow_irregulars and remaining.get("irregular_noun", 0) > 0:
-        obj1 = np_features("boy", RECIPIENT, plural=True, adjectives=["red"] if allow_adjectives else [], use_irregular=True)
-        delta["irregular_noun"] = 1
-        # if we still need more irregular noun items, also set subject to irregular boy plural
-        if remaining.get("irregular_noun", 0) - delta["irregular_noun"] > 0:
-            subj = np_features("boy", AGENT, plural=True, adjectives=subj.adjectives or ["tall"], use_irregular=True)
-
-    # 4) fem plural receiver (if applicable)
-    must_fem = remaining.get("fem_plural", 0) >= items_left if allow_plural and allow_feminine else False
-    if allow_plural and allow_feminine and (remaining.get("fem_plural", 0) > 0 or must_fem):
-        if obj1.noun_id not in {"woman", "girl"}:
-            obj1 = np_features("woman", RECIPIENT, feminine=True, plural=True, adjectives=obj1.adjectives or ["red"])
-            delta["fem_plural"] = 1
-        elif not (obj1.feminine and obj1.plural):
-            obj1 = np_features(obj1.noun_id, RECIPIENT, feminine=True, plural=True, adjectives=obj1.adjectives)
-            delta["fem_plural"] = 1
-
-    # 5) plural coverage (if still needed)
-    must_plural = remaining.get("plural", 0) >= items_left if allow_plural else False
-    if allow_plural and (remaining.get("plural", 0) > 0 or must_plural) and not (obj1.plural or subj.plural or (obj2 and obj2.plural)):
-        obj1 = np_features(obj1.noun_id, obj1.role, feminine=obj1.feminine, plural=True, adjectives=obj1.adjectives)
-        delta["plural"] = 1
-    # secondary plural if still needed and possible
-    if allow_plural and remaining.get("plural", 0) - delta.get("plural", 0) > 0 and not subj.plural:
-        subj = NPFeature(subj.noun_id, subj.feminine, True, subj.adjectives, subj.role, subj.use_irregular)
-        delta["plural"] = delta.get("plural", 0) + 1
-
-    # 6) adjective coverage
-    must_adj = remaining.get("adjective", 0) >= items_left if allow_adjectives else False
-    if allow_adjectives and (remaining.get("adjective", 0) > 0 or must_adj):
-        if not obj1.adjectives:
-            obj1 = np_features(
-                obj1.noun_id, obj1.role, feminine=obj1.feminine, plural=obj1.plural, adjectives=[rng.choice(adj_pool)]
-            )
-            delta["adjective"] = 1
-        elif not subj.adjectives:
-            subj = np_features(
-                subj.noun_id, subj.role, feminine=subj.feminine, plural=subj.plural, adjectives=[rng.choice(adj_pool)]
-            )
-            delta["adjective"] = 1
-
-    return sentence_features(verb_id, tense, subj, obj1, obj2, use_irregular_verb=use_irregular_verb), delta
-
-
-def _difficulty_score(sf: SentenceFeatures, irregular: bool) -> float:
-    score = 0
-    for np in [sf.subject, sf.obj1] + ([sf.obj2] if sf.obj2 else []):
-        score += 1 if np.plural else 0
-        score += 1 if np.feminine else 0
-        score += len(np.adjectives)
-    if sf.obj2:
-        score += 1
-    if irregular:
-        score += 1
-    if score <= 2:
-        return 0.2
-    if score <= 4:
-        return 0.5
-    return 0.8
-
-
-def _feature_load(sf: SentenceFeatures) -> int:
-    load = 0
-    for np in [sf.subject, sf.obj1] + ([sf.obj2] if sf.obj2 else []):
-        load += 1 if np.plural else 0
-        load += len(np.adjectives)
-        load += 1 if (np.feminine and np.noun_id in {"woman", "girl"}) else 0
-    load += 1 if sf.tense == "PAST" else 0
-    load += 1 if (sf.verb_id == "chase" and sf.tense == "PAST" and sf.use_irregular_verb) else 0
-    if sf.obj1.noun_id == "boy" and sf.obj1.plural and sf.obj1.use_irregular:
-        load += 1
-    return load
-
-
-def _boost_feature_load(sf: SentenceFeatures, target: int, constraints: Optional[SectionConstraints] = None) -> SentenceFeatures:
-    """Increase feature load deterministically without breaking rules."""
-    current = _feature_load(sf)
-    if current >= target:
-        return sf
-    cons = constraints
-    allow_plural = cons.allow_plural if cons else True
-    allow_past = cons.allow_past if cons else True
-    allow_adjectives = cons.allow_adjectives if cons else True
-    adj_pool = ["red", "fast", "big", "tall"]
-    subj = sf.subject
-    obj1 = sf.obj1
-    obj2 = sf.obj2
-    verb_id = sf.verb_id
-    tense = sf.tense
-    use_irregular_verb = sf.use_irregular_verb
-    adj_pool = ["red", "fast", "big", "tall"]
-
-    def add_adj(np: NPFeature) -> NPFeature:
-        for adj in adj_pool:
-            if adj not in np.adjectives:
-                return NPFeature(np.noun_id, np.feminine, np.plural, np.adjectives + [adj], np.role, np.use_irregular)
-        return np
-
-    steps = []
-    if allow_past:
-        steps.append("tense")
-    if allow_adjectives:
-        steps.extend(
-            [
-                "obj1_adj",
-                "subj_adj",
-            ]
-        )
-    if allow_plural:
-        steps.extend(["obj1_plural", "subj_plural"])
-    if allow_adjectives:
-        steps.extend(
-            [
-                "obj2_adj",
-                "obj1_second_adj",
-                "subj_second_adj",
-                "obj2_second_adj",
-            ]
-        )
-    if not steps:
-        return SentenceFeatures(subj, obj1, obj2, verb_id, tense, use_irregular_verb)
-    for i in range(24):
-        current = _feature_load(SentenceFeatures(subj, obj1, obj2, verb_id, tense, use_irregular_verb))
-        if current >= target:
-            break
-        step_name = steps[i % len(steps)]
-        if step_name == "tense" and tense == "PRES":
-            tense = "PAST"
-        elif step_name == "obj1_adj":
-            obj1 = add_adj(obj1)
-        elif step_name == "subj_adj":
-            subj = add_adj(subj)
-        elif step_name == "obj1_plural" and not obj1.plural:
-            obj1 = NPFeature(obj1.noun_id, obj1.feminine, True, obj1.adjectives, obj1.role, obj1.use_irregular)
-        elif step_name == "subj_plural" and not subj.plural:
-            subj = NPFeature(subj.noun_id, subj.feminine, True, subj.adjectives, subj.role, subj.use_irregular)
-        elif step_name == "obj2_adj" and obj2:
-            obj2 = add_adj(obj2)
-        elif step_name == "obj1_second_adj":
-            obj1 = add_adj(obj1)
-        elif step_name == "subj_second_adj":
-            subj = add_adj(subj)
-        elif step_name == "obj2_second_adj" and obj2:
-            obj2 = add_adj(obj2)
-    return SentenceFeatures(subj, obj1, obj2, verb_id, tense, use_irregular_verb)
-
-
-def generate_item(
-    spec: LanguageSpec,
-    concepts: List[str],
-    section_id: str,
-    item_type: str,
-    rng: random.Random,
-    difficulty: str = "mid",
-    sf_override: Optional[SentenceFeatures] = None,
-    min_feature_load: int = 1,
-    constraints: Optional[SectionConstraints] = None,
-) -> Question:
-    sf = sf_override or _base_features(spec, rng, difficulty, constraints=constraints)
-    sf = _boost_feature_load(sf, min_feature_load, constraints=constraints)
-    if constraints and not respects_constraints(sf, constraints):
-        sf = sf_override or _base_features(spec, rng, difficulty, constraints=constraints)
-    correct_text = realize_sentence(spec, sf)
-    gloss = english_gloss(sf)
-    distractors = build_distractors(spec, sf, rng, constraints=constraints)
-    options = [Option(label="", text=correct_text, is_correct=True, explanation="Correct", features=sf)] + distractors
-    # ensure uniqueness
-    texts = set()
-    unique_options = []
-    for opt in options:
-        if opt.text in texts:
-            continue
-        texts.add(opt.text)
-        unique_options.append(opt)
-    options = unique_options[:4]
-    rng.shuffle(options)
-    labels = ["A", "B", "C", "D"]
-    for i, opt in enumerate(options):
-        opt.label = labels[i]
-
-    if item_type == TRANSLATE_TO_LANG:
-        stem = f"Translate into the language: {gloss}"
-    elif item_type == TRANSLATE_FROM_LANG:
-        stem = f"What does this sentence mean? {correct_text}"
-    else:
-        stem = f"Use the rules to choose the correct sentence. Target meaning: {gloss}"
-
-    irregular = (sf.obj1.noun_id == "boy" and sf.obj1.plural) or (sf.verb_id == "chase" and sf.tense == "PAST")
-    difficulty_score = _difficulty_score(sf, irregular=irregular)
-
-    return Question(
-        id=f"{section_id}_{rng.randrange(10_000)}",
-        item_type=item_type,
-        section_id=section_id,
-        concepts=concepts,
-        stem=stem,
-        options=options,
-        difficulty_score=difficulty_score,
-    )
-
-
-# ---------------------------------------------------------------------------
-# Test orchestration
-
-
-def generate_test(
-    spec: LanguageSpec,
-    blueprint: TestBlueprint,
-    concepts: Dict[str, Concept],
-    rng: random.Random,
-    seed: int | None = None,
-    params: Dict[str, int] | None = None,
-    git_sha: str | None = None,
-) -> Dict:
-    sections_out = []
-    question_counter = 1
-    item_map = []
-    total_items = sum(section.num_items for section in blueprint.sections)
-    cfg = params or {}
-
-    def spaced_slots(count: int) -> set[int]:
-        if count <= 0:
-            return set()
-        slots = set()
-        for i in range(count):
-            idx = 1 + int(i * total_items / max(1, count))
-            slots.add(min(total_items, max(1, idx)))
-        return slots
-
-    irregular_target = cfg.get("min_irregular", 6)
-    # counters for coverage
-    remaining = {
-        "irregular_noun": min(irregular_target, total_items),
-        "irregular_verb": min(irregular_target, total_items),
-        "ditransitive": min(cfg.get("min_ditransitive", 8), total_items),
-        "fem_plural": min(cfg.get("min_fem_plural", 4), total_items),
-        "plural": min(cfg.get("min_plural", 12), total_items),
-        "adjective": min(cfg.get("min_adjective", 12), total_items),
-    }
-
-    unlocked: set[str] = set()
-    seen_correct_meanings = set()
-    seen_correct_surfaces = set()
-    for section in blueprint.sections:
-        unlocked |= set(section.introduce_concepts)
-        constraints = section_constraints(unlocked)
-        questions: List[Question] = []
-        section_intro = [render_concept_explanation(concepts[cid], spec) for cid in section.introduce_concepts]
-        section_meanings = set()
-
-        idx = 0
-        while len(questions) < section.num_items:
-            item_type = section.item_types[idx % len(section.item_types)]
-            idx += 1
-            current_number = question_counter + len(questions)
-            difficulty_tag = "early" if current_number <= 8 else "mid" if current_number <= 24 else "late"
-
-            items_left = total_items - current_number + 1
-            sf_override, delta = _planned_features(
-                spec, rng, difficulty_tag, remaining.copy(), current_number, items_left, constraints=constraints
-            )
-            q = generate_item(
-                spec,
-                section.focus_concepts,
-                section.id,
-                item_type,
-                rng,
-                difficulty=difficulty_tag,
-                sf_override=sf_override,
-                min_feature_load=cfg.get("min_feature_load", 1),
-                constraints=constraints,
-            )
-            # enforce invariants: one correct, 4 unique options
-            if not question_valid(q, spec):
-                continue
-            correct_opt = next((o for o in q.options if o.is_correct), None)
-            if correct_opt:
-                meaning = to_meaning(correct_opt.features)
-                surface = correct_opt.text
-                if meaning in section_meanings:
-                    continue
-                if meaning in seen_correct_meanings or surface in seen_correct_surfaces:
-                    continue
-                section_meanings.add(meaning)
-                seen_correct_meanings.add(meaning)
-                seen_correct_surfaces.add(surface)
-            # apply deltas only when item accepted
-            for key, dec in delta.items():
-                remaining[key] = max(0, remaining.get(key, 0) - dec)
-            questions.append(q)
-            item_map.append(
-                {
-                    "number": current_number,
-                    "stem": q.stem,
-                    "difficulty": difficulty_tag,
-                    "constructs": q.concepts,
-                }
-            )
-
-        questions_dicts = []
-        for q in questions:
-            q_dict = asdict(q)
-            q_dict["number"] = question_counter
-            question_counter += 1
-            questions_dicts.append(q_dict)
-
-        sections_out.append(
-            {
-                "id": section.id,
-                "introduce_concepts": section.introduce_concepts,
-                "intro_text": section_intro,
-                "questions": questions_dicts,
-            }
-        )
-
-    meta_params = params or {}
-    return {
-        "meta": {
-            "version": "0.2",
-            "description": "Alan's Language Aptitude iNstrument (ALAN)",
-            "seed": seed,
-            "git_sha": git_sha,
-            "generation_params": meta_params,
-            "dictionary": spec.lexicon,
-            "instructions": (
-                "You will see a brief dictionary, a handful of rules, and examples. Words may take small "
-                "prefixes or suffixes to mark who does what or when it happens—copy these patterns from the examples. "
-                "You do not need linguistics training; apply the rules logically. In every question exactly one option (A–D) "
-                "matches the target meaning. Correct answers always follow the stated word order: doer (subject), receiver (object), verb."
-            ),
-            "rules": [
-                "Word order: DOER RECEIVER VERB (SOV). For 'give': doer, recipient, theme, verb.",
-                "Adjectives follow the noun they describe.",
-                "Prefix stacking: na (receiver) + mem (feminine) + leko (plural) + noun; doer adds suffix mur.",
-                "Feminine plural: memleko + noun (e.g., memlekorema).",
-                "Irregulars: verb 'ror' past = 'rontmimu'; plural of 'tul' = 'letul'.",
-                "Receiver marker na- applies to the whole noun phrase (e.g., namemlekorema).",
-                "Past tense: verb takes suffix 'mimu' unless irregular.",
-            ],
-            "item_map": item_map,
-        },
-        "sections": sections_out,
-    }