git » alan.git » commit dfe0517

Add canonical schema validation and update documentation

author Alan Dipert
2025-12-04 03:58:39 UTC
committer Alan Dipert
2025-12-04 03:58:39 UTC
parent 65e1c79a67f2b6edc52d7f515c567a509d754437

Add canonical schema validation and update documentation

README.md +2 -0
meta_schema.py +81 -0
property_tests.py +4 -0

diff --git a/README.md b/README.md
index bd20f54..6af45ca 100644
--- a/README.md
+++ b/README.md
@@ -58,6 +58,7 @@ Do **not** use the score as a sole hiring gate; treat it as one data point along
   - Structural diversity quotas for ditransitives, plurals, adjectives, feminine plurals.
   - No prefix/suffix ordering violations.
   - Tense/number/gender surfaces remain distinct (no collapses).
+  - JSON output validated against the canonical feature-structure schema (`meta_schema.py`).
 - **Regeneration:** `main.py` will retry up to 10 seeds until all properties pass; otherwise it fails loudly.
 
 ## Proctoring Guidance
@@ -81,6 +82,7 @@ Do **not** use the score as a sole hiring gate; treat it as one data point along
 - `test_blueprint.py` — Section structure and default blueprint.
 - `test_generator.py` — Feature-based item generation and minimal-pair distractors.
 - `property_tests.py` — Gatekeeper checks (grammaticality, uniqueness, quotas).
+- `meta_schema.py` — Lightweight JSON schema validator for the canonical feature-structure format.
 - `render_text.py` — Converts JSON to booklet and answer key.
 - `main.py` — CLI to generate JSON; retries until all properties pass.
 - `Makefile` — `make run` builds everything; `make clean` removes artifacts.
diff --git a/meta_schema.py b/meta_schema.py
new file mode 100644
index 0000000..2e4e75f
--- /dev/null
+++ b/meta_schema.py
@@ -0,0 +1,81 @@
+"""Lightweight schema checks for ALAN JSON structures."""
+from __future__ import annotations
+
+from typing import Any, Dict, List
+
+
+def _is_bool(x: Any) -> bool:
+    return isinstance(x, bool)
+
+
+def _is_str(x: Any) -> bool:
+    return isinstance(x, str)
+
+
+def _is_list_of_str(x: Any) -> bool:
+    return isinstance(x, list) and all(isinstance(i, str) for i in x)
+
+
+def _check_np(feat: Dict[str, Any]) -> bool:
+    required = {"noun_id", "feminine", "plural", "adjectives", "role", "use_irregular"}
+    if not isinstance(feat, dict) or not required.issubset(feat.keys()):
+        return False
+    return (
+        _is_str(feat["noun_id"])
+        and _is_bool(feat["feminine"])
+        and _is_bool(feat["plural"])
+        and isinstance(feat["adjectives"], list)
+        and _is_str(feat["role"])
+        and _is_bool(feat["use_irregular"])
+    )
+
+
+def _check_features(f: Dict[str, Any]) -> bool:
+    required = {"subject", "obj1", "verb_id", "tense", "use_irregular_verb"}
+    if not isinstance(f, dict) or not required.issubset(f.keys()):
+        return False
+    if not _check_np(f["subject"]) or not _check_np(f["obj1"]):
+        return False
+    if f.get("obj2") is not None and not _check_np(f["obj2"]):
+        return False
+    return _is_str(f["verb_id"]) and _is_str(f["tense"]) and _is_bool(f["use_irregular_verb"])
+
+
+def validate_schema(data: Dict[str, Any]) -> bool:
+    """Shallow structural validation for generated JSON."""
+    meta = data.get("meta", {})
+    if not (_is_str(meta.get("version", "")) and _is_str(meta.get("description", ""))):
+        return False
+    if not isinstance(meta.get("rules", []), list):
+        return False
+    dictionary = meta.get("dictionary", {})
+    for part in ("nouns", "verbs", "adjectives"):
+        if not isinstance(dictionary.get(part, {}), dict):
+            return False
+        if not all(_is_str(k) and _is_str(v) for k, v in dictionary.get(part, {}).items()):
+            return False
+
+    sections: List[Dict[str, Any]] = data.get("sections", [])
+    if not isinstance(sections, list) or not sections:
+        return False
+    for sec in sections:
+        if not _is_str(sec.get("id", "")):
+            return False
+        if not isinstance(sec.get("questions", []), list):
+            return False
+        for q in sec.get("questions", []):
+            if not all(_is_str(q.get(k, "")) for k in ("id", "item_type", "section_id", "stem")):
+                return False
+            if not isinstance(q.get("options", []), list) or len(q["options"]) != 4:
+                return False
+            for opt in q["options"]:
+                if not (_is_str(opt.get("label", "")) and _is_str(opt.get("text", ""))):
+                    return False
+                if not _is_bool(opt.get("is_correct", False)):
+                    return False
+                if not _is_str(opt.get("explanation", "")):
+                    return False
+                if not _check_features(opt.get("features", {})):
+                    return False
+    return True
+
diff --git a/property_tests.py b/property_tests.py
index 27950ef..b97fcad 100644
--- a/property_tests.py
+++ b/property_tests.py
@@ -16,6 +16,7 @@ from language_spec import (
 from grammar_check import is_grammatical
 from semantic import to_meaning, meanings_equal, semantic_distance
 from language_coherence import check_coherence
+from meta_schema import validate_schema
 
 # Property thresholds
 DIST_MIN = 1
@@ -440,6 +441,9 @@ def validate_data(data: Dict, spec=None) -> bool:
     if not check_coherence(data):
         ok = False
         print("FAIL language coherence checks")
+    if not validate_schema(data):
+        ok = False
+        print("FAIL JSON schema validation")
     if ok:
         print("All property tests passed.")
     return ok