| author | Alan Dipert
<alan@dipert.org> 2025-12-04 03:58:39 UTC |
| committer | Alan Dipert
<alan@dipert.org> 2025-12-04 03:58:39 UTC |
| parent | 65e1c79a67f2b6edc52d7f515c567a509d754437 |
| README.md | +2 | -0 |
| meta_schema.py | +81 | -0 |
| property_tests.py | +4 | -0 |
diff --git a/README.md b/README.md index bd20f54..6af45ca 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,7 @@ Do **not** use the score as a sole hiring gate; treat it as one data point along - Structural diversity quotas for ditransitives, plurals, adjectives, feminine plurals. - No prefix/suffix ordering violations. - Tense/number/gender surfaces remain distinct (no collapses). + - JSON output validated against the canonical feature-structure schema (`meta_schema.py`). - **Regeneration:** `main.py` will retry up to 10 seeds until all properties pass; otherwise it fails loudly. ## Proctoring Guidance @@ -81,6 +82,7 @@ Do **not** use the score as a sole hiring gate; treat it as one data point along - `test_blueprint.py` — Section structure and default blueprint. - `test_generator.py` — Feature-based item generation and minimal-pair distractors. - `property_tests.py` — Gatekeeper checks (grammaticality, uniqueness, quotas). +- `meta_schema.py` — Lightweight JSON schema validator for the canonical feature-structure format. - `render_text.py` — Converts JSON to booklet and answer key. - `main.py` — CLI to generate JSON; retries until all properties pass. - `Makefile` — `make run` builds everything; `make clean` removes artifacts. diff --git a/meta_schema.py b/meta_schema.py new file mode 100644 index 0000000..2e4e75f --- /dev/null +++ b/meta_schema.py @@ -0,0 +1,81 @@ +"""Lightweight schema checks for ALAN JSON structures.""" +from __future__ import annotations + +from typing import Any, Dict, List + + +def _is_bool(x: Any) -> bool: + return isinstance(x, bool) + + +def _is_str(x: Any) -> bool: + return isinstance(x, str) + + +def _is_list_of_str(x: Any) -> bool: + return isinstance(x, list) and all(isinstance(i, str) for i in x) + + +def _check_np(feat: Dict[str, Any]) -> bool: + required = {"noun_id", "feminine", "plural", "adjectives", "role", "use_irregular"} + if not isinstance(feat, dict) or not required.issubset(feat.keys()): + return False + return ( + _is_str(feat["noun_id"]) + and _is_bool(feat["feminine"]) + and _is_bool(feat["plural"]) + and isinstance(feat["adjectives"], list) + and _is_str(feat["role"]) + and _is_bool(feat["use_irregular"]) + ) + + +def _check_features(f: Dict[str, Any]) -> bool: + required = {"subject", "obj1", "verb_id", "tense", "use_irregular_verb"} + if not isinstance(f, dict) or not required.issubset(f.keys()): + return False + if not _check_np(f["subject"]) or not _check_np(f["obj1"]): + return False + if f.get("obj2") is not None and not _check_np(f["obj2"]): + return False + return _is_str(f["verb_id"]) and _is_str(f["tense"]) and _is_bool(f["use_irregular_verb"]) + + +def validate_schema(data: Dict[str, Any]) -> bool: + """Shallow structural validation for generated JSON.""" + meta = data.get("meta", {}) + if not (_is_str(meta.get("version", "")) and _is_str(meta.get("description", ""))): + return False + if not isinstance(meta.get("rules", []), list): + return False + dictionary = meta.get("dictionary", {}) + for part in ("nouns", "verbs", "adjectives"): + if not isinstance(dictionary.get(part, {}), dict): + return False + if not all(_is_str(k) and _is_str(v) for k, v in dictionary.get(part, {}).items()): + return False + + sections: List[Dict[str, Any]] = data.get("sections", []) + if not isinstance(sections, list) or not sections: + return False + for sec in sections: + if not _is_str(sec.get("id", "")): + return False + if not isinstance(sec.get("questions", []), list): + return False + for q in sec.get("questions", []): + if not all(_is_str(q.get(k, "")) for k in ("id", "item_type", "section_id", "stem")): + return False + if not isinstance(q.get("options", []), list) or len(q["options"]) != 4: + return False + for opt in q["options"]: + if not (_is_str(opt.get("label", "")) and _is_str(opt.get("text", ""))): + return False + if not _is_bool(opt.get("is_correct", False)): + return False + if not _is_str(opt.get("explanation", "")): + return False + if not _check_features(opt.get("features", {})): + return False + return True + diff --git a/property_tests.py b/property_tests.py index 27950ef..b97fcad 100644 --- a/property_tests.py +++ b/property_tests.py @@ -16,6 +16,7 @@ from language_spec import ( from grammar_check import is_grammatical from semantic import to_meaning, meanings_equal, semantic_distance from language_coherence import check_coherence +from meta_schema import validate_schema # Property thresholds DIST_MIN = 1 @@ -440,6 +441,9 @@ def validate_data(data: Dict, spec=None) -> bool: if not check_coherence(data): ok = False print("FAIL language coherence checks") + if not validate_schema(data): + ok = False + print("FAIL JSON schema validation") if ok: print("All property tests passed.") return ok