| author | Alan Dipert
<alan@dipert.org> 2025-12-04 04:05:10 UTC |
| committer | Alan Dipert
<alan@dipert.org> 2025-12-04 04:05:10 UTC |
| parent | dfe0517d18f0c85fce9370e8bb0bf03c065a255d |
| main.py | +8 | -0 |
| property_tests.py | +25 | -4 |
| render_text.py | +6 | -0 |
| test_generator.py | +8 | -1 |
diff --git a/main.py b/main.py index fb16b0d..87839f9 100644 --- a/main.py +++ b/main.py @@ -15,6 +15,14 @@ def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Generate an artificial language test JSON.") parser.add_argument("--seed", type=int, help="Random seed for reproducibility.") parser.add_argument("--out", dest="out_path", default="generated_test.json", help="Output path.") + parser.add_argument("--min-irregular", type=int, default=6, help="Minimum irregular uses.") + parser.add_argument( + "--min-irregular-contrast", type=int, default=4, help="Minimum irregular contrast items (correct vs distractor)." + ) + parser.add_argument("--min-ditransitive", type=int, default=8, help="Minimum ditransitive items.") + parser.add_argument("--min-plural", type=int, default=12, help="Minimum plural-bearing items.") + parser.add_argument("--min-adjective", type=int, default=12, help="Minimum adjective-bearing items.") + parser.add_argument("--min-fem-plural", type=int, default=6, help="Minimum feminine-plural items.") return parser.parse_args() diff --git a/property_tests.py b/property_tests.py index b97fcad..2a6ae3b 100644 --- a/property_tests.py +++ b/property_tests.py @@ -18,7 +18,7 @@ from semantic import to_meaning, meanings_equal, semantic_distance from language_coherence import check_coherence from meta_schema import validate_schema -# Property thresholds +# Property thresholds (can be overridden via params) DIST_MIN = 1 DIST_MAX = 1 MIN_IRREG_USE = 6 @@ -386,9 +386,17 @@ def main() -> None: sys.exit(0 if ok else 1) -def validate_data(data: Dict, spec=None) -> bool: +def validate_data(data: Dict, spec=None, overrides: Dict[str, int] | None = None) -> bool: if spec is None: spec = generate_language_instance() + o = overrides or {} + min_irregular = o.get("min_irregular", MIN_IRREG_USE) + min_irregular_contrast = o.get("min_irregular_contrast", MIN_IRREG_CONTRAST) + min_irregular_distractor = o.get("min_irregular_distractor", MIN_IRREG_DISTRACTOR) + min_ditransitive = o.get("min_ditransitive", MIN_DITRANSITIVE) + min_plural = o.get("min_plural", MIN_PLURAL_ITEMS) + min_adjective = o.get("min_adjective", MIN_ADJECTIVE_ITEMS) + min_fem_plural = o.get("min_fem_plural", MIN_FEM_PLURAL_ITEMS) verbs = spec.lexicon["verbs"] ok = True questions = [q for s in data.get("sections", []) for q in s.get("questions", [])] @@ -426,7 +434,13 @@ def validate_data(data: Dict, spec=None) -> bool: if not check_prefix_and_scope(opt, spec): ok = False print(f"FAIL prefix/scope for Q{q.get('number')} option {opt['label']}") - if not check_irregulars(data, spec): + if not check_irregulars( + data, + spec, + min_use=min_irregular, + min_contrast=min_irregular_contrast, + min_distractors=min_irregular_distractor, + ): ok = False print("FAIL irregular coverage (need >=3 letul and >=3 rontmimu in correct answers)") if not check_role_number_uniqueness(spec): @@ -435,7 +449,14 @@ def validate_data(data: Dict, spec=None) -> bool: if not check_tense_uniqueness(spec): ok = False print("FAIL tense uniqueness (present and past forms must differ)") - if not check_structural_diversity(data): + if not check_structural_diversity( + data, + min_irregular=min_irregular, + min_ditransitive=min_ditransitive, + min_plural=min_plural, + min_adjective=min_adjective, + min_fem_plural=min_fem_plural, + ): ok = False print("FAIL structural diversity quotas") if not check_coherence(data): diff --git a/render_text.py b/render_text.py index e8b0949..c1851e5 100644 --- a/render_text.py +++ b/render_text.py @@ -20,6 +20,12 @@ def render_booklet(data: Dict[str, Any]) -> str: if meta.get("instructions"): lines.append(meta["instructions"]) lines.append("") + params = meta.get("generation_params") + if params: + lines.append("Generation parameters:") + for k, v in params.items(): + lines.append(f"- {k}: {v}") + lines.append("") if meta.get("rules"): lines.append("Grammar Cheat Sheet") lines.append("-------------------") diff --git a/test_generator.py b/test_generator.py index 84f261a..8061d4c 100644 --- a/test_generator.py +++ b/test_generator.py @@ -504,7 +504,12 @@ def generate_item( def generate_test( - spec: LanguageSpec, blueprint: TestBlueprint, concepts: Dict[str, Concept], rng: random.Random, seed: int | None = None + spec: LanguageSpec, + blueprint: TestBlueprint, + concepts: Dict[str, Concept], + rng: random.Random, + seed: int | None = None, + params: Dict[str, int] | None = None, ) -> Dict: sections_out = [] question_counter = 1 @@ -593,6 +598,7 @@ def generate_test( } ) + meta_params = params or {} return { "meta": { "version": "0.2", @@ -605,6 +611,7 @@ def generate_test( "the examples. You do not need any linguistics background. For each question, pick " "the best option (A-D). All correct answers keep the order: doer, receiver, verb." ), + "generation_params": meta_params, "rules": [ "Word order: DOER RECEIVER VERB (SOV). For 'give': doer, recipient, theme, verb.", "Adjectives follow the noun they describe.",