diff --git a/evals/capture.py b/evals/capture.py new file mode 100644 index 0000000..bd584f3 --- /dev/null +++ b/evals/capture.py @@ -0,0 +1,320 @@ +"""Contract test capture utilities for evaluator notebooks. + +Three-step workflow for notebook authors +----------------------------------------- +1. Wrap every model in your chain with ``capture_llm()``: + + chain = prompt | capture_llm("step_name", my_model) | JsonOutputParser() + + The prefix (``"step_name"``) becomes the step key in ``prompt_steps`` in + the TOML output. Use a short, stable name per step (e.g. ``"main"``, + ``"bk"``, ``"vocab"``). + +2. Immediately after each test-case evaluation, call ``capture_case()`` to save a + point-in-time copy of what was captured. Pass the evaluator's input dict + and output dict directly — no manual field extraction needed:: + + case_input = {"text": my_text, "grade_level": 4} + case_output = run_evaluator(**case_input) + + _cap = capture_case( + name="my_case", + input=case_input, + llm_call_captures=["step_name"], # prefixes, in call order + expected_result=case_output, + description="…", # optional human-readable label + ) + +3. Print the TOML block and paste it into ``contract_tests.toml``: + + print(build_contract_toml(_cap_one, _cap_two)) + +Resetting between runs +----------------------- +Call ``reset_captures()`` at the start of each evaluation to avoid stale data from a +previous run leaking into the next capture_case:: + + reset_captures() + output = run_evaluator(text, grade) + _cap = capture_case( + name="my_case", + input={"text": text, "grade_level": grade}, + llm_call_captures=["main"], + expected_result=output, + ) + +Async chains +------------- +``capture_llm()`` works in both sync (``invoke``) and async (``ainvoke``) +chains with no extra configuration. +""" + +from __future__ import annotations + +import json as _json +from typing import Any + +from langchain_core.runnables import RunnableLambda + + +# --------------------------------------------------------------------------- +# Internal state +# --------------------------------------------------------------------------- + +# Flat dict populated by capture_llm() on every chain invocation. +# Keys follow the pattern "{prefix}_{field}" (e.g. "bk_user_prompt"). +_captures: dict[str, Any] = {} + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + + +def reset_captures() -> None: + """Clear all captured data. Call at the start of each evaluation run.""" + _captures.clear() + + +def capture_llm(prefix: str, model: Any) -> RunnableLambda: + """Return a wrapped model that records the formatted prompt and raw response. + + Drop it into a LangChain chain wherever you have a bare model:: + + # single-step evaluator + chain = prompt | capture_llm("main", llm) | JsonOutputParser() + + # multi-step evaluator + bk_chain = bk_prompt | capture_llm("bk", bk_model) + vocab_chain = vocab_prompt | capture_llm("vocab", vocab_model) | JsonOutputParser() + + After the chain runs, ``_captures`` contains: + + * ``"{prefix}_system_prompt"`` — system message content (``""`` if none) + * ``"{prefix}_user_prompt"`` — human message content + * ``"{prefix}_raw_response"`` — text content of the LLM response. + Most providers return a plain string. + Some (e.g. Google Gemini via + langchain_google_genai) return a list of + content blocks; the first ``"text"`` block + is extracted so the stored value is always + a plain string suitable for JsonOutputParser. + * ``"{prefix}_model"`` — model identifier + * ``"{prefix}_temperature"`` — temperature used + + Supports both ``invoke`` (sync) and ``ainvoke`` (async) chains. + """ + model_name = getattr(model, "model", None) or getattr(model, "model_name", None) or "" + temperature = float(getattr(model, "temperature", 0)) + + def _record(prompt_value: Any, ai_message: Any) -> None: + if isinstance(prompt_value, str): + # Plain string passed directly to the model — treat it as a lone + # human message with no system prompt. This happens when the + # caller formats a prompt template themselves and passes the result + # as a string rather than going through a ChatPromptTemplate chain. + system = "" + human = prompt_value + else: + messages = ( + prompt_value.to_messages() + if hasattr(prompt_value, "to_messages") + else list(prompt_value) + ) + system = next( + (str(m.content) for m in messages if getattr(m, "type", None) == "system"), "" + ) + human = next( + (str(m.content) for m in messages if getattr(m, "type", None) == "human"), "" + ) + _captures[f"{prefix}_system_prompt"] = system + _captures[f"{prefix}_user_prompt"] = human + _captures[f"{prefix}_model"] = model_name + _captures[f"{prefix}_temperature"] = temperature + _captures[f"{prefix}_raw_response"] = _extract_text_content(ai_message.content) + + def _invoke(prompt_value: Any) -> Any: + ai_message = model.invoke(prompt_value) + _record(prompt_value, ai_message) + return ai_message + + async def _ainvoke(prompt_value: Any) -> Any: + ai_message = await model.ainvoke(prompt_value) + _record(prompt_value, ai_message) + return ai_message + + return RunnableLambda(_invoke, afunc=_ainvoke) + + +def capture_case( + *, + name: str, + input: dict[str, Any], + llm_call_captures: list[str], + expected_result: dict[str, Any] | None = None, + description: str | None = None, +) -> dict[str, Any]: + """Return a point-in-time copy of ``_captures`` merged with structured context. + + Args: + name: Case identifier used as the TOML key (e.g. ``"marco_polo_grade3"``). + input: The evaluator's input dict (e.g. ``{"text": ..., "grade_level": 4}``). + Keys are written as-is to the ``[input]`` TOML section. + llm_call_captures: Ordered list of capture prefixes to include as + ``prompt_steps`` in the TOML. Must match the prefixes + passed to ``capture_llm()`` during this run, in call order. + expected_result: The evaluator's output. Pass the whole output — no + need to extract individual fields. Plain dicts, Pydantic + v1/v2 models, and anything dict-like are all accepted; + ``capture_case()`` normalises to a plain dict. Written to + the ``[expected_result]`` TOML section. + description: Optional human-readable label for this test case. + + Example:: + + case_input = {"text": text, "grade_level": 3} + case_output = run_evaluator(**case_input) + + _cap = capture_case( + name="marco_polo_grade3", + input=case_input, + llm_call_captures=["bk", "vocab"], + expected_result=case_output, + description="Marco Polo passage, grade 3", + ) + """ + data: dict[str, Any] = dict(_captures) + data["name"] = name + data["input"] = dict(input) + data["llm_call_captures"] = llm_call_captures + if expected_result is not None: + # Normalise to a plain dict so capture_case() is always fully serializable. + # Handles Pydantic v2 models (.model_dump()), v1 models (.dict()), and + # anything else that is already a dict or dict-like. + if hasattr(expected_result, "model_dump"): + expected_result = expected_result.model_dump() + elif hasattr(expected_result, "dict"): + expected_result = expected_result.dict() + data["expected_result"] = dict(expected_result) + if description is not None: + data["description"] = description + return data + + +def build_contract_toml(*cases: dict[str, Any]) -> str: + """Build the contract TOML block for one or more test cases. + + Args: + *cases: One or more dicts as returned by :func:`capture_case`. + + Returns: + TOML string ready to paste into ``contract_tests.toml``. + + Example:: + + print(build_contract_toml(_cap_grade3, _cap_grade7)) + """ + return "\n".join(_build_case(c) for c in cases) + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + + +def _extract_text_content(content: Any) -> str: + """Extract a plain-text string from an AIMessage content value. + + Most LLM providers via LangChain return a plain ``str``. Some + (e.g. Google Gemini via ``langchain_google_genai``) return a list of + content blocks such as:: + + [{"type": "text", "text": "...the model's reply...", "extras": {...}}] + + This helper normalises both shapes to a plain string so that + ``_raw_response`` is always something ``JsonOutputParser`` can parse + directly — not a Python-repr of a list. + """ + if isinstance(content, str): + return content + if isinstance(content, list): + for block in content: + if isinstance(block, dict) and block.get("type") == "text": + return str(block["text"]) + return str(content) + + +def _toml_value(v: Any) -> str: + """Serialize *v* to a TOML literal.""" + if isinstance(v, str): + if v == "": + return '""' + # Prefer multiline LITERAL strings ('''...''') — they are verbatim so + # backslashes, double-quotes, and \' sequences are all safe. The only + # restriction is the content cannot contain '''. + # + # IMPORTANT: no "\n" before the closing ''' — that would add a spurious + # trailing newline to every parsed value (TOML only trims the *first* + # newline after the opening delimiter, not the last one before closing). + if "'''" not in v: + return "'''\n" + v + "'''" + # Fallback: content contains ''' so literal multiline cannot be used. + # json.dumps produces a TOML-safe basic string (all backslashes/quotes + # escaped); tomllib accepts it as a single-line value. + return _json.dumps(v) + if isinstance(v, bool): + return "true" if v else "false" + if isinstance(v, int): + return str(v) + if isinstance(v, float): + return str(v) + if isinstance(v, list): + return "[" + ", ".join(_json.dumps(item) for item in v) + "]" + # Fallback: JSON-encode whatever it is. + return _json.dumps(str(v)) + + +def _build_case(c: dict[str, Any]) -> str: + name = c["name"] + lines: list[str] = [] + + # ── optional top-level description ─────────────────────────────────────── + description = c.get("description") + if description: + lines += [ + f"[cases.{name}]", + f"description = {_json.dumps(description)}", + "", + ] + + # ── input section ──────────────────────────────────────────────────────── + lines.append(f"[cases.{name}.input]") + for field, val in c.get("input", {}).items(): + # Do NOT strip text — stripping would make input.text differ from the + # text that capture_llm used when formatting the user_prompt, causing + # the contract test's prompt-fidelity assertion to fail. + lines.append(f"{field} = {_toml_value(val)}") + lines.append("") + + # ── prompt_steps sections ───────────────────────────────────────────────── + prefixes = c["llm_call_captures"] + for prefix in prefixes: + lines += [ + f"[cases.{name}.prompt_steps.{prefix}]", + f"model = {_json.dumps(c.get(prefix + '_model', ''))}", + f"temperature = {c.get(prefix + '_temperature', 0)}", + f"system_prompt = {_toml_value(c.get(prefix + '_system_prompt', ''))}", + f"user_prompt = {_toml_value(c.get(prefix + '_user_prompt', ''))}", + f"llm_response = {_toml_value(c.get(prefix + '_raw_response', ''))}", + "", + ] + + # ── expected_result section ─────────────────────────────────────────────── + expected = c.get("expected_result") + if expected: + lines.append(f"[cases.{name}.expected_result]") + for field, value in expected.items(): + lines.append(f"{field} = {_toml_value(value)}") + lines.append("") + + return "\n".join(lines) diff --git a/evals/conventionality_evaluator.ipynb b/evals/conventionality_evaluator.ipynb index 87b9b99..b74b7c2 100644 --- a/evals/conventionality_evaluator.ipynb +++ b/evals/conventionality_evaluator.ipynb @@ -54,7 +54,8 @@ "from langchain_core.prompts.chat import HumanMessagePromptTemplate\n", "from langchain_google_genai import ChatGoogleGenerativeAI\n", "from pydantic import BaseModel, Field\n", - "from textstat import textstat as ts" + "from textstat import textstat as ts\n", + "from capture import capture_llm, capture_case, reset_captures, build_contract_toml" ] }, { @@ -172,7 +173,7 @@ " },\n", " )\n", "\n", - " chain = prompt | model | JsonOutputParser()\n", + " chain = prompt | capture_llm(\"main\", model) | JsonOutputParser()\n", " return chain.invoke(dataset)" ] }, @@ -201,6 +202,32 @@ "display(result)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "fbbe4aa9", + "metadata": {}, + "outputs": [], + "source": [ + "reset_captures()\n", + "\n", + "sample_text = \"\"\"\n", + "\"Well, then,\" said the teacher, \"you may take your slate and go out behind the schoolhouse for half an hour. Think of something to write about, and write the word on your slate. Then try to tell what it is, what it is like, what it is good for, and what is done with it. That is the way to write a composition.\" Henry took his slate and went out. Just behind the schoolhouse was Mr. Finney's barn. Quite close to the barn was a garden. And in the garden, Henry saw a turnip. \"Well, I know what that is,\" he said to himself; and he wrote the word turnip on his slate. Then he tried to tell what it was like, what it was good for, and what was done with it. Before the half hour was ended he had written a very neat composition on his slate. He then went into the house, and waited while the teacher read it. The teacher was surprised and pleased. He said, \"Henry Longfellow, you have done very well. Today you may stand up before the school and read what you have written about the turnip.\"\n", + "\"\"\"\n", + "input = {\"text\": sample_text, \"grade\": 4}\n", + "result = predict_text_complexity_level(**input)\n", + "\n", + "capture = capture_case(\n", + " name=\"turnip\",\n", + " description=\"Grade 4 classroom narrative (Henry and the turnip)\",\n", + " input=input,\n", + " llm_call_captures=[\"main\"],\n", + " expected_result=result,\n", + ")\n", + "\n", + "print(build_contract_toml(capture))" + ] + }, { "cell_type": "markdown", "id": "cell-12", @@ -212,13 +239,21 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": ".venv (3.14.4)", "language": "python", "name": "python3" }, "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", "name": "python", - "version": "3.10.0" + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.14.4" } }, "nbformat": 4, diff --git a/sdks/python/Makefile b/sdks/python/Makefile index 742abd1..9d7b465 100644 --- a/sdks/python/Makefile +++ b/sdks/python/Makefile @@ -4,32 +4,55 @@ PYTEST := $(VENV)/bin/pytest RUFF := $(VENV)/bin/ruff MYPY := $(VENV)/bin/mypy -# Bundled settings directory inside the package. +# Bundled settings directory inside the package. Two things are maintained here: +# _generated_*_settings.py — generated from settings.toml by `make generate-settings` +# contracts.toml — copied from sdks/settings/ by `make sync-settings` +# +# Tests always read contracts.toml from the bundled copy (via importlib.resources +# fallback in shared_settings_root()). EVALUATORS_SETTINGS_DIR is intentionally NOT +# exported here so that dev and installed-package runs behave identically. SETTINGS_DST := src/learning_commons_evaluators/settings -.PHONY: help test unit-test generate-settings check-generated \ +.PHONY: help build check-build test unit-test contract-test \ + generate-settings check-generated sync-settings check-sync \ lint format format-check typecheck pip-check verify coverage help: @echo "Usage: make " @echo "" + @echo " build Prepare the package: generate settings + sync contracts TOML" + @echo " check-build Verify build artifacts are up to date (use in CI)" + @echo "" @echo " lint Ruff linter (src, tests, settings generator script)" @echo " format Apply Ruff formatter" @echo " format-check Fail if Ruff would reformat any file" @echo " typecheck Mypy on src package + tests" @echo " pip-check pip dependency consistency check" - @echo " verify check-generated + lint + format-check + typecheck + pip-check + unit-test" + @echo " verify check-build + lint + format-check + typecheck + pip-check + unit-test" @echo " coverage Unit tests with coverage report (requires dev install)" @echo "" - @echo " test Run the full test suite" - @echo " unit-test Run unit tests only" + @echo " test Run the full test suite (unit + contract)" + @echo " unit-test Run unit tests only (skip contract tests)" + @echo " contract-test Run contract tests only" @echo "" @echo " generate-settings Re-generate _generated_*_settings.py from sdks/settings/" @echo " check-generated Verify generated .py files match sdks/settings/ (exits 1 if stale)" + @echo " sync-settings Copy contracts.toml from sdks/settings/ into the package" + @echo " check-sync Verify bundled contracts.toml matches sdks/settings/" @echo "" @echo "Typical workflows:" @echo " After editing evaluator settings TOML: make generate-settings && make test" - @echo " CI: make verify" + @echo " After editing contracts TOML: make sync-settings && make contract-test" + @echo " Fresh checkout / before release: make build && make verify" + @echo " CI: make verify && make contract-test" + +# ── Build ───────────────────────────────────────────────────────────────────── +# Prepares all package build artifacts in one step. Run after editing any +# file under sdks/settings/, or on a fresh checkout before running tests. + +build: generate-settings sync-settings + +check-build: check-generated check-sync # ── Static checks (Ruff + Mypy) ─────────────────────────────────────────────── @@ -48,7 +71,7 @@ typecheck: pip-check: $(PYTHON) -m pip check -verify: check-generated lint format-check typecheck pip-check unit-test +verify: check-build lint format-check typecheck pip-check unit-test coverage: $(PYTEST) tests/ -q --ignore=tests/contract_tests --cov=learning_commons_evaluators --cov-report=term-missing @@ -61,11 +84,26 @@ test: unit-test: $(PYTEST) tests/ -v --ignore=tests/contract_tests +contract-test: + $(PYTEST) tests/contract_tests/ -v + # ── Code generation ─────────────────────────────────────────────────────────── # Re-run when an evaluator settings TOML under sdks/settings/ changes. +# The generator sets EVALUATORS_SETTINGS_DIR itself to point at sdks/settings/. generate-settings: $(PYTHON) ../../scripts/generate_settings.py check-generated: $(PYTHON) ../../scripts/generate_settings.py --check + +# ── Contract test bundling ──────────────────────────────────────────────────── +# contracts.toml is bundled so contract tests run consistently whether using +# a development checkout or a pip-installed package. The sync command is +# handled by the Python generator script. + +sync-settings: + $(PYTHON) ../../scripts/generate_settings.py --sync + +check-sync: + $(PYTHON) ../../scripts/generate_settings.py --check-sync diff --git a/sdks/python/src/learning_commons_evaluators/settings/conventionality/__init__.py b/sdks/python/src/learning_commons_evaluators/settings/conventionality/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sdks/python/src/learning_commons_evaluators/settings/conventionality/contracts.toml b/sdks/python/src/learning_commons_evaluators/settings/conventionality/contracts.toml new file mode 100644 index 0000000..5ff6dd7 --- /dev/null +++ b/sdks/python/src/learning_commons_evaluators/settings/conventionality/contracts.toml @@ -0,0 +1,132 @@ +# Contract tests for the Conventionality evaluator. +# +# This file captures the exact LLM interaction for known inputs so that +# tests can verify the SDK sends the same request as the notebook and +# produces the same result from the same response. +# +# HOW TO REGENERATE +# ----------------- +# Run the notebook "evals/Final ship - Conventionality Experimental Evaluator.ipynb" +# from the repo root with GOOGLE_API_KEY set. It loads prompts from the evaluator +# settings TOML (same as the SDK) and prints a TOML block — paste into this file +# and run `make sync-settings` from sdks/python/ to update the bundled copy. +# +# WHAT BELONGS HERE +# ----------------- +# Only data that originates from external sources: +# - input: what the caller provides to the evaluator +# - prompt_steps: what the SDK sends to the LLM (fully formatted messages, +# model, temperature) and the raw response the LLM returned +# - expected_result: the structured output the LLM returned (notebook format) +# +# Deterministic values computed by library code (e.g. FK score from textstat, +# format_instructions from LangChain's JsonOutputParser) are NOT stored as +# standalone fields — they appear inside the fully formatted prompts, where they +# are implicitly verified by the prompt-request assertion in the contract test. + +[cases.turnip] +description = "Grade 4 classroom narrative (Henry and the turnip)" + +[cases.turnip.input] +text = ''' + +"Well, then," said the teacher, "you may take your slate and go out behind the schoolhouse for half an hour. Think of something to write about, and write the word on your slate. Then try to tell what it is, what it is like, what it is good for, and what is done with it. That is the way to write a composition." Henry took his slate and went out. Just behind the schoolhouse was Mr. Finney's barn. Quite close to the barn was a garden. And in the garden, Henry saw a turnip. "Well, I know what that is," he said to himself; and he wrote the word turnip on his slate. Then he tried to tell what it was like, what it was good for, and what was done with it. Before the half hour was ended he had written a very neat composition on his slate. He then went into the house, and waited while the teacher read it. The teacher was surprised and pleased. He said, "Henry Longfellow, you have done very well. Today you may stand up before the school and read what you have written about the turnip." +''' +grade = 4 + +[cases.turnip.prompt_steps.main] +model = "gemini-3-flash-preview" +temperature = 0.0 +system_prompt = '''Role +You are an expert reading teacher and text complexity evaluator. Your task is to evaluate the "Conventionality" of a text and assign it a complexity level based on a 4-point scale, carefully factoring in the target grade level. + +Objective +Measure how explicit, literal, and straightforward the text's meaning is, versus how abstract, ironic, figurative, or archaic it is. Focus on the hiddenness of the meaning, the use of conceptual framing, the reliance on abstract reasoning, and the familiarity of the expression for the target grade. + +Complexity Levels +- Slightly Complex: Explicit, literal, straightforward, easy to understand. Meaning is entirely on the surface. The language is concrete, and the meaning is clear and procedural, mostly referring to observable materials and actions. Contains no symbolic or ironic language, and conceptual interpretation is not required. Contains limited figurative language that is common and easy to comprehend at the target grade level. +- Moderately Complex: Largely explicit and easy to understand with some occasions for more complex meaning. May contain a noticeable amount of archaic/dated phrasing, formal historical prose, vocabulary demands, background knowledge requirements, or expressions that are less familiar to the target grade level, which might make the text feel vague or slightly challenging. +- Very Complex: Fairly complex; contains sustained abstract language, conceptual framing, rhetorical idealization, ironic comparisons, or central metaphors that drive the meaning of the text. Addresses concepts, beliefs, and abstract qualities rather than just concrete objects. The tone or underlying message requires interpretation, even if the surface message is clear. +- Exceedingly Complex: Dense and complex; contains considerable abstract, ironic, and/or figurative language. Meaning is heavily hidden, deeply conceptual, or relies heavily on complex rhetorical devices. + +Essential Evaluation Rules +1. Concrete & Procedural Texts: Texts that are highly concrete, clear, and procedural (e.g., describing observable materials, mechanical processes, or physical actions) should typically be rated "Slightly Complex." + +2. Grade-Level Anchoring and Vague Narratives: Always consider the target grade. A literal historical narrative that might be straightforward for older students can be "Moderately Complex" for younger students (e.g., 4th graders) if it involves less familiar expressions, older contexts (e.g., wagon loads, traveling by horseback), vocabulary demands, and background knowledge requirements that make the text feel vague or slightly demanding for that age group. + +3. Rhetorical Idealization and Abstract Qualities: If an entire argument or narrative is built around abstract qualities (e.g., national character, bravery, liberty) and uses repeated figurative language or personification to portray a subject in a certain idealized way, rate the text as "Very Complex." Even if the figurative language is easy to interpret, the need to interpret the rhetorical tone and sustained abstract focus elevates the complexity beyond level two. + +4. Common Idioms and Grade-Level Appropriateness: Do NOT elevate a text to "Moderately Complex" simply because it contains a few common idiomatic expressions. If these expressions are widely known and easy for the target grade to understand without making the text feel vague, the text remains "Slightly Complex." + +5. Conversational and Hypothetical Framing: Using a second-person conversational hook (e.g., "Imagine you are...") to explain a concept is a standard, literal device for engaging readers. It does not constitute complex conceptual framing. + +6. Sustained vs. Occasional Impact: If abstract language, figurative phrasing, irony, or conceptual framing is sustained throughout the text and central to the argument/meaning, the text is Very Complex. Reserve Moderately Complex for texts where the explicit meaning dominates but the expression, vocabulary, or archaic language provides a moderate conventionality challenge. + +7. Central Metaphors and Conceptual Framing: When an author uses a central metaphor to explain a concept or uses figurative phrasing to explain how things "work," this abstract reasoning drives the meaning, elevating the text to Very Complex. + +8. Irony and Abstract Comparisons: Texts that rely on sustained irony, especially through comparative arguments, are inherently Very Complex for younger students. + +9. Isolate Conventionality from Vocabulary: Do not inflate the Conventionality score just because the text uses archaic, dated, or highly academic vocabulary. + +Input Format +You will receive: +- text: The passage to evaluate. +- grade: The target student grade level. +- fk_score: The Flesch-Kincaid readability score. + +Output Format +Provide a JSON object containing ONLY the following keys: +- complexity_score: (String) One of the 4 scale levels exactly as formatted: 'slightly_complex', 'moderately_complex', 'very_complex', or 'exceedingly_complex'. +- reasoning: (String) A detailed explanation of the rating, citing specific features in the text and referencing the expert guardrails (e.g., noting if the text relies on abstract qualities/rhetorical idealization, if vocabulary/background knowledge demands make a literal text vague for the grade level, or if it is strictly concrete/procedural). +- conventionality_features: (List of Strings) The specific language features driving the complexity (e.g., literal narrative, concrete actions, less familiar expressions, sustained irony, abstract qualities, rhetorical idealization, archaic phrasing) with direct quotes from the text. +- grade_context: (String) How the conventionality demands compare to general expectations for the provided target grade. +- instructional_insights: (String) Actionable pedagogical suggestions for scaffolding the conventionality features in the classroom. + +STRICT OUTPUT FORMAT: +- Return only the JSON value that conforms to the schema. Do not include any additional text, explanations, headings, or separators. +- Do not wrap the JSON in Markdown or code fences (no ``` or ```json). +- Do not prepend or append any text (e.g., do not write "Here is the JSON:"). +- The response must be a single top-level JSON value exactly as required by the schema (object/array/etc.), with no trailing commas or comments. + +The output should be formatted as a JSON instance that conforms to the JSON schema below. + +As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]} the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted. + +Here is the output schema (shown in a code block for readability only — do not include any backticks or Markdown in your output): +``` +{"properties": {"conventionality_features": {"description": "List of the specific language features driving the complexity (e.g., idioms, metaphors, implied meaning) with direct quotes from the text.", "items": {"type": "string"}, "title": "Conventionality Features", "type": "array"}, "grade_context": {"description": "How the conventionality demands compare to general expectations for the provided target grade.", "title": "Grade Context", "type": "string"}, "instructional_insights": {"description": "Actionable pedagogical suggestions for scaffolding the unconventional language features in the classroom.", "title": "Instructional Insights", "type": "string"}, "complexity_score": {"description": "The conventionality complexity level of the text", "enum": ["slightly_complex", "moderately_complex", "very_complex", "exceedingly_complex"], "title": "Complexity Score", "type": "string"}, "reasoning": {"description": "A synthesis of why the text fits the chosen rubric level.", "title": "Reasoning", "type": "string"}}, "required": ["conventionality_features", "grade_context", "instructional_insights", "complexity_score", "reasoning"]} +``` +''' +user_prompt = '''Analyze: +Text: +"Well, then," said the teacher, "you may take your slate and go out behind the schoolhouse for half an hour. Think of something to write about, and write the word on your slate. Then try to tell what it is, what it is like, what it is good for, and what is done with it. That is the way to write a composition." Henry took his slate and went out. Just behind the schoolhouse was Mr. Finney's barn. Quite close to the barn was a garden. And in the garden, Henry saw a turnip. "Well, I know what that is," he said to himself; and he wrote the word turnip on his slate. Then he tried to tell what it was like, what it was good for, and what was done with it. Before the half hour was ended he had written a very neat composition on his slate. He then went into the house, and waited while the teacher read it. The teacher was surprised and pleased. He said, "Henry Longfellow, you have done very well. Today you may stand up before the school and read what you have written about the turnip." + +Grade: 4 +FK Score: 3.75 +''' +llm_response = ''' +```json +{ + "complexity_score": "slightly_complex", + "reasoning": "The text is a straightforward, literal narrative that is entirely explicit and easy to understand. The meaning remains on the surface, focusing on concrete actions and observable materials (a slate, a schoolhouse, a turnip). The teacher's instructions are procedural and clear, providing a step-by-step guide for writing a composition. There is no symbolic, ironic, or figurative language that requires interpretation. While the setting is historical (using a 'slate' and 'schoolhouse'), these terms do not create a barrier to understanding the plot or the underlying message, which is a simple account of a student completing a task.", + "conventionality_features": [ + "literal narrative: 'Henry took his slate and went out.'", + "concrete actions: 'he wrote the word turnip on his slate.'", + "procedural instructions: 'tell what it is, what it is like, what it is good for, and what is done with it.'", + "straightforward meaning: 'The teacher was surprised and pleased.'" + ], + "grade_context": "For a 4th-grade student, this text is highly accessible and falls well within the expected range for conventionality. The Flesch-Kincaid score of 3.75 aligns with the text's simple sentence structures and concrete subject matter, making it an easy read for students at this level.", + "instructional_insights": "Since the text is highly conventional, teachers can use it to model the 'procedural' writing steps mentioned in the story. Students can practice the teacher's four-step method (what it is, what it is like, what it is good for, what is done with it) on modern objects. A brief historical note explaining that a 'slate' was a precursor to a notebook or tablet would provide sufficient context for the dated terminology." +} +```''' + +[cases.turnip.expected_result] +complexity_score = ''' +slightly_complex''' +reasoning = ''' +The text is a straightforward, literal narrative that is entirely explicit and easy to understand. The meaning remains on the surface, focusing on concrete actions and observable materials (a slate, a schoolhouse, a turnip). The teacher's instructions are procedural and clear, providing a step-by-step guide for writing a composition. There is no symbolic, ironic, or figurative language that requires interpretation. While the setting is historical (using a 'slate' and 'schoolhouse'), these terms do not create a barrier to understanding the plot or the underlying message, which is a simple account of a student completing a task.''' +conventionality_features = ["literal narrative: 'Henry took his slate and went out.'", "concrete actions: 'he wrote the word turnip on his slate.'", "procedural instructions: 'tell what it is, what it is like, what it is good for, and what is done with it.'", "straightforward meaning: 'The teacher was surprised and pleased.'"] +grade_context = ''' +For a 4th-grade student, this text is highly accessible and falls well within the expected range for conventionality. The Flesch-Kincaid score of 3.75 aligns with the text's simple sentence structures and concrete subject matter, making it an easy read for students at this level.''' +instructional_insights = ''' +Since the text is highly conventional, teachers can use it to model the 'procedural' writing steps mentioned in the story. Students can practice the teacher's four-step method (what it is, what it is like, what it is good for, what is done with it) on modern objects. A brief historical note explaining that a 'slate' was a precursor to a notebook or tablet would provide sufficient context for the dated terminology.''' diff --git a/sdks/python/tests/contract_tests/__init__.py b/sdks/python/tests/contract_tests/__init__.py new file mode 100644 index 0000000..569ad77 --- /dev/null +++ b/sdks/python/tests/contract_tests/__init__.py @@ -0,0 +1,14 @@ +"""Contract tests: verify SDK fidelity to the evaluation notebooks. + +Each evaluator has a ``contracts.toml`` in its settings folder that records: + - ``input``: the evaluator inputs used for the test case + - ``prompt_steps``: the exact LLM request (formatted messages, model, temperature) + and raw response captured from a real notebook run + - ``expected_result``: the structured LLM output (notebook format) + +Tests load these artifacts, run the evaluator with the LLM mocked to return the +captured response, and assert that: + 1. The SDK sends the same prompt as the notebook (same formatted messages, + model, and temperature). + 2. The SDK produces the same result from that response as the notebook. +""" diff --git a/sdks/python/tests/contract_tests/conventionality.py b/sdks/python/tests/contract_tests/conventionality.py new file mode 100644 index 0000000..b49d11f --- /dev/null +++ b/sdks/python/tests/contract_tests/conventionality.py @@ -0,0 +1,87 @@ +"""Conventionality-specific helpers for contract tests. + +Provides: + - Named case loaders (one function per test case in the TOML). + - ``conventionality_notebook_to_sdk_result``: converts the notebook-format + expected result (raw ``JsonOutputParser`` dict) to the expected + ``TextComplexityResult`` that the SDK should produce. +""" + +from __future__ import annotations + +from learning_commons_evaluators.schemas.evaluator import EvaluationExplanation +from learning_commons_evaluators.schemas.metadata import ( + EvaluationMetadata, + EvaluatorMaturity, + EvaluatorMetadata, + Status, +) +from learning_commons_evaluators.schemas.text_complexity import ( + TextComplexityAnswer, + TextComplexityResult, +) + +from .loader import ContractCase, load_contract_case + +# --------------------------------------------------------------------------- +# Case loaders +# --------------------------------------------------------------------------- + + +def load_conventionality_turnip_case() -> ContractCase: + """Load the 'turnip' contract test case for the conventionality evaluator.""" + return load_contract_case("conventionality", "turnip") + + +# --------------------------------------------------------------------------- +# Result mapper +# --------------------------------------------------------------------------- + + +def conventionality_notebook_to_sdk_result(case: ContractCase) -> TextComplexityResult: + """Convert ``case.expected_result`` (notebook format) to a ``TextComplexityResult``. + + The notebook outputs a plain dict from ``JsonOutputParser``; the SDK wraps + that into ``TextComplexityResult``. This function performs the same + structural mapping the SDK does so tests can assert equality. + + Only ``answer`` and ``explanation`` are compared — ``metadata`` is excluded + because it contains non-deterministic fields (timing, evaluation ID, etc.). + + Args: + case: A loaded :class:`~loader.ContractCase` with a populated + ``expected_result``. + + Returns: + A ``TextComplexityResult`` built from the contract's expected output. + The ``metadata`` field is a minimal placeholder so the object is valid. + """ + r = case.expected_result + answer = TextComplexityAnswer.from_score(r["complexity_score"]) + explanation = EvaluationExplanation( + summary=r["reasoning"], + details={ + "conventionality_features": r["conventionality_features"], + "grade_context": r["grade_context"], + "instructional_insights": r["instructional_insights"], + }, + ) + # A minimal metadata object — only used to satisfy the result model; not + # compared in assertions (use assert_answer / assert_explanation helpers). + placeholder_metadata = EvaluationMetadata( + evaluator_metadata=EvaluatorMetadata( + id="conventionality", + version="0.1", + name="Conventionality", + description="", + maturity=EvaluatorMaturity.beta, + ), + evaluation_settings=None, # type: ignore[arg-type] + input_metadata={}, + status=Status.succeeded, + ) + return TextComplexityResult( + answer=answer, + explanation=explanation, + metadata=placeholder_metadata, + ) diff --git a/sdks/python/tests/contract_tests/harness.py b/sdks/python/tests/contract_tests/harness.py new file mode 100644 index 0000000..84f453d --- /dev/null +++ b/sdks/python/tests/contract_tests/harness.py @@ -0,0 +1,223 @@ +"""Contract test harness: LLM mocking and prompt-request assertions. + +Usage +----- +:: + + case = load_conventionality_turnip_case() + config = create_config_no_telemetry() + evaluator = ConventionalityEvaluator(config) + + inp = ConventionalityEvaluationInput( + text=case.input["text"], + grade=case.input["grade"], + ) + + with ContractTestHarness(case) as harness: + result = evaluator.evaluate(inp) + + harness.assert_prompt_step("main") + +For evaluators with multiple LLM steps the harness automatically queues +responses in ``prompt_steps`` order and captures each call's request, so the +assertions still use the step name:: + + harness.assert_prompt_step("step_1") + harness.assert_prompt_step("step_2") + +The captured data is accessible after the context exits. + +Prompt strings (system and user) are compared after normalizing line endings +(``\\r\\n`` → ``\\n``) and stripping leading/trailing whitespace, so minor TOML +multiline formatting differences do not fail tests. +""" + +from __future__ import annotations + +from typing import Any +from unittest.mock import patch + +from .loader import ContractCase, PromptStepContract + +# The module-level name that ``base.py`` imports ``create_provider`` under. +_CREATE_PROVIDER_TARGET = "learning_commons_evaluators.evaluators.base.create_provider" + + +class ContractTestHarness: + """Sets up a mocked LLM for a contract test and captures what was sent to it. + + The harness patches ``create_provider`` so that no real API calls are made. + Each call to ``create_provider`` pops the next response from the case's + ``prompt_steps`` (in order) and returns a lightweight callable that: + + - Records the formatted messages it receives. + - Returns an ``AIMessage`` with the stored ``llm_response`` content. + + After the ``with`` block, use :meth:`assert_prompt_step` to verify each + captured request matches the stored contract. + """ + + def __init__(self, case: ContractCase) -> None: + self.case = case + # Populated during the evaluation run; indexed by call order. + self._captured: list[_CapturedCall] = [] + self._patch: Any = None + + def __enter__(self) -> ContractTestHarness: + # Build a FIFO list of (step_name, response) pairs in definition order. + self._response_queue: list[tuple[str, str]] = [ + (name, step.llm_response) for name, step in self.case.prompt_steps.items() + ] + self._captured = [] + + self._patch = patch(_CREATE_PROVIDER_TARGET, side_effect=self._make_fake_provider) + self._patch.start() + return self + + def __exit__(self, *args: Any) -> None: + if self._patch is not None: + self._patch.stop() + + # ------------------------------------------------------------------ + # Assertions + # ------------------------------------------------------------------ + + def assert_prompt_step(self, step_name: str) -> None: + """Assert that the request captured for *step_name* matches the contract. + + Raises: + AssertionError: If the captured messages, model, or temperature + differ from the stored contract. + IndexError: If no call was captured for the given step. + """ + step_names = list(self.case.prompt_steps.keys()) + if step_name not in step_names: + raise ValueError(f"Step '{step_name}' not in contract. Available steps: {step_names}") + step_index = step_names.index(step_name) + + if step_index >= len(self._captured): + raise AssertionError( + f"No LLM call captured for step '{step_name}' " + f"(only {len(self._captured)} call(s) were made)." + ) + + captured = self._captured[step_index] + contract = self.case.prompt_steps[step_name] + _assert_prompt_matches(captured, contract, step_name) + + # ------------------------------------------------------------------ + # Internal + # ------------------------------------------------------------------ + + def _make_fake_provider(self, prompt_settings: Any, evaluator_config: Any) -> Any: + """Return a callable that records input messages and returns the next mock response.""" + from langchain_core.messages import AIMessage + + if not self._response_queue: + raise RuntimeError( + "ContractTestHarness: more LLM calls were made than there are " + "prompt steps in the contract case." + ) + _step_name, response_content = self._response_queue.pop(0) + captured_list = self._captured + + def _fake_llm(prompt_value: Any) -> AIMessage: + # ``prompt_value`` is a ChatPromptValue produced by the template. + messages = ( + prompt_value.to_messages() + if hasattr(prompt_value, "to_messages") + else list(prompt_value) + ) + system_content = _message_content(messages, "system") + human_content = _message_content(messages, "human") + captured_list.append( + _CapturedCall( + system_prompt=system_content, + user_prompt=human_content, + model=prompt_settings.model, + temperature=prompt_settings.temperature, + ) + ) + return AIMessage(content=response_content) + + return _fake_llm + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + + +class _CapturedCall: + """One captured LLM invocation.""" + + def __init__( + self, + *, + system_prompt: str, + user_prompt: str, + model: str, + temperature: float, + ) -> None: + self.system_prompt = system_prompt + self.user_prompt = user_prompt + self.model = model + self.temperature = temperature + + +def _message_content(messages: list[Any], message_type: str) -> str: + """Extract content from the first message of the given type.""" + for msg in messages: + if getattr(msg, "type", None) == message_type: + return str(msg.content) + return "" + + +def _normalize_prompt_snapshot(s: str) -> str: + """Normalize prompt text for contract comparison. + + Hand-edited or pasted TOML multiline strings may differ from runtime only by + CRLF vs LF line endings, or a spurious leading newline after an opening + ``'''`` delimiter (TOML trims only the first newline). Normalizing avoids + those false failures while preserving internal content. + """ + return s.replace("\r\n", "\n").strip() + + +def _assert_prompt_matches( + captured: _CapturedCall, + contract: PromptStepContract, + step_name: str, +) -> None: + assert captured.model == contract.model, ( + f"Step '{step_name}': model mismatch.\n" + f" SDK sent: {captured.model!r}\n" + f" Contract: {contract.model!r}" + ) + assert captured.temperature == contract.temperature, ( + f"Step '{step_name}': temperature mismatch.\n" + f" SDK sent: {captured.temperature}\n" + f" Contract: {contract.temperature}" + ) + sys_sdk = _normalize_prompt_snapshot(captured.system_prompt) + sys_contract = _normalize_prompt_snapshot(contract.system_prompt) + assert sys_sdk == sys_contract, ( + f"Step '{step_name}': system_prompt mismatch.\n" + f" First diff at char {_first_diff_index(sys_sdk, sys_contract)}.\n" + f" SDK sent (first 200): {sys_sdk[:200]!r}\n" + f" Contract (first 200): {sys_contract[:200]!r}" + ) + user_sdk = _normalize_prompt_snapshot(captured.user_prompt) + user_contract = _normalize_prompt_snapshot(contract.user_prompt) + assert user_sdk == user_contract, ( + f"Step '{step_name}': user_prompt mismatch.\n" + f" SDK sent: {user_sdk!r}\n" + f" Contract: {user_contract!r}" + ) + + +def _first_diff_index(a: str, b: str) -> int: + for i, (ca, cb) in enumerate(zip(a, b, strict=False)): + if ca != cb: + return i + return min(len(a), len(b)) diff --git a/sdks/python/tests/contract_tests/loader.py b/sdks/python/tests/contract_tests/loader.py new file mode 100644 index 0000000..8331921 --- /dev/null +++ b/sdks/python/tests/contract_tests/loader.py @@ -0,0 +1,156 @@ +"""Load contract test cases from evaluator settings folders. + +Each evaluator's ``contracts.toml`` lives next to its ``settings.toml`` in +``sdks/settings//`` (e.g. +``settings/conventionality/contracts.toml``). This module provides the data +models and a loader that reads those files into structured objects usable from +tests. +""" + +from __future__ import annotations + +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +if sys.version_info >= (3, 11): + import tomllib +else: + try: + import tomllib + except ImportError: + import tomli as tomllib # type: ignore[import-not-found,no-redef] + + +# Sentinel used to detect placeholder values left by the TOML template. +_PLACEHOLDER_PREFIX = "PLACEHOLDER" + + +@dataclass +class PromptStepContract: + """Contract for one LLM prompt step: the exact request sent and response received. + + ``system_prompt`` and ``user_prompt`` are the *fully formatted* messages — + all template placeholders have been substituted (including library-computed + values like ``{format_instructions}`` and ``{fk_score}``). + + ``llm_response`` is the raw string content returned by the LLM, as captured + from the notebook run. It is used to mock the LLM in the contract test. + """ + + system_prompt: str + user_prompt: str + model: str + temperature: float + llm_response: str + + def is_populated(self) -> bool: + """Return False if any field still holds a placeholder value.""" + return not any( + str(v).startswith(_PLACEHOLDER_PREFIX) + for v in (self.system_prompt, self.user_prompt, self.llm_response) + ) + + +@dataclass +class ContractCase: + """One test case for a contract test. + + Attributes: + name: Identifier matching the ``[cases.]`` TOML key. + description: Human-readable label (optional). + input: Raw evaluator input values (e.g. ``{"text": ..., "grade_level": 4}``). + prompt_steps: Ordered mapping of step name → :class:`PromptStepContract`. + The order matches the order of LLM calls made during evaluation. + expected_result: Parsed LLM output in notebook format (i.e. the dict produced by + ``JsonOutputParser``, before SDK result mapping). Used to verify + the SDK produces the same structured result. + """ + + name: str + description: str + input: dict[str, Any] + prompt_steps: dict[str, PromptStepContract] + expected_result: dict[str, Any] + + def is_populated(self) -> bool: + """Return False if any prompt step still holds a placeholder value.""" + return all(step.is_populated() for step in self.prompt_steps.values()) + + def llm_responses_in_order(self) -> list[str]: + """Return LLM responses for all steps, in call order.""" + return [step.llm_response for step in self.prompt_steps.values()] + + +def load_contract_case(evaluator_name: str, case_name: str) -> ContractCase: + """Load a named test case from ``settings//contracts.toml``. + + The TOML file is resolved relative to the ``learning_commons_evaluators`` + package's installed ``settings`` directory. + + Args: + evaluator_name: Name of the evaluator (e.g. ``"conventionality"``). + case_name: Name of the case within the TOML (e.g. ``"turnip"``). + + Returns: + A :class:`ContractCase` loaded from the TOML. + + Raises: + FileNotFoundError: If ``contracts.toml`` does not exist. + KeyError: If ``case_name`` is not found in the TOML. + """ + toml_path = _settings_path(evaluator_name) / "contracts.toml" + if not toml_path.exists(): + raise FileNotFoundError( + f"contracts.toml not found for evaluator '{evaluator_name}' (expected at {toml_path})" + ) + + with open(toml_path, "rb") as fh: + data = tomllib.load(fh) + + cases = data.get("cases", {}) + if case_name not in cases: + available = ", ".join(cases.keys()) or "(none)" + raise KeyError(f"Case '{case_name}' not found in {toml_path}. Available cases: {available}") + + return _parse_case(case_name, cases[case_name]) + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + + +def _settings_path(evaluator_name: str) -> Path: + """Resolve the settings directory for an evaluator from the shared settings root.""" + from learning_commons_evaluators.settings.load_settings import ( + shared_settings_root, # noqa: PLC0415 + ) + + return shared_settings_root() / evaluator_name + + +def _parse_case(name: str, raw: dict[str, Any]) -> ContractCase: + prompt_steps_raw = raw.get("prompt_steps", {}) + prompt_steps: dict[str, PromptStepContract] = { + step_name: _parse_prompt_step(step_data) + for step_name, step_data in prompt_steps_raw.items() + } + return ContractCase( + name=name, + description=raw.get("description", ""), + input=raw.get("input", {}), + prompt_steps=prompt_steps, + expected_result=raw.get("expected_result", {}), + ) + + +def _parse_prompt_step(raw: dict[str, Any]) -> PromptStepContract: + return PromptStepContract( + system_prompt=str(raw["system_prompt"]), + user_prompt=str(raw["user_prompt"]), + model=str(raw["model"]), + temperature=float(raw["temperature"]), + llm_response=str(raw["llm_response"]), + ) diff --git a/sdks/python/tests/contract_tests/test_conventionality.py b/sdks/python/tests/contract_tests/test_conventionality.py new file mode 100644 index 0000000..be9437f --- /dev/null +++ b/sdks/python/tests/contract_tests/test_conventionality.py @@ -0,0 +1,84 @@ +"""Contract test: ConventionalityEvaluator matches the notebook. + +This test verifies two things for each contract case: + 1. The SDK sends the same LLM request as the notebook (same fully-formatted + system prompt, user prompt, model, and temperature). + 2. Given the same LLM response, the SDK produces the same result as the + notebook. + +HOW TO ADD A NEW CASE +--------------------- +1. Add a ``[cases.]`` entry to + ``settings/conventionality/contracts.toml``. +2. Add a loader function to ``contract_tests/conventionality.py``. +3. Add a test function here following the pattern below. + +HOW TO REFRESH CONTRACT DATA +----------------------------- +Run the notebook ``evals/Final ship - Conventionality Experimental Evaluator.ipynb`` +from the **repository root** with a valid ``GOOGLE_API_KEY``. The notebook loads +prompts from the evaluator settings TOML (same as the SDK) and prints a TOML block. +Paste it into ``sdks/settings/conventionality/contracts.toml`` (the canonical +copy), then run ``make sync-settings`` from ``sdks/python/`` to update the bundled +copy. +""" + +from learning_commons_evaluators import ( + ConventionalityEvaluationInput, + ConventionalityEvaluator, + create_config_no_telemetry, +) +from learning_commons_evaluators.schemas.metadata import Status + +from .conventionality import ( + conventionality_notebook_to_sdk_result, + load_conventionality_turnip_case, +) +from .harness import ContractTestHarness + + +class TestConventionalityContract: + def test_turnip_grade4(self) -> None: + """Turnip classroom narrative, grade 4. + + Verifies: + - The fully-formatted system and user prompts match the notebook. + - The model and temperature match the notebook. + - Given the notebook's LLM response, the SDK returns the same + answer and explanation as the notebook. + """ + case = load_conventionality_turnip_case() + + config = create_config_no_telemetry() + evaluator = ConventionalityEvaluator(config) + inp = ConventionalityEvaluationInput( + text=case.input["text"], + grade=case.input["grade"], + ) + + with ContractTestHarness(case) as harness: + result = evaluator.evaluate(inp) + + # --- Prompt fidelity --- + # Asserts that the SDK sent the same fully-formatted request as the + # notebook (model, temperature, and both message contents). + harness.assert_prompt_step("main") + + # --- Result fidelity --- + # Asserts the SDK maps the LLM response to the same answer and + # explanation as the notebook. Metadata (timing, evaluation_id, etc.) + # is intentionally excluded from this comparison. + expected = conventionality_notebook_to_sdk_result(case) + assert result.metadata.status == Status.succeeded + assert result.answer.score == expected.answer.score, ( + f"answer.score: SDK={result.answer.score!r}, notebook={expected.answer.score!r}" + ) + assert result.answer.label == expected.answer.label, ( + f"answer.label: SDK={result.answer.label!r}, notebook={expected.answer.label!r}" + ) + assert result.explanation.summary == expected.explanation.summary, ( + "explanation.summary (reasoning) differs between SDK and notebook" + ) + assert result.explanation.details == expected.explanation.details, ( + "explanation.details differs between SDK and notebook" + ) diff --git a/sdks/python/tests/settings/test_load_settings.py b/sdks/python/tests/settings/test_load_settings.py index d4f4e91..9d8e2be 100644 --- a/sdks/python/tests/settings/test_load_settings.py +++ b/sdks/python/tests/settings/test_load_settings.py @@ -3,8 +3,11 @@ Covers: load_settings(), _require(), _parse_prompts() (including prompt whitespace normalization), load_evaluator_settings(), and shared_settings_root() — including every conditional branch in each function. +Also contains the settings-sync guard: a test that fails if the bundled package +copies of settings diverge from the canonical sdks/settings/ source of truth. """ +import importlib.resources import os from pathlib import Path from unittest.mock import patch @@ -330,3 +333,79 @@ def test_importlib_fallback_resolves_to_existing_directory(self) -> None: with patch.dict(os.environ, env, clear=True): root = shared_settings_root() assert root.exists(), f"shared_settings_root() resolved to non-existent path: {root}" + + def test_bundled_contract_tests_are_present(self) -> None: + """The bundled package must contain contracts.toml for each evaluator. + + The evaluator settings TOML is intentionally NOT bundled (evaluators use _generated_*_settings.py). + contracts.toml IS bundled so contract tests work against an installed package. + """ + env = {k: v for k, v in os.environ.items() if k != "EVALUATORS_SETTINGS_DIR"} + with patch.dict(os.environ, env, clear=True): + root = shared_settings_root() + for evaluator in ("conventionality",): + assert (root / evaluator / "contracts.toml").exists(), ( + f"Bundled {evaluator}/contracts.toml not found — " + f"run 'python scripts/generate_settings.py --sync'" + ) + + +# --------------------------------------------------------------------------- +# Settings bundle sync guard +# --------------------------------------------------------------------------- +# +# What is and isn't bundled in the package: +# +# BUNDLED: contracts.toml — needed so contract tests run against an +# installed package without access to sdks/settings/. +# +# NOT BUNDLED: settings.toml — not needed at runtime; evaluators import +# from _generated_*_settings.py (pre-built at generation time). +# The canonical copy lives in sdks/settings/ and is the input to +# `make generate-settings`. +# +# If a sync test fails, run from the repo root: +# python scripts/generate_settings.py --sync — copies contracts.toml canonical → bundled +# python scripts/generate_settings.py --check — verifies generated .py files are up to date + + +def _bundled_settings_root() -> Path: + """Return the importlib.resources path for bundled settings, bypassing EVALUATORS_SETTINGS_DIR.""" + pkg = importlib.resources.files("learning_commons_evaluators.settings") + return Path(str(pkg)) + + +def _canonical_settings_root() -> Path | None: + """Return sdks/settings/ relative to this file, or None if not in the monorepo.""" + # This file lives at: sdks/python/tests/settings/test_load_settings.py + # parents[3] = sdks/ + candidate = Path(__file__).parents[3] / "settings" + return candidate if candidate.is_dir() else None + + +@pytest.mark.parametrize("evaluator", ["conventionality"]) +def test_bundled_contract_tests_match_canonical(evaluator: str) -> None: + """Bundled contracts.toml must be byte-for-byte identical to sdks/settings/. + + Skipped when running outside the monorepo (e.g., from an installed package). + If this fails, run ``python scripts/generate_settings.py --sync`` from the repo root. + """ + canonical_root = _canonical_settings_root() + if canonical_root is None: + pytest.skip("sdks/settings/ not found — running outside the monorepo") + + canonical = canonical_root / evaluator / "contracts.toml" + if not canonical.exists(): + pytest.skip(f"Canonical file not found: {canonical}") + + bundled = _bundled_settings_root() / evaluator / "contracts.toml" + assert bundled.exists(), ( + f"Bundled {evaluator}/contracts.toml not found.\n" + f"Run: python scripts/generate_settings.py --sync" + ) + assert canonical.read_bytes() == bundled.read_bytes(), ( + f"{evaluator}/contracts.toml is out of sync.\n" + f" canonical: {canonical}\n" + f" bundled: {bundled}\n" + f"Fix: python scripts/generate_settings.py --sync" + ) diff --git a/sdks/settings/conventionality/contracts.toml b/sdks/settings/conventionality/contracts.toml new file mode 100644 index 0000000..5ff6dd7 --- /dev/null +++ b/sdks/settings/conventionality/contracts.toml @@ -0,0 +1,132 @@ +# Contract tests for the Conventionality evaluator. +# +# This file captures the exact LLM interaction for known inputs so that +# tests can verify the SDK sends the same request as the notebook and +# produces the same result from the same response. +# +# HOW TO REGENERATE +# ----------------- +# Run the notebook "evals/Final ship - Conventionality Experimental Evaluator.ipynb" +# from the repo root with GOOGLE_API_KEY set. It loads prompts from the evaluator +# settings TOML (same as the SDK) and prints a TOML block — paste into this file +# and run `make sync-settings` from sdks/python/ to update the bundled copy. +# +# WHAT BELONGS HERE +# ----------------- +# Only data that originates from external sources: +# - input: what the caller provides to the evaluator +# - prompt_steps: what the SDK sends to the LLM (fully formatted messages, +# model, temperature) and the raw response the LLM returned +# - expected_result: the structured output the LLM returned (notebook format) +# +# Deterministic values computed by library code (e.g. FK score from textstat, +# format_instructions from LangChain's JsonOutputParser) are NOT stored as +# standalone fields — they appear inside the fully formatted prompts, where they +# are implicitly verified by the prompt-request assertion in the contract test. + +[cases.turnip] +description = "Grade 4 classroom narrative (Henry and the turnip)" + +[cases.turnip.input] +text = ''' + +"Well, then," said the teacher, "you may take your slate and go out behind the schoolhouse for half an hour. Think of something to write about, and write the word on your slate. Then try to tell what it is, what it is like, what it is good for, and what is done with it. That is the way to write a composition." Henry took his slate and went out. Just behind the schoolhouse was Mr. Finney's barn. Quite close to the barn was a garden. And in the garden, Henry saw a turnip. "Well, I know what that is," he said to himself; and he wrote the word turnip on his slate. Then he tried to tell what it was like, what it was good for, and what was done with it. Before the half hour was ended he had written a very neat composition on his slate. He then went into the house, and waited while the teacher read it. The teacher was surprised and pleased. He said, "Henry Longfellow, you have done very well. Today you may stand up before the school and read what you have written about the turnip." +''' +grade = 4 + +[cases.turnip.prompt_steps.main] +model = "gemini-3-flash-preview" +temperature = 0.0 +system_prompt = '''Role +You are an expert reading teacher and text complexity evaluator. Your task is to evaluate the "Conventionality" of a text and assign it a complexity level based on a 4-point scale, carefully factoring in the target grade level. + +Objective +Measure how explicit, literal, and straightforward the text's meaning is, versus how abstract, ironic, figurative, or archaic it is. Focus on the hiddenness of the meaning, the use of conceptual framing, the reliance on abstract reasoning, and the familiarity of the expression for the target grade. + +Complexity Levels +- Slightly Complex: Explicit, literal, straightforward, easy to understand. Meaning is entirely on the surface. The language is concrete, and the meaning is clear and procedural, mostly referring to observable materials and actions. Contains no symbolic or ironic language, and conceptual interpretation is not required. Contains limited figurative language that is common and easy to comprehend at the target grade level. +- Moderately Complex: Largely explicit and easy to understand with some occasions for more complex meaning. May contain a noticeable amount of archaic/dated phrasing, formal historical prose, vocabulary demands, background knowledge requirements, or expressions that are less familiar to the target grade level, which might make the text feel vague or slightly challenging. +- Very Complex: Fairly complex; contains sustained abstract language, conceptual framing, rhetorical idealization, ironic comparisons, or central metaphors that drive the meaning of the text. Addresses concepts, beliefs, and abstract qualities rather than just concrete objects. The tone or underlying message requires interpretation, even if the surface message is clear. +- Exceedingly Complex: Dense and complex; contains considerable abstract, ironic, and/or figurative language. Meaning is heavily hidden, deeply conceptual, or relies heavily on complex rhetorical devices. + +Essential Evaluation Rules +1. Concrete & Procedural Texts: Texts that are highly concrete, clear, and procedural (e.g., describing observable materials, mechanical processes, or physical actions) should typically be rated "Slightly Complex." + +2. Grade-Level Anchoring and Vague Narratives: Always consider the target grade. A literal historical narrative that might be straightforward for older students can be "Moderately Complex" for younger students (e.g., 4th graders) if it involves less familiar expressions, older contexts (e.g., wagon loads, traveling by horseback), vocabulary demands, and background knowledge requirements that make the text feel vague or slightly demanding for that age group. + +3. Rhetorical Idealization and Abstract Qualities: If an entire argument or narrative is built around abstract qualities (e.g., national character, bravery, liberty) and uses repeated figurative language or personification to portray a subject in a certain idealized way, rate the text as "Very Complex." Even if the figurative language is easy to interpret, the need to interpret the rhetorical tone and sustained abstract focus elevates the complexity beyond level two. + +4. Common Idioms and Grade-Level Appropriateness: Do NOT elevate a text to "Moderately Complex" simply because it contains a few common idiomatic expressions. If these expressions are widely known and easy for the target grade to understand without making the text feel vague, the text remains "Slightly Complex." + +5. Conversational and Hypothetical Framing: Using a second-person conversational hook (e.g., "Imagine you are...") to explain a concept is a standard, literal device for engaging readers. It does not constitute complex conceptual framing. + +6. Sustained vs. Occasional Impact: If abstract language, figurative phrasing, irony, or conceptual framing is sustained throughout the text and central to the argument/meaning, the text is Very Complex. Reserve Moderately Complex for texts where the explicit meaning dominates but the expression, vocabulary, or archaic language provides a moderate conventionality challenge. + +7. Central Metaphors and Conceptual Framing: When an author uses a central metaphor to explain a concept or uses figurative phrasing to explain how things "work," this abstract reasoning drives the meaning, elevating the text to Very Complex. + +8. Irony and Abstract Comparisons: Texts that rely on sustained irony, especially through comparative arguments, are inherently Very Complex for younger students. + +9. Isolate Conventionality from Vocabulary: Do not inflate the Conventionality score just because the text uses archaic, dated, or highly academic vocabulary. + +Input Format +You will receive: +- text: The passage to evaluate. +- grade: The target student grade level. +- fk_score: The Flesch-Kincaid readability score. + +Output Format +Provide a JSON object containing ONLY the following keys: +- complexity_score: (String) One of the 4 scale levels exactly as formatted: 'slightly_complex', 'moderately_complex', 'very_complex', or 'exceedingly_complex'. +- reasoning: (String) A detailed explanation of the rating, citing specific features in the text and referencing the expert guardrails (e.g., noting if the text relies on abstract qualities/rhetorical idealization, if vocabulary/background knowledge demands make a literal text vague for the grade level, or if it is strictly concrete/procedural). +- conventionality_features: (List of Strings) The specific language features driving the complexity (e.g., literal narrative, concrete actions, less familiar expressions, sustained irony, abstract qualities, rhetorical idealization, archaic phrasing) with direct quotes from the text. +- grade_context: (String) How the conventionality demands compare to general expectations for the provided target grade. +- instructional_insights: (String) Actionable pedagogical suggestions for scaffolding the conventionality features in the classroom. + +STRICT OUTPUT FORMAT: +- Return only the JSON value that conforms to the schema. Do not include any additional text, explanations, headings, or separators. +- Do not wrap the JSON in Markdown or code fences (no ``` or ```json). +- Do not prepend or append any text (e.g., do not write "Here is the JSON:"). +- The response must be a single top-level JSON value exactly as required by the schema (object/array/etc.), with no trailing commas or comments. + +The output should be formatted as a JSON instance that conforms to the JSON schema below. + +As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]} the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted. + +Here is the output schema (shown in a code block for readability only — do not include any backticks or Markdown in your output): +``` +{"properties": {"conventionality_features": {"description": "List of the specific language features driving the complexity (e.g., idioms, metaphors, implied meaning) with direct quotes from the text.", "items": {"type": "string"}, "title": "Conventionality Features", "type": "array"}, "grade_context": {"description": "How the conventionality demands compare to general expectations for the provided target grade.", "title": "Grade Context", "type": "string"}, "instructional_insights": {"description": "Actionable pedagogical suggestions for scaffolding the unconventional language features in the classroom.", "title": "Instructional Insights", "type": "string"}, "complexity_score": {"description": "The conventionality complexity level of the text", "enum": ["slightly_complex", "moderately_complex", "very_complex", "exceedingly_complex"], "title": "Complexity Score", "type": "string"}, "reasoning": {"description": "A synthesis of why the text fits the chosen rubric level.", "title": "Reasoning", "type": "string"}}, "required": ["conventionality_features", "grade_context", "instructional_insights", "complexity_score", "reasoning"]} +``` +''' +user_prompt = '''Analyze: +Text: +"Well, then," said the teacher, "you may take your slate and go out behind the schoolhouse for half an hour. Think of something to write about, and write the word on your slate. Then try to tell what it is, what it is like, what it is good for, and what is done with it. That is the way to write a composition." Henry took his slate and went out. Just behind the schoolhouse was Mr. Finney's barn. Quite close to the barn was a garden. And in the garden, Henry saw a turnip. "Well, I know what that is," he said to himself; and he wrote the word turnip on his slate. Then he tried to tell what it was like, what it was good for, and what was done with it. Before the half hour was ended he had written a very neat composition on his slate. He then went into the house, and waited while the teacher read it. The teacher was surprised and pleased. He said, "Henry Longfellow, you have done very well. Today you may stand up before the school and read what you have written about the turnip." + +Grade: 4 +FK Score: 3.75 +''' +llm_response = ''' +```json +{ + "complexity_score": "slightly_complex", + "reasoning": "The text is a straightforward, literal narrative that is entirely explicit and easy to understand. The meaning remains on the surface, focusing on concrete actions and observable materials (a slate, a schoolhouse, a turnip). The teacher's instructions are procedural and clear, providing a step-by-step guide for writing a composition. There is no symbolic, ironic, or figurative language that requires interpretation. While the setting is historical (using a 'slate' and 'schoolhouse'), these terms do not create a barrier to understanding the plot or the underlying message, which is a simple account of a student completing a task.", + "conventionality_features": [ + "literal narrative: 'Henry took his slate and went out.'", + "concrete actions: 'he wrote the word turnip on his slate.'", + "procedural instructions: 'tell what it is, what it is like, what it is good for, and what is done with it.'", + "straightforward meaning: 'The teacher was surprised and pleased.'" + ], + "grade_context": "For a 4th-grade student, this text is highly accessible and falls well within the expected range for conventionality. The Flesch-Kincaid score of 3.75 aligns with the text's simple sentence structures and concrete subject matter, making it an easy read for students at this level.", + "instructional_insights": "Since the text is highly conventional, teachers can use it to model the 'procedural' writing steps mentioned in the story. Students can practice the teacher's four-step method (what it is, what it is like, what it is good for, what is done with it) on modern objects. A brief historical note explaining that a 'slate' was a precursor to a notebook or tablet would provide sufficient context for the dated terminology." +} +```''' + +[cases.turnip.expected_result] +complexity_score = ''' +slightly_complex''' +reasoning = ''' +The text is a straightforward, literal narrative that is entirely explicit and easy to understand. The meaning remains on the surface, focusing on concrete actions and observable materials (a slate, a schoolhouse, a turnip). The teacher's instructions are procedural and clear, providing a step-by-step guide for writing a composition. There is no symbolic, ironic, or figurative language that requires interpretation. While the setting is historical (using a 'slate' and 'schoolhouse'), these terms do not create a barrier to understanding the plot or the underlying message, which is a simple account of a student completing a task.''' +conventionality_features = ["literal narrative: 'Henry took his slate and went out.'", "concrete actions: 'he wrote the word turnip on his slate.'", "procedural instructions: 'tell what it is, what it is like, what it is good for, and what is done with it.'", "straightforward meaning: 'The teacher was surprised and pleased.'"] +grade_context = ''' +For a 4th-grade student, this text is highly accessible and falls well within the expected range for conventionality. The Flesch-Kincaid score of 3.75 aligns with the text's simple sentence structures and concrete subject matter, making it an easy read for students at this level.''' +instructional_insights = ''' +Since the text is highly conventional, teachers can use it to model the 'procedural' writing steps mentioned in the story. Students can practice the teacher's four-step method (what it is, what it is like, what it is good for, what is done with it) on modern objects. A brief historical note explaining that a 'slate' was a precursor to a notebook or tablet would provide sufficient context for the dated terminology.'''