From 3ac83b5affa4be3dcf9773ace79a1f3ddd77312f Mon Sep 17 00:00:00 2001 From: Jake LoRocco Date: Fri, 1 May 2026 13:46:57 -0400 Subject: [PATCH] fix: default intrinsic adapter types; add tests for granite-switch; fix expected canned input/output with temperature Signed-off-by: Jake LoRocco Assisted-by: CLAUDE:OPUS --- mellea/backends/adapters/catalog.py | 26 +-- test/backends/test_adapters/test_catalog.py | 12 +- test/backends/test_openai_intrinsics.py | 205 +++++++++++++++++- .../answerability_answerable.json | 1 + .../answerability_simple.json | 3 +- .../answerability_unanswerable.json | 1 + .../test_canned_input/context_relevance.json | 3 +- 7 files changed, 220 insertions(+), 31 deletions(-) diff --git a/mellea/backends/adapters/catalog.py b/mellea/backends/adapters/catalog.py index 62763a1ec..9edeea2e4 100644 --- a/mellea/backends/adapters/catalog.py +++ b/mellea/backends/adapters/catalog.py @@ -78,31 +78,15 @@ class IntriniscsCatalogEntry(pydantic.BaseModel): IntriniscsCatalogEntry(name="citations", repo_id=_RAG_REPO), IntriniscsCatalogEntry(name="context_relevance", repo_id=_RAG_REPO), IntriniscsCatalogEntry(name="hallucination_detection", repo_id=_RAG_REPO), - IntriniscsCatalogEntry( - name="query_clarification", repo_id=_RAG_REPO, adapter_types=(AdapterType.LORA,) - ), + IntriniscsCatalogEntry(name="query_clarification", repo_id=_RAG_REPO), IntriniscsCatalogEntry(name="query_rewrite", repo_id=_RAG_REPO), ############################################ # Guardian Intrinsics ############################################ - IntriniscsCatalogEntry( - name="policy-guardrails", - repo_id=_GUARDIAN_REPO, - adapter_types=(AdapterType.LORA,), - ), - IntriniscsCatalogEntry( - name="guardian-core", repo_id=_GUARDIAN_REPO, adapter_types=(AdapterType.LORA,) - ), - IntriniscsCatalogEntry( - name="factuality-detection", - repo_id=_GUARDIAN_REPO, - adapter_types=(AdapterType.LORA,), - ), - IntriniscsCatalogEntry( - name="factuality-correction", - repo_id=_GUARDIAN_REPO, - adapter_types=(AdapterType.LORA,), - ), + IntriniscsCatalogEntry(name="policy-guardrails", repo_id=_GUARDIAN_REPO), + IntriniscsCatalogEntry(name="guardian-core", repo_id=_GUARDIAN_REPO), + IntriniscsCatalogEntry(name="factuality-detection", repo_id=_GUARDIAN_REPO), + IntriniscsCatalogEntry(name="factuality-correction", repo_id=_GUARDIAN_REPO), ] _INTRINSICS_CATALOG = {e.name: e for e in _INTRINSICS_CATALOG_ENTRIES} diff --git a/test/backends/test_adapters/test_catalog.py b/test/backends/test_adapters/test_catalog.py index 03d7c3538..cbbe504ca 100644 --- a/test/backends/test_adapters/test_catalog.py +++ b/test/backends/test_adapters/test_catalog.py @@ -54,6 +54,16 @@ def test_default_adapter_types(): assert AdapterType.ALORA in entry.adapter_types -def test_lora_only_entry(): +def test_lora_only_entry(monkeypatch): + from mellea.backends.adapters import catalog + + fake_entry = catalog.IntriniscsCatalogEntry( + name="query_clarification", + repo_id="ibm-granite/granitelib-rag-r1.0", + adapter_types=(AdapterType.LORA,), + ) + monkeypatch.setattr( + catalog, "_INTRINSICS_CATALOG", {"query_clarification": fake_entry} + ) entry = fetch_intrinsic_metadata("query_clarification") assert entry.adapter_types == (AdapterType.LORA,) diff --git a/test/backends/test_openai_intrinsics.py b/test/backends/test_openai_intrinsics.py index 0e4575add..3d5287fe0 100644 --- a/test/backends/test_openai_intrinsics.py +++ b/test/backends/test_openai_intrinsics.py @@ -43,7 +43,7 @@ from mellea.stdlib import functional as mfuncs from mellea.stdlib.components import Intrinsic, Message from mellea.stdlib.components.docs.document import Document -from mellea.stdlib.components.intrinsic import rag +from mellea.stdlib.components.intrinsic import core as intrinsic_core, guardian, rag from mellea.stdlib.context import ChatContext from test.formatters.granite.test_intrinsics_formatters import ( _YAML_JSON_COMBOS_WITH_MODEL, @@ -355,13 +355,21 @@ def test_call_intrinsic_answerability(call_intrinsic_backend): @pytest.mark.qualitative -def test_call_intrinsic_context_relevance(call_intrinsic_backend): - """call_intrinsic path: check_context_relevance returns a score between 0 and 1.""" - context, question, documents = _read_rag_input("context_relevance.json") - result = rag.check_context_relevance( - question, documents[0], context, call_intrinsic_backend +def test_call_intrinsic_requirement_check(call_intrinsic_backend): + """call_intrinsic path: requirement_check returns a score between 0 and 1.""" + with open(_RAG_TEST_DATA / "requirement_check.json", encoding="utf-8") as f: + data = json.load(f) + + context = ChatContext() + for m in data["messages"]: + context = context.add(Message(m["role"], m["content"])) + + requirement = data["requirement"] + result = intrinsic_core.requirement_check( + context, call_intrinsic_backend, requirement=requirement ) - assert result in ["relevant", "irrelevant", "partially relevant"] + assert isinstance(result, float) + assert 0.0 <= result <= 1.0 # --------------------------------------------------------------------------- @@ -399,3 +407,186 @@ def get_temperature(location: str) -> int: assert len(result.value) > 0 parsed = json.loads(result.value) assert isinstance(parsed, dict) + + +# --------------------------------------------------------------------------- +# Guardian intrinsic tests — exercise the high-level convenience wrappers +# --------------------------------------------------------------------------- + +_GUARDIAN_TEST_DATA = ( + pathlib.Path(__file__).parent.parent + / "stdlib" + / "components" + / "intrinsic" + / "testdata" + / "input_json" +) + + +def _read_guardian_input(file_name: str) -> ChatContext: + """Read guardian test input and convert to a ChatContext.""" + with open(_GUARDIAN_TEST_DATA / file_name, encoding="utf-8") as f: + json_data = json.load(f) + + context = ChatContext() + for m in json_data["messages"]: + role = m["role"] + content = m["content"] + context = context.add(Message(role, content)) + + return context + + +@pytest.mark.qualitative +def test_call_intrinsic_policy_guardrails(call_intrinsic_backend): + """call_intrinsic path: policy_guardrails returns a compliance label.""" + context = _read_guardian_input("policy_guardrails.json") + + policy_text = ( + "hiring managers should steer away from any questions that directly seek " + 'information about protected classes\u2014such as "how old are you," "where are ' + 'you from," "what year did you graduate" or "what are your plans for having kids."' + ) + + result = guardian.policy_guardrails( + context, call_intrinsic_backend, policy_text=policy_text + ) + assert result in ("Yes", "No", "Ambiguous") + + +@pytest.mark.qualitative +def test_call_intrinsic_guardian_check_harm(call_intrinsic_backend): + """call_intrinsic path: guardian_check detects harmful prompts.""" + context = _read_guardian_input("guardian_core.json") + + result = guardian.guardian_check( + context, call_intrinsic_backend, criteria="harm", target_role="user" + ) + assert isinstance(result, float) + assert 0.0 <= result <= 1.0 + + +@pytest.mark.qualitative +def test_call_intrinsic_guardian_check_groundedness(call_intrinsic_backend): + """call_intrinsic path: guardian_check detects ungrounded responses.""" + document = Document( + text=( + "Eat (1964) is a 45-minute underground film created by Andy Warhol. " + "The film was first shown by Jonas Mekas on July 16, 1964, at the " + "Washington Square Gallery." + ), + doc_id="0", + ) + + context = ( + ChatContext() + .add(Message("user", "When was the film Eat first shown?")) + .add( + Message( + "assistant", + "The film Eat was first shown by Jonas Mekas on December 24, " + "1922 at the Washington Square Gallery.", + documents=[document], + ) + ) + ) + + result = guardian.guardian_check( + context, call_intrinsic_backend, criteria="groundedness" + ) + assert isinstance(result, float) + assert 0.0 <= result <= 1.0 + + +@pytest.mark.qualitative +def test_call_intrinsic_guardian_check_function_call(call_intrinsic_backend): + """call_intrinsic path: guardian_check detects function call hallucinations.""" + tools = [ + { + "name": "comment_list", + "description": "Fetches a list of comments for a specified IBM video.", + "parameters": { + "aweme_id": { + "description": "The ID of the IBM video.", + "type": "int", + "default": "7178094165614464282", + }, + "cursor": { + "description": "The cursor for pagination. Defaults to 0.", + "type": "int, optional", + "default": "0", + }, + "count": { + "description": "The number of comments to fetch. Maximum is 30. Defaults to 20.", + "type": "int, optional", + "default": "20", + }, + }, + } + ] + tools_text = "Available tools:\n" + json.dumps(tools, indent=2) + user_text = "Fetch the first 15 comments for the IBM video with ID 456789123." + # Deliberately wrong: uses "video_id" instead of "aweme_id" + response_text = str( + [{"name": "comment_list", "arguments": {"video_id": 456789123, "count": 15}}] + ) + + context = ( + ChatContext() + .add(Message("user", f"{tools_text}\n\n{user_text}")) + .add(Message("assistant", response_text)) + ) + + result = guardian.guardian_check( + context, call_intrinsic_backend, criteria="function_call" + ) + assert isinstance(result, float) + assert 0.0 <= result <= 1.0 + + +@pytest.mark.qualitative +def test_call_intrinsic_factuality_detection(call_intrinsic_backend): + """call_intrinsic path: factuality_detection returns a yes/no label.""" + with open(_GUARDIAN_TEST_DATA / "factuality_detection.json", encoding="utf-8") as f: + data = json.load(f) + + context = ChatContext() + docs = [ + Document(text=d["text"], doc_id=d.get("doc_id")) + for d in data.get("extra_body", {}).get("documents", []) + ] + messages = data["messages"] + for i, m in enumerate(messages): + is_last = i == len(messages) - 1 + if is_last and docs: + context = context.add(Message(m["role"], m["content"], documents=docs)) + else: + context = context.add(Message(m["role"], m["content"])) + + result = guardian.factuality_detection(context, call_intrinsic_backend) + assert result in ("yes", "no") + + +@pytest.mark.qualitative +def test_call_intrinsic_factuality_correction(call_intrinsic_backend): + """call_intrinsic path: factuality_correction returns corrected text or 'none'.""" + with open( + _GUARDIAN_TEST_DATA / "factuality_correction.json", encoding="utf-8" + ) as f: + data = json.load(f) + + context = ChatContext() + docs = [ + Document(text=d["text"], doc_id=d.get("doc_id")) + for d in data.get("extra_body", {}).get("documents", []) + ] + messages = data["messages"] + for i, m in enumerate(messages): + is_last = i == len(messages) - 1 + if is_last and docs: + context = context.add(Message(m["role"], m["content"], documents=docs)) + else: + context = context.add(Message(m["role"], m["content"])) + + result = guardian.factuality_correction(context, call_intrinsic_backend) + assert isinstance(result, str) diff --git a/test/formatters/granite/testdata/test_canned_input/answerability_answerable.json b/test/formatters/granite/testdata/test_canned_input/answerability_answerable.json index c8f127d67..5843e43e5 100644 --- a/test/formatters/granite/testdata/test_canned_input/answerability_answerable.json +++ b/test/formatters/granite/testdata/test_canned_input/answerability_answerable.json @@ -26,5 +26,6 @@ } } }, + "temperature": 0.0, "max_completion_tokens": 6 } \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_canned_input/answerability_simple.json b/test/formatters/granite/testdata/test_canned_input/answerability_simple.json index 3a42fc67e..d247c43d3 100644 --- a/test/formatters/granite/testdata/test_canned_input/answerability_simple.json +++ b/test/formatters/granite/testdata/test_canned_input/answerability_simple.json @@ -16,5 +16,6 @@ } } }, - "max_completion_tokens": 6 + "max_completion_tokens": 6, + "temperature": 0.0 } \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_canned_input/answerability_unanswerable.json b/test/formatters/granite/testdata/test_canned_input/answerability_unanswerable.json index 8475fd979..4d390b598 100644 --- a/test/formatters/granite/testdata/test_canned_input/answerability_unanswerable.json +++ b/test/formatters/granite/testdata/test_canned_input/answerability_unanswerable.json @@ -30,5 +30,6 @@ } } }, + "temperature": 0.0, "max_completion_tokens": 6 } \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_canned_input/context_relevance.json b/test/formatters/granite/testdata/test_canned_input/context_relevance.json index 05ecc0562..9ce036c51 100644 --- a/test/formatters/granite/testdata/test_canned_input/context_relevance.json +++ b/test/formatters/granite/testdata/test_canned_input/context_relevance.json @@ -30,5 +30,6 @@ ] } } - } + }, + "temperature": 0.0 } \ No newline at end of file