From 415952a5aae2530758d591b62128c841aad2f330 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Tue, 3 Mar 2026 11:00:28 +0000
Subject: [PATCH 01/11] chore: remove code coverage from pytest screen output
 (#565)

<!-- mellea-pr-edited-marker: do not remove this marker -->
# Misc PR

## Type of PR

- [ ] Bug Fix
- [ ] New Feature
- [ ] Documentation
- [x] Other

## Description
- [x] Link to Issue: Fixes #565

<!-- Brief description of the change being made along with an explanation. -->
Removed `--cov-report=term` from the `[tool.pytest.ini_options]` configuration in `pyproject.toml` to prevent test runs from dumping large code coverage tables to the terminal. Test coverage is still generated and output to files `htmlcov/` and `coverage.json`.

### Testing
- [ ] Tests added to the respective file if code was changed
- [ ] New code has 100% coverage if code as added
- [ ] Ensure existing tests and github automation passes (a maintainer will kick off the github automation when the rest of the PR is populated)
---
 pyproject.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index b865ef403..5987d691f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -255,7 +255,6 @@ addopts = [
     # Run qualitative tests by default (use -m "not qualitative" for fast tests)
     "--cov=mellea",
     "--cov=cli",
-    "--cov-report=term",
     "--cov-report=html",
     "--cov-report=json",
     # Set timeout to 15 minutes for full test suite

From e6bd942ffeab9a3f5ac0c56c34f2a329aeb37a9e Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Tue, 3 Mar 2026 10:13:58 +0000
Subject: [PATCH 02/11] test: isolate astream_incremental tests from CI

Fixes #562
---
 test/core/test_astream_incremental.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test/core/test_astream_incremental.py b/test/core/test_astream_incremental.py
index 78b0c2a8a..58425200f 100644
--- a/test/core/test_astream_incremental.py
+++ b/test/core/test_astream_incremental.py
@@ -14,6 +14,7 @@
 
 @pytest.mark.ollama
 @pytest.mark.llm
+@pytest.mark.qualitative
 async def test_astream_returns_incremental_chunks():
     """Test that astream() returns only new content, not accumulated content.
 
@@ -68,6 +69,7 @@ async def test_astream_returns_incremental_chunks():
 
 @pytest.mark.ollama
 @pytest.mark.llm
+@pytest.mark.qualitative
 async def test_astream_multiple_calls_accumulate_correctly():
     """Test that multiple astream() calls accumulate to the final value.
 
@@ -120,6 +122,7 @@ async def test_astream_multiple_calls_accumulate_correctly():
 
 @pytest.mark.ollama
 @pytest.mark.llm
+@pytest.mark.qualitative
 async def test_astream_beginning_length_tracking():
     """Test that beginning_length is correctly tracked across astream calls.
 
@@ -150,6 +153,7 @@ async def test_astream_beginning_length_tracking():
 
 @pytest.mark.ollama
 @pytest.mark.llm
+@pytest.mark.qualitative
 async def test_astream_empty_beginning():
     """Test astream when _underlying_value starts as None."""
     session = start_session()
@@ -174,6 +178,7 @@ async def test_astream_empty_beginning():
 
 @pytest.mark.ollama
 @pytest.mark.llm
+@pytest.mark.qualitative
 async def test_astream_computed_returns_full_value():
     """Test that astream returns full value when already computed."""
     # Create a pre-computed thunk
@@ -188,6 +193,7 @@ async def test_astream_computed_returns_full_value():
 
 @pytest.mark.ollama
 @pytest.mark.llm
+@pytest.mark.qualitative
 async def test_astream_final_call_returns_full_value():
     """Test that the final astream call returns the full value when computed.
 

From 69a5889782d4006fdfde13871dda9bbd924d37b8 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Tue, 3 Mar 2026 10:38:18 +0000
Subject: [PATCH 03/11] test: add deterministic mock tests for astream
 incremental logic

Introduces `test_astream_mock.py` to test `ModelOutputThunk`'s async queue incremental streaming logic deterministically without relying on highly-variable LLM backends.
---
 test/core/test_astream_mock.py | 162 +++++++++++++++++++++++++++++++++
 1 file changed, 162 insertions(+)
 create mode 100644 test/core/test_astream_mock.py

diff --git a/test/core/test_astream_mock.py b/test/core/test_astream_mock.py
new file mode 100644
index 000000000..e3c58c697
--- /dev/null
+++ b/test/core/test_astream_mock.py
@@ -0,0 +1,162 @@
+"""Deterministic Mock Tests for ModelOutputThunk.astream() incremental return behavior.
+
+Tests that astream() returns only new content added since the beginning of
+each astream() call, not the entire accumulated value. Uses manual queue
+injection to bypass LLM calls and network operations, guaranteeing determinism.
+"""
+
+import asyncio
+from typing import Any
+
+import pytest
+
+from mellea.core.base import CBlock, GenerateType, ModelOutputThunk
+
+
+async def mock_process(mot: ModelOutputThunk, chunk: Any) -> None:
+    """Mock process function that simply appends the chunk to the underlying value."""
+    if mot._underlying_value is None:
+        mot._underlying_value = ""
+    if chunk is not None:
+        mot._underlying_value += chunk
+
+
+async def mock_post_process(mot: ModelOutputThunk) -> None:
+    """Mock post-process function (does nothing)."""
+
+
+def create_manual_mock_thunk() -> ModelOutputThunk:
+    """Helper to create a mock ModelOutputThunk where we manually populate the queue."""
+    mot = ModelOutputThunk(value=None)
+    mot._action = CBlock("mock_action")
+    mot._generate_type = GenerateType.ASYNC
+    mot._process = mock_process
+    mot._post_process = mock_post_process
+    mot._chunk_size = 0  # Read exactly what is available
+    return mot
+
+
+@pytest.mark.asyncio
+async def test_astream_returns_incremental_chunks():
+    """Test that astream() returns only new content, not accumulated content."""
+    mot = create_manual_mock_thunk()
+
+    # Drop the first chunk and pull it
+    mot._async_queue.put_nowait("chunk1 ")
+    chunk1 = await mot.astream()
+    assert chunk1 == "chunk1 "
+
+    # Drop the second chunk and pull it
+    mot._async_queue.put_nowait("chunk2 ")
+    chunk2 = await mot.astream()
+    assert chunk2 == "chunk2 "
+
+    # Drop the third chunk and pull it
+    mot._async_queue.put_nowait("chunk3 ")
+    chunk3 = await mot.astream()
+    assert chunk3 == "chunk3 "
+
+    # Send completion sentinel
+    mot._async_queue.put_nowait(None)
+
+    # Wait until fully consumed
+    while not mot.is_computed():
+        await mot.astream()
+
+    final_val = await mot.avalue()
+    assert final_val == "chunk1 chunk2 chunk3 "
+
+
+@pytest.mark.asyncio
+async def test_astream_multiple_calls_accumulate_correctly():
+    """Test that multiple astream() calls accumulate to the final value."""
+    # Simulating a scenario where queue chunks outpace the reading loop
+    mot = create_manual_mock_thunk()
+
+    # Drop multiple items at once to simulate fast network
+    mot._async_queue.put_nowait("c")
+    mot._async_queue.put_nowait("h")
+    mot._async_queue.put_nowait("u")
+
+    # Calling astream should drain all currently queued items ("chu")
+    chunk1 = await mot.astream()
+    assert chunk1 == "chu"
+
+    mot._async_queue.put_nowait("n")
+    mot._async_queue.put_nowait("k")
+    mot._async_queue.put_nowait(None)
+
+    chunk2 = await mot.astream()
+    assert chunk2 == "chunk"
+
+    final_val = await mot.avalue()
+
+    assert mot.is_computed()
+    assert final_val == "chunk"
+
+
+@pytest.mark.asyncio
+async def test_astream_beginning_length_tracking():
+    """Test that beginning_length is correctly tracked across astream calls."""
+    mot = create_manual_mock_thunk()
+
+    mot._async_queue.put_nowait("AAA")
+    chunk1 = await mot.astream()
+    assert chunk1 == "AAA"
+
+    mot._async_queue.put_nowait("BBB")
+    chunk2 = await mot.astream()
+    # verify incremental length tracking works
+    assert not chunk2.startswith(chunk1)
+    assert chunk2 == "BBB"
+
+
+@pytest.mark.asyncio
+async def test_astream_empty_beginning():
+    """Test astream when _underlying_value starts as None."""
+    mot = create_manual_mock_thunk()
+
+    mot._async_queue.put_nowait("First")
+    # At the start, _underlying_value is None, beginning_length is 0
+    chunk = await mot.astream()
+
+    # Because beginning length was 0, astream returns the full chunk
+    assert chunk == "First"
+    assert mot._underlying_value == "First"
+
+
+@pytest.mark.asyncio
+async def test_astream_computed_returns_full_value():
+    """Test that astream returns full value when already computed."""
+    # Precomputed thunk skips queue checking completely
+    mot = ModelOutputThunk(value="Hello, world!")
+
+    # For a precomputed thunk, astream directly returns value
+    result = await mot.astream()
+    assert result == "Hello, world!"
+
+
+@pytest.mark.asyncio
+async def test_astream_final_call_returns_full_value():
+    """Test that the final astream call returns the full value when computed."""
+    mot = create_manual_mock_thunk()
+
+    mot._async_queue.put_nowait("part1")
+    chunk1 = await mot.astream()
+    assert chunk1 == "part1"
+
+    mot._async_queue.put_nowait("part2")
+    chunk2 = await mot.astream()
+    assert chunk2 == "part2"
+
+    mot._async_queue.put_nowait("part3")
+    mot._async_queue.put_nowait(None)
+
+    # Calling astream here processes "part3" and `None`, flagging it as done
+    chunk3 = await mot.astream()
+
+    final_val = await mot.avalue()
+
+    # The final chunk logically completes the thunk, returning the full value instead of a slice.
+    assert chunk3 == "part1part2part3"
+    assert chunk3 == final_val

From bc206243daeed7009f1f85a6a75895eb8c21773d Mon Sep 17 00:00:00 2001
From: 0xCUB3 <skula@mit.edu>
Date: Mon, 16 Feb 2026 09:13:57 -0500
Subject: [PATCH 04/11] fix: prevent exception chunk from being passed to
 _process in astream

Pop exception from chunks list (like we do for the None sentinel) so
_process doesn't receive it. Guard chat_response access in ollama
post_processing with .get() for when no valid chunks arrived.

Signed-off-by: 0xCUB3 <skula@mit.edu>
---
 mellea/backends/ollama.py | 2 +-
 mellea/core/base.py       | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mellea/backends/ollama.py b/mellea/backends/ollama.py
index e848b9004..04f75751b 100644
--- a/mellea/backends/ollama.py
+++ b/mellea/backends/ollama.py
@@ -596,7 +596,7 @@ async def post_processing(
         generate_log.backend = f"ollama::{self._get_ollama_model_id()}"
         generate_log.model_options = mot._model_options
         generate_log.date = datetime.datetime.now()
-        generate_log.model_output = mot._meta["chat_response"]
+        generate_log.model_output = mot._meta.get("chat_response")
         generate_log.extra = {
             "format": _format,
             "thinking": mot._model_options.get(ModelOption.THINKING, None),
diff --git a/mellea/core/base.py b/mellea/core/base.py
index e7b40c7cd..a6520f730 100644
--- a/mellea/core/base.py
+++ b/mellea/core/base.py
@@ -325,8 +325,8 @@ async def astream(self) -> str:
             elif isinstance(chunks[-1], Exception):
                 # Mark as computed so post_process runs in finally block
                 self._computed = True
-                # Store exception to re-raise after cleanup
-                exception_to_raise = chunks[-1]
+                # Remove the exception from chunks so _process doesn't receive it
+                exception_to_raise = chunks.pop()
 
             for chunk in chunks:
                 assert self._process is not None

From 16a34e90e15789bacec6970daded41cb61007bbe Mon Sep 17 00:00:00 2001
From: 0xCUB3 <skula@mit.edu>
Date: Thu, 19 Feb 2026 09:29:42 -0500
Subject: [PATCH 05/11] fix: safe _meta access in post_processing for all
 backends

Signed-off-by: 0xCUB3 <skula@mit.edu>
---
 mellea/backends/huggingface.py |  5 +++--
 mellea/backends/litellm.py     |  9 ++++++---
 mellea/backends/openai.py      | 23 ++++++++++++++---------
 mellea/backends/vllm.py        |  3 ++-
 mellea/backends/watsonx.py     |  9 +++++++--
 5 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/mellea/backends/huggingface.py b/mellea/backends/huggingface.py
index 3dad5ed71..62ace6b68 100644
--- a/mellea/backends/huggingface.py
+++ b/mellea/backends/huggingface.py
@@ -1015,7 +1015,8 @@ async def post_processing(
                 mot._meta["hf_output"] = full_output
 
         # The ModelOutputThunk must be computed by this point.
-        assert mot.value is not None
+        if mot.value is None:
+            return
 
         # Store KV cache in LRU separately (not in mot._meta) to enable proper cleanup on eviction.
         # This prevents GPU memory from being held by ModelOutputThunk references.
@@ -1078,7 +1079,7 @@ async def post_processing(
         ):
             import gc
 
-            hf_out = mot._meta["hf_output"]
+            hf_out = mot._meta.get("hf_output")
             if hasattr(hf_out, "sequences") and hf_out.sequences is not None:
                 del hf_out.sequences
             if hasattr(hf_out, "scores") and hf_out.scores is not None:
diff --git a/mellea/backends/litellm.py b/mellea/backends/litellm.py
index dc06db3a3..a624a31d6 100644
--- a/mellea/backends/litellm.py
+++ b/mellea/backends/litellm.py
@@ -441,8 +441,11 @@ async def post_processing(
         # OpenAI-like streamed responses potentially give you chunks of tool calls.
         # As a result, we have to store data between calls and only then
         # check for complete tool calls in the post_processing step.
-        tool_chunk = extract_model_tool_requests(
-            tools, mot._meta["litellm_chat_response"]
+        litellm_response = mot._meta.get("litellm_chat_response")
+        tool_chunk = (
+            extract_model_tool_requests(tools, litellm_response)
+            if litellm_response is not None
+            else None
         )
         if tool_chunk is not None:
             if mot.tool_calls is None:
@@ -457,7 +460,7 @@ async def post_processing(
         generate_log.backend = f"litellm::{self.model_id!s}"
         generate_log.model_options = mot._model_options
         generate_log.date = datetime.datetime.now()
-        generate_log.model_output = mot._meta["litellm_chat_response"]
+        generate_log.model_output = mot._meta.get("litellm_chat_response")
         generate_log.extra = {
             "format": _format,
             "tools_available": tools,
diff --git a/mellea/backends/openai.py b/mellea/backends/openai.py
index 0b520bd29..8d12f2c15 100644
--- a/mellea/backends/openai.py
+++ b/mellea/backends/openai.py
@@ -575,9 +575,13 @@ async def post_processing(
         # check for complete tool calls in the post_processing step.
         # Use the choice format for tool extraction (backward compatibility)
         choice_response = mot._meta.get(
-            "oai_chat_response_choice", mot._meta["oai_chat_response"]
+            "oai_chat_response_choice", mot._meta.get("oai_chat_response")
+        )
+        tool_chunk = (
+            extract_model_tool_requests(tools, choice_response)
+            if choice_response is not None
+            else None
         )
-        tool_chunk = extract_model_tool_requests(tools, choice_response)
         if tool_chunk is not None:
             if mot.tool_calls is None:
                 mot.tool_calls = {}
@@ -592,7 +596,7 @@ async def post_processing(
         generate_log.model_options = mot._model_options
         generate_log.date = datetime.datetime.now()
         # Store the full response (includes usage info)
-        generate_log.model_output = mot._meta["oai_chat_response"]
+        generate_log.model_output = mot._meta.get("oai_chat_response")
         generate_log.extra = {
             "format": _format,
             "thinking": thinking,
@@ -613,12 +617,13 @@ async def post_processing(
                 record_token_usage,
             )
 
-            response = mot._meta["oai_chat_response"]
-            # response is a dict from model_dump(), extract usage if present
-            usage = response.get("usage") if isinstance(response, dict) else None
-            if usage:
-                record_token_usage(span, usage)
-            record_response_metadata(span, response)
+            response = mot._meta.get("oai_chat_response")
+            if response is not None:
+                # response is a dict from model_dump(), extract usage if present
+                usage = response.get("usage") if isinstance(response, dict) else None
+                if usage:
+                    record_token_usage(span, usage)
+                record_response_metadata(span, response)
             # Close the span now that async operation is complete
             end_backend_span(span)
             # Clean up the span reference
diff --git a/mellea/backends/vllm.py b/mellea/backends/vllm.py
index 85ce66308..5f3d666d7 100644
--- a/mellea/backends/vllm.py
+++ b/mellea/backends/vllm.py
@@ -380,7 +380,8 @@ async def post_processing(
     ):
         """Called when generation is done."""
         # The ModelOutputThunk must be computed by this point.
-        assert mot.value is not None
+        if mot.value is None:
+            return
 
         # Only scan for tools if we are not doing structured output and tool calls were provided to the model.
         if _format is None and tool_calls:
diff --git a/mellea/backends/watsonx.py b/mellea/backends/watsonx.py
index 51cd84c2c..7d820d7e7 100644
--- a/mellea/backends/watsonx.py
+++ b/mellea/backends/watsonx.py
@@ -469,7 +469,12 @@ async def post_processing(
         # OpenAI streamed responses give you chunks of tool calls.
         # As a result, we have to store data between calls and only then
         # check for complete tool calls in the post_processing step.
-        tool_chunk = extract_model_tool_requests(tools, mot._meta["oai_chat_response"])
+        oai_response = mot._meta.get("oai_chat_response")
+        tool_chunk = (
+            extract_model_tool_requests(tools, oai_response)
+            if oai_response is not None
+            else None
+        )
         if tool_chunk is not None:
             if mot.tool_calls is None:
                 mot.tool_calls = {}
@@ -509,7 +514,7 @@ async def post_processing(
         generate_log.backend = f"watsonx::{self.model_id!s}"
         generate_log.model_options = mot._model_options
         generate_log.date = datetime.datetime.now()
-        generate_log.model_output = mot._meta["oai_chat_response"]
+        generate_log.model_output = mot._meta.get("oai_chat_response")
         generate_log.extra = {
             "format": _format,
             "tools_available": tools,

From 837a25a952bb5964f6ec61a1c1a2c841033cce68 Mon Sep 17 00:00:00 2001
From: 0xCUB3 <skula@mit.edu>
Date: Thu, 19 Feb 2026 09:29:58 -0500
Subject: [PATCH 06/11] fix: skip Exception and None chunks in astream before
 _process

Signed-off-by: 0xCUB3 <skula@mit.edu>
---
 mellea/core/base.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/mellea/core/base.py b/mellea/core/base.py
index a6520f730..7da08d460 100644
--- a/mellea/core/base.py
+++ b/mellea/core/base.py
@@ -329,6 +329,10 @@ async def astream(self) -> str:
                 exception_to_raise = chunks.pop()
 
             for chunk in chunks:
+                # Belt-and-suspenders: skip non-chunk objects that should
+                # have been removed above (exceptions, sentinel None).
+                if chunk is None or isinstance(chunk, Exception):
+                    continue
                 assert self._process is not None
                 await self._process(self, chunk)
 

From e9d26443c6e0cb359d784f6a43bdd60692aaeb43 Mon Sep 17 00:00:00 2001
From: 0xCUB3 <skula@mit.edu>
Date: Thu, 19 Feb 2026 09:35:11 -0500
Subject: [PATCH 07/11] fix: revert hf_output to bracket access where
 isinstance guard proves key exists

Signed-off-by: 0xCUB3 <skula@mit.edu>
---
 mellea/backends/huggingface.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mellea/backends/huggingface.py b/mellea/backends/huggingface.py
index 62ace6b68..dfbf3e4c5 100644
--- a/mellea/backends/huggingface.py
+++ b/mellea/backends/huggingface.py
@@ -1079,7 +1079,7 @@ async def post_processing(
         ):
             import gc
 
-            hf_out = mot._meta.get("hf_output")
+            hf_out = mot._meta["hf_output"]
             if hasattr(hf_out, "sequences") and hf_out.sequences is not None:
                 del hf_out.sequences
             if hasattr(hf_out, "scores") and hf_out.scores is not None:

From 00eb95b3dc853c1fb2adcde90e9edaee4107c85b Mon Sep 17 00:00:00 2001
From: 0xCUB3 <skula@mit.edu>
Date: Wed, 25 Feb 2026 07:54:12 -0500
Subject: [PATCH 08/11] test: add regression tests for astream exception
 handling

Unit tests that verify exceptions in the async queue are cleanly
propagated without reaching _process, and that _post_process still
runs for telemetry cleanup.
---
 test/core/test_astream_exception_handling.py | 101 +++++++++++++++++++
 1 file changed, 101 insertions(+)
 create mode 100644 test/core/test_astream_exception_handling.py

diff --git a/test/core/test_astream_exception_handling.py b/test/core/test_astream_exception_handling.py
new file mode 100644
index 000000000..303fb842e
--- /dev/null
+++ b/test/core/test_astream_exception_handling.py
@@ -0,0 +1,101 @@
+"""Regression tests for astream() exception handling.
+
+When a backend error occurs during streaming, the Exception object lands in the
+async queue.  Before the fix, astream() would either pass it to _process (crash)
+or post_processing would hit a KeyError on _meta keys that were never set.
+
+These tests verify that astream() cleanly propagates the original exception
+after running _post_process for telemetry cleanup.
+"""
+
+import asyncio
+
+import pytest
+
+from mellea.core.base import GenerateType, ModelOutputThunk
+
+
+def _make_streaming_mot():
+    """Create a ModelOutputThunk wired up for streaming with stub callbacks."""
+    mot = ModelOutputThunk(value=None)
+    mot._generate_type = GenerateType.ASYNC
+    mot._chunk_size = 1
+
+    process_calls: list = []
+
+    async def _process(mot, chunk):
+        process_calls.append(chunk)
+        text = chunk if isinstance(chunk, str) else str(chunk)
+        if mot._underlying_value is None:
+            mot._underlying_value = text
+        else:
+            mot._underlying_value += text
+
+    post_process_called = asyncio.Event()
+
+    async def _post_process(mot):
+        post_process_called.set()
+
+    mot._process = _process
+    mot._post_process = _post_process
+
+    return mot, process_calls, post_process_called
+
+
+@pytest.mark.asyncio
+async def test_astream_propagates_exception_from_queue():
+    """Exception in the queue is re-raised after cleanup, not passed to _process."""
+    mot, process_calls, post_process_called = _make_streaming_mot()
+
+    original_error = RuntimeError("backend connection lost")
+    await mot._async_queue.put(original_error)
+
+    with pytest.raises(RuntimeError, match="backend connection lost"):
+        await mot.astream()
+
+    # _process must never have seen the Exception object
+    assert original_error not in process_calls
+    # _post_process ran for telemetry cleanup
+    assert post_process_called.is_set()
+
+
+@pytest.mark.asyncio
+async def test_astream_propagates_exception_after_valid_chunks():
+    """Valid chunks before the exception are processed; exception still raised."""
+    mot, process_calls, post_process_called = _make_streaming_mot()
+
+    await mot._async_queue.put("hello ")
+    await mot._async_queue.put("world")
+    await mot._async_queue.put(ValueError("mid-stream failure"))
+
+    with pytest.raises(ValueError, match="mid-stream failure"):
+        await mot.astream()
+
+    # Valid chunks were processed
+    assert process_calls == ["hello ", "world"]
+    # Accumulated value reflects only the valid chunks
+    assert mot._underlying_value == "hello world"
+    # Cleanup still ran
+    assert post_process_called.is_set()
+
+
+@pytest.mark.asyncio
+async def test_astream_skips_none_and_exception_in_chunk_loop():
+    """Belt-and-suspenders: stray None/Exception objects in the middle of the
+    chunk list are skipped rather than passed to _process."""
+    mot, process_calls, _ = _make_streaming_mot()
+
+    # Simulate a queue where a None sentinel terminates the stream normally
+    await mot._async_queue.put("good chunk")
+    await mot._async_queue.put(None)
+
+    # Need _action for the finally-block parse path
+    from mellea.core.base import CBlock
+
+    mot._action = CBlock("test")
+
+    result = await mot.astream()
+
+    assert process_calls == ["good chunk"]
+    assert mot.is_computed()
+    assert result is not None

From feadd253266afd856758b53e51ffe0b43e8e8f2a Mon Sep 17 00:00:00 2001
From: 0xCUB3 <skula@mit.edu>
Date: Wed, 25 Feb 2026 07:56:10 -0500
Subject: [PATCH 09/11] test: clean up asyncio markers and inline import

---
 test/core/test_astream_exception_handling.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/test/core/test_astream_exception_handling.py b/test/core/test_astream_exception_handling.py
index 303fb842e..4a2d1d232 100644
--- a/test/core/test_astream_exception_handling.py
+++ b/test/core/test_astream_exception_handling.py
@@ -12,7 +12,7 @@
 
 import pytest
 
-from mellea.core.base import GenerateType, ModelOutputThunk
+from mellea.core.base import CBlock, GenerateType, ModelOutputThunk
 
 
 def _make_streaming_mot():
@@ -42,7 +42,6 @@ async def _post_process(mot):
     return mot, process_calls, post_process_called
 
 
-@pytest.mark.asyncio
 async def test_astream_propagates_exception_from_queue():
     """Exception in the queue is re-raised after cleanup, not passed to _process."""
     mot, process_calls, post_process_called = _make_streaming_mot()
@@ -59,7 +58,6 @@ async def test_astream_propagates_exception_from_queue():
     assert post_process_called.is_set()
 
 
-@pytest.mark.asyncio
 async def test_astream_propagates_exception_after_valid_chunks():
     """Valid chunks before the exception are processed; exception still raised."""
     mot, process_calls, post_process_called = _make_streaming_mot()
@@ -79,19 +77,14 @@ async def test_astream_propagates_exception_after_valid_chunks():
     assert post_process_called.is_set()
 
 
-@pytest.mark.asyncio
 async def test_astream_skips_none_and_exception_in_chunk_loop():
     """Belt-and-suspenders: stray None/Exception objects in the middle of the
     chunk list are skipped rather than passed to _process."""
     mot, process_calls, _ = _make_streaming_mot()
 
-    # Simulate a queue where a None sentinel terminates the stream normally
     await mot._async_queue.put("good chunk")
     await mot._async_queue.put(None)
 
-    # Need _action for the finally-block parse path
-    from mellea.core.base import CBlock
-
     mot._action = CBlock("test")
 
     result = await mot.astream()

From 3b59bbac3348fd81f5048db79affc985b1e95721 Mon Sep 17 00:00:00 2001
From: Jake LoRocco <jake.lorocco@ibm.com>
Date: Tue, 3 Mar 2026 12:04:52 -0500
Subject: [PATCH 10/11] fix: issues with tests (alora example, rag intrinsics,
 mistral tool use)

---
 docs/examples/aLora/101_example.py            |  3 +--
 docs/examples/aLora/102_example.py            |  3 +++
 pyproject.toml                                |  1 +
 test/backends/test_openai_vllm.py             | 21 ++++++++++++-------
 .../input_json/hallucination_detection.json   |  1 +
 .../testdata/output_json/citations.json       |  2 +-
 .../output_json/hallucination_detection.json  |  4 ++--
 uv.lock                                       |  8 +++++++
 8 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/docs/examples/aLora/101_example.py b/docs/examples/aLora/101_example.py
index 3abc2554a..243d9225b 100644
--- a/docs/examples/aLora/101_example.py
+++ b/docs/examples/aLora/101_example.py
@@ -1,5 +1,4 @@
-# pytest: skip, huggingface, requires_heavy_ram, llm
-# SKIP REASON: Example broken since intrinsics refactor - see issue #385
+# pytest: huggingface, requires_heavy_ram, llm
 
 import time
 
diff --git a/docs/examples/aLora/102_example.py b/docs/examples/aLora/102_example.py
index 542407ce9..c2bf86a3a 100644
--- a/docs/examples/aLora/102_example.py
+++ b/docs/examples/aLora/102_example.py
@@ -1,3 +1,6 @@
+# pytest: skip, huggingface, requires_heavy_ram, llm
+# SKIP REASON: Requires user input; tests same functionality as 101_example.py.
+
 from stembolts_intrinsic import (
     async_stembolt_failure_analysis,
     stembolt_failure_analysis,
diff --git a/pyproject.toml b/pyproject.toml
index 5987d691f..31018730b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -119,6 +119,7 @@ dev = [
     "python-semantic-release~=7.32",
     "nbmake>=1.5.5",
     "langchain-core>=1.2.7", # Necessary for mypy and some tool tests
+    "sentencepiece==0.2.1", # Necessary for test_huggingface_tools test because of Mistral model
 ]
 
 notebook = [
diff --git a/test/backends/test_openai_vllm.py b/test/backends/test_openai_vllm.py
index 685465799..f4e0e6466 100644
--- a/test/backends/test_openai_vllm.py
+++ b/test/backends/test_openai_vllm.py
@@ -79,17 +79,24 @@ def vllm_process():
 
         yield process
 
+    except Exception as e:
+        pytest.skip(
+            f"vLLM process not available: {e}. May need to install with: pip install mellea[vllm]",
+            allow_module_level=True,
+        )
+
     # --- Teardown (always runs) ---
     finally:
-        try:
-            os.killpg(process.pid, signal.SIGTERM)  # kill the session group
-            process.wait(timeout=30)
-        except Exception:
+        if process is not None:
             try:
-                os.killpg(process.pid, signal.SIGKILL)
+                os.killpg(process.pid, signal.SIGTERM)  # kill the session group
+                process.wait(timeout=30)
             except Exception:
-                pass
-            process.wait()
+                try:
+                    os.killpg(process.pid, signal.SIGKILL)
+                except Exception:
+                    pass
+                process.wait()
 
 
 @pytest.fixture(scope="module")
diff --git a/test/stdlib/components/intrinsic/testdata/input_json/hallucination_detection.json b/test/stdlib/components/intrinsic/testdata/input_json/hallucination_detection.json
index f224ed20a..69a3e75bc 100644
--- a/test/stdlib/components/intrinsic/testdata/input_json/hallucination_detection.json
+++ b/test/stdlib/components/intrinsic/testdata/input_json/hallucination_detection.json
@@ -13,6 +13,7 @@
       "content": "Purple bumble fish are yellow. Green bumble fish are also yellow."
     }
   ],
+  "temperature": 0.0,
   "extra_body": {
     "documents": [
       {
diff --git a/test/stdlib/components/intrinsic/testdata/output_json/citations.json b/test/stdlib/components/intrinsic/testdata/output_json/citations.json
index 804f64f43..67dc2bb51 100644
--- a/test/stdlib/components/intrinsic/testdata/output_json/citations.json
+++ b/test/stdlib/components/intrinsic/testdata/output_json/citations.json
@@ -3,7 +3,7 @@
         {
             "index": 0,
             "message": {
-                "content": "[{\"response_begin\": 0, \"response_end\": 96, \"response_text\": \"Murdoch expanded in Australia and New Zealand by acquiring and expanding local newspapers. \", \"citation_doc_id\": \"0\", \"citation_begin\": 2468, \"citation_end\": 3533, \"citation_text\": \"He began to direct his attention to acquisition and expansion, buying the troubled Sunday Times in Perth, Western Australia (1956) and over the next few years acquiring suburban and provincial newspapers in New South Wales, Queensland, Victoria and the Northern Territory, including the Sydney afternoon tabloid, The Daily Mirror (1960). \"}, {\"response_begin\": 0, \"response_end\": 96, \"response_text\": \"Murdoch expanded in Australia and New Zealand by acquiring and expanding local newspapers. \", \"citation_doc_id\": \"0\", \"citation_begin\": 4792, \"citation_end\": 6183, \"citation_text\": \"Murdoch's first foray outside Australia involved the purchase of a controlling interest in the New Zealand daily The Dominion. \"}]",
+                "content": "[{\"response_begin\": 0, \"response_end\": 91, \"response_text\": \"Murdoch expanded in Australia and New Zealand by acquiring and expanding local newspapers. \", \"citation_doc_id\": \"0\", \"citation_begin\": 692, \"citation_end\": 1030, \"citation_text\": \"He began to direct his attention to acquisition and expansion, buying the troubled Sunday Times in Perth, Western Australia (1956) and over the next few years acquiring suburban and provincial newspapers in New South Wales, Queensland, Victoria and the Northern Territory, including the Sydney afternoon tabloid, The Daily Mirror (1960). \"}, {\"response_begin\": 0, \"response_end\": 91, \"response_text\": \"Murdoch expanded in Australia and New Zealand by acquiring and expanding local newspapers. \", \"citation_doc_id\": \"0\", \"citation_begin\": 1219, \"citation_end\": 1346, \"citation_text\": \"Murdoch's first foray outside Australia involved the purchase of a controlling interest in the New Zealand daily The Dominion. \"}]",
                 "role": "assistant"
             }
         }
diff --git a/test/stdlib/components/intrinsic/testdata/output_json/hallucination_detection.json b/test/stdlib/components/intrinsic/testdata/output_json/hallucination_detection.json
index 06e80be5f..eb518a4da 100644
--- a/test/stdlib/components/intrinsic/testdata/output_json/hallucination_detection.json
+++ b/test/stdlib/components/intrinsic/testdata/output_json/hallucination_detection.json
@@ -3,9 +3,9 @@
         {
             "index": 0,
             "message": {
-                "content": "[{\"response_begin\": 0, \"response_end\": 36, \"response_text\": \"Purple bumble fish are yellow. \", \"faithfulness_likelihood\": 0.7280598165124975, \"explanation\": \"This sentence makes a factual claim about the color of purple bumble fish. The document states 'The only type of fish that is yellow is the purple bumble fish.' This directly supports the claim in the sentence.\"}, {\"response_begin\": 36, \"response_end\": 70, \"response_text\": \"Green bumble fish are also yellow.\", \"faithfulness_likelihood\": 0.08656033472953338, \"explanation\": \"This sentence makes a factual claim about the color of green bumble fish. However, the document does not mention green bumble fish at all. Therefore, this claim cannot be verified from the provided context.\"}]",
+                "content": "[{\"response_begin\": 0, \"response_end\": 31, \"response_text\": \"Purple bumble fish are yellow. \", \"faithfulness_likelihood\": 0.7280580899614958, \"explanation\": \"This sentence makes a factual claim about the color of purple bumble fish. The document states 'The only type of fish that is yellow is the purple bumble fish.' This directly supports the claim in the sentence.\"}, {\"response_begin\": 31, \"response_end\": 65, \"response_text\": \"Green bumble fish are also yellow.\", \"faithfulness_likelihood\": 0.09613224257737445, \"explanation\": \"This sentence makes a factual claim about the color of green bumble fish. However, the document does not mention green bumble fish at all. Therefore, this claim cannot be verified from the provided context.\"}]",
                 "role": "assistant"
             }
         }
     ]
-}
\ No newline at end of file
+}
diff --git a/uv.lock b/uv.lock
index 8f5fa0871..9cb0759b9 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2216,6 +2216,7 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fe/65/5b235b40581ad75ab97dcd8b4218022ae8e3ab77c13c919f1a1dfe9171fd/greenlet-3.3.1-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:04bee4775f40ecefcdaa9d115ab44736cd4b9c5fba733575bfe9379419582e13", size = 273723, upload-time = "2026-01-23T15:30:37.521Z" },
     { url = "https://files.pythonhosted.org/packages/ce/ad/eb4729b85cba2d29499e0a04ca6fbdd8f540afd7be142fd571eea43d712f/greenlet-3.3.1-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:50e1457f4fed12a50e427988a07f0f9df53cf0ee8da23fab16e6732c2ec909d4", size = 574874, upload-time = "2026-01-23T16:00:54.551Z" },
     { url = "https://files.pythonhosted.org/packages/87/32/57cad7fe4c8b82fdaa098c89498ef85ad92dfbb09d5eb713adedfc2ae1f5/greenlet-3.3.1-cp310-cp310-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:070472cd156f0656f86f92e954591644e158fd65aa415ffbe2d44ca77656a8f5", size = 586309, upload-time = "2026-01-23T16:05:25.18Z" },
+    { url = "https://files.pythonhosted.org/packages/66/66/f041005cb87055e62b0d68680e88ec1a57f4688523d5e2fb305841bc8307/greenlet-3.3.1-cp310-cp310-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:1108b61b06b5224656121c3c8ee8876161c491cbe74e5c519e0634c837cf93d5", size = 597461, upload-time = "2026-01-23T16:15:51.943Z" },
     { url = "https://files.pythonhosted.org/packages/87/eb/8a1ec2da4d55824f160594a75a9d8354a5fe0a300fb1c48e7944265217e1/greenlet-3.3.1-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3a300354f27dd86bae5fbf7002e6dd2b3255cd372e9242c933faf5e859b703fe", size = 586985, upload-time = "2026-01-23T15:32:47.968Z" },
     { url = "https://files.pythonhosted.org/packages/15/1c/0621dd4321dd8c351372ee8f9308136acb628600658a49be1b7504208738/greenlet-3.3.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:e84b51cbebf9ae573b5fbd15df88887815e3253fc000a7d0ff95170e8f7e9729", size = 1547271, upload-time = "2026-01-23T16:04:18.977Z" },
     { url = "https://files.pythonhosted.org/packages/9d/53/24047f8924c83bea7a59c8678d9571209c6bfe5f4c17c94a78c06024e9f2/greenlet-3.3.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e0093bd1a06d899892427217f0ff2a3c8f306182b8c754336d32e2d587c131b4", size = 1613427, upload-time = "2026-01-23T15:33:44.428Z" },
@@ -2223,6 +2224,7 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ec/e8/2e1462c8fdbe0f210feb5ac7ad2d9029af8be3bf45bd9fa39765f821642f/greenlet-3.3.1-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:5fd23b9bc6d37b563211c6abbb1b3cab27db385a4449af5c32e932f93017080c", size = 274974, upload-time = "2026-01-23T15:31:02.891Z" },
     { url = "https://files.pythonhosted.org/packages/7e/a8/530a401419a6b302af59f67aaf0b9ba1015855ea7e56c036b5928793c5bd/greenlet-3.3.1-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:09f51496a0bfbaa9d74d36a52d2580d1ef5ed4fdfcff0a73730abfbbbe1403dd", size = 577175, upload-time = "2026-01-23T16:00:56.213Z" },
     { url = "https://files.pythonhosted.org/packages/8e/89/7e812bb9c05e1aaef9b597ac1d0962b9021d2c6269354966451e885c4e6b/greenlet-3.3.1-cp311-cp311-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:cb0feb07fe6e6a74615ee62a880007d976cf739b6669cce95daa7373d4fc69c5", size = 590401, upload-time = "2026-01-23T16:05:26.365Z" },
+    { url = "https://files.pythonhosted.org/packages/70/ae/e2d5f0e59b94a2269b68a629173263fa40b63da32f5c231307c349315871/greenlet-3.3.1-cp311-cp311-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:67ea3fc73c8cd92f42467a72b75e8f05ed51a0e9b1d15398c913416f2dafd49f", size = 601161, upload-time = "2026-01-23T16:15:53.456Z" },
     { url = "https://files.pythonhosted.org/packages/5c/ae/8d472e1f5ac5efe55c563f3eabb38c98a44b832602e12910750a7c025802/greenlet-3.3.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:39eda9ba259cc9801da05351eaa8576e9aa83eb9411e8f0c299e05d712a210f2", size = 590272, upload-time = "2026-01-23T15:32:49.411Z" },
     { url = "https://files.pythonhosted.org/packages/a8/51/0fde34bebfcadc833550717eade64e35ec8738e6b097d5d248274a01258b/greenlet-3.3.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:e2e7e882f83149f0a71ac822ebf156d902e7a5d22c9045e3e0d1daf59cee2cc9", size = 1550729, upload-time = "2026-01-23T16:04:20.867Z" },
     { url = "https://files.pythonhosted.org/packages/16/c9/2fb47bee83b25b119d5a35d580807bb8b92480a54b68fef009a02945629f/greenlet-3.3.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:80aa4d79eb5564f2e0a6144fcc744b5a37c56c4a92d60920720e99210d88db0f", size = 1615552, upload-time = "2026-01-23T15:33:45.743Z" },
@@ -2231,6 +2233,7 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f9/c8/9d76a66421d1ae24340dfae7e79c313957f6e3195c144d2c73333b5bfe34/greenlet-3.3.1-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:7e806ca53acf6d15a888405880766ec84721aa4181261cd11a457dfe9a7a4975", size = 276443, upload-time = "2026-01-23T15:30:10.066Z" },
     { url = "https://files.pythonhosted.org/packages/81/99/401ff34bb3c032d1f10477d199724f5e5f6fbfb59816ad1455c79c1eb8e7/greenlet-3.3.1-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d842c94b9155f1c9b3058036c24ffb8ff78b428414a19792b2380be9cecf4f36", size = 597359, upload-time = "2026-01-23T16:00:57.394Z" },
     { url = "https://files.pythonhosted.org/packages/2b/bc/4dcc0871ed557792d304f50be0f7487a14e017952ec689effe2180a6ff35/greenlet-3.3.1-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:20fedaadd422fa02695f82093f9a98bad3dab5fcda793c658b945fcde2ab27ba", size = 607805, upload-time = "2026-01-23T16:05:28.068Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/cd/7a7ca57588dac3389e97f7c9521cb6641fd8b6602faf1eaa4188384757df/greenlet-3.3.1-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c620051669fd04ac6b60ebc70478210119c56e2d5d5df848baec4312e260e4ca", size = 622363, upload-time = "2026-01-23T16:15:54.754Z" },
     { url = "https://files.pythonhosted.org/packages/cf/05/821587cf19e2ce1f2b24945d890b164401e5085f9d09cbd969b0c193cd20/greenlet-3.3.1-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:14194f5f4305800ff329cbf02c5fcc88f01886cadd29941b807668a45f0d2336", size = 609947, upload-time = "2026-01-23T15:32:51.004Z" },
     { url = "https://files.pythonhosted.org/packages/a4/52/ee8c46ed9f8babaa93a19e577f26e3d28a519feac6350ed6f25f1afee7e9/greenlet-3.3.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:7b2fe4150a0cf59f847a67db8c155ac36aed89080a6a639e9f16df5d6c6096f1", size = 1567487, upload-time = "2026-01-23T16:04:22.125Z" },
     { url = "https://files.pythonhosted.org/packages/8f/7c/456a74f07029597626f3a6db71b273a3632aecb9afafeeca452cfa633197/greenlet-3.3.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:49f4ad195d45f4a66a0eb9c1ba4832bb380570d361912fa3554746830d332149", size = 1636087, upload-time = "2026-01-23T15:33:47.486Z" },
@@ -2239,6 +2242,7 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ec/ab/d26750f2b7242c2b90ea2ad71de70cfcd73a948a49513188a0fc0d6fc15a/greenlet-3.3.1-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:7ab327905cabb0622adca5971e488064e35115430cec2c35a50fd36e72a315b3", size = 275205, upload-time = "2026-01-23T15:30:24.556Z" },
     { url = "https://files.pythonhosted.org/packages/10/d3/be7d19e8fad7c5a78eeefb2d896a08cd4643e1e90c605c4be3b46264998f/greenlet-3.3.1-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:65be2f026ca6a176f88fb935ee23c18333ccea97048076aef4db1ef5bc0713ac", size = 599284, upload-time = "2026-01-23T16:00:58.584Z" },
     { url = "https://files.pythonhosted.org/packages/ae/21/fe703aaa056fdb0f17e5afd4b5c80195bbdab701208918938bd15b00d39b/greenlet-3.3.1-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7a3ae05b3d225b4155bda56b072ceb09d05e974bc74be6c3fc15463cf69f33fd", size = 610274, upload-time = "2026-01-23T16:05:29.312Z" },
+    { url = "https://files.pythonhosted.org/packages/06/00/95df0b6a935103c0452dad2203f5be8377e551b8466a29650c4c5a5af6cc/greenlet-3.3.1-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:12184c61e5d64268a160226fb4818af4df02cfead8379d7f8b99a56c3a54ff3e", size = 624375, upload-time = "2026-01-23T16:15:55.915Z" },
     { url = "https://files.pythonhosted.org/packages/cb/86/5c6ab23bb3c28c21ed6bebad006515cfe08b04613eb105ca0041fecca852/greenlet-3.3.1-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6423481193bbbe871313de5fd06a082f2649e7ce6e08015d2a76c1e9186ca5b3", size = 612904, upload-time = "2026-01-23T15:32:52.317Z" },
     { url = "https://files.pythonhosted.org/packages/c2/f3/7949994264e22639e40718c2daf6f6df5169bf48fb038c008a489ec53a50/greenlet-3.3.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:33a956fe78bbbda82bfc95e128d61129b32d66bcf0a20a1f0c08aa4839ffa951", size = 1567316, upload-time = "2026-01-23T16:04:23.316Z" },
     { url = "https://files.pythonhosted.org/packages/8d/6e/d73c94d13b6465e9f7cd6231c68abde838bb22408596c05d9059830b7872/greenlet-3.3.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4b065d3284be43728dd280f6f9a13990b56470b81be20375a207cdc814a983f2", size = 1636549, upload-time = "2026-01-23T15:33:48.643Z" },
@@ -2247,6 +2251,7 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ae/fb/011c7c717213182caf78084a9bea51c8590b0afda98001f69d9f853a495b/greenlet-3.3.1-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:bd59acd8529b372775cd0fcbc5f420ae20681c5b045ce25bd453ed8455ab99b5", size = 275737, upload-time = "2026-01-23T15:32:16.889Z" },
     { url = "https://files.pythonhosted.org/packages/41/2e/a3a417d620363fdbb08a48b1dd582956a46a61bf8fd27ee8164f9dfe87c2/greenlet-3.3.1-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b31c05dd84ef6871dd47120386aed35323c944d86c3d91a17c4b8d23df62f15b", size = 646422, upload-time = "2026-01-23T16:01:00.354Z" },
     { url = "https://files.pythonhosted.org/packages/b4/09/c6c4a0db47defafd2d6bab8ddfe47ad19963b4e30f5bed84d75328059f8c/greenlet-3.3.1-cp314-cp314-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:02925a0bfffc41e542c70aa14c7eda3593e4d7e274bfcccca1827e6c0875902e", size = 658219, upload-time = "2026-01-23T16:05:30.956Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/89/b95f2ddcc5f3c2bc09c8ee8d77be312df7f9e7175703ab780f2014a0e781/greenlet-3.3.1-cp314-cp314-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:3e0f3878ca3a3ff63ab4ea478585942b53df66ddde327b59ecb191b19dbbd62d", size = 671455, upload-time = "2026-01-23T16:15:57.232Z" },
     { url = "https://files.pythonhosted.org/packages/80/38/9d42d60dffb04b45f03dbab9430898352dba277758640751dc5cc316c521/greenlet-3.3.1-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:34a729e2e4e4ffe9ae2408d5ecaf12f944853f40ad724929b7585bca808a9d6f", size = 660237, upload-time = "2026-01-23T15:32:53.967Z" },
     { url = "https://files.pythonhosted.org/packages/96/61/373c30b7197f9e756e4c81ae90a8d55dc3598c17673f91f4d31c3c689c3f/greenlet-3.3.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:aec9ab04e82918e623415947921dea15851b152b822661cce3f8e4393c3df683", size = 1615261, upload-time = "2026-01-23T16:04:25.066Z" },
     { url = "https://files.pythonhosted.org/packages/fd/d3/ca534310343f5945316f9451e953dcd89b36fe7a19de652a1dc5a0eeef3f/greenlet-3.3.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:71c767cf281a80d02b6c1bdc41c9468e1f5a494fb11bc8688c360524e273d7b1", size = 1683719, upload-time = "2026-01-23T15:33:50.61Z" },
@@ -2255,6 +2260,7 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/28/24/cbbec49bacdcc9ec652a81d3efef7b59f326697e7edf6ed775a5e08e54c2/greenlet-3.3.1-cp314-cp314t-macosx_11_0_universal2.whl", hash = "sha256:3e63252943c921b90abb035ebe9de832c436401d9c45f262d80e2d06cc659242", size = 282706, upload-time = "2026-01-23T15:33:05.525Z" },
     { url = "https://files.pythonhosted.org/packages/86/2e/4f2b9323c144c4fe8842a4e0d92121465485c3c2c5b9e9b30a52e80f523f/greenlet-3.3.1-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:76e39058e68eb125de10c92524573924e827927df5d3891fbc97bd55764a8774", size = 651209, upload-time = "2026-01-23T16:01:01.517Z" },
     { url = "https://files.pythonhosted.org/packages/d9/87/50ca60e515f5bb55a2fbc5f0c9b5b156de7d2fc51a0a69abc9d23914a237/greenlet-3.3.1-cp314-cp314t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c9f9d5e7a9310b7a2f416dd13d2e3fd8b42d803968ea580b7c0f322ccb389b97", size = 654300, upload-time = "2026-01-23T16:05:32.199Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/25/c51a63f3f463171e09cb586eb64db0861eb06667ab01a7968371a24c4f3b/greenlet-3.3.1-cp314-cp314t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:4b9721549a95db96689458a1e0ae32412ca18776ed004463df3a9299c1b257ab", size = 662574, upload-time = "2026-01-23T16:15:58.364Z" },
     { url = "https://files.pythonhosted.org/packages/1d/94/74310866dfa2b73dd08659a3d18762f83985ad3281901ba0ee9a815194fb/greenlet-3.3.1-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:92497c78adf3ac703b57f1e3813c2d874f27f71a178f9ea5887855da413cd6d2", size = 653842, upload-time = "2026-01-23T15:32:55.671Z" },
     { url = "https://files.pythonhosted.org/packages/97/43/8bf0ffa3d498eeee4c58c212a3905dd6146c01c8dc0b0a046481ca29b18c/greenlet-3.3.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:ed6b402bc74d6557a705e197d47f9063733091ed6357b3de33619d8a8d93ac53", size = 1614917, upload-time = "2026-01-23T16:04:26.276Z" },
     { url = "https://files.pythonhosted.org/packages/89/90/a3be7a5f378fc6e84abe4dcfb2ba32b07786861172e502388b4c90000d1b/greenlet-3.3.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:59913f1e5ada20fde795ba906916aea25d442abcc0593fba7e26c92b7ad76249", size = 1676092, upload-time = "2026-01-23T15:33:52.176Z" },
@@ -4123,6 +4129,7 @@ dev = [
     { name = "pytest-timeout" },
     { name = "python-semantic-release" },
     { name = "ruff" },
+    { name = "sentencepiece" },
 ]
 docs = [
     { name = "sphinx-autodoc-typehints", version = "3.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
@@ -4199,6 +4206,7 @@ dev = [
     { name = "pytest-timeout" },
     { name = "python-semantic-release", specifier = "~=7.32" },
     { name = "ruff", specifier = ">=0.11.6" },
+    { name = "sentencepiece", specifier = "==0.2.1" },
 ]
 docs = [
     { name = "sphinx-autodoc-typehints" },

From 79092d2098a0da367a6134ad8cbdccf8d2abd83f Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Fri, 6 Mar 2026 10:10:00 +0000
Subject: [PATCH 11/11] test: fix flaky ollama tests, remove stale xfails, add
 diagnostic logging

- Remove xfail from test_generate_from_raw_with_format (consistently passing)
- Remove xfail from test_multiple_async_funcs (watsonx litellm bug resolved)
- Add CONTEXT_WINDOW: 2048 and stronger assertions to generate_from_raw tests
- Add pytest.mark.timeout(150) to test_generate_from_raw
- Increase MAX_NEW_TOKENS to 2**10 in format tests
- Add FancyLogger warning when generate_from_raw catches an exception
- Mark researcher example as slow; add markers to query_clarification
- Update slow marker description in pyproject.toml
---
 .../intrinsics/query_clarification.py         |  1 +
 docs/examples/mini_researcher/researcher.py   |  2 +-
 mellea/backends/ollama.py                     |  4 +++
 pyproject.toml                                |  2 +-
 test/backends/test_litellm_watsonx.py         |  3 --
 test/backends/test_ollama.py                  | 30 ++++++++++++-------
 test/backends/test_openai_ollama.py           |  2 +-
 7 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/docs/examples/intrinsics/query_clarification.py b/docs/examples/intrinsics/query_clarification.py
index 9f465b871..6d660e1af 100644
--- a/docs/examples/intrinsics/query_clarification.py
+++ b/docs/examples/intrinsics/query_clarification.py
@@ -1,3 +1,4 @@
+# pytest: huggingface, requires_heavy_ram, llm
 """
 Example usage of the query clarification intrinsic for RAG applications.
 
diff --git a/docs/examples/mini_researcher/researcher.py b/docs/examples/mini_researcher/researcher.py
index 87cdeda0c..db4532c00 100644
--- a/docs/examples/mini_researcher/researcher.py
+++ b/docs/examples/mini_researcher/researcher.py
@@ -1,4 +1,4 @@
-# pytest: ollama, qualitative, llm
+# pytest: ollama, qualitative, llm, slow
 
 from collections.abc import Callable
 from functools import cache
diff --git a/mellea/backends/ollama.py b/mellea/backends/ollama.py
index 04f75751b..144155cd7 100644
--- a/mellea/backends/ollama.py
+++ b/mellea/backends/ollama.py
@@ -470,6 +470,10 @@ async def generate_from_raw(
             result = None
             error = None
             if isinstance(response, BaseException):
+                FancyLogger.get_logger().warning(
+                    f"generate_from_raw: request {i} failed with "
+                    f"{type(response).__name__}: {response}"
+                )
                 result = ModelOutputThunk(value="")
                 error = response
             else:
diff --git a/pyproject.toml b/pyproject.toml
index 31018730b..957761043 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -244,7 +244,7 @@ markers = [
     "requires_gpu: Tests requiring GPU",
     "requires_heavy_ram: Tests requiring 48GB+ RAM",
     "qualitative: Non-deterministic quality tests",
-    "slow: Tests taking >5 minutes (e.g., dataset loading)",
+    "slow: Tests taking >1 minute (e.g., multi-step pipelines like researcher)",
 
     # Composite markers
     "llm: Tests that make LLM calls (needs at least Ollama)",
diff --git a/test/backends/test_litellm_watsonx.py b/test/backends/test_litellm_watsonx.py
index 80f65b096..9edb43ffb 100644
--- a/test/backends/test_litellm_watsonx.py
+++ b/test/backends/test_litellm_watsonx.py
@@ -62,9 +62,6 @@ async def test_generate_from_raw(session) -> None:
 
 
 @pytest.mark.qualitative
-@pytest.mark.xfail(
-    reason="litellm has a bug with watsonx; once that is fixed, this should pass."
-)
 async def test_multiple_async_funcs(session) -> None:
     """If this test passes, remove the _has_potential_event_loop_errors func from litellm."""
     session.chat(
diff --git a/test/backends/test_ollama.py b/test/backends/test_ollama.py
index fcca7fcdc..dee25bfb8 100644
--- a/test/backends/test_ollama.py
+++ b/test/backends/test_ollama.py
@@ -87,7 +87,7 @@ class Email(pydantic.BaseModel):
     output = session.instruct(
         "Write a short email to Olivia, thanking her for organizing a sailing activity. Her email server is example.com. No more than two sentences. ",
         format=Email,
-        model_options={ModelOption.MAX_NEW_TOKENS: 2**8},
+        model_options={ModelOption.MAX_NEW_TOKENS: 2**10},
     )
     print("Formatted output:")
     email = Email.model_validate_json(
@@ -102,18 +102,22 @@ class Email(pydantic.BaseModel):
 
 
 @pytest.mark.qualitative
+@pytest.mark.timeout(150)
 async def test_generate_from_raw(session) -> None:
     prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"]
 
     results = await session.backend.generate_from_raw(
-        actions=[CBlock(value=prompt) for prompt in prompts], ctx=session.ctx
+        actions=[CBlock(value=prompt) for prompt in prompts],
+        ctx=session.ctx,
+        model_options={ModelOption.CONTEXT_WINDOW: 2048},
     )
 
     assert len(results) == len(prompts)
-    assert results[0].value is not None
+    assert all(r.value for r in results), (
+        f"One or more requests returned empty (possible backend timeout): {[r.value for r in results]}"
+    )
 
 
-@pytest.mark.xfail(reason="ollama sometimes fails generated structured outputs")
 async def test_generate_from_raw_with_format(session) -> None:
     prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"]
 
@@ -125,17 +129,21 @@ class Answer(pydantic.BaseModel):
         actions=[CBlock(value=prompt) for prompt in prompts],
         ctx=session.ctx,
         format=Answer,
+        model_options={ModelOption.CONTEXT_WINDOW: 2048},
     )
 
     assert len(results) == len(prompts)
+    assert all(r.value for r in results), (
+        f"One or more requests returned empty (possible backend timeout): {[r.value for r in results]}"
+    )
 
-    random_result = results[0]
-    try:
-        Answer.model_validate_json(random_result.value)
-    except pydantic.ValidationError as e:
-        assert False, (
-            f"formatting directive failed for {random_result.value}: {e.json()}"
-        )
+    for result in results:
+        try:
+            Answer.model_validate_json(result.value)
+        except pydantic.ValidationError as e:
+            assert False, (
+                f"formatting directive failed for {result.value}: {e.json()}"
+            )
 
 
 async def test_async_parallel_requests(session) -> None:
diff --git a/test/backends/test_openai_ollama.py b/test/backends/test_openai_ollama.py
index c40dcd869..142d07819 100644
--- a/test/backends/test_openai_ollama.py
+++ b/test/backends/test_openai_ollama.py
@@ -104,7 +104,7 @@ class Email(pydantic.BaseModel):
     output = m_session.instruct(
         "Write a short email to Olivia, thanking her for organizing a sailing activity. Her email server is example.com. No more than two sentences. ",
         format=Email,
-        model_options={ModelOption.MAX_NEW_TOKENS: 2**8},
+        model_options={ModelOption.MAX_NEW_TOKENS: 2**10},
     )
     print("Formatted output:")
     email = Email.model_validate_json(