generative-computing · planetf1 · Mar 3, 2026 · Mar 3, 2026 · Mar 3, 2026 · Feb 16, 2026
@@ -1,5 +1,4 @@
-# pytest: skip, huggingface, requires_heavy_ram, llm
-# SKIP REASON: Example broken since intrinsics refactor - see issue #385
+# pytest: huggingface, requires_heavy_ram, llm
 
 import time
 

@@ -1,3 +1,6 @@
+# pytest: skip, huggingface, requires_heavy_ram, llm
+# SKIP REASON: Requires user input; tests same functionality as 101_example.py.
+
 from stembolts_intrinsic import (
     async_stembolt_failure_analysis,
     stembolt_failure_analysis,

@@ -1,3 +1,4 @@
+# pytest: huggingface, requires_heavy_ram, llm
 """
 Example usage of the query clarification intrinsic for RAG applications.
 

@@ -1,4 +1,4 @@
-# pytest: ollama, qualitative, llm
+# pytest: ollama, qualitative, llm, slow
 
 from collections.abc import Callable
 from functools import cache

@@ -1015,7 +1015,8 @@ async def post_processing(
                 mot._meta["hf_output"] = full_output
 
         # The ModelOutputThunk must be computed by this point.
-        assert mot.value is not None
+        if mot.value is None:
+            return
 
         # Store KV cache in LRU separately (not in mot._meta) to enable proper cleanup on eviction.
         # This prevents GPU memory from being held by ModelOutputThunk references.

@@ -441,8 +441,11 @@ async def post_processing(
         # OpenAI-like streamed responses potentially give you chunks of tool calls.
         # As a result, we have to store data between calls and only then
         # check for complete tool calls in the post_processing step.
-        tool_chunk = extract_model_tool_requests(
-            tools, mot._meta["litellm_chat_response"]
+        litellm_response = mot._meta.get("litellm_chat_response")
+        tool_chunk = (
+            extract_model_tool_requests(tools, litellm_response)
+            if litellm_response is not None
+            else None
         )
         if tool_chunk is not None:
             if mot.tool_calls is None:
@@ -457,7 +460,7 @@ async def post_processing(
         generate_log.backend = f"litellm::{self.model_id!s}"
         generate_log.model_options = mot._model_options
         generate_log.date = datetime.datetime.now()
-        generate_log.model_output = mot._meta["litellm_chat_response"]
+        generate_log.model_output = mot._meta.get("litellm_chat_response")
         generate_log.extra = {
             "format": _format,
             "tools_available": tools,

@@ -470,6 +470,10 @@ async def generate_from_raw(
             result = None
             error = None
             if isinstance(response, BaseException):
+                FancyLogger.get_logger().warning(
+                    f"generate_from_raw: request {i} failed with "
+                    f"{type(response).__name__}: {response}"
+                )
                 result = ModelOutputThunk(value="")
                 error = response
             else:
@@ -596,7 +600,7 @@ async def post_processing(
         generate_log.backend = f"ollama::{self._get_ollama_model_id()}"
         generate_log.model_options = mot._model_options
         generate_log.date = datetime.datetime.now()
-        generate_log.model_output = mot._meta["chat_response"]
+        generate_log.model_output = mot._meta.get("chat_response")
         generate_log.extra = {
             "format": _format,
             "thinking": mot._model_options.get(ModelOption.THINKING, None),

@@ -575,9 +575,13 @@ async def post_processing(
         # check for complete tool calls in the post_processing step.
         # Use the choice format for tool extraction (backward compatibility)
         choice_response = mot._meta.get(
-            "oai_chat_response_choice", mot._meta["oai_chat_response"]
+            "oai_chat_response_choice", mot._meta.get("oai_chat_response")
+        )
+        tool_chunk = (
+            extract_model_tool_requests(tools, choice_response)
+            if choice_response is not None
+            else None
         )
-        tool_chunk = extract_model_tool_requests(tools, choice_response)
         if tool_chunk is not None:
             if mot.tool_calls is None:
                 mot.tool_calls = {}
@@ -592,7 +596,7 @@ async def post_processing(
         generate_log.model_options = mot._model_options
         generate_log.date = datetime.datetime.now()
         # Store the full response (includes usage info)
-        generate_log.model_output = mot._meta["oai_chat_response"]
+        generate_log.model_output = mot._meta.get("oai_chat_response")
         generate_log.extra = {
             "format": _format,
             "thinking": thinking,
@@ -613,12 +617,13 @@ async def post_processing(
                 record_token_usage,
             )
 
-            response = mot._meta["oai_chat_response"]
-            # response is a dict from model_dump(), extract usage if present
-            usage = response.get("usage") if isinstance(response, dict) else None
-            if usage:
-                record_token_usage(span, usage)
-            record_response_metadata(span, response)
+            response = mot._meta.get("oai_chat_response")
+            if response is not None:
+                # response is a dict from model_dump(), extract usage if present
+                usage = response.get("usage") if isinstance(response, dict) else None
+                if usage:
+                    record_token_usage(span, usage)
+                record_response_metadata(span, response)
             # Close the span now that async operation is complete
             end_backend_span(span)
             # Clean up the span reference

@@ -380,7 +380,8 @@ async def post_processing(
     ):
         """Called when generation is done."""
         # The ModelOutputThunk must be computed by this point.
-        assert mot.value is not None
+        if mot.value is None:
+            return
 
         # Only scan for tools if we are not doing structured output and tool calls were provided to the model.
         if _format is None and tool_calls:

@@ -469,7 +469,12 @@ async def post_processing(
         # OpenAI streamed responses give you chunks of tool calls.
         # As a result, we have to store data between calls and only then
         # check for complete tool calls in the post_processing step.
-        tool_chunk = extract_model_tool_requests(tools, mot._meta["oai_chat_response"])
+        oai_response = mot._meta.get("oai_chat_response")
+        tool_chunk = (
+            extract_model_tool_requests(tools, oai_response)
+            if oai_response is not None
+            else None
+        )
         if tool_chunk is not None:
             if mot.tool_calls is None:
                 mot.tool_calls = {}
@@ -509,7 +514,7 @@ async def post_processing(
         generate_log.backend = f"watsonx::{self.model_id!s}"
         generate_log.model_options = mot._model_options
         generate_log.date = datetime.datetime.now()
-        generate_log.model_output = mot._meta["oai_chat_response"]
+        generate_log.model_output = mot._meta.get("oai_chat_response")
         generate_log.extra = {
             "format": _format,
             "tools_available": tools,

@@ -325,10 +325,14 @@ async def astream(self) -> str:
             elif isinstance(chunks[-1], Exception):
                 # Mark as computed so post_process runs in finally block
                 self._computed = True
-                # Store exception to re-raise after cleanup
-                exception_to_raise = chunks[-1]
+                # Remove the exception from chunks so _process doesn't receive it
+                exception_to_raise = chunks.pop()
 
             for chunk in chunks:
+                # Belt-and-suspenders: skip non-chunk objects that should
+                # have been removed above (exceptions, sentinel None).
+                if chunk is None or isinstance(chunk, Exception):
+                    continue
                 assert self._process is not None
                 await self._process(self, chunk)
 

@@ -119,6 +119,7 @@ dev = [
     "python-semantic-release~=7.32",
     "nbmake>=1.5.5",
     "langchain-core>=1.2.7", # Necessary for mypy and some tool tests
+    "sentencepiece==0.2.1", # Necessary for test_huggingface_tools test because of Mistral model
 ]
 
 notebook = [
@@ -243,7 +244,7 @@ markers = [
     "requires_gpu: Tests requiring GPU",
     "requires_heavy_ram: Tests requiring 48GB+ RAM",
     "qualitative: Non-deterministic quality tests",
-    "slow: Tests taking >5 minutes (e.g., dataset loading)",
+    "slow: Tests taking >1 minute (e.g., multi-step pipelines like researcher)",
 
     # Composite markers
     "llm: Tests that make LLM calls (needs at least Ollama)",
@@ -255,7 +256,6 @@ addopts = [
     # Run qualitative tests by default (use -m "not qualitative" for fast tests)
     "--cov=mellea",
     "--cov=cli",
-    "--cov-report=term",
     "--cov-report=html",
     "--cov-report=json",
     # Set timeout to 15 minutes for full test suite

@@ -62,9 +62,6 @@ async def test_generate_from_raw(session) -> None:
 
 
 @pytest.mark.qualitative
-@pytest.mark.xfail(
-    reason="litellm has a bug with watsonx; once that is fixed, this should pass."
-)
 async def test_multiple_async_funcs(session) -> None:
     """If this test passes, remove the _has_potential_event_loop_errors func from litellm."""
     session.chat(

@@ -87,7 +87,7 @@ class Email(pydantic.BaseModel):
     output = session.instruct(
         "Write a short email to Olivia, thanking her for organizing a sailing activity. Her email server is example.com. No more than two sentences. ",
         format=Email,
-        model_options={ModelOption.MAX_NEW_TOKENS: 2**8},
+        model_options={ModelOption.MAX_NEW_TOKENS: 2**10},
     )
     print("Formatted output:")
     email = Email.model_validate_json(
@@ -102,18 +102,22 @@ class Email(pydantic.BaseModel):
 
 
 @pytest.mark.qualitative
+@pytest.mark.timeout(150)
 async def test_generate_from_raw(session) -> None:
     prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"]
 
     results = await session.backend.generate_from_raw(
-        actions=[CBlock(value=prompt) for prompt in prompts], ctx=session.ctx
+        actions=[CBlock(value=prompt) for prompt in prompts],
+        ctx=session.ctx,
+        model_options={ModelOption.CONTEXT_WINDOW: 2048},
     )
 
     assert len(results) == len(prompts)
-    assert results[0].value is not None
+    assert all(r.value for r in results), (
+        f"One or more requests returned empty (possible backend timeout): {[r.value for r in results]}"
+    )
 
 
-@pytest.mark.xfail(reason="ollama sometimes fails generated structured outputs")
 async def test_generate_from_raw_with_format(session) -> None:
     prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"]
 
@@ -125,17 +129,21 @@ class Answer(pydantic.BaseModel):
         actions=[CBlock(value=prompt) for prompt in prompts],
         ctx=session.ctx,
         format=Answer,
+        model_options={ModelOption.CONTEXT_WINDOW: 2048},
     )
 
     assert len(results) == len(prompts)
+    assert all(r.value for r in results), (
+        f"One or more requests returned empty (possible backend timeout): {[r.value for r in results]}"
+    )
 
-    random_result = results[0]
-    try:
-        Answer.model_validate_json(random_result.value)
-    except pydantic.ValidationError as e:
-        assert False, (
-            f"formatting directive failed for {random_result.value}: {e.json()}"
-        )
+    for result in results:
+        try:
+            Answer.model_validate_json(result.value)
+        except pydantic.ValidationError as e:
+            assert False, (
+                f"formatting directive failed for {result.value}: {e.json()}"
+            )
 
 
 async def test_async_parallel_requests(session) -> None:

@@ -104,7 +104,7 @@ class Email(pydantic.BaseModel):
     output = m_session.instruct(
         "Write a short email to Olivia, thanking her for organizing a sailing activity. Her email server is example.com. No more than two sentences. ",
         format=Email,
-        model_options={ModelOption.MAX_NEW_TOKENS: 2**8},
+        model_options={ModelOption.MAX_NEW_TOKENS: 2**10},
     )
     print("Formatted output:")
     email = Email.model_validate_json(

@@ -79,17 +79,24 @@ def vllm_process():
 
         yield process
 
+    except Exception as e:
+        pytest.skip(
+            f"vLLM process not available: {e}. May need to install with: pip install mellea[vllm]",
+            allow_module_level=True,
+        )
+
     # --- Teardown (always runs) ---
     finally:
-        try:
-            os.killpg(process.pid, signal.SIGTERM)  # kill the session group
-            process.wait(timeout=30)
-        except Exception:
+        if process is not None:
             try:
-                os.killpg(process.pid, signal.SIGKILL)
+                os.killpg(process.pid, signal.SIGTERM)  # kill the session group
+                process.wait(timeout=30)
             except Exception:
-                pass
-            process.wait()
+                try:
+                    os.killpg(process.pid, signal.SIGKILL)
+                except Exception:
+                    pass
+                process.wait()
 
 
 @pytest.fixture(scope="module")