diff --git a/docs/examples/intrinsics/query_clarification.py b/docs/examples/intrinsics/query_clarification.py
index 9f465b871..6d660e1af 100644
--- a/docs/examples/intrinsics/query_clarification.py
+++ b/docs/examples/intrinsics/query_clarification.py
@@ -1,3 +1,4 @@
+# pytest: huggingface, requires_heavy_ram, llm
 """
 Example usage of the query clarification intrinsic for RAG applications.
 
diff --git a/docs/examples/mini_researcher/researcher.py b/docs/examples/mini_researcher/researcher.py
index 87cdeda0c..db4532c00 100644
--- a/docs/examples/mini_researcher/researcher.py
+++ b/docs/examples/mini_researcher/researcher.py
@@ -1,4 +1,4 @@
-# pytest: ollama, qualitative, llm
+# pytest: ollama, qualitative, llm, slow
 
 from collections.abc import Callable
 from functools import cache
diff --git a/mellea/backends/ollama.py b/mellea/backends/ollama.py
index e848b9004..d257c7e6d 100644
--- a/mellea/backends/ollama.py
+++ b/mellea/backends/ollama.py
@@ -470,6 +470,10 @@ async def generate_from_raw(
             result = None
             error = None
             if isinstance(response, BaseException):
+                FancyLogger.get_logger().warning(
+                    f"generate_from_raw: request {i} failed with "
+                    f"{type(response).__name__}: {response}"
+                )
                 result = ModelOutputThunk(value="")
                 error = response
             else:
diff --git a/pyproject.toml b/pyproject.toml
index e3d7127c5..cc11f0f48 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -251,7 +251,7 @@ markers = [
     "requires_gpu: Tests requiring GPU",
     "requires_heavy_ram: Tests requiring 48GB+ RAM",
     "qualitative: Non-deterministic quality tests",
-    "slow: Tests taking >5 minutes (e.g., dataset loading)",
+    "slow: Tests taking >1 minute (e.g., multi-step pipelines like researcher)",
 
     # Composite markers
     "llm: Tests that make LLM calls (needs at least Ollama)",
diff --git a/test/backends/test_ollama.py b/test/backends/test_ollama.py
index fcca7fcdc..c8b77180e 100644
--- a/test/backends/test_ollama.py
+++ b/test/backends/test_ollama.py
@@ -87,7 +87,7 @@ class Email(pydantic.BaseModel):
     output = session.instruct(
         "Write a short email to Olivia, thanking her for organizing a sailing activity. Her email server is example.com. No more than two sentences. ",
         format=Email,
-        model_options={ModelOption.MAX_NEW_TOKENS: 2**8},
+        model_options={ModelOption.MAX_NEW_TOKENS: 2**10},
     )
     print("Formatted output:")
     email = Email.model_validate_json(
@@ -102,15 +102,20 @@ class Email(pydantic.BaseModel):
 
 
 @pytest.mark.qualitative
+@pytest.mark.timeout(150)
 async def test_generate_from_raw(session) -> None:
     prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"]
 
     results = await session.backend.generate_from_raw(
-        actions=[CBlock(value=prompt) for prompt in prompts], ctx=session.ctx
+        actions=[CBlock(value=prompt) for prompt in prompts],
+        ctx=session.ctx,
+        model_options={ModelOption.CONTEXT_WINDOW: 2048},
     )
 
     assert len(results) == len(prompts)
-    assert results[0].value is not None
+    assert all(r.value for r in results), (
+        f"One or more requests returned empty (possible backend timeout): {[r.value for r in results]}"
+    )
 
 
 @pytest.mark.xfail(reason="ollama sometimes fails generated structured outputs")
@@ -125,17 +130,19 @@ class Answer(pydantic.BaseModel):
         actions=[CBlock(value=prompt) for prompt in prompts],
         ctx=session.ctx,
         format=Answer,
+        model_options={ModelOption.CONTEXT_WINDOW: 2048},
     )
 
     assert len(results) == len(prompts)
+    assert all(r.value for r in results), (
+        f"One or more requests returned empty (possible backend timeout): {[r.value for r in results]}"
+    )
 
-    random_result = results[0]
-    try:
-        Answer.model_validate_json(random_result.value)
-    except pydantic.ValidationError as e:
-        assert False, (
-            f"formatting directive failed for {random_result.value}: {e.json()}"
-        )
+    for result in results:
+        try:
+            Answer.model_validate_json(result.value)
+        except pydantic.ValidationError as e:
+            assert False, f"formatting directive failed for {result.value}: {e.json()}"
 
 
 async def test_async_parallel_requests(session) -> None:
diff --git a/test/backends/test_openai_ollama.py b/test/backends/test_openai_ollama.py
index c40dcd869..142d07819 100644
--- a/test/backends/test_openai_ollama.py
+++ b/test/backends/test_openai_ollama.py
@@ -104,7 +104,7 @@ class Email(pydantic.BaseModel):
     output = m_session.instruct(
         "Write a short email to Olivia, thanking her for organizing a sailing activity. Her email server is example.com. No more than two sentences. ",
         format=Email,
-        model_options={ModelOption.MAX_NEW_TOKENS: 2**8},
+        model_options={ModelOption.MAX_NEW_TOKENS: 2**10},
     )
     print("Formatted output:")
     email = Email.model_validate_json(