diff --git a/docs/examples/intrinsics/query_clarification.py b/docs/examples/intrinsics/query_clarification.py index 9f465b871..6d660e1af 100644 --- a/docs/examples/intrinsics/query_clarification.py +++ b/docs/examples/intrinsics/query_clarification.py @@ -1,3 +1,4 @@ +# pytest: huggingface, requires_heavy_ram, llm """ Example usage of the query clarification intrinsic for RAG applications. diff --git a/docs/examples/mini_researcher/researcher.py b/docs/examples/mini_researcher/researcher.py index 87cdeda0c..db4532c00 100644 --- a/docs/examples/mini_researcher/researcher.py +++ b/docs/examples/mini_researcher/researcher.py @@ -1,4 +1,4 @@ -# pytest: ollama, qualitative, llm +# pytest: ollama, qualitative, llm, slow from collections.abc import Callable from functools import cache diff --git a/mellea/backends/ollama.py b/mellea/backends/ollama.py index e848b9004..d257c7e6d 100644 --- a/mellea/backends/ollama.py +++ b/mellea/backends/ollama.py @@ -470,6 +470,10 @@ async def generate_from_raw( result = None error = None if isinstance(response, BaseException): + FancyLogger.get_logger().warning( + f"generate_from_raw: request {i} failed with " + f"{type(response).__name__}: {response}" + ) result = ModelOutputThunk(value="") error = response else: diff --git a/pyproject.toml b/pyproject.toml index e3d7127c5..cc11f0f48 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -251,7 +251,7 @@ markers = [ "requires_gpu: Tests requiring GPU", "requires_heavy_ram: Tests requiring 48GB+ RAM", "qualitative: Non-deterministic quality tests", - "slow: Tests taking >5 minutes (e.g., dataset loading)", + "slow: Tests taking >1 minute (e.g., multi-step pipelines like researcher)", # Composite markers "llm: Tests that make LLM calls (needs at least Ollama)", diff --git a/test/backends/test_ollama.py b/test/backends/test_ollama.py index fcca7fcdc..c8b77180e 100644 --- a/test/backends/test_ollama.py +++ b/test/backends/test_ollama.py @@ -87,7 +87,7 @@ class Email(pydantic.BaseModel): output = session.instruct( "Write a short email to Olivia, thanking her for organizing a sailing activity. Her email server is example.com. No more than two sentences. ", format=Email, - model_options={ModelOption.MAX_NEW_TOKENS: 2**8}, + model_options={ModelOption.MAX_NEW_TOKENS: 2**10}, ) print("Formatted output:") email = Email.model_validate_json( @@ -102,15 +102,20 @@ class Email(pydantic.BaseModel): @pytest.mark.qualitative +@pytest.mark.timeout(150) async def test_generate_from_raw(session) -> None: prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"] results = await session.backend.generate_from_raw( - actions=[CBlock(value=prompt) for prompt in prompts], ctx=session.ctx + actions=[CBlock(value=prompt) for prompt in prompts], + ctx=session.ctx, + model_options={ModelOption.CONTEXT_WINDOW: 2048}, ) assert len(results) == len(prompts) - assert results[0].value is not None + assert all(r.value for r in results), ( + f"One or more requests returned empty (possible backend timeout): {[r.value for r in results]}" + ) @pytest.mark.xfail(reason="ollama sometimes fails generated structured outputs") @@ -125,17 +130,19 @@ class Answer(pydantic.BaseModel): actions=[CBlock(value=prompt) for prompt in prompts], ctx=session.ctx, format=Answer, + model_options={ModelOption.CONTEXT_WINDOW: 2048}, ) assert len(results) == len(prompts) + assert all(r.value for r in results), ( + f"One or more requests returned empty (possible backend timeout): {[r.value for r in results]}" + ) - random_result = results[0] - try: - Answer.model_validate_json(random_result.value) - except pydantic.ValidationError as e: - assert False, ( - f"formatting directive failed for {random_result.value}: {e.json()}" - ) + for result in results: + try: + Answer.model_validate_json(result.value) + except pydantic.ValidationError as e: + assert False, f"formatting directive failed for {result.value}: {e.json()}" async def test_async_parallel_requests(session) -> None: diff --git a/test/backends/test_openai_ollama.py b/test/backends/test_openai_ollama.py index c40dcd869..142d07819 100644 --- a/test/backends/test_openai_ollama.py +++ b/test/backends/test_openai_ollama.py @@ -104,7 +104,7 @@ class Email(pydantic.BaseModel): output = m_session.instruct( "Write a short email to Olivia, thanking her for organizing a sailing activity. Her email server is example.com. No more than two sentences. ", format=Email, - model_options={ModelOption.MAX_NEW_TOKENS: 2**8}, + model_options={ModelOption.MAX_NEW_TOKENS: 2**10}, ) print("Formatted output:") email = Email.model_validate_json(