diff --git a/docs/examples/aLora/101_example.py b/docs/examples/aLora/101_example.py index 3abc2554a..243d9225b 100644 --- a/docs/examples/aLora/101_example.py +++ b/docs/examples/aLora/101_example.py @@ -1,5 +1,4 @@ -# pytest: skip, huggingface, requires_heavy_ram, llm -# SKIP REASON: Example broken since intrinsics refactor - see issue #385 +# pytest: huggingface, requires_heavy_ram, llm import time diff --git a/docs/examples/aLora/102_example.py b/docs/examples/aLora/102_example.py index 542407ce9..c2bf86a3a 100644 --- a/docs/examples/aLora/102_example.py +++ b/docs/examples/aLora/102_example.py @@ -1,3 +1,6 @@ +# pytest: skip, huggingface, requires_heavy_ram, llm +# SKIP REASON: Requires user input; tests same functionality as 101_example.py. + from stembolts_intrinsic import ( async_stembolt_failure_analysis, stembolt_failure_analysis, diff --git a/docs/examples/intrinsics/query_clarification.py b/docs/examples/intrinsics/query_clarification.py index 9f465b871..6d660e1af 100644 --- a/docs/examples/intrinsics/query_clarification.py +++ b/docs/examples/intrinsics/query_clarification.py @@ -1,3 +1,4 @@ +# pytest: huggingface, requires_heavy_ram, llm """ Example usage of the query clarification intrinsic for RAG applications. diff --git a/docs/examples/mini_researcher/researcher.py b/docs/examples/mini_researcher/researcher.py index 87cdeda0c..db4532c00 100644 --- a/docs/examples/mini_researcher/researcher.py +++ b/docs/examples/mini_researcher/researcher.py @@ -1,4 +1,4 @@ -# pytest: ollama, qualitative, llm +# pytest: ollama, qualitative, llm, slow from collections.abc import Callable from functools import cache diff --git a/mellea/backends/huggingface.py b/mellea/backends/huggingface.py index 3dad5ed71..dfbf3e4c5 100644 --- a/mellea/backends/huggingface.py +++ b/mellea/backends/huggingface.py @@ -1015,7 +1015,8 @@ async def post_processing( mot._meta["hf_output"] = full_output # The ModelOutputThunk must be computed by this point. - assert mot.value is not None + if mot.value is None: + return # Store KV cache in LRU separately (not in mot._meta) to enable proper cleanup on eviction. # This prevents GPU memory from being held by ModelOutputThunk references. diff --git a/mellea/backends/litellm.py b/mellea/backends/litellm.py index dc06db3a3..a624a31d6 100644 --- a/mellea/backends/litellm.py +++ b/mellea/backends/litellm.py @@ -441,8 +441,11 @@ async def post_processing( # OpenAI-like streamed responses potentially give you chunks of tool calls. # As a result, we have to store data between calls and only then # check for complete tool calls in the post_processing step. - tool_chunk = extract_model_tool_requests( - tools, mot._meta["litellm_chat_response"] + litellm_response = mot._meta.get("litellm_chat_response") + tool_chunk = ( + extract_model_tool_requests(tools, litellm_response) + if litellm_response is not None + else None ) if tool_chunk is not None: if mot.tool_calls is None: @@ -457,7 +460,7 @@ async def post_processing( generate_log.backend = f"litellm::{self.model_id!s}" generate_log.model_options = mot._model_options generate_log.date = datetime.datetime.now() - generate_log.model_output = mot._meta["litellm_chat_response"] + generate_log.model_output = mot._meta.get("litellm_chat_response") generate_log.extra = { "format": _format, "tools_available": tools, diff --git a/mellea/backends/ollama.py b/mellea/backends/ollama.py index e848b9004..144155cd7 100644 --- a/mellea/backends/ollama.py +++ b/mellea/backends/ollama.py @@ -470,6 +470,10 @@ async def generate_from_raw( result = None error = None if isinstance(response, BaseException): + FancyLogger.get_logger().warning( + f"generate_from_raw: request {i} failed with " + f"{type(response).__name__}: {response}" + ) result = ModelOutputThunk(value="") error = response else: @@ -596,7 +600,7 @@ async def post_processing( generate_log.backend = f"ollama::{self._get_ollama_model_id()}" generate_log.model_options = mot._model_options generate_log.date = datetime.datetime.now() - generate_log.model_output = mot._meta["chat_response"] + generate_log.model_output = mot._meta.get("chat_response") generate_log.extra = { "format": _format, "thinking": mot._model_options.get(ModelOption.THINKING, None), diff --git a/mellea/backends/openai.py b/mellea/backends/openai.py index 0b520bd29..8d12f2c15 100644 --- a/mellea/backends/openai.py +++ b/mellea/backends/openai.py @@ -575,9 +575,13 @@ async def post_processing( # check for complete tool calls in the post_processing step. # Use the choice format for tool extraction (backward compatibility) choice_response = mot._meta.get( - "oai_chat_response_choice", mot._meta["oai_chat_response"] + "oai_chat_response_choice", mot._meta.get("oai_chat_response") + ) + tool_chunk = ( + extract_model_tool_requests(tools, choice_response) + if choice_response is not None + else None ) - tool_chunk = extract_model_tool_requests(tools, choice_response) if tool_chunk is not None: if mot.tool_calls is None: mot.tool_calls = {} @@ -592,7 +596,7 @@ async def post_processing( generate_log.model_options = mot._model_options generate_log.date = datetime.datetime.now() # Store the full response (includes usage info) - generate_log.model_output = mot._meta["oai_chat_response"] + generate_log.model_output = mot._meta.get("oai_chat_response") generate_log.extra = { "format": _format, "thinking": thinking, @@ -613,12 +617,13 @@ async def post_processing( record_token_usage, ) - response = mot._meta["oai_chat_response"] - # response is a dict from model_dump(), extract usage if present - usage = response.get("usage") if isinstance(response, dict) else None - if usage: - record_token_usage(span, usage) - record_response_metadata(span, response) + response = mot._meta.get("oai_chat_response") + if response is not None: + # response is a dict from model_dump(), extract usage if present + usage = response.get("usage") if isinstance(response, dict) else None + if usage: + record_token_usage(span, usage) + record_response_metadata(span, response) # Close the span now that async operation is complete end_backend_span(span) # Clean up the span reference diff --git a/mellea/backends/vllm.py b/mellea/backends/vllm.py index 85ce66308..5f3d666d7 100644 --- a/mellea/backends/vllm.py +++ b/mellea/backends/vllm.py @@ -380,7 +380,8 @@ async def post_processing( ): """Called when generation is done.""" # The ModelOutputThunk must be computed by this point. - assert mot.value is not None + if mot.value is None: + return # Only scan for tools if we are not doing structured output and tool calls were provided to the model. if _format is None and tool_calls: diff --git a/mellea/backends/watsonx.py b/mellea/backends/watsonx.py index 51cd84c2c..7d820d7e7 100644 --- a/mellea/backends/watsonx.py +++ b/mellea/backends/watsonx.py @@ -469,7 +469,12 @@ async def post_processing( # OpenAI streamed responses give you chunks of tool calls. # As a result, we have to store data between calls and only then # check for complete tool calls in the post_processing step. - tool_chunk = extract_model_tool_requests(tools, mot._meta["oai_chat_response"]) + oai_response = mot._meta.get("oai_chat_response") + tool_chunk = ( + extract_model_tool_requests(tools, oai_response) + if oai_response is not None + else None + ) if tool_chunk is not None: if mot.tool_calls is None: mot.tool_calls = {} @@ -509,7 +514,7 @@ async def post_processing( generate_log.backend = f"watsonx::{self.model_id!s}" generate_log.model_options = mot._model_options generate_log.date = datetime.datetime.now() - generate_log.model_output = mot._meta["oai_chat_response"] + generate_log.model_output = mot._meta.get("oai_chat_response") generate_log.extra = { "format": _format, "tools_available": tools, diff --git a/mellea/core/base.py b/mellea/core/base.py index e7b40c7cd..7da08d460 100644 --- a/mellea/core/base.py +++ b/mellea/core/base.py @@ -325,10 +325,14 @@ async def astream(self) -> str: elif isinstance(chunks[-1], Exception): # Mark as computed so post_process runs in finally block self._computed = True - # Store exception to re-raise after cleanup - exception_to_raise = chunks[-1] + # Remove the exception from chunks so _process doesn't receive it + exception_to_raise = chunks.pop() for chunk in chunks: + # Belt-and-suspenders: skip non-chunk objects that should + # have been removed above (exceptions, sentinel None). + if chunk is None or isinstance(chunk, Exception): + continue assert self._process is not None await self._process(self, chunk) diff --git a/pyproject.toml b/pyproject.toml index b865ef403..957761043 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -119,6 +119,7 @@ dev = [ "python-semantic-release~=7.32", "nbmake>=1.5.5", "langchain-core>=1.2.7", # Necessary for mypy and some tool tests + "sentencepiece==0.2.1", # Necessary for test_huggingface_tools test because of Mistral model ] notebook = [ @@ -243,7 +244,7 @@ markers = [ "requires_gpu: Tests requiring GPU", "requires_heavy_ram: Tests requiring 48GB+ RAM", "qualitative: Non-deterministic quality tests", - "slow: Tests taking >5 minutes (e.g., dataset loading)", + "slow: Tests taking >1 minute (e.g., multi-step pipelines like researcher)", # Composite markers "llm: Tests that make LLM calls (needs at least Ollama)", @@ -255,7 +256,6 @@ addopts = [ # Run qualitative tests by default (use -m "not qualitative" for fast tests) "--cov=mellea", "--cov=cli", - "--cov-report=term", "--cov-report=html", "--cov-report=json", # Set timeout to 15 minutes for full test suite diff --git a/test/backends/test_litellm_watsonx.py b/test/backends/test_litellm_watsonx.py index 80f65b096..9edb43ffb 100644 --- a/test/backends/test_litellm_watsonx.py +++ b/test/backends/test_litellm_watsonx.py @@ -62,9 +62,6 @@ async def test_generate_from_raw(session) -> None: @pytest.mark.qualitative -@pytest.mark.xfail( - reason="litellm has a bug with watsonx; once that is fixed, this should pass." -) async def test_multiple_async_funcs(session) -> None: """If this test passes, remove the _has_potential_event_loop_errors func from litellm.""" session.chat( diff --git a/test/backends/test_ollama.py b/test/backends/test_ollama.py index fcca7fcdc..dee25bfb8 100644 --- a/test/backends/test_ollama.py +++ b/test/backends/test_ollama.py @@ -87,7 +87,7 @@ class Email(pydantic.BaseModel): output = session.instruct( "Write a short email to Olivia, thanking her for organizing a sailing activity. Her email server is example.com. No more than two sentences. ", format=Email, - model_options={ModelOption.MAX_NEW_TOKENS: 2**8}, + model_options={ModelOption.MAX_NEW_TOKENS: 2**10}, ) print("Formatted output:") email = Email.model_validate_json( @@ -102,18 +102,22 @@ class Email(pydantic.BaseModel): @pytest.mark.qualitative +@pytest.mark.timeout(150) async def test_generate_from_raw(session) -> None: prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"] results = await session.backend.generate_from_raw( - actions=[CBlock(value=prompt) for prompt in prompts], ctx=session.ctx + actions=[CBlock(value=prompt) for prompt in prompts], + ctx=session.ctx, + model_options={ModelOption.CONTEXT_WINDOW: 2048}, ) assert len(results) == len(prompts) - assert results[0].value is not None + assert all(r.value for r in results), ( + f"One or more requests returned empty (possible backend timeout): {[r.value for r in results]}" + ) -@pytest.mark.xfail(reason="ollama sometimes fails generated structured outputs") async def test_generate_from_raw_with_format(session) -> None: prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"] @@ -125,17 +129,21 @@ class Answer(pydantic.BaseModel): actions=[CBlock(value=prompt) for prompt in prompts], ctx=session.ctx, format=Answer, + model_options={ModelOption.CONTEXT_WINDOW: 2048}, ) assert len(results) == len(prompts) + assert all(r.value for r in results), ( + f"One or more requests returned empty (possible backend timeout): {[r.value for r in results]}" + ) - random_result = results[0] - try: - Answer.model_validate_json(random_result.value) - except pydantic.ValidationError as e: - assert False, ( - f"formatting directive failed for {random_result.value}: {e.json()}" - ) + for result in results: + try: + Answer.model_validate_json(result.value) + except pydantic.ValidationError as e: + assert False, ( + f"formatting directive failed for {result.value}: {e.json()}" + ) async def test_async_parallel_requests(session) -> None: diff --git a/test/backends/test_openai_ollama.py b/test/backends/test_openai_ollama.py index c40dcd869..142d07819 100644 --- a/test/backends/test_openai_ollama.py +++ b/test/backends/test_openai_ollama.py @@ -104,7 +104,7 @@ class Email(pydantic.BaseModel): output = m_session.instruct( "Write a short email to Olivia, thanking her for organizing a sailing activity. Her email server is example.com. No more than two sentences. ", format=Email, - model_options={ModelOption.MAX_NEW_TOKENS: 2**8}, + model_options={ModelOption.MAX_NEW_TOKENS: 2**10}, ) print("Formatted output:") email = Email.model_validate_json( diff --git a/test/backends/test_openai_vllm.py b/test/backends/test_openai_vllm.py index 685465799..f4e0e6466 100644 --- a/test/backends/test_openai_vllm.py +++ b/test/backends/test_openai_vllm.py @@ -79,17 +79,24 @@ def vllm_process(): yield process + except Exception as e: + pytest.skip( + f"vLLM process not available: {e}. May need to install with: pip install mellea[vllm]", + allow_module_level=True, + ) + # --- Teardown (always runs) --- finally: - try: - os.killpg(process.pid, signal.SIGTERM) # kill the session group - process.wait(timeout=30) - except Exception: + if process is not None: try: - os.killpg(process.pid, signal.SIGKILL) + os.killpg(process.pid, signal.SIGTERM) # kill the session group + process.wait(timeout=30) except Exception: - pass - process.wait() + try: + os.killpg(process.pid, signal.SIGKILL) + except Exception: + pass + process.wait() @pytest.fixture(scope="module") diff --git a/test/core/test_astream_exception_handling.py b/test/core/test_astream_exception_handling.py new file mode 100644 index 000000000..4a2d1d232 --- /dev/null +++ b/test/core/test_astream_exception_handling.py @@ -0,0 +1,94 @@ +"""Regression tests for astream() exception handling. + +When a backend error occurs during streaming, the Exception object lands in the +async queue. Before the fix, astream() would either pass it to _process (crash) +or post_processing would hit a KeyError on _meta keys that were never set. + +These tests verify that astream() cleanly propagates the original exception +after running _post_process for telemetry cleanup. +""" + +import asyncio + +import pytest + +from mellea.core.base import CBlock, GenerateType, ModelOutputThunk + + +def _make_streaming_mot(): + """Create a ModelOutputThunk wired up for streaming with stub callbacks.""" + mot = ModelOutputThunk(value=None) + mot._generate_type = GenerateType.ASYNC + mot._chunk_size = 1 + + process_calls: list = [] + + async def _process(mot, chunk): + process_calls.append(chunk) + text = chunk if isinstance(chunk, str) else str(chunk) + if mot._underlying_value is None: + mot._underlying_value = text + else: + mot._underlying_value += text + + post_process_called = asyncio.Event() + + async def _post_process(mot): + post_process_called.set() + + mot._process = _process + mot._post_process = _post_process + + return mot, process_calls, post_process_called + + +async def test_astream_propagates_exception_from_queue(): + """Exception in the queue is re-raised after cleanup, not passed to _process.""" + mot, process_calls, post_process_called = _make_streaming_mot() + + original_error = RuntimeError("backend connection lost") + await mot._async_queue.put(original_error) + + with pytest.raises(RuntimeError, match="backend connection lost"): + await mot.astream() + + # _process must never have seen the Exception object + assert original_error not in process_calls + # _post_process ran for telemetry cleanup + assert post_process_called.is_set() + + +async def test_astream_propagates_exception_after_valid_chunks(): + """Valid chunks before the exception are processed; exception still raised.""" + mot, process_calls, post_process_called = _make_streaming_mot() + + await mot._async_queue.put("hello ") + await mot._async_queue.put("world") + await mot._async_queue.put(ValueError("mid-stream failure")) + + with pytest.raises(ValueError, match="mid-stream failure"): + await mot.astream() + + # Valid chunks were processed + assert process_calls == ["hello ", "world"] + # Accumulated value reflects only the valid chunks + assert mot._underlying_value == "hello world" + # Cleanup still ran + assert post_process_called.is_set() + + +async def test_astream_skips_none_and_exception_in_chunk_loop(): + """Belt-and-suspenders: stray None/Exception objects in the middle of the + chunk list are skipped rather than passed to _process.""" + mot, process_calls, _ = _make_streaming_mot() + + await mot._async_queue.put("good chunk") + await mot._async_queue.put(None) + + mot._action = CBlock("test") + + result = await mot.astream() + + assert process_calls == ["good chunk"] + assert mot.is_computed() + assert result is not None diff --git a/test/core/test_astream_incremental.py b/test/core/test_astream_incremental.py index 78b0c2a8a..58425200f 100644 --- a/test/core/test_astream_incremental.py +++ b/test/core/test_astream_incremental.py @@ -14,6 +14,7 @@ @pytest.mark.ollama @pytest.mark.llm +@pytest.mark.qualitative async def test_astream_returns_incremental_chunks(): """Test that astream() returns only new content, not accumulated content. @@ -68,6 +69,7 @@ async def test_astream_returns_incremental_chunks(): @pytest.mark.ollama @pytest.mark.llm +@pytest.mark.qualitative async def test_astream_multiple_calls_accumulate_correctly(): """Test that multiple astream() calls accumulate to the final value. @@ -120,6 +122,7 @@ async def test_astream_multiple_calls_accumulate_correctly(): @pytest.mark.ollama @pytest.mark.llm +@pytest.mark.qualitative async def test_astream_beginning_length_tracking(): """Test that beginning_length is correctly tracked across astream calls. @@ -150,6 +153,7 @@ async def test_astream_beginning_length_tracking(): @pytest.mark.ollama @pytest.mark.llm +@pytest.mark.qualitative async def test_astream_empty_beginning(): """Test astream when _underlying_value starts as None.""" session = start_session() @@ -174,6 +178,7 @@ async def test_astream_empty_beginning(): @pytest.mark.ollama @pytest.mark.llm +@pytest.mark.qualitative async def test_astream_computed_returns_full_value(): """Test that astream returns full value when already computed.""" # Create a pre-computed thunk @@ -188,6 +193,7 @@ async def test_astream_computed_returns_full_value(): @pytest.mark.ollama @pytest.mark.llm +@pytest.mark.qualitative async def test_astream_final_call_returns_full_value(): """Test that the final astream call returns the full value when computed. diff --git a/test/core/test_astream_mock.py b/test/core/test_astream_mock.py new file mode 100644 index 000000000..e3c58c697 --- /dev/null +++ b/test/core/test_astream_mock.py @@ -0,0 +1,162 @@ +"""Deterministic Mock Tests for ModelOutputThunk.astream() incremental return behavior. + +Tests that astream() returns only new content added since the beginning of +each astream() call, not the entire accumulated value. Uses manual queue +injection to bypass LLM calls and network operations, guaranteeing determinism. +""" + +import asyncio +from typing import Any + +import pytest + +from mellea.core.base import CBlock, GenerateType, ModelOutputThunk + + +async def mock_process(mot: ModelOutputThunk, chunk: Any) -> None: + """Mock process function that simply appends the chunk to the underlying value.""" + if mot._underlying_value is None: + mot._underlying_value = "" + if chunk is not None: + mot._underlying_value += chunk + + +async def mock_post_process(mot: ModelOutputThunk) -> None: + """Mock post-process function (does nothing).""" + + +def create_manual_mock_thunk() -> ModelOutputThunk: + """Helper to create a mock ModelOutputThunk where we manually populate the queue.""" + mot = ModelOutputThunk(value=None) + mot._action = CBlock("mock_action") + mot._generate_type = GenerateType.ASYNC + mot._process = mock_process + mot._post_process = mock_post_process + mot._chunk_size = 0 # Read exactly what is available + return mot + + +@pytest.mark.asyncio +async def test_astream_returns_incremental_chunks(): + """Test that astream() returns only new content, not accumulated content.""" + mot = create_manual_mock_thunk() + + # Drop the first chunk and pull it + mot._async_queue.put_nowait("chunk1 ") + chunk1 = await mot.astream() + assert chunk1 == "chunk1 " + + # Drop the second chunk and pull it + mot._async_queue.put_nowait("chunk2 ") + chunk2 = await mot.astream() + assert chunk2 == "chunk2 " + + # Drop the third chunk and pull it + mot._async_queue.put_nowait("chunk3 ") + chunk3 = await mot.astream() + assert chunk3 == "chunk3 " + + # Send completion sentinel + mot._async_queue.put_nowait(None) + + # Wait until fully consumed + while not mot.is_computed(): + await mot.astream() + + final_val = await mot.avalue() + assert final_val == "chunk1 chunk2 chunk3 " + + +@pytest.mark.asyncio +async def test_astream_multiple_calls_accumulate_correctly(): + """Test that multiple astream() calls accumulate to the final value.""" + # Simulating a scenario where queue chunks outpace the reading loop + mot = create_manual_mock_thunk() + + # Drop multiple items at once to simulate fast network + mot._async_queue.put_nowait("c") + mot._async_queue.put_nowait("h") + mot._async_queue.put_nowait("u") + + # Calling astream should drain all currently queued items ("chu") + chunk1 = await mot.astream() + assert chunk1 == "chu" + + mot._async_queue.put_nowait("n") + mot._async_queue.put_nowait("k") + mot._async_queue.put_nowait(None) + + chunk2 = await mot.astream() + assert chunk2 == "chunk" + + final_val = await mot.avalue() + + assert mot.is_computed() + assert final_val == "chunk" + + +@pytest.mark.asyncio +async def test_astream_beginning_length_tracking(): + """Test that beginning_length is correctly tracked across astream calls.""" + mot = create_manual_mock_thunk() + + mot._async_queue.put_nowait("AAA") + chunk1 = await mot.astream() + assert chunk1 == "AAA" + + mot._async_queue.put_nowait("BBB") + chunk2 = await mot.astream() + # verify incremental length tracking works + assert not chunk2.startswith(chunk1) + assert chunk2 == "BBB" + + +@pytest.mark.asyncio +async def test_astream_empty_beginning(): + """Test astream when _underlying_value starts as None.""" + mot = create_manual_mock_thunk() + + mot._async_queue.put_nowait("First") + # At the start, _underlying_value is None, beginning_length is 0 + chunk = await mot.astream() + + # Because beginning length was 0, astream returns the full chunk + assert chunk == "First" + assert mot._underlying_value == "First" + + +@pytest.mark.asyncio +async def test_astream_computed_returns_full_value(): + """Test that astream returns full value when already computed.""" + # Precomputed thunk skips queue checking completely + mot = ModelOutputThunk(value="Hello, world!") + + # For a precomputed thunk, astream directly returns value + result = await mot.astream() + assert result == "Hello, world!" + + +@pytest.mark.asyncio +async def test_astream_final_call_returns_full_value(): + """Test that the final astream call returns the full value when computed.""" + mot = create_manual_mock_thunk() + + mot._async_queue.put_nowait("part1") + chunk1 = await mot.astream() + assert chunk1 == "part1" + + mot._async_queue.put_nowait("part2") + chunk2 = await mot.astream() + assert chunk2 == "part2" + + mot._async_queue.put_nowait("part3") + mot._async_queue.put_nowait(None) + + # Calling astream here processes "part3" and `None`, flagging it as done + chunk3 = await mot.astream() + + final_val = await mot.avalue() + + # The final chunk logically completes the thunk, returning the full value instead of a slice. + assert chunk3 == "part1part2part3" + assert chunk3 == final_val diff --git a/test/stdlib/components/intrinsic/testdata/input_json/hallucination_detection.json b/test/stdlib/components/intrinsic/testdata/input_json/hallucination_detection.json index f224ed20a..69a3e75bc 100644 --- a/test/stdlib/components/intrinsic/testdata/input_json/hallucination_detection.json +++ b/test/stdlib/components/intrinsic/testdata/input_json/hallucination_detection.json @@ -13,6 +13,7 @@ "content": "Purple bumble fish are yellow. Green bumble fish are also yellow." } ], + "temperature": 0.0, "extra_body": { "documents": [ { diff --git a/test/stdlib/components/intrinsic/testdata/output_json/citations.json b/test/stdlib/components/intrinsic/testdata/output_json/citations.json index 804f64f43..67dc2bb51 100644 --- a/test/stdlib/components/intrinsic/testdata/output_json/citations.json +++ b/test/stdlib/components/intrinsic/testdata/output_json/citations.json @@ -3,7 +3,7 @@ { "index": 0, "message": { - "content": "[{\"response_begin\": 0, \"response_end\": 96, \"response_text\": \"Murdoch expanded in Australia and New Zealand by acquiring and expanding local newspapers. \", \"citation_doc_id\": \"0\", \"citation_begin\": 2468, \"citation_end\": 3533, \"citation_text\": \"He began to direct his attention to acquisition and expansion, buying the troubled Sunday Times in Perth, Western Australia (1956) and over the next few years acquiring suburban and provincial newspapers in New South Wales, Queensland, Victoria and the Northern Territory, including the Sydney afternoon tabloid, The Daily Mirror (1960). \"}, {\"response_begin\": 0, \"response_end\": 96, \"response_text\": \"Murdoch expanded in Australia and New Zealand by acquiring and expanding local newspapers. \", \"citation_doc_id\": \"0\", \"citation_begin\": 4792, \"citation_end\": 6183, \"citation_text\": \"Murdoch's first foray outside Australia involved the purchase of a controlling interest in the New Zealand daily The Dominion. \"}]", + "content": "[{\"response_begin\": 0, \"response_end\": 91, \"response_text\": \"Murdoch expanded in Australia and New Zealand by acquiring and expanding local newspapers. \", \"citation_doc_id\": \"0\", \"citation_begin\": 692, \"citation_end\": 1030, \"citation_text\": \"He began to direct his attention to acquisition and expansion, buying the troubled Sunday Times in Perth, Western Australia (1956) and over the next few years acquiring suburban and provincial newspapers in New South Wales, Queensland, Victoria and the Northern Territory, including the Sydney afternoon tabloid, The Daily Mirror (1960). \"}, {\"response_begin\": 0, \"response_end\": 91, \"response_text\": \"Murdoch expanded in Australia and New Zealand by acquiring and expanding local newspapers. \", \"citation_doc_id\": \"0\", \"citation_begin\": 1219, \"citation_end\": 1346, \"citation_text\": \"Murdoch's first foray outside Australia involved the purchase of a controlling interest in the New Zealand daily The Dominion. \"}]", "role": "assistant" } } diff --git a/test/stdlib/components/intrinsic/testdata/output_json/hallucination_detection.json b/test/stdlib/components/intrinsic/testdata/output_json/hallucination_detection.json index 06e80be5f..eb518a4da 100644 --- a/test/stdlib/components/intrinsic/testdata/output_json/hallucination_detection.json +++ b/test/stdlib/components/intrinsic/testdata/output_json/hallucination_detection.json @@ -3,9 +3,9 @@ { "index": 0, "message": { - "content": "[{\"response_begin\": 0, \"response_end\": 36, \"response_text\": \"Purple bumble fish are yellow. \", \"faithfulness_likelihood\": 0.7280598165124975, \"explanation\": \"This sentence makes a factual claim about the color of purple bumble fish. The document states 'The only type of fish that is yellow is the purple bumble fish.' This directly supports the claim in the sentence.\"}, {\"response_begin\": 36, \"response_end\": 70, \"response_text\": \"Green bumble fish are also yellow.\", \"faithfulness_likelihood\": 0.08656033472953338, \"explanation\": \"This sentence makes a factual claim about the color of green bumble fish. However, the document does not mention green bumble fish at all. Therefore, this claim cannot be verified from the provided context.\"}]", + "content": "[{\"response_begin\": 0, \"response_end\": 31, \"response_text\": \"Purple bumble fish are yellow. \", \"faithfulness_likelihood\": 0.7280580899614958, \"explanation\": \"This sentence makes a factual claim about the color of purple bumble fish. The document states 'The only type of fish that is yellow is the purple bumble fish.' This directly supports the claim in the sentence.\"}, {\"response_begin\": 31, \"response_end\": 65, \"response_text\": \"Green bumble fish are also yellow.\", \"faithfulness_likelihood\": 0.09613224257737445, \"explanation\": \"This sentence makes a factual claim about the color of green bumble fish. However, the document does not mention green bumble fish at all. Therefore, this claim cannot be verified from the provided context.\"}]", "role": "assistant" } } ] -} \ No newline at end of file +} diff --git a/uv.lock b/uv.lock index 8f5fa0871..9cb0759b9 100644 --- a/uv.lock +++ b/uv.lock @@ -2216,6 +2216,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fe/65/5b235b40581ad75ab97dcd8b4218022ae8e3ab77c13c919f1a1dfe9171fd/greenlet-3.3.1-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:04bee4775f40ecefcdaa9d115ab44736cd4b9c5fba733575bfe9379419582e13", size = 273723, upload-time = "2026-01-23T15:30:37.521Z" }, { url = "https://files.pythonhosted.org/packages/ce/ad/eb4729b85cba2d29499e0a04ca6fbdd8f540afd7be142fd571eea43d712f/greenlet-3.3.1-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:50e1457f4fed12a50e427988a07f0f9df53cf0ee8da23fab16e6732c2ec909d4", size = 574874, upload-time = "2026-01-23T16:00:54.551Z" }, { url = "https://files.pythonhosted.org/packages/87/32/57cad7fe4c8b82fdaa098c89498ef85ad92dfbb09d5eb713adedfc2ae1f5/greenlet-3.3.1-cp310-cp310-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:070472cd156f0656f86f92e954591644e158fd65aa415ffbe2d44ca77656a8f5", size = 586309, upload-time = "2026-01-23T16:05:25.18Z" }, + { url = "https://files.pythonhosted.org/packages/66/66/f041005cb87055e62b0d68680e88ec1a57f4688523d5e2fb305841bc8307/greenlet-3.3.1-cp310-cp310-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:1108b61b06b5224656121c3c8ee8876161c491cbe74e5c519e0634c837cf93d5", size = 597461, upload-time = "2026-01-23T16:15:51.943Z" }, { url = "https://files.pythonhosted.org/packages/87/eb/8a1ec2da4d55824f160594a75a9d8354a5fe0a300fb1c48e7944265217e1/greenlet-3.3.1-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3a300354f27dd86bae5fbf7002e6dd2b3255cd372e9242c933faf5e859b703fe", size = 586985, upload-time = "2026-01-23T15:32:47.968Z" }, { url = "https://files.pythonhosted.org/packages/15/1c/0621dd4321dd8c351372ee8f9308136acb628600658a49be1b7504208738/greenlet-3.3.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:e84b51cbebf9ae573b5fbd15df88887815e3253fc000a7d0ff95170e8f7e9729", size = 1547271, upload-time = "2026-01-23T16:04:18.977Z" }, { url = "https://files.pythonhosted.org/packages/9d/53/24047f8924c83bea7a59c8678d9571209c6bfe5f4c17c94a78c06024e9f2/greenlet-3.3.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e0093bd1a06d899892427217f0ff2a3c8f306182b8c754336d32e2d587c131b4", size = 1613427, upload-time = "2026-01-23T15:33:44.428Z" }, @@ -2223,6 +2224,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ec/e8/2e1462c8fdbe0f210feb5ac7ad2d9029af8be3bf45bd9fa39765f821642f/greenlet-3.3.1-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:5fd23b9bc6d37b563211c6abbb1b3cab27db385a4449af5c32e932f93017080c", size = 274974, upload-time = "2026-01-23T15:31:02.891Z" }, { url = "https://files.pythonhosted.org/packages/7e/a8/530a401419a6b302af59f67aaf0b9ba1015855ea7e56c036b5928793c5bd/greenlet-3.3.1-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:09f51496a0bfbaa9d74d36a52d2580d1ef5ed4fdfcff0a73730abfbbbe1403dd", size = 577175, upload-time = "2026-01-23T16:00:56.213Z" }, { url = "https://files.pythonhosted.org/packages/8e/89/7e812bb9c05e1aaef9b597ac1d0962b9021d2c6269354966451e885c4e6b/greenlet-3.3.1-cp311-cp311-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:cb0feb07fe6e6a74615ee62a880007d976cf739b6669cce95daa7373d4fc69c5", size = 590401, upload-time = "2026-01-23T16:05:26.365Z" }, + { url = "https://files.pythonhosted.org/packages/70/ae/e2d5f0e59b94a2269b68a629173263fa40b63da32f5c231307c349315871/greenlet-3.3.1-cp311-cp311-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:67ea3fc73c8cd92f42467a72b75e8f05ed51a0e9b1d15398c913416f2dafd49f", size = 601161, upload-time = "2026-01-23T16:15:53.456Z" }, { url = "https://files.pythonhosted.org/packages/5c/ae/8d472e1f5ac5efe55c563f3eabb38c98a44b832602e12910750a7c025802/greenlet-3.3.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:39eda9ba259cc9801da05351eaa8576e9aa83eb9411e8f0c299e05d712a210f2", size = 590272, upload-time = "2026-01-23T15:32:49.411Z" }, { url = "https://files.pythonhosted.org/packages/a8/51/0fde34bebfcadc833550717eade64e35ec8738e6b097d5d248274a01258b/greenlet-3.3.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:e2e7e882f83149f0a71ac822ebf156d902e7a5d22c9045e3e0d1daf59cee2cc9", size = 1550729, upload-time = "2026-01-23T16:04:20.867Z" }, { url = "https://files.pythonhosted.org/packages/16/c9/2fb47bee83b25b119d5a35d580807bb8b92480a54b68fef009a02945629f/greenlet-3.3.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:80aa4d79eb5564f2e0a6144fcc744b5a37c56c4a92d60920720e99210d88db0f", size = 1615552, upload-time = "2026-01-23T15:33:45.743Z" }, @@ -2231,6 +2233,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f9/c8/9d76a66421d1ae24340dfae7e79c313957f6e3195c144d2c73333b5bfe34/greenlet-3.3.1-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:7e806ca53acf6d15a888405880766ec84721aa4181261cd11a457dfe9a7a4975", size = 276443, upload-time = "2026-01-23T15:30:10.066Z" }, { url = "https://files.pythonhosted.org/packages/81/99/401ff34bb3c032d1f10477d199724f5e5f6fbfb59816ad1455c79c1eb8e7/greenlet-3.3.1-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d842c94b9155f1c9b3058036c24ffb8ff78b428414a19792b2380be9cecf4f36", size = 597359, upload-time = "2026-01-23T16:00:57.394Z" }, { url = "https://files.pythonhosted.org/packages/2b/bc/4dcc0871ed557792d304f50be0f7487a14e017952ec689effe2180a6ff35/greenlet-3.3.1-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:20fedaadd422fa02695f82093f9a98bad3dab5fcda793c658b945fcde2ab27ba", size = 607805, upload-time = "2026-01-23T16:05:28.068Z" }, + { url = "https://files.pythonhosted.org/packages/3b/cd/7a7ca57588dac3389e97f7c9521cb6641fd8b6602faf1eaa4188384757df/greenlet-3.3.1-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c620051669fd04ac6b60ebc70478210119c56e2d5d5df848baec4312e260e4ca", size = 622363, upload-time = "2026-01-23T16:15:54.754Z" }, { url = "https://files.pythonhosted.org/packages/cf/05/821587cf19e2ce1f2b24945d890b164401e5085f9d09cbd969b0c193cd20/greenlet-3.3.1-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:14194f5f4305800ff329cbf02c5fcc88f01886cadd29941b807668a45f0d2336", size = 609947, upload-time = "2026-01-23T15:32:51.004Z" }, { url = "https://files.pythonhosted.org/packages/a4/52/ee8c46ed9f8babaa93a19e577f26e3d28a519feac6350ed6f25f1afee7e9/greenlet-3.3.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:7b2fe4150a0cf59f847a67db8c155ac36aed89080a6a639e9f16df5d6c6096f1", size = 1567487, upload-time = "2026-01-23T16:04:22.125Z" }, { url = "https://files.pythonhosted.org/packages/8f/7c/456a74f07029597626f3a6db71b273a3632aecb9afafeeca452cfa633197/greenlet-3.3.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:49f4ad195d45f4a66a0eb9c1ba4832bb380570d361912fa3554746830d332149", size = 1636087, upload-time = "2026-01-23T15:33:47.486Z" }, @@ -2239,6 +2242,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ec/ab/d26750f2b7242c2b90ea2ad71de70cfcd73a948a49513188a0fc0d6fc15a/greenlet-3.3.1-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:7ab327905cabb0622adca5971e488064e35115430cec2c35a50fd36e72a315b3", size = 275205, upload-time = "2026-01-23T15:30:24.556Z" }, { url = "https://files.pythonhosted.org/packages/10/d3/be7d19e8fad7c5a78eeefb2d896a08cd4643e1e90c605c4be3b46264998f/greenlet-3.3.1-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:65be2f026ca6a176f88fb935ee23c18333ccea97048076aef4db1ef5bc0713ac", size = 599284, upload-time = "2026-01-23T16:00:58.584Z" }, { url = "https://files.pythonhosted.org/packages/ae/21/fe703aaa056fdb0f17e5afd4b5c80195bbdab701208918938bd15b00d39b/greenlet-3.3.1-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7a3ae05b3d225b4155bda56b072ceb09d05e974bc74be6c3fc15463cf69f33fd", size = 610274, upload-time = "2026-01-23T16:05:29.312Z" }, + { url = "https://files.pythonhosted.org/packages/06/00/95df0b6a935103c0452dad2203f5be8377e551b8466a29650c4c5a5af6cc/greenlet-3.3.1-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:12184c61e5d64268a160226fb4818af4df02cfead8379d7f8b99a56c3a54ff3e", size = 624375, upload-time = "2026-01-23T16:15:55.915Z" }, { url = "https://files.pythonhosted.org/packages/cb/86/5c6ab23bb3c28c21ed6bebad006515cfe08b04613eb105ca0041fecca852/greenlet-3.3.1-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6423481193bbbe871313de5fd06a082f2649e7ce6e08015d2a76c1e9186ca5b3", size = 612904, upload-time = "2026-01-23T15:32:52.317Z" }, { url = "https://files.pythonhosted.org/packages/c2/f3/7949994264e22639e40718c2daf6f6df5169bf48fb038c008a489ec53a50/greenlet-3.3.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:33a956fe78bbbda82bfc95e128d61129b32d66bcf0a20a1f0c08aa4839ffa951", size = 1567316, upload-time = "2026-01-23T16:04:23.316Z" }, { url = "https://files.pythonhosted.org/packages/8d/6e/d73c94d13b6465e9f7cd6231c68abde838bb22408596c05d9059830b7872/greenlet-3.3.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4b065d3284be43728dd280f6f9a13990b56470b81be20375a207cdc814a983f2", size = 1636549, upload-time = "2026-01-23T15:33:48.643Z" }, @@ -2247,6 +2251,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ae/fb/011c7c717213182caf78084a9bea51c8590b0afda98001f69d9f853a495b/greenlet-3.3.1-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:bd59acd8529b372775cd0fcbc5f420ae20681c5b045ce25bd453ed8455ab99b5", size = 275737, upload-time = "2026-01-23T15:32:16.889Z" }, { url = "https://files.pythonhosted.org/packages/41/2e/a3a417d620363fdbb08a48b1dd582956a46a61bf8fd27ee8164f9dfe87c2/greenlet-3.3.1-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b31c05dd84ef6871dd47120386aed35323c944d86c3d91a17c4b8d23df62f15b", size = 646422, upload-time = "2026-01-23T16:01:00.354Z" }, { url = "https://files.pythonhosted.org/packages/b4/09/c6c4a0db47defafd2d6bab8ddfe47ad19963b4e30f5bed84d75328059f8c/greenlet-3.3.1-cp314-cp314-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:02925a0bfffc41e542c70aa14c7eda3593e4d7e274bfcccca1827e6c0875902e", size = 658219, upload-time = "2026-01-23T16:05:30.956Z" }, + { url = "https://files.pythonhosted.org/packages/e2/89/b95f2ddcc5f3c2bc09c8ee8d77be312df7f9e7175703ab780f2014a0e781/greenlet-3.3.1-cp314-cp314-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:3e0f3878ca3a3ff63ab4ea478585942b53df66ddde327b59ecb191b19dbbd62d", size = 671455, upload-time = "2026-01-23T16:15:57.232Z" }, { url = "https://files.pythonhosted.org/packages/80/38/9d42d60dffb04b45f03dbab9430898352dba277758640751dc5cc316c521/greenlet-3.3.1-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:34a729e2e4e4ffe9ae2408d5ecaf12f944853f40ad724929b7585bca808a9d6f", size = 660237, upload-time = "2026-01-23T15:32:53.967Z" }, { url = "https://files.pythonhosted.org/packages/96/61/373c30b7197f9e756e4c81ae90a8d55dc3598c17673f91f4d31c3c689c3f/greenlet-3.3.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:aec9ab04e82918e623415947921dea15851b152b822661cce3f8e4393c3df683", size = 1615261, upload-time = "2026-01-23T16:04:25.066Z" }, { url = "https://files.pythonhosted.org/packages/fd/d3/ca534310343f5945316f9451e953dcd89b36fe7a19de652a1dc5a0eeef3f/greenlet-3.3.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:71c767cf281a80d02b6c1bdc41c9468e1f5a494fb11bc8688c360524e273d7b1", size = 1683719, upload-time = "2026-01-23T15:33:50.61Z" }, @@ -2255,6 +2260,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/28/24/cbbec49bacdcc9ec652a81d3efef7b59f326697e7edf6ed775a5e08e54c2/greenlet-3.3.1-cp314-cp314t-macosx_11_0_universal2.whl", hash = "sha256:3e63252943c921b90abb035ebe9de832c436401d9c45f262d80e2d06cc659242", size = 282706, upload-time = "2026-01-23T15:33:05.525Z" }, { url = "https://files.pythonhosted.org/packages/86/2e/4f2b9323c144c4fe8842a4e0d92121465485c3c2c5b9e9b30a52e80f523f/greenlet-3.3.1-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:76e39058e68eb125de10c92524573924e827927df5d3891fbc97bd55764a8774", size = 651209, upload-time = "2026-01-23T16:01:01.517Z" }, { url = "https://files.pythonhosted.org/packages/d9/87/50ca60e515f5bb55a2fbc5f0c9b5b156de7d2fc51a0a69abc9d23914a237/greenlet-3.3.1-cp314-cp314t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c9f9d5e7a9310b7a2f416dd13d2e3fd8b42d803968ea580b7c0f322ccb389b97", size = 654300, upload-time = "2026-01-23T16:05:32.199Z" }, + { url = "https://files.pythonhosted.org/packages/7c/25/c51a63f3f463171e09cb586eb64db0861eb06667ab01a7968371a24c4f3b/greenlet-3.3.1-cp314-cp314t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:4b9721549a95db96689458a1e0ae32412ca18776ed004463df3a9299c1b257ab", size = 662574, upload-time = "2026-01-23T16:15:58.364Z" }, { url = "https://files.pythonhosted.org/packages/1d/94/74310866dfa2b73dd08659a3d18762f83985ad3281901ba0ee9a815194fb/greenlet-3.3.1-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:92497c78adf3ac703b57f1e3813c2d874f27f71a178f9ea5887855da413cd6d2", size = 653842, upload-time = "2026-01-23T15:32:55.671Z" }, { url = "https://files.pythonhosted.org/packages/97/43/8bf0ffa3d498eeee4c58c212a3905dd6146c01c8dc0b0a046481ca29b18c/greenlet-3.3.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:ed6b402bc74d6557a705e197d47f9063733091ed6357b3de33619d8a8d93ac53", size = 1614917, upload-time = "2026-01-23T16:04:26.276Z" }, { url = "https://files.pythonhosted.org/packages/89/90/a3be7a5f378fc6e84abe4dcfb2ba32b07786861172e502388b4c90000d1b/greenlet-3.3.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:59913f1e5ada20fde795ba906916aea25d442abcc0593fba7e26c92b7ad76249", size = 1676092, upload-time = "2026-01-23T15:33:52.176Z" }, @@ -4123,6 +4129,7 @@ dev = [ { name = "pytest-timeout" }, { name = "python-semantic-release" }, { name = "ruff" }, + { name = "sentencepiece" }, ] docs = [ { name = "sphinx-autodoc-typehints", version = "3.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, @@ -4199,6 +4206,7 @@ dev = [ { name = "pytest-timeout" }, { name = "python-semantic-release", specifier = "~=7.32" }, { name = "ruff", specifier = ">=0.11.6" }, + { name = "sentencepiece", specifier = "==0.2.1" }, ] docs = [ { name = "sphinx-autodoc-typehints" },