From 4b256f3c45fc35b4b09d13c2aadfedc487f50083 Mon Sep 17 00:00:00 2001 From: Zoltan Szabo Date: Wed, 23 Jul 2025 09:07:07 +0200 Subject: [PATCH] MGMT-21204: add categories to feedback API --- docs/openapi.json | 55 +++++++++- docs/openapi.md | 33 +++++- src/models/requests.py | 86 +++++++++++++-- tests/unit/app/endpoints/test_feedback.py | 75 ++++++++++---- tests/unit/models/test_requests.py | 121 +++++++++++++++++++++- 5 files changed, 334 insertions(+), 36 deletions(-) diff --git a/docs/openapi.json b/docs/openapi.json index c2b448444..6760ff063 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -935,6 +935,27 @@ "title": "DataCollectorConfiguration", "description": "Data collector configuration for sending data to ingress server." }, + "FeedbackCategory": { + "type": "string", + "enum": [ + "incorrect", + "not_relevant", + "incomplete", + "outdated_information", + "unsafe", + "other" + ], + "x-enum-varnames": [ + "INCORRECT", + "NOT_RELEVANT", + "INCOMPLETE", + "OUTDATED_INFORMATION", + "UNSAFE", + "OTHER" + ], + "title": "FeedbackCategory", + "description": "Enum representing predefined feedback categories for AI responses.\n\nThese categories help provide structured feedback about AI inference quality when users provide negative feedback (thumbs down). Multiple categories can be selected to provide comprehensive feedback about response issues." + }, "FeedbackRequest": { "properties": { "conversation_id": { @@ -975,6 +996,27 @@ "examples": [ "I'm not satisfied with the response because it is too vague." ] + }, + "categories": { + "anyOf": [ + { + "items": { + "$ref": "#/components/schemas/FeedbackCategory" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Categories", + "description": "List of feedback categories that apply to the LLM response.", + "examples": [ + [ + "incorrect", + "incomplete" + ] + ] } }, "type": "object", @@ -984,7 +1026,7 @@ "llm_response" ], "title": "FeedbackRequest", - "description": "Model representing a feedback request.\n\nAttributes:\n conversation_id: The required conversation ID (UUID).\n user_question: The required user question.\n llm_response: The required LLM response.\n sentiment: The optional sentiment.\n user_feedback: The optional user feedback.\n\nExample:\n ```python\n feedback_request = FeedbackRequest(\n conversation_id=\"12345678-abcd-0000-0123-456789abcdef\",\n user_question=\"what are you doing?\",\n user_feedback=\"Great service!\",\n llm_response=\"I don't know\",\n sentiment=-1,\n )\n ```", + "description": "Model representing a feedback request.\n\nAttributes:\n conversation_id: The required conversation ID (UUID).\n user_question: The required user question.\n llm_response: The required LLM response.\n sentiment: The optional sentiment.\n user_feedback: The optional user feedback.\n categories: The optional list of feedback categories (multi-select).\n\nExample:\n ```python\n feedback_request = FeedbackRequest(\n conversation_id=\"12345678-abcd-0000-0123-456789abcdef\",\n user_question=\"what are you doing?\",\n user_feedback=\"Great service!\",\n llm_response=\"I don't know\",\n sentiment=-1,\n categories=[FeedbackCategory.INCOMPLETE, FeedbackCategory.UNSAFE]\n )\n ```", "examples": [ { "conversation_id": "12345678-abcd-0000-0123-456789abcdef", @@ -992,6 +1034,17 @@ "sentiment": 1, "user_feedback": "Great service!", "user_question": "foo" + }, + { + "categories": [ + "incomplete", + "not_relevant" + ], + "conversation_id": "12345678-abcd-0000-0123-456789abcdef", + "llm_response": "You need to use Docker and Kubernetes for everything.", + "sentiment": -1, + "user_feedback": "This response is too general and doesn't provide specific steps.", + "user_question": "How do I deploy a web app?" } ] }, diff --git a/docs/openapi.md b/docs/openapi.md index e361ebb87..e78035076 100644 --- a/docs/openapi.md +++ b/docs/openapi.md @@ -457,6 +457,24 @@ Data collector configuration for sending data to ingress server. | connection_timeout | integer | | +## FeedbackCategory + + +Enum representing predefined feedback categories for AI responses. + +These categories help provide structured feedback about AI inference quality. +Multiple categories can be selected to provide comprehensive feedback. + +| Value | Description | +|-------|-------------| +| incorrect | The answer provided is completely wrong | +| not_relevant | This answer doesn't address my question at all | +| incomplete | The answer only covers part of what I asked about | +| outdated_information | This information is from several years ago and no longer accurate | +| unsafe | This response could be harmful or dangerous if followed | +| other | The response has issues not covered by other categories | + + ## FeedbackRequest @@ -468,15 +486,27 @@ Attributes: llm_response: The required LLM response. sentiment: The optional sentiment. user_feedback: The optional user feedback. + categories: The optional list of feedback categories (multi-select). -Example: +Examples: ```python + # Basic feedback feedback_request = FeedbackRequest( conversation_id="12345678-abcd-0000-0123-456789abcdef", user_question="what are you doing?", user_feedback="Great service!", llm_response="I don't know", + sentiment=1 + ) + + # Feedback with categories + feedback_request = FeedbackRequest( + conversation_id="12345678-abcd-0000-0123-456789abcdef", + user_question="How do I deploy a web app?", + llm_response="You need to use Docker and Kubernetes for everything.", + user_feedback="This response is too general and doesn't provide specific steps.", sentiment=-1, + categories=["incomplete", "not_relevant"] ) ``` @@ -488,6 +518,7 @@ Example: | llm_response | string | | | sentiment | | | | user_feedback | | Feedback on the LLM response. | +| categories | array[FeedbackCategory] | List of feedback categories that apply to the LLM response. | ## FeedbackResponse diff --git a/src/models/requests.py b/src/models/requests.py index e8363563a..e21ad01ce 100644 --- a/src/models/requests.py +++ b/src/models/requests.py @@ -1,6 +1,7 @@ """Model for service requests.""" from typing import Optional, Self +from enum import Enum from pydantic import BaseModel, model_validator, field_validator, Field from llama_stack_client.types.agents.turn_create_params import Document @@ -146,6 +147,22 @@ def validate_media_type(self) -> Self: return self +class FeedbackCategory(str, Enum): + """Enum representing predefined feedback categories for AI responses. + + These categories help provide structured feedback about AI inference quality + when users provide negative feedback (thumbs down). Multiple categories can + be selected to provide comprehensive feedback about response issues. + """ + + INCORRECT = "incorrect" # "The answer provided is completely wrong" + NOT_RELEVANT = "not_relevant" # "This answer doesn't address my question at all" + INCOMPLETE = "incomplete" # "The answer only covers part of what I asked about" + OUTDATED_INFORMATION = "outdated_information" # "This information is from several years ago and no longer accurate" # pylint: disable=line-too-long + UNSAFE = "unsafe" # "This response could be harmful or dangerous if followed" + OTHER = "other" # "The response has issues not covered by other categories" + + class FeedbackRequest(BaseModel): """Model representing a feedback request. @@ -155,15 +172,17 @@ class FeedbackRequest(BaseModel): llm_response: The required LLM response. sentiment: The optional sentiment. user_feedback: The optional user feedback. + categories: The optional list of feedback categories (multi-select for negative feedback). Example: ```python feedback_request = FeedbackRequest( conversation_id="12345678-abcd-0000-0123-456789abcdef", user_question="what are you doing?", - user_feedback="Great service!", + user_feedback="This response is not helpful", llm_response="I don't know", sentiment=-1, + categories=[FeedbackCategory.INCORRECT, FeedbackCategory.INCOMPLETE] ) ``` """ @@ -179,6 +198,15 @@ class FeedbackRequest(BaseModel): description="Feedback on the LLM response.", examples=["I'm not satisfied with the response because it is too vague."], ) + # Optional list of predefined feedback categories for negative feedback + categories: Optional[list[FeedbackCategory]] = Field( + default=None, + description=( + "List of feedback categories that describe issues with the LLM response " + "(for negative feedback)." + ), + examples=[["incorrect", "incomplete"]], + ) # provides examples for /docs endpoint model_config = { @@ -188,9 +216,26 @@ class FeedbackRequest(BaseModel): "conversation_id": "12345678-abcd-0000-0123-456789abcdef", "user_question": "foo", "llm_response": "bar", - "user_feedback": "Great service!", - "sentiment": 1, - } + "user_feedback": "Not satisfied with the response quality.", + "sentiment": -1, + }, + { + "conversation_id": "12345678-abcd-0000-0123-456789abcdef", + "user_question": "What is the capital of France?", + "llm_response": "The capital of France is Berlin.", + "sentiment": -1, + "categories": ["incorrect"], + }, + { + "conversation_id": "12345678-abcd-0000-0123-456789abcdef", + "user_question": "How do I deploy a web app?", + "llm_response": "Use Docker.", + "user_feedback": ( + "This response is too general and doesn't provide specific steps." + ), + "sentiment": -1, + "categories": ["incomplete", "not_relevant"], + }, ] } } @@ -213,9 +258,34 @@ def check_sentiment(cls, value: Optional[int]) -> Optional[int]: ) return value + @field_validator("categories") + @classmethod + def validate_categories( + cls, value: Optional[list[FeedbackCategory]] + ) -> Optional[list[FeedbackCategory]]: + """Validate feedback categories list.""" + if value is None: + return value + + if not isinstance(value, list): + raise ValueError("Categories must be a list") + + if len(value) == 0: + return None # Convert empty list to None for consistency + + unique_categories = list(set(value)) + return unique_categories + @model_validator(mode="after") - def check_sentiment_or_user_feedback_set(self) -> Self: - """Ensure that either 'sentiment' or 'user_feedback' is set.""" - if self.sentiment is None and self.user_feedback is None: - raise ValueError("Either 'sentiment' or 'user_feedback' must be set") + def check_feedback_provided(self) -> Self: + """Ensure that at least one form of feedback is provided.""" + if ( + self.sentiment is None + and self.user_feedback is None + and self.categories is None + ): + raise ValueError( + "At least one form of feedback must be provided: " + "'sentiment', 'user_feedback', or 'categories'" + ) return self diff --git a/tests/unit/app/endpoints/test_feedback.py b/tests/unit/app/endpoints/test_feedback.py index 945d7a88f..783f9c19c 100644 --- a/tests/unit/app/endpoints/test_feedback.py +++ b/tests/unit/app/endpoints/test_feedback.py @@ -48,15 +48,31 @@ async def test_assert_feedback_enabled(mocker): await assert_feedback_enabled(mocker.Mock()) -def test_feedback_endpoint_handler(mocker): - """Test that feedback_endpoint_handler processes feedback correctly.""" +@pytest.mark.parametrize( + "feedback_request_data", + [ + {}, + { + "conversation_id": "12345678-abcd-0000-0123-456789abcdef", + "user_question": "What is Kubernetes?", + "llm_response": "It's some computer thing.", + "sentiment": -1, + "categories": ["incorrect", "incomplete"], + }, + ], + ids=["no_categories", "with_negative_categories"], +) +def test_feedback_endpoint_handler(mocker, feedback_request_data): + """Test that feedback_endpoint_handler processes feedback for different payloads.""" + # Mock the dependencies mocker.patch("app.endpoints.feedback.assert_feedback_enabled", return_value=None) mocker.patch("utils.common.retrieve_user_id", return_value="test_user_id") mocker.patch("app.endpoints.feedback.store_feedback", return_value=None) - # Mock the feedback request + # Prepare the feedback request mock feedback_request = mocker.Mock() + feedback_request.model_dump.return_value = feedback_request_data # Call the endpoint handler result = feedback_endpoint_handler( @@ -65,7 +81,7 @@ def test_feedback_endpoint_handler(mocker): _ensure_feedback_enabled=assert_feedback_enabled, ) - # Assert that the response is as expected + # Assert that the expected response is returned assert result.response == "feedback received" @@ -94,35 +110,50 @@ def test_feedback_endpoint_handler_error(mocker): assert exc_info.value.detail["response"] == "Error storing user feedback" -def test_store_feedback(mocker): - """Test that store_feedback calls the correct storage function.""" +@pytest.mark.parametrize( + "feedback_request_data", + [ + { + "conversation_id": "12345678-abcd-0000-0123-456789abcdef", + "user_question": "What is OpenStack?", + "llm_response": "It's some cloud thing.", + "user_feedback": "This response is not helpful!", + "sentiment": -1, + }, + { + "conversation_id": "12345678-abcd-0000-0123-456789abcdef", + "user_question": "What is Kubernetes?", + "llm_response": "K8s.", + "sentiment": -1, + "categories": ["incorrect", "not_relevant", "incomplete"], + }, + ], + ids=["negative_text_feedback", "negative_feedback_with_categories"], +) +def test_store_feedback(mocker, feedback_request_data): + """Test that store_feedback correctly stores various feedback payloads.""" + configuration.user_data_collection_configuration.feedback_storage = "fake-path" + # Patch filesystem and helpers mocker.patch("builtins.open", mocker.mock_open()) mocker.patch("app.endpoints.feedback.Path", return_value=mocker.MagicMock()) mocker.patch("app.endpoints.feedback.get_suid", return_value="fake-uuid") - # Mock the JSON to assert the data is stored correctly + # Patch json to inspect stored data mock_json = mocker.patch("app.endpoints.feedback.json") user_id = "test_user_id" - feedback_request = { - "conversation_id": "12345678-abcd-0000-0123-456789abcdef", - "user_question": "What is OpenStack?", - "llm_response": "OpenStack is a cloud computing platform.", - "user_feedback": "Great service!", - "sentiment": 1, + + store_feedback(user_id, feedback_request_data) + + expected_data = { + "user_id": user_id, + "timestamp": mocker.ANY, + **feedback_request_data, } - store_feedback(user_id, feedback_request) - mock_json.dump.assert_called_once_with( - { - "user_id": user_id, - "timestamp": mocker.ANY, - **feedback_request, - }, - mocker.ANY, - ) + mock_json.dump.assert_called_once_with(expected_data, mocker.ANY) def test_feedback_status(): diff --git a/tests/unit/models/test_requests.py b/tests/unit/models/test_requests.py index 3f3218de8..c2fe57104 100644 --- a/tests/unit/models/test_requests.py +++ b/tests/unit/models/test_requests.py @@ -5,7 +5,7 @@ import pytest from pydantic import ValidationError -from models.requests import QueryRequest, Attachment, FeedbackRequest +from models.requests import QueryRequest, Attachment, FeedbackRequest, FeedbackCategory class TestAttachment: @@ -208,10 +208,10 @@ def test_check_sentiment(self) -> None: sentiment=99, # Invalid sentiment ) - def test_check_sentiment_or_user_feedback(self) -> None: - """Test that at least one of sentiment or user_feedback is provided.""" + def test_check_feedback_provided(self) -> None: + """Test that at least one form of feedback is provided.""" with pytest.raises( - ValueError, match="Either 'sentiment' or 'user_feedback' must be set" + ValueError, match="At least one form of feedback must be provided" ): FeedbackRequest( conversation_id="123e4567-e89b-12d3-a456-426614174000", @@ -219,6 +219,7 @@ def test_check_sentiment_or_user_feedback(self) -> None: llm_response="OpenStack is a cloud computing platform.", sentiment=None, user_feedback=None, + categories=None, ) def test_feedback_too_long(self) -> None: @@ -232,3 +233,115 @@ def test_feedback_too_long(self) -> None: llm_response="Some response", user_feedback="a" * 4097, ) + + def test_with_categories(self) -> None: + """Test FeedbackRequest with categories for negative feedback.""" + fr = FeedbackRequest( + conversation_id="123e4567-e89b-12d3-a456-426614174000", + user_question="What is Kubernetes?", + llm_response="It's just some software thing.", + categories=[FeedbackCategory.INCORRECT, FeedbackCategory.INCOMPLETE], + ) + assert fr.conversation_id == "123e4567-e89b-12d3-a456-426614174000" + assert fr.user_question == "What is Kubernetes?" + assert fr.llm_response == "It's just some software thing." + assert set(fr.categories) == { + FeedbackCategory.INCORRECT, + FeedbackCategory.INCOMPLETE, + } + assert fr.sentiment is None + assert fr.user_feedback is None + + def test_with_single_category(self) -> None: + """Test FeedbackRequest with single category for negative feedback.""" + fr = FeedbackRequest( + conversation_id="123e4567-e89b-12d3-a456-426614174000", + user_question="What is Docker?", + llm_response="Docker is a database system.", + categories=[FeedbackCategory.INCORRECT], + ) + assert fr.categories == [FeedbackCategory.INCORRECT] + + def test_categories_with_duplicates(self) -> None: + """Test that duplicate categories are removed.""" + fr = FeedbackRequest( + conversation_id="123e4567-e89b-12d3-a456-426614174000", + user_question="What is API?", + llm_response="API is some computer thing.", + categories=[ + FeedbackCategory.INCORRECT, + FeedbackCategory.INCOMPLETE, + FeedbackCategory.INCORRECT, # Duplicate + ], + ) + assert len(fr.categories) == 2 + assert set(fr.categories) == { + FeedbackCategory.INCORRECT, + FeedbackCategory.INCOMPLETE, + } + + def test_empty_categories_converted_to_none(self) -> None: + """Test that empty categories list is converted to None.""" + with pytest.raises( + ValueError, match="At least one form of feedback must be provided" + ): + FeedbackRequest( + conversation_id="123e4567-e89b-12d3-a456-426614174000", + user_question="What is testing?", + llm_response="Testing is a verification process.", + categories=[], # Empty list should be converted to None + ) + + def test_categories_only_feedback(self) -> None: + """Test that categories alone are sufficient for negative feedback.""" + fr = FeedbackRequest( + conversation_id="123e4567-e89b-12d3-a456-426614174000", + user_question="Explain machine learning", + llm_response="It's AI stuff.", + categories=[FeedbackCategory.NOT_RELEVANT, FeedbackCategory.INCOMPLETE], + ) + assert fr.sentiment is None + assert fr.user_feedback is None + assert len(fr.categories) == 2 + + def test_mixed_feedback_types(self) -> None: + """Test FeedbackRequest with categories, sentiment, and user feedback for negative feedback.""" # pylint: disable=line-too-long + fr = FeedbackRequest( + conversation_id="123e4567-e89b-12d3-a456-426614174000", + user_question="What is cloud computing?", + llm_response="It's computing in the sky.", + sentiment=-1, + user_feedback="This response is not informative and lacks detail", + categories=[FeedbackCategory.OTHER, FeedbackCategory.INCOMPLETE], + ) + assert fr.sentiment == -1 + assert fr.user_feedback == "This response is not informative and lacks detail" + assert len(fr.categories) == 2 + assert set(fr.categories) == { + FeedbackCategory.OTHER, + FeedbackCategory.INCOMPLETE, + } + + def test_all_feedback_categories(self) -> None: + """Test that all defined feedback categories are valid.""" + all_categories = list(FeedbackCategory) + + fr = FeedbackRequest( + conversation_id="123e4567-e89b-12d3-a456-426614174000", + user_question="Test question", + llm_response="Test response", + categories=all_categories, + ) + assert len(fr.categories) == len(all_categories) + for category in all_categories: + assert category in fr.categories + + def test_categories_invalid_type(self) -> None: + """Test validation error for invalid categories type.""" + with pytest.raises(ValidationError): + FeedbackRequest( + conversation_id="123e4567-e89b-12d3-a456-426614174000", + user_question="Test question", + llm_response="Test response", + categories="invalid_type", # Should be list, not string + )