From 710fb9ebd12b8b991b0876765ea48eff23e99408 Mon Sep 17 00:00:00 2001 From: Ian Macleod Date: Fri, 20 Oct 2023 02:14:19 +0000 Subject: [PATCH 01/13] ensuring invalid image tag errors are surfaced to users clearly --- .../model_engine_server/api/llms_v1.py | 6 ++ .../model_engine_server/domain/exceptions.py | 6 ++ .../use_cases/llm_model_endpoint_use_cases.py | 100 +++++++++++------- 3 files changed, 74 insertions(+), 38 deletions(-) diff --git a/model-engine/model_engine_server/api/llms_v1.py b/model-engine/model_engine_server/api/llms_v1.py index 7e73ef707..72d502856 100644 --- a/model-engine/model_engine_server/api/llms_v1.py +++ b/model-engine/model_engine_server/api/llms_v1.py @@ -48,6 +48,7 @@ EndpointResourceInvalidRequestException, EndpointUnsupportedInferenceTypeException, ExistingEndpointOperationInProgressException, + InvalidInferenceFrameworkImageTagException, InvalidRequestException, LLMFineTuningMethodNotImplementedException, LLMFineTuningQuotaReached, @@ -150,6 +151,11 @@ async def create_model_endpoint( status_code=400, detail=str(exc), ) from exc + except InvalidInferenceFrameworkImageTagException as exc: + raise HTTPException( + status_code=400, + detail="The specified inference framework image tag doesn't exist for the specified inference framework.", + ) from exc except ObjectNotApprovedException as exc: raise HTTPException( status_code=403, diff --git a/model-engine/model_engine_server/domain/exceptions.py b/model-engine/model_engine_server/domain/exceptions.py index 934a5e215..69d588485 100644 --- a/model-engine/model_engine_server/domain/exceptions.py +++ b/model-engine/model_engine_server/domain/exceptions.py @@ -170,3 +170,9 @@ class TriggerNameAlreadyExistsException(DomainException): """ Thrown if the requested name already exists in the trigger repository """ + + +class InvalidInferenceFrameworkImageTagException(DomainException): + """ + Thrown if the image tag passed in doesn't exist for the provided inference framework + """ diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py index 78218843c..476a68b94 100644 --- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py @@ -49,6 +49,7 @@ from model_engine_server.domain.exceptions import ( EndpointLabelsException, EndpointUnsupportedInferenceTypeException, + InvalidInferenceFrameworkImageTagException, InvalidRequestException, ObjectHasInvalidValueException, ObjectNotAuthorizedException, @@ -74,6 +75,51 @@ logger = make_logger(logger_name()) +_VALID_FRAMEWORK_IMAGE_TAGS = { + # setting this to empty for now since no one uses deepspeed + LLMInferenceFramework.DEEPSPEED: [], + LLMInferenceFramework.TEXT_GENERATION_INFERENCE: [ + "0.9.4.1", + "0.9.4", + "0.9.3-launch_s3", + "0.9.3", + "0.9.1-launch_s3", + "0.9.1", + "ipv6", + "ipv6-0", + "0.8", + ], + LLMInferenceFramework.VLLM: [ + "0.2.1", + "0.2.0", + "0.1.7-awq", + "0.1.5", + "0.1.7", + "0.1.3.10", + "0.1.3.9", + "0.1.3.8", + "0.1.3.7", + "0.1.3.6", + "0.1.3.5", + "0.1.3.4", + "0.1.3.3", + "0.1.3.2", + "0.1.3.1", + "0.1.3", + ], + LLMInferenceFramework.LIGHTLLM: [ + "0.0.9", + "0.0.8", + "0.0.7", + "0.0.6", + "0.0.5", + "0.0.4", + "0.0.3", + "0.0.2", + "0.0.1", + ], +} + _SUPPORTED_MODEL_NAMES = { LLMInferenceFramework.DEEPSPEED: { "mpt-7b": "mosaicml/mpt-7b", @@ -227,21 +273,8 @@ def validate_num_shards( raise ObjectHasInvalidValueException("DeepSpeed requires more than 1 GPU.") if num_shards != gpus: raise ObjectHasInvalidValueException( - f"Num shard {num_shards} must be the same as number of GPUs {gpus} for DeepSpeed." + f"DeepSpeed requires num shard {num_shards} to be the same as number of GPUs {gpus}." ) - if num_shards > gpus: - raise ObjectHasInvalidValueException( - f"Num shard {num_shards} must be less than or equal to the number of GPUs {gpus}." - ) - - -def validate_quantization( - quantize: Optional[Quantization], inference_framework: LLMInferenceFramework -) -> None: - if quantize is not None and quantize not in _SUPPORTED_QUANTIZATIONS[inference_framework]: - raise ObjectHasInvalidValueException( - f"Quantization {quantize} is not supported for inference framework {inference_framework}. Supported quantization types are {_SUPPORTED_QUANTIZATIONS[inference_framework]}." - ) class CreateLLMModelEndpointV1UseCase: @@ -272,6 +305,10 @@ async def create_model_bundle( checkpoint_path: Optional[str], ) -> ModelBundle: if source == LLMSource.HUGGING_FACE: + # validate the image tag / framework pair + if framework_image_tag not in _VALID_FRAMEWORK_IMAGE_TAGS[framework]: # type: ignore + raise InvalidInferenceFrameworkImageTagException + if framework == LLMInferenceFramework.DEEPSPEED: bundle_id = await self.create_deepspeed_bundle( user, @@ -710,12 +747,10 @@ async def execute( validate_post_inference_hooks(user, request.post_inference_hooks) validate_model_name(request.model_name, request.inference_framework) validate_num_shards(request.num_shards, request.inference_framework, request.gpus) - validate_quantization(request.quantize, request.inference_framework) if request.inference_framework in [ LLMInferenceFramework.TEXT_GENERATION_INFERENCE, LLMInferenceFramework.VLLM, - LLMInferenceFramework.LIGHTLLM, ]: if request.endpoint_type != ModelEndpointType.STREAMING: raise ObjectHasInvalidValueException( @@ -954,10 +989,7 @@ def validate_and_update_completion_params( if inference_framework == LLMInferenceFramework.TEXT_GENERATION_INFERENCE: request.top_k = None if request.top_k == -1 else request.top_k request.top_p = None if request.top_p == 1.0 else request.top_p - if inference_framework in [ - LLMInferenceFramework.VLLM, - LLMInferenceFramework.LIGHTLLM, - ]: + if inference_framework in [LLMInferenceFramework.VLLM, LLMInferenceFramework.LIGHTLLM]: request.top_k = -1 if request.top_k is None else request.top_k request.top_p = 1.0 if request.top_p is None else request.top_p else: @@ -967,10 +999,7 @@ def validate_and_update_completion_params( ) # presence_penalty, frequency_penalty - if inference_framework in [ - LLMInferenceFramework.VLLM, - LLMInferenceFramework.LIGHTLLM, - ]: + if inference_framework in [LLMInferenceFramework.VLLM, LLMInferenceFramework.LIGHTLLM]: request.presence_penalty = ( 0.0 if request.presence_penalty is None else request.presence_penalty ) @@ -1038,17 +1067,14 @@ def model_output_to_completion_output( raise InvalidRequestException(model_output.get("error")) # trigger a 400 else: raise UpstreamServiceError( - status_code=500, content=bytes(model_output["error"], "utf-8") + status_code=500, content=bytes(model_output["error"]) ) elif model_content.inference_framework == LLMInferenceFramework.VLLM: tokens = None if with_token_probs: tokens = [ - TokenOutput( - token=model_output["tokens"][index], - log_prob=list(t.values())[0], - ) + TokenOutput(token=model_output["tokens"][index], log_prob=list(t.values())[0]) for index, t in enumerate(model_output["log_probs"]) ] return CompletionOutput( @@ -1057,6 +1083,7 @@ def model_output_to_completion_output( tokens=tokens, ) elif model_content.inference_framework == LLMInferenceFramework.LIGHTLLM: + print(model_output) tokens = None if with_token_probs: tokens = [ @@ -1162,8 +1189,7 @@ async def execute( timeout_seconds=DOWNSTREAM_REQUEST_TIMEOUT_SECONDS, ) predict_result = await inference_gateway.predict( - topic=model_endpoint.record.destination, - predict_request=inference_request, + topic=model_endpoint.record.destination, predict_request=inference_request ) if predict_result.status == TaskStatus.SUCCESS and predict_result.result is not None: @@ -1206,8 +1232,7 @@ async def execute( timeout_seconds=DOWNSTREAM_REQUEST_TIMEOUT_SECONDS, ) predict_result = await inference_gateway.predict( - topic=model_endpoint.record.destination, - predict_request=inference_request, + topic=model_endpoint.record.destination, predict_request=inference_request ) if predict_result.status != TaskStatus.SUCCESS or predict_result.result is None: @@ -1246,8 +1271,7 @@ async def execute( timeout_seconds=DOWNSTREAM_REQUEST_TIMEOUT_SECONDS, ) predict_result = await inference_gateway.predict( - topic=model_endpoint.record.destination, - predict_request=inference_request, + topic=model_endpoint.record.destination, predict_request=inference_request ) if predict_result.status != TaskStatus.SUCCESS or predict_result.result is None: @@ -1289,8 +1313,7 @@ async def execute( timeout_seconds=DOWNSTREAM_REQUEST_TIMEOUT_SECONDS, ) predict_result = await inference_gateway.predict( - topic=model_endpoint.record.destination, - predict_request=inference_request, + topic=model_endpoint.record.destination, predict_request=inference_request ) if predict_result.status != TaskStatus.SUCCESS or predict_result.result is None: @@ -1353,7 +1376,7 @@ async def execute( ) if len(model_endpoints) == 0: - raise ObjectNotFoundException(f"Model endpoint {model_endpoint_name} not found.") + raise ObjectNotFoundException if len(model_endpoints) > 1: raise ObjectHasInvalidValueException( @@ -1574,6 +1597,7 @@ async def execute( ) elif model_content.inference_framework == LLMInferenceFramework.LIGHTLLM: if res.status == TaskStatus.SUCCESS and result is not None: + print(result) token = None num_completion_tokens += 1 if request.return_token_log_probs: From 088a36552e1fddd93b264fab4ef105fd37aa3388 Mon Sep 17 00:00:00 2001 From: Ian Macleod Date: Fri, 20 Oct 2023 16:33:23 +0000 Subject: [PATCH 02/13] adding new vllm version --- .../domain/use_cases/llm_model_endpoint_use_cases.py | 1 + 1 file changed, 1 insertion(+) diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py index 476a68b94..cc4e7da9b 100644 --- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py @@ -90,6 +90,7 @@ "0.8", ], LLMInferenceFramework.VLLM: [ + "0.2.1.post1", "0.2.1", "0.2.0", "0.1.7-awq", From a1fab6347fa587ff392577f88afdfa7bf64df086 Mon Sep 17 00:00:00 2001 From: Ian Macleod Date: Fri, 27 Oct 2023 05:32:13 +0000 Subject: [PATCH 03/13] update error message, handling for deepspeed --- .../domain/use_cases/llm_model_endpoint_use_cases.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py index cc4e7da9b..65b67d6a6 100644 --- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py @@ -307,8 +307,10 @@ async def create_model_bundle( ) -> ModelBundle: if source == LLMSource.HUGGING_FACE: # validate the image tag / framework pair - if framework_image_tag not in _VALID_FRAMEWORK_IMAGE_TAGS[framework]: # type: ignore - raise InvalidInferenceFrameworkImageTagException + if framework != LLMInferenceFramework.DEEPSPEED and framework_image_tag not in _VALID_FRAMEWORK_IMAGE_TAGS[framework]: # type: ignore + raise InvalidInferenceFrameworkImageTagException( + f"Valid image tags for framework {framework} are {_VALID_FRAMEWORK_IMAGE_TAGS[framework]}" + ) if framework == LLMInferenceFramework.DEEPSPEED: bundle_id = await self.create_deepspeed_bundle( From 5111997f0ce11fe785fa99f77da5ccb26e034d4a Mon Sep 17 00:00:00 2001 From: Ian Macleod Date: Fri, 27 Oct 2023 05:39:06 +0000 Subject: [PATCH 04/13] update conftest --- model-engine/tests/unit/domain/conftest.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/model-engine/tests/unit/domain/conftest.py b/model-engine/tests/unit/domain/conftest.py index 6a958ed43..64d11de13 100644 --- a/model-engine/tests/unit/domain/conftest.py +++ b/model-engine/tests/unit/domain/conftest.py @@ -201,8 +201,8 @@ def create_llm_model_endpoint_request_async() -> CreateLLMModelEndpointV1Request name="test_llm_endpoint_name_async", model_name="mpt-7b", source="hugging_face", - inference_framework="deepspeed", - inference_framework_image_tag="test_tag", + inference_framework="vllm", + inference_framework_image_tag="0.2.0", num_shards=2, endpoint_type=ModelEndpointType.ASYNC, metadata={}, @@ -254,7 +254,7 @@ def create_llm_model_endpoint_request_llama_2() -> CreateLLMModelEndpointV1Reque model_name="llama-2-7b", source="hugging_face", inference_framework="text_generation_inference", - inference_framework_image_tag="test_tag", + inference_framework_image_tag="0.9.4", num_shards=2, endpoint_type=ModelEndpointType.STREAMING, metadata={}, @@ -310,7 +310,7 @@ def create_llm_model_endpoint_text_generation_inference_request_async() -> ( model_name="mpt-7b", source="hugging_face", inference_framework="text_generation_inference", - inference_framework_image_tag="test_tag", + inference_framework_image_tag="0.9.4", num_shards=2, quantize=Quantization.BITSANDBYTES, endpoint_type=ModelEndpointType.ASYNC, From f4e8f9b0c4021e85e0be2e444a862dd32b64eed9 Mon Sep 17 00:00:00 2001 From: Ian Macleod Date: Fri, 27 Oct 2023 16:07:52 +0000 Subject: [PATCH 05/13] update conftest --- model-engine/tests/unit/domain/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/model-engine/tests/unit/domain/conftest.py b/model-engine/tests/unit/domain/conftest.py index 64d11de13..c27aaa52a 100644 --- a/model-engine/tests/unit/domain/conftest.py +++ b/model-engine/tests/unit/domain/conftest.py @@ -201,8 +201,8 @@ def create_llm_model_endpoint_request_async() -> CreateLLMModelEndpointV1Request name="test_llm_endpoint_name_async", model_name="mpt-7b", source="hugging_face", - inference_framework="vllm", - inference_framework_image_tag="0.2.0", + inference_framework="deepspeed", + inference_framework_image_tag="test_tag", num_shards=2, endpoint_type=ModelEndpointType.ASYNC, metadata={}, From a0080d2585b232247405e6dc00f44379105cb8c5 Mon Sep 17 00:00:00 2001 From: Ian Macleod Date: Mon, 30 Oct 2023 15:47:41 +0000 Subject: [PATCH 06/13] more fixes to tags --- model-engine/tests/unit/conftest.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/model-engine/tests/unit/conftest.py b/model-engine/tests/unit/conftest.py index b784e5c45..8459947ba 100644 --- a/model-engine/tests/unit/conftest.py +++ b/model-engine/tests/unit/conftest.py @@ -3688,7 +3688,7 @@ def llm_model_endpoint_sync_tgi( "model_name": "llama-7b", "source": "hugging_face", "inference_framework": "text_generation_inference", - "inference_framework_image_tag": "123", + "inference_framework_image_tag": "0.9.4", "num_shards": 4, } }, @@ -3750,7 +3750,7 @@ def llm_model_endpoint_sync_tgi( "source": "hugging_face", "status": "READY", "inference_framework": "text_generation_inference", - "inference_framework_image_tag": "123", + "inference_framework_image_tag": "0.9.4", "num_shards": 4, "spec": { "id": "test_llm_model_endpoint_id_2", @@ -3763,7 +3763,7 @@ def llm_model_endpoint_sync_tgi( "model_name": "llama-7b", "source": "hugging_face", "inference_framework": "text_generation_inference", - "inference_framework_image_tag": "123", + "inference_framework_image_tag": "0.9.4", "num_shards": 4, } }, @@ -3885,7 +3885,7 @@ def llm_model_endpoint_text_generation_inference( "model_name": "llama-7b", "source": "hugging_face", "inference_framework": "text_generation_inference", - "inference_framework_image_tag": "123", + "inference_framework_image_tag": "0.9.4", "num_shards": 4, } }, From bc151050194d6813e7b78b4389e155074b59612d Mon Sep 17 00:00:00 2001 From: Tiffany Zhao Date: Wed, 1 Nov 2023 20:04:45 +0000 Subject: [PATCH 07/13] add unit test --- .../use_cases/llm_model_endpoint_use_cases.py | 2 - .../tests/unit/domain/test_llm_use_cases.py | 54 ++++++++++++++++++- 2 files changed, 53 insertions(+), 3 deletions(-) diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py index 65b67d6a6..188df8202 100644 --- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py @@ -1086,7 +1086,6 @@ def model_output_to_completion_output( tokens=tokens, ) elif model_content.inference_framework == LLMInferenceFramework.LIGHTLLM: - print(model_output) tokens = None if with_token_probs: tokens = [ @@ -1600,7 +1599,6 @@ async def execute( ) elif model_content.inference_framework == LLMInferenceFramework.LIGHTLLM: if res.status == TaskStatus.SUCCESS and result is not None: - print(result) token = None num_completion_tokens += 1 if request.return_token_log_probs: diff --git a/model-engine/tests/unit/domain/test_llm_use_cases.py b/model-engine/tests/unit/domain/test_llm_use_cases.py index c71995eac..ce317dc59 100644 --- a/model-engine/tests/unit/domain/test_llm_use_cases.py +++ b/model-engine/tests/unit/domain/test_llm_use_cases.py @@ -14,9 +14,14 @@ ) from model_engine_server.common.dtos.tasks import SyncEndpointPredictV1Response, TaskStatus from model_engine_server.core.auth.authentication_repository import User -from model_engine_server.domain.entities import ModelEndpoint, ModelEndpointType +from model_engine_server.domain.entities import ( + LLMInferenceFramework, + ModelEndpoint, + ModelEndpointType, +) from model_engine_server.domain.exceptions import ( EndpointUnsupportedInferenceTypeException, + InvalidInferenceFrameworkImageTagException, InvalidRequestException, LLMFineTuningQuotaReached, ObjectHasInvalidValueException, @@ -147,6 +152,53 @@ async def test_create_model_endpoint_use_case_success( assert "--max-total-tokens" in bundle.flavor.command[-1] and "4096" in bundle.flavor.command[-1] +@pytest.mark.asyncio +@pytest.mark.parametrize( + "valid, inference_framework, inference_framework_image_tag", + [ + (False, LLMInferenceFramework.TEXT_GENERATION_INFERENCE, "0.9.2"), + (True, LLMInferenceFramework.TEXT_GENERATION_INFERENCE, "0.9.3"), + (False, LLMInferenceFramework.VLLM, "0.1.6"), + (True, LLMInferenceFramework.VLLM, "0.1.3.6"), + ], +) +async def test_create_model_bundle_inference_framework_image_tag_validation( + test_api_key: str, + fake_model_bundle_repository, + fake_model_endpoint_service, + fake_docker_repository_image_always_exists, + fake_model_primitive_gateway, + fake_llm_artifact_gateway, + create_llm_model_endpoint_text_generation_inference_request_streaming: CreateLLMModelEndpointV1Request, + valid, + inference_framework, + inference_framework_image_tag, +): + fake_model_endpoint_service.model_bundle_repository = fake_model_bundle_repository + bundle_use_case = CreateModelBundleV2UseCase( + model_bundle_repository=fake_model_bundle_repository, + docker_repository=fake_docker_repository_image_always_exists, + model_primitive_gateway=fake_model_primitive_gateway, + ) + + use_case = CreateLLMModelEndpointV1UseCase( + create_model_bundle_use_case=bundle_use_case, + model_bundle_repository=fake_model_bundle_repository, + model_endpoint_service=fake_model_endpoint_service, + llm_artifact_gateway=fake_llm_artifact_gateway, + ) + + request = create_llm_model_endpoint_text_generation_inference_request_streaming.copy() + request.inference_framework = inference_framework + request.inference_framework_image_tag = inference_framework_image_tag + user = User(user_id=test_api_key, team_id=test_api_key, is_privileged_user=True) + if valid: + await use_case.execute(user=user, request=request) + else: + with pytest.raises(InvalidInferenceFrameworkImageTagException): + await use_case.execute(user=user, request=request) + + @pytest.mark.asyncio async def test_create_model_endpoint_text_generation_inference_use_case_success( test_api_key: str, From e00e878cb100b18c7ea86dbf4d2a7a28b8932aab Mon Sep 17 00:00:00 2001 From: Tiffany Zhao Date: Wed, 1 Nov 2023 21:33:55 +0000 Subject: [PATCH 08/13] fix --- .../domain/use_cases/llm_model_endpoint_use_cases.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py index 188df8202..7367c7b66 100644 --- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py @@ -1039,7 +1039,6 @@ def model_output_to_completion_output( with_token_probs: Optional[bool], ) -> CompletionOutput: model_content = _model_endpoint_entity_to_get_llm_model_endpoint_response(model_endpoint) - if model_content.inference_framework == LLMInferenceFramework.DEEPSPEED: completion_token_count = len(model_output["token_probs"]["tokens"]) tokens = None @@ -1070,7 +1069,7 @@ def model_output_to_completion_output( raise InvalidRequestException(model_output.get("error")) # trigger a 400 else: raise UpstreamServiceError( - status_code=500, content=bytes(model_output["error"]) + status_code=500, content=bytes(model_output["error"], "utf-8") ) elif model_content.inference_framework == LLMInferenceFramework.VLLM: From 660e8230fc47218823148ecac0a4241b6ac74f67 Mon Sep 17 00:00:00 2001 From: Tiffany Zhao Date: Thu, 2 Nov 2023 23:30:40 +0000 Subject: [PATCH 09/13] check ecr image --- .../model_engine_server/api/llms_v1.py | 7 +- .../model_engine_server/domain/exceptions.py | 6 -- .../use_cases/llm_model_endpoint_use_cases.py | 89 ++++++++----------- .../tests/unit/domain/test_llm_use_cases.py | 7 +- 4 files changed, 42 insertions(+), 67 deletions(-) diff --git a/model-engine/model_engine_server/api/llms_v1.py b/model-engine/model_engine_server/api/llms_v1.py index 72d502856..fe5601b0c 100644 --- a/model-engine/model_engine_server/api/llms_v1.py +++ b/model-engine/model_engine_server/api/llms_v1.py @@ -48,7 +48,6 @@ EndpointResourceInvalidRequestException, EndpointUnsupportedInferenceTypeException, ExistingEndpointOperationInProgressException, - InvalidInferenceFrameworkImageTagException, InvalidRequestException, LLMFineTuningMethodNotImplementedException, LLMFineTuningQuotaReached, @@ -132,6 +131,7 @@ async def create_model_endpoint( model_bundle_repository=external_interfaces.model_bundle_repository, model_endpoint_service=external_interfaces.model_endpoint_service, llm_artifact_gateway=external_interfaces.llm_artifact_gateway, + docker_repository=external_interfaces.docker_repository, ) return await use_case.execute(user=auth, request=request) except ObjectAlreadyExistsException as exc: @@ -151,11 +151,6 @@ async def create_model_endpoint( status_code=400, detail=str(exc), ) from exc - except InvalidInferenceFrameworkImageTagException as exc: - raise HTTPException( - status_code=400, - detail="The specified inference framework image tag doesn't exist for the specified inference framework.", - ) from exc except ObjectNotApprovedException as exc: raise HTTPException( status_code=403, diff --git a/model-engine/model_engine_server/domain/exceptions.py b/model-engine/model_engine_server/domain/exceptions.py index 69d588485..934a5e215 100644 --- a/model-engine/model_engine_server/domain/exceptions.py +++ b/model-engine/model_engine_server/domain/exceptions.py @@ -170,9 +170,3 @@ class TriggerNameAlreadyExistsException(DomainException): """ Thrown if the requested name already exists in the trigger repository """ - - -class InvalidInferenceFrameworkImageTagException(DomainException): - """ - Thrown if the image tag passed in doesn't exist for the provided inference framework - """ diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py index 7367c7b66..c4d49b85a 100644 --- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py @@ -47,9 +47,9 @@ StreamingEnhancedRunnableImageFlavor, ) from model_engine_server.domain.exceptions import ( + DockerImageNotFoundException, EndpointLabelsException, EndpointUnsupportedInferenceTypeException, - InvalidInferenceFrameworkImageTagException, InvalidRequestException, ObjectHasInvalidValueException, ObjectNotAuthorizedException, @@ -58,6 +58,7 @@ ) from model_engine_server.domain.gateways.llm_artifact_gateway import LLMArtifactGateway from model_engine_server.domain.repositories import ModelBundleRepository +from model_engine_server.domain.repositories.docker_repository import DockerRepository from model_engine_server.domain.services import LLMModelEndpointService, ModelEndpointService from model_engine_server.infra.gateways.filesystem_gateway import FilesystemGateway @@ -75,52 +76,6 @@ logger = make_logger(logger_name()) -_VALID_FRAMEWORK_IMAGE_TAGS = { - # setting this to empty for now since no one uses deepspeed - LLMInferenceFramework.DEEPSPEED: [], - LLMInferenceFramework.TEXT_GENERATION_INFERENCE: [ - "0.9.4.1", - "0.9.4", - "0.9.3-launch_s3", - "0.9.3", - "0.9.1-launch_s3", - "0.9.1", - "ipv6", - "ipv6-0", - "0.8", - ], - LLMInferenceFramework.VLLM: [ - "0.2.1.post1", - "0.2.1", - "0.2.0", - "0.1.7-awq", - "0.1.5", - "0.1.7", - "0.1.3.10", - "0.1.3.9", - "0.1.3.8", - "0.1.3.7", - "0.1.3.6", - "0.1.3.5", - "0.1.3.4", - "0.1.3.3", - "0.1.3.2", - "0.1.3.1", - "0.1.3", - ], - LLMInferenceFramework.LIGHTLLM: [ - "0.0.9", - "0.0.8", - "0.0.7", - "0.0.6", - "0.0.5", - "0.0.4", - "0.0.3", - "0.0.2", - "0.0.1", - ], -} - _SUPPORTED_MODEL_NAMES = { LLMInferenceFramework.DEEPSPEED: { "mpt-7b": "mosaicml/mpt-7b", @@ -285,12 +240,14 @@ def __init__( model_bundle_repository: ModelBundleRepository, model_endpoint_service: ModelEndpointService, llm_artifact_gateway: LLMArtifactGateway, + docker_repository: DockerRepository, ): self.authz_module = LiveAuthorizationModule() self.create_model_bundle_use_case = create_model_bundle_use_case self.model_bundle_repository = model_bundle_repository self.model_endpoint_service = model_endpoint_service self.llm_artifact_gateway = llm_artifact_gateway + self.docker_repository = docker_repository async def create_model_bundle( self, @@ -306,13 +263,15 @@ async def create_model_bundle( checkpoint_path: Optional[str], ) -> ModelBundle: if source == LLMSource.HUGGING_FACE: - # validate the image tag / framework pair - if framework != LLMInferenceFramework.DEEPSPEED and framework_image_tag not in _VALID_FRAMEWORK_IMAGE_TAGS[framework]: # type: ignore - raise InvalidInferenceFrameworkImageTagException( - f"Valid image tags for framework {framework} are {_VALID_FRAMEWORK_IMAGE_TAGS[framework]}" - ) - if framework == LLMInferenceFramework.DEEPSPEED: + if not self.docker_repository.image_exists( + image_tag=framework_image_tag, + repository_name="instant-llm", + ): + raise DockerImageNotFoundException( + repository="instant-llm", + tag=framework_image_tag, + ) bundle_id = await self.create_deepspeed_bundle( user, model_name, @@ -321,6 +280,14 @@ async def create_model_bundle( endpoint_name, ) elif framework == LLMInferenceFramework.TEXT_GENERATION_INFERENCE: + if not self.docker_repository.image_exists( + image_tag=framework_image_tag, + repository_name=hmi_config.tgi_repository, + ): + raise DockerImageNotFoundException( + repository=hmi_config.tgi_repository, + tag=framework_image_tag, + ) bundle_id = await self.create_text_generation_inference_bundle( user, model_name, @@ -331,6 +298,14 @@ async def create_model_bundle( checkpoint_path, ) elif framework == LLMInferenceFramework.VLLM: + if not self.docker_repository.image_exists( + image_tag=framework_image_tag, + repository_name=hmi_config.vllm_repository, + ): + raise DockerImageNotFoundException( + repository=hmi_config.vllm_repository, + tag=framework_image_tag, + ) bundle_id = await self.create_vllm_bundle( user, model_name, @@ -349,6 +324,14 @@ async def create_model_bundle( num_shards, checkpoint_path, ) + if not self.docker_repository.image_exists( + image_tag=framework_image_tag, + repository_name=hmi_config.lightllm_repository, + ): + raise DockerImageNotFoundException( + repository=hmi_config.lightllm_repository, + tag=framework_image_tag, + ) else: raise ObjectHasInvalidValueException( f"Framework {framework} is not supported for source {source}." diff --git a/model-engine/tests/unit/domain/test_llm_use_cases.py b/model-engine/tests/unit/domain/test_llm_use_cases.py index ce317dc59..29bca2170 100644 --- a/model-engine/tests/unit/domain/test_llm_use_cases.py +++ b/model-engine/tests/unit/domain/test_llm_use_cases.py @@ -20,8 +20,8 @@ ModelEndpointType, ) from model_engine_server.domain.exceptions import ( + DockerImageNotFoundException, EndpointUnsupportedInferenceTypeException, - InvalidInferenceFrameworkImageTagException, InvalidRequestException, LLMFineTuningQuotaReached, ObjectHasInvalidValueException, @@ -167,6 +167,7 @@ async def test_create_model_bundle_inference_framework_image_tag_validation( fake_model_bundle_repository, fake_model_endpoint_service, fake_docker_repository_image_always_exists, + fake_docker_repository_image_never_exists, fake_model_primitive_gateway, fake_llm_artifact_gateway, create_llm_model_endpoint_text_generation_inference_request_streaming: CreateLLMModelEndpointV1Request, @@ -186,6 +187,7 @@ async def test_create_model_bundle_inference_framework_image_tag_validation( model_bundle_repository=fake_model_bundle_repository, model_endpoint_service=fake_model_endpoint_service, llm_artifact_gateway=fake_llm_artifact_gateway, + docker_repository=fake_docker_repository_image_always_exists, ) request = create_llm_model_endpoint_text_generation_inference_request_streaming.copy() @@ -195,7 +197,8 @@ async def test_create_model_bundle_inference_framework_image_tag_validation( if valid: await use_case.execute(user=user, request=request) else: - with pytest.raises(InvalidInferenceFrameworkImageTagException): + use_case.docker_repository = fake_docker_repository_image_never_exists + with pytest.raises(DockerImageNotFoundException): await use_case.execute(user=user, request=request) From bc0a915614ab7eeb9ff0b3794260281613408683 Mon Sep 17 00:00:00 2001 From: Tiffany Zhao Date: Fri, 3 Nov 2023 04:21:27 +0000 Subject: [PATCH 10/13] catch docker image exception --- model-engine/model_engine_server/api/llms_v1.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/model-engine/model_engine_server/api/llms_v1.py b/model-engine/model_engine_server/api/llms_v1.py index fe5601b0c..a74b78561 100644 --- a/model-engine/model_engine_server/api/llms_v1.py +++ b/model-engine/model_engine_server/api/llms_v1.py @@ -43,6 +43,7 @@ make_logger, ) from model_engine_server.domain.exceptions import ( + DockerImageNotFoundException, EndpointDeleteFailedException, EndpointLabelsException, EndpointResourceInvalidRequestException, @@ -161,6 +162,11 @@ async def create_model_endpoint( status_code=404, detail="The specified model bundle could not be found.", ) from exc + except DockerImageNotFoundException as exc: + raise HTTPException( + status_code=404, + detail="The specified docker image could not be found.", + ) from exc @llm_router_v1.get("/model-endpoints", response_model=ListLLMModelEndpointsV1Response) From 6ba1f02e947705bdadea986d53bdc74a1ecc6c40 Mon Sep 17 00:00:00 2001 From: Tiffany Zhao Date: Fri, 3 Nov 2023 04:29:40 +0000 Subject: [PATCH 11/13] fix --- model-engine/tests/unit/domain/test_llm_use_cases.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/model-engine/tests/unit/domain/test_llm_use_cases.py b/model-engine/tests/unit/domain/test_llm_use_cases.py index 29bca2170..bc8d39497 100644 --- a/model-engine/tests/unit/domain/test_llm_use_cases.py +++ b/model-engine/tests/unit/domain/test_llm_use_cases.py @@ -71,6 +71,7 @@ async def test_create_model_endpoint_use_case_success( model_bundle_repository=fake_model_bundle_repository, model_endpoint_service=fake_model_endpoint_service, llm_artifact_gateway=fake_llm_artifact_gateway, + docker_repository=fake_docker_repository_image_always_exists, ) user = User(user_id=test_api_key, team_id=test_api_key, is_privileged_user=True) response_1 = await use_case.execute(user=user, request=create_llm_model_endpoint_request_async) @@ -224,6 +225,7 @@ async def test_create_model_endpoint_text_generation_inference_use_case_success( model_bundle_repository=fake_model_bundle_repository, model_endpoint_service=fake_model_endpoint_service, llm_artifact_gateway=fake_llm_artifact_gateway, + docker_repository=fake_docker_repository_image_always_exists, ) user = User(user_id=test_api_key, team_id=test_api_key, is_privileged_user=True) response_1 = await use_case.execute( @@ -279,6 +281,7 @@ async def test_create_llm_model_endpoint_use_case_raises_invalid_value_exception model_bundle_repository=fake_model_bundle_repository, model_endpoint_service=fake_model_endpoint_service, llm_artifact_gateway=fake_llm_artifact_gateway, + docker_repository=fake_docker_repository_image_always_exists, ) user = User(user_id=test_api_key, team_id=test_api_key, is_privileged_user=True) with pytest.raises(ObjectHasInvalidValueException): @@ -308,6 +311,7 @@ async def test_create_llm_model_endpoint_use_case_quantization_exception( model_bundle_repository=fake_model_bundle_repository, model_endpoint_service=fake_model_endpoint_service, llm_artifact_gateway=fake_llm_artifact_gateway, + docker_repository=fake_docker_repository_image_always_exists, ) user = User(user_id=test_api_key, team_id=test_api_key, is_privileged_user=True) with pytest.raises(ObjectHasInvalidValueException): From 1b8f3aa1efc80bbcaebfa6e86966cc6f457a7077 Mon Sep 17 00:00:00 2001 From: Tiffany Zhao Date: Wed, 8 Nov 2023 06:14:44 +0000 Subject: [PATCH 12/13] revert removal commit --- .../use_cases/llm_model_endpoint_use_cases.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py index 1cdfcd2f0..182554483 100644 --- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py @@ -232,8 +232,21 @@ def validate_num_shards( raise ObjectHasInvalidValueException("DeepSpeed requires more than 1 GPU.") if num_shards != gpus: raise ObjectHasInvalidValueException( - f"DeepSpeed requires num shard {num_shards} to be the same as number of GPUs {gpus}." + f"Num shard {num_shards} must be the same as number of GPUs {gpus} for DeepSpeed." ) + if num_shards > gpus: + raise ObjectHasInvalidValueException( + f"Num shard {num_shards} must be less than or equal to the number of GPUs {gpus}." + ) + + +def validate_quantization( + quantize: Optional[Quantization], inference_framework: LLMInferenceFramework +) -> None: + if quantize is not None and quantize not in _SUPPORTED_QUANTIZATIONS[inference_framework]: + raise ObjectHasInvalidValueException( + f"Quantization {quantize} is not supported for inference framework {inference_framework}. Supported quantization types are {_SUPPORTED_QUANTIZATIONS[inference_framework]}." + ) class CreateLLMModelEndpointV1UseCase: @@ -731,6 +744,7 @@ async def execute( validate_post_inference_hooks(user, request.post_inference_hooks) validate_model_name(request.model_name, request.inference_framework) validate_num_shards(request.num_shards, request.inference_framework, request.gpus) + validate_quantization(request.quantize, request.inference_framework) if request.inference_framework in [ LLMInferenceFramework.TEXT_GENERATION_INFERENCE, From 6873c127bc16c2d50697793d148bb0973ca4cc32 Mon Sep 17 00:00:00 2001 From: Tiffany Zhao Date: Thu, 9 Nov 2023 02:07:53 +0000 Subject: [PATCH 13/13] fix + refactor --- .../use_cases/llm_model_endpoint_use_cases.py | 56 ++++++++----------- 1 file changed, 23 insertions(+), 33 deletions(-) diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py index 182554483..829f4801b 100644 --- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py @@ -265,6 +265,18 @@ def __init__( self.llm_artifact_gateway = llm_artifact_gateway self.docker_repository = docker_repository + def check_docker_image_exists_for_image_tag( + self, framework_image_tag: str, repository_name: str + ): + if not self.docker_repository.image_exists( + image_tag=framework_image_tag, + repository_name=repository_name, + ): + raise DockerImageNotFoundException( + repository=repository_name, + tag=framework_image_tag, + ) + async def create_model_bundle( self, user: User, @@ -280,14 +292,7 @@ async def create_model_bundle( ) -> ModelBundle: if source == LLMSource.HUGGING_FACE: if framework == LLMInferenceFramework.DEEPSPEED: - if not self.docker_repository.image_exists( - image_tag=framework_image_tag, - repository_name="instant-llm", - ): - raise DockerImageNotFoundException( - repository="instant-llm", - tag=framework_image_tag, - ) + self.check_docker_image_exists_for_image_tag(framework_image_tag, "instant-llm") bundle_id = await self.create_deepspeed_bundle( user, model_name, @@ -296,14 +301,9 @@ async def create_model_bundle( endpoint_name, ) elif framework == LLMInferenceFramework.TEXT_GENERATION_INFERENCE: - if not self.docker_repository.image_exists( - image_tag=framework_image_tag, - repository_name=hmi_config.tgi_repository, - ): - raise DockerImageNotFoundException( - repository=hmi_config.tgi_repository, - tag=framework_image_tag, - ) + self.check_docker_image_exists_for_image_tag( + framework_image_tag, hmi_config.tgi_repository + ) bundle_id = await self.create_text_generation_inference_bundle( user, model_name, @@ -314,14 +314,9 @@ async def create_model_bundle( checkpoint_path, ) elif framework == LLMInferenceFramework.VLLM: - if not self.docker_repository.image_exists( - image_tag=framework_image_tag, - repository_name=hmi_config.vllm_repository, - ): - raise DockerImageNotFoundException( - repository=hmi_config.vllm_repository, - tag=framework_image_tag, - ) + self.check_docker_image_exists_for_image_tag( + framework_image_tag, hmi_config.vllm_repository + ) bundle_id = await self.create_vllm_bundle( user, model_name, @@ -332,6 +327,9 @@ async def create_model_bundle( checkpoint_path, ) elif framework == LLMInferenceFramework.LIGHTLLM: + self.check_docker_image_exists_for_image_tag( + framework_image_tag, hmi_config.lightllm_repository + ) bundle_id = await self.create_lightllm_bundle( user, model_name, @@ -340,14 +338,6 @@ async def create_model_bundle( num_shards, checkpoint_path, ) - if not self.docker_repository.image_exists( - image_tag=framework_image_tag, - repository_name=hmi_config.lightllm_repository, - ): - raise DockerImageNotFoundException( - repository=hmi_config.lightllm_repository, - tag=framework_image_tag, - ) else: raise ObjectHasInvalidValueException( f"Framework {framework} is not supported for source {source}." @@ -1372,7 +1362,7 @@ async def execute( ) if len(model_endpoints) == 0: - raise ObjectNotFoundException + raise ObjectNotFoundException(f"Model endpoint {model_endpoint_name} not found.") if len(model_endpoints) > 1: raise ObjectHasInvalidValueException(