From 710fb9ebd12b8b991b0876765ea48eff23e99408 Mon Sep 17 00:00:00 2001
From: Ian Macleod <ian.macleod@scale.com>
Date: Fri, 20 Oct 2023 02:14:19 +0000
Subject: [PATCH 01/13] ensuring invalid image tag errors are surfaced to users
 clearly

---
 .../model_engine_server/api/llms_v1.py        |   6 ++
 .../model_engine_server/domain/exceptions.py  |   6 ++
 .../use_cases/llm_model_endpoint_use_cases.py | 100 +++++++++++-------
 3 files changed, 74 insertions(+), 38 deletions(-)

diff --git a/model-engine/model_engine_server/api/llms_v1.py b/model-engine/model_engine_server/api/llms_v1.py
index 7e73ef707..72d502856 100644
--- a/model-engine/model_engine_server/api/llms_v1.py
+++ b/model-engine/model_engine_server/api/llms_v1.py
@@ -48,6 +48,7 @@
     EndpointResourceInvalidRequestException,
     EndpointUnsupportedInferenceTypeException,
     ExistingEndpointOperationInProgressException,
+    InvalidInferenceFrameworkImageTagException,
     InvalidRequestException,
     LLMFineTuningMethodNotImplementedException,
     LLMFineTuningQuotaReached,
@@ -150,6 +151,11 @@ async def create_model_endpoint(
             status_code=400,
             detail=str(exc),
         ) from exc
+    except InvalidInferenceFrameworkImageTagException as exc:
+        raise HTTPException(
+            status_code=400,
+            detail="The specified inference framework image tag doesn't exist for the specified inference framework.",
+        ) from exc
     except ObjectNotApprovedException as exc:
         raise HTTPException(
             status_code=403,
diff --git a/model-engine/model_engine_server/domain/exceptions.py b/model-engine/model_engine_server/domain/exceptions.py
index 934a5e215..69d588485 100644
--- a/model-engine/model_engine_server/domain/exceptions.py
+++ b/model-engine/model_engine_server/domain/exceptions.py
@@ -170,3 +170,9 @@ class TriggerNameAlreadyExistsException(DomainException):
     """
     Thrown if the requested name already exists in the trigger repository
     """
+
+
+class InvalidInferenceFrameworkImageTagException(DomainException):
+    """
+    Thrown if the image tag passed in doesn't exist for the provided inference framework
+    """
diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
index 78218843c..476a68b94 100644
--- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
+++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -49,6 +49,7 @@
 from model_engine_server.domain.exceptions import (
     EndpointLabelsException,
     EndpointUnsupportedInferenceTypeException,
+    InvalidInferenceFrameworkImageTagException,
     InvalidRequestException,
     ObjectHasInvalidValueException,
     ObjectNotAuthorizedException,
@@ -74,6 +75,51 @@
 
 logger = make_logger(logger_name())
 
+_VALID_FRAMEWORK_IMAGE_TAGS = {
+    # setting this to empty for now since no one uses deepspeed
+    LLMInferenceFramework.DEEPSPEED: [],
+    LLMInferenceFramework.TEXT_GENERATION_INFERENCE: [
+        "0.9.4.1",
+        "0.9.4",
+        "0.9.3-launch_s3",
+        "0.9.3",
+        "0.9.1-launch_s3",
+        "0.9.1",
+        "ipv6",
+        "ipv6-0",
+        "0.8",
+    ],
+    LLMInferenceFramework.VLLM: [
+        "0.2.1",
+        "0.2.0",
+        "0.1.7-awq",
+        "0.1.5",
+        "0.1.7",
+        "0.1.3.10",
+        "0.1.3.9",
+        "0.1.3.8",
+        "0.1.3.7",
+        "0.1.3.6",
+        "0.1.3.5",
+        "0.1.3.4",
+        "0.1.3.3",
+        "0.1.3.2",
+        "0.1.3.1",
+        "0.1.3",
+    ],
+    LLMInferenceFramework.LIGHTLLM: [
+        "0.0.9",
+        "0.0.8",
+        "0.0.7",
+        "0.0.6",
+        "0.0.5",
+        "0.0.4",
+        "0.0.3",
+        "0.0.2",
+        "0.0.1",
+    ],
+}
+
 _SUPPORTED_MODEL_NAMES = {
     LLMInferenceFramework.DEEPSPEED: {
         "mpt-7b": "mosaicml/mpt-7b",
@@ -227,21 +273,8 @@ def validate_num_shards(
             raise ObjectHasInvalidValueException("DeepSpeed requires more than 1 GPU.")
         if num_shards != gpus:
             raise ObjectHasInvalidValueException(
-                f"Num shard {num_shards} must be the same as number of GPUs {gpus} for DeepSpeed."
+                f"DeepSpeed requires num shard {num_shards} to be the same as number of GPUs {gpus}."
             )
-    if num_shards > gpus:
-        raise ObjectHasInvalidValueException(
-            f"Num shard {num_shards} must be less than or equal to the number of GPUs {gpus}."
-        )
-
-
-def validate_quantization(
-    quantize: Optional[Quantization], inference_framework: LLMInferenceFramework
-) -> None:
-    if quantize is not None and quantize not in _SUPPORTED_QUANTIZATIONS[inference_framework]:
-        raise ObjectHasInvalidValueException(
-            f"Quantization {quantize} is not supported for inference framework {inference_framework}. Supported quantization types are {_SUPPORTED_QUANTIZATIONS[inference_framework]}."
-        )
 
 
 class CreateLLMModelEndpointV1UseCase:
@@ -272,6 +305,10 @@ async def create_model_bundle(
         checkpoint_path: Optional[str],
     ) -> ModelBundle:
         if source == LLMSource.HUGGING_FACE:
+            # validate the image tag / framework pair
+            if framework_image_tag not in _VALID_FRAMEWORK_IMAGE_TAGS[framework]:  # type: ignore
+                raise InvalidInferenceFrameworkImageTagException
+
             if framework == LLMInferenceFramework.DEEPSPEED:
                 bundle_id = await self.create_deepspeed_bundle(
                     user,
@@ -710,12 +747,10 @@ async def execute(
         validate_post_inference_hooks(user, request.post_inference_hooks)
         validate_model_name(request.model_name, request.inference_framework)
         validate_num_shards(request.num_shards, request.inference_framework, request.gpus)
-        validate_quantization(request.quantize, request.inference_framework)
 
         if request.inference_framework in [
             LLMInferenceFramework.TEXT_GENERATION_INFERENCE,
             LLMInferenceFramework.VLLM,
-            LLMInferenceFramework.LIGHTLLM,
         ]:
             if request.endpoint_type != ModelEndpointType.STREAMING:
                 raise ObjectHasInvalidValueException(
@@ -954,10 +989,7 @@ def validate_and_update_completion_params(
         if inference_framework == LLMInferenceFramework.TEXT_GENERATION_INFERENCE:
             request.top_k = None if request.top_k == -1 else request.top_k
             request.top_p = None if request.top_p == 1.0 else request.top_p
-        if inference_framework in [
-            LLMInferenceFramework.VLLM,
-            LLMInferenceFramework.LIGHTLLM,
-        ]:
+        if inference_framework in [LLMInferenceFramework.VLLM, LLMInferenceFramework.LIGHTLLM]:
             request.top_k = -1 if request.top_k is None else request.top_k
             request.top_p = 1.0 if request.top_p is None else request.top_p
     else:
@@ -967,10 +999,7 @@ def validate_and_update_completion_params(
             )
 
     # presence_penalty, frequency_penalty
-    if inference_framework in [
-        LLMInferenceFramework.VLLM,
-        LLMInferenceFramework.LIGHTLLM,
-    ]:
+    if inference_framework in [LLMInferenceFramework.VLLM, LLMInferenceFramework.LIGHTLLM]:
         request.presence_penalty = (
             0.0 if request.presence_penalty is None else request.presence_penalty
         )
@@ -1038,17 +1067,14 @@ def model_output_to_completion_output(
                     raise InvalidRequestException(model_output.get("error"))  # trigger a 400
                 else:
                     raise UpstreamServiceError(
-                        status_code=500, content=bytes(model_output["error"], "utf-8")
+                        status_code=500, content=bytes(model_output["error"])
                     )
 
         elif model_content.inference_framework == LLMInferenceFramework.VLLM:
             tokens = None
             if with_token_probs:
                 tokens = [
-                    TokenOutput(
-                        token=model_output["tokens"][index],
-                        log_prob=list(t.values())[0],
-                    )
+                    TokenOutput(token=model_output["tokens"][index], log_prob=list(t.values())[0])
                     for index, t in enumerate(model_output["log_probs"])
                 ]
             return CompletionOutput(
@@ -1057,6 +1083,7 @@ def model_output_to_completion_output(
                 tokens=tokens,
             )
         elif model_content.inference_framework == LLMInferenceFramework.LIGHTLLM:
+            print(model_output)
             tokens = None
             if with_token_probs:
                 tokens = [
@@ -1162,8 +1189,7 @@ async def execute(
                 timeout_seconds=DOWNSTREAM_REQUEST_TIMEOUT_SECONDS,
             )
             predict_result = await inference_gateway.predict(
-                topic=model_endpoint.record.destination,
-                predict_request=inference_request,
+                topic=model_endpoint.record.destination, predict_request=inference_request
             )
 
             if predict_result.status == TaskStatus.SUCCESS and predict_result.result is not None:
@@ -1206,8 +1232,7 @@ async def execute(
                 timeout_seconds=DOWNSTREAM_REQUEST_TIMEOUT_SECONDS,
             )
             predict_result = await inference_gateway.predict(
-                topic=model_endpoint.record.destination,
-                predict_request=inference_request,
+                topic=model_endpoint.record.destination, predict_request=inference_request
             )
 
             if predict_result.status != TaskStatus.SUCCESS or predict_result.result is None:
@@ -1246,8 +1271,7 @@ async def execute(
                 timeout_seconds=DOWNSTREAM_REQUEST_TIMEOUT_SECONDS,
             )
             predict_result = await inference_gateway.predict(
-                topic=model_endpoint.record.destination,
-                predict_request=inference_request,
+                topic=model_endpoint.record.destination, predict_request=inference_request
             )
 
             if predict_result.status != TaskStatus.SUCCESS or predict_result.result is None:
@@ -1289,8 +1313,7 @@ async def execute(
                 timeout_seconds=DOWNSTREAM_REQUEST_TIMEOUT_SECONDS,
             )
             predict_result = await inference_gateway.predict(
-                topic=model_endpoint.record.destination,
-                predict_request=inference_request,
+                topic=model_endpoint.record.destination, predict_request=inference_request
             )
 
             if predict_result.status != TaskStatus.SUCCESS or predict_result.result is None:
@@ -1353,7 +1376,7 @@ async def execute(
         )
 
         if len(model_endpoints) == 0:
-            raise ObjectNotFoundException(f"Model endpoint {model_endpoint_name} not found.")
+            raise ObjectNotFoundException
 
         if len(model_endpoints) > 1:
             raise ObjectHasInvalidValueException(
@@ -1574,6 +1597,7 @@ async def execute(
                     )
             elif model_content.inference_framework == LLMInferenceFramework.LIGHTLLM:
                 if res.status == TaskStatus.SUCCESS and result is not None:
+                    print(result)
                     token = None
                     num_completion_tokens += 1
                     if request.return_token_log_probs:

From 088a36552e1fddd93b264fab4ef105fd37aa3388 Mon Sep 17 00:00:00 2001
From: Ian Macleod <ian.macleod@scale.com>
Date: Fri, 20 Oct 2023 16:33:23 +0000
Subject: [PATCH 02/13] adding new vllm version

---
 .../domain/use_cases/llm_model_endpoint_use_cases.py             | 1 +
 1 file changed, 1 insertion(+)

diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
index 476a68b94..cc4e7da9b 100644
--- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
+++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -90,6 +90,7 @@
         "0.8",
     ],
     LLMInferenceFramework.VLLM: [
+        "0.2.1.post1",
         "0.2.1",
         "0.2.0",
         "0.1.7-awq",

From a1fab6347fa587ff392577f88afdfa7bf64df086 Mon Sep 17 00:00:00 2001
From: Ian Macleod <ian.macleod@scale.com>
Date: Fri, 27 Oct 2023 05:32:13 +0000
Subject: [PATCH 03/13] update error message, handling for deepspeed

---
 .../domain/use_cases/llm_model_endpoint_use_cases.py        | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
index cc4e7da9b..65b67d6a6 100644
--- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
+++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -307,8 +307,10 @@ async def create_model_bundle(
     ) -> ModelBundle:
         if source == LLMSource.HUGGING_FACE:
             # validate the image tag / framework pair
-            if framework_image_tag not in _VALID_FRAMEWORK_IMAGE_TAGS[framework]:  # type: ignore
-                raise InvalidInferenceFrameworkImageTagException
+            if framework != LLMInferenceFramework.DEEPSPEED and framework_image_tag not in _VALID_FRAMEWORK_IMAGE_TAGS[framework]:  # type: ignore
+                raise InvalidInferenceFrameworkImageTagException(
+                    f"Valid image tags for framework {framework} are {_VALID_FRAMEWORK_IMAGE_TAGS[framework]}"
+                )
 
             if framework == LLMInferenceFramework.DEEPSPEED:
                 bundle_id = await self.create_deepspeed_bundle(

From 5111997f0ce11fe785fa99f77da5ccb26e034d4a Mon Sep 17 00:00:00 2001
From: Ian Macleod <ian.macleod@scale.com>
Date: Fri, 27 Oct 2023 05:39:06 +0000
Subject: [PATCH 04/13] update conftest

---
 model-engine/tests/unit/domain/conftest.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/model-engine/tests/unit/domain/conftest.py b/model-engine/tests/unit/domain/conftest.py
index 6a958ed43..64d11de13 100644
--- a/model-engine/tests/unit/domain/conftest.py
+++ b/model-engine/tests/unit/domain/conftest.py
@@ -201,8 +201,8 @@ def create_llm_model_endpoint_request_async() -> CreateLLMModelEndpointV1Request
         name="test_llm_endpoint_name_async",
         model_name="mpt-7b",
         source="hugging_face",
-        inference_framework="deepspeed",
-        inference_framework_image_tag="test_tag",
+        inference_framework="vllm",
+        inference_framework_image_tag="0.2.0",
         num_shards=2,
         endpoint_type=ModelEndpointType.ASYNC,
         metadata={},
@@ -254,7 +254,7 @@ def create_llm_model_endpoint_request_llama_2() -> CreateLLMModelEndpointV1Reque
         model_name="llama-2-7b",
         source="hugging_face",
         inference_framework="text_generation_inference",
-        inference_framework_image_tag="test_tag",
+        inference_framework_image_tag="0.9.4",
         num_shards=2,
         endpoint_type=ModelEndpointType.STREAMING,
         metadata={},
@@ -310,7 +310,7 @@ def create_llm_model_endpoint_text_generation_inference_request_async() -> (
         model_name="mpt-7b",
         source="hugging_face",
         inference_framework="text_generation_inference",
-        inference_framework_image_tag="test_tag",
+        inference_framework_image_tag="0.9.4",
         num_shards=2,
         quantize=Quantization.BITSANDBYTES,
         endpoint_type=ModelEndpointType.ASYNC,

From f4e8f9b0c4021e85e0be2e444a862dd32b64eed9 Mon Sep 17 00:00:00 2001
From: Ian Macleod <ian.macleod@scale.com>
Date: Fri, 27 Oct 2023 16:07:52 +0000
Subject: [PATCH 05/13] update conftest

---
 model-engine/tests/unit/domain/conftest.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/model-engine/tests/unit/domain/conftest.py b/model-engine/tests/unit/domain/conftest.py
index 64d11de13..c27aaa52a 100644
--- a/model-engine/tests/unit/domain/conftest.py
+++ b/model-engine/tests/unit/domain/conftest.py
@@ -201,8 +201,8 @@ def create_llm_model_endpoint_request_async() -> CreateLLMModelEndpointV1Request
         name="test_llm_endpoint_name_async",
         model_name="mpt-7b",
         source="hugging_face",
-        inference_framework="vllm",
-        inference_framework_image_tag="0.2.0",
+        inference_framework="deepspeed",
+        inference_framework_image_tag="test_tag",
         num_shards=2,
         endpoint_type=ModelEndpointType.ASYNC,
         metadata={},

From a0080d2585b232247405e6dc00f44379105cb8c5 Mon Sep 17 00:00:00 2001
From: Ian Macleod <ian.macleod@scale.com>
Date: Mon, 30 Oct 2023 15:47:41 +0000
Subject: [PATCH 06/13] more fixes to tags

---
 model-engine/tests/unit/conftest.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/model-engine/tests/unit/conftest.py b/model-engine/tests/unit/conftest.py
index b784e5c45..8459947ba 100644
--- a/model-engine/tests/unit/conftest.py
+++ b/model-engine/tests/unit/conftest.py
@@ -3688,7 +3688,7 @@ def llm_model_endpoint_sync_tgi(
                     "model_name": "llama-7b",
                     "source": "hugging_face",
                     "inference_framework": "text_generation_inference",
-                    "inference_framework_image_tag": "123",
+                    "inference_framework_image_tag": "0.9.4",
                     "num_shards": 4,
                 }
             },
@@ -3750,7 +3750,7 @@ def llm_model_endpoint_sync_tgi(
         "source": "hugging_face",
         "status": "READY",
         "inference_framework": "text_generation_inference",
-        "inference_framework_image_tag": "123",
+        "inference_framework_image_tag": "0.9.4",
         "num_shards": 4,
         "spec": {
             "id": "test_llm_model_endpoint_id_2",
@@ -3763,7 +3763,7 @@ def llm_model_endpoint_sync_tgi(
                     "model_name": "llama-7b",
                     "source": "hugging_face",
                     "inference_framework": "text_generation_inference",
-                    "inference_framework_image_tag": "123",
+                    "inference_framework_image_tag": "0.9.4",
                     "num_shards": 4,
                 }
             },
@@ -3885,7 +3885,7 @@ def llm_model_endpoint_text_generation_inference(
                     "model_name": "llama-7b",
                     "source": "hugging_face",
                     "inference_framework": "text_generation_inference",
-                    "inference_framework_image_tag": "123",
+                    "inference_framework_image_tag": "0.9.4",
                     "num_shards": 4,
                 }
             },

From bc151050194d6813e7b78b4389e155074b59612d Mon Sep 17 00:00:00 2001
From: Tiffany Zhao <tiffany.zhao@scale.com>
Date: Wed, 1 Nov 2023 20:04:45 +0000
Subject: [PATCH 07/13] add unit test

---
 .../use_cases/llm_model_endpoint_use_cases.py |  2 -
 .../tests/unit/domain/test_llm_use_cases.py   | 54 ++++++++++++++++++-
 2 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
index 65b67d6a6..188df8202 100644
--- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
+++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -1086,7 +1086,6 @@ def model_output_to_completion_output(
                 tokens=tokens,
             )
         elif model_content.inference_framework == LLMInferenceFramework.LIGHTLLM:
-            print(model_output)
             tokens = None
             if with_token_probs:
                 tokens = [
@@ -1600,7 +1599,6 @@ async def execute(
                     )
             elif model_content.inference_framework == LLMInferenceFramework.LIGHTLLM:
                 if res.status == TaskStatus.SUCCESS and result is not None:
-                    print(result)
                     token = None
                     num_completion_tokens += 1
                     if request.return_token_log_probs:
diff --git a/model-engine/tests/unit/domain/test_llm_use_cases.py b/model-engine/tests/unit/domain/test_llm_use_cases.py
index c71995eac..ce317dc59 100644
--- a/model-engine/tests/unit/domain/test_llm_use_cases.py
+++ b/model-engine/tests/unit/domain/test_llm_use_cases.py
@@ -14,9 +14,14 @@
 )
 from model_engine_server.common.dtos.tasks import SyncEndpointPredictV1Response, TaskStatus
 from model_engine_server.core.auth.authentication_repository import User
-from model_engine_server.domain.entities import ModelEndpoint, ModelEndpointType
+from model_engine_server.domain.entities import (
+    LLMInferenceFramework,
+    ModelEndpoint,
+    ModelEndpointType,
+)
 from model_engine_server.domain.exceptions import (
     EndpointUnsupportedInferenceTypeException,
+    InvalidInferenceFrameworkImageTagException,
     InvalidRequestException,
     LLMFineTuningQuotaReached,
     ObjectHasInvalidValueException,
@@ -147,6 +152,53 @@ async def test_create_model_endpoint_use_case_success(
     assert "--max-total-tokens" in bundle.flavor.command[-1] and "4096" in bundle.flavor.command[-1]
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "valid, inference_framework, inference_framework_image_tag",
+    [
+        (False, LLMInferenceFramework.TEXT_GENERATION_INFERENCE, "0.9.2"),
+        (True, LLMInferenceFramework.TEXT_GENERATION_INFERENCE, "0.9.3"),
+        (False, LLMInferenceFramework.VLLM, "0.1.6"),
+        (True, LLMInferenceFramework.VLLM, "0.1.3.6"),
+    ],
+)
+async def test_create_model_bundle_inference_framework_image_tag_validation(
+    test_api_key: str,
+    fake_model_bundle_repository,
+    fake_model_endpoint_service,
+    fake_docker_repository_image_always_exists,
+    fake_model_primitive_gateway,
+    fake_llm_artifact_gateway,
+    create_llm_model_endpoint_text_generation_inference_request_streaming: CreateLLMModelEndpointV1Request,
+    valid,
+    inference_framework,
+    inference_framework_image_tag,
+):
+    fake_model_endpoint_service.model_bundle_repository = fake_model_bundle_repository
+    bundle_use_case = CreateModelBundleV2UseCase(
+        model_bundle_repository=fake_model_bundle_repository,
+        docker_repository=fake_docker_repository_image_always_exists,
+        model_primitive_gateway=fake_model_primitive_gateway,
+    )
+
+    use_case = CreateLLMModelEndpointV1UseCase(
+        create_model_bundle_use_case=bundle_use_case,
+        model_bundle_repository=fake_model_bundle_repository,
+        model_endpoint_service=fake_model_endpoint_service,
+        llm_artifact_gateway=fake_llm_artifact_gateway,
+    )
+
+    request = create_llm_model_endpoint_text_generation_inference_request_streaming.copy()
+    request.inference_framework = inference_framework
+    request.inference_framework_image_tag = inference_framework_image_tag
+    user = User(user_id=test_api_key, team_id=test_api_key, is_privileged_user=True)
+    if valid:
+        await use_case.execute(user=user, request=request)
+    else:
+        with pytest.raises(InvalidInferenceFrameworkImageTagException):
+            await use_case.execute(user=user, request=request)
+
+
 @pytest.mark.asyncio
 async def test_create_model_endpoint_text_generation_inference_use_case_success(
     test_api_key: str,

From e00e878cb100b18c7ea86dbf4d2a7a28b8932aab Mon Sep 17 00:00:00 2001
From: Tiffany Zhao <tiffany.zhao@scale.com>
Date: Wed, 1 Nov 2023 21:33:55 +0000
Subject: [PATCH 08/13] fix

---
 .../domain/use_cases/llm_model_endpoint_use_cases.py           | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
index 188df8202..7367c7b66 100644
--- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
+++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -1039,7 +1039,6 @@ def model_output_to_completion_output(
         with_token_probs: Optional[bool],
     ) -> CompletionOutput:
         model_content = _model_endpoint_entity_to_get_llm_model_endpoint_response(model_endpoint)
-
         if model_content.inference_framework == LLMInferenceFramework.DEEPSPEED:
             completion_token_count = len(model_output["token_probs"]["tokens"])
             tokens = None
@@ -1070,7 +1069,7 @@ def model_output_to_completion_output(
                     raise InvalidRequestException(model_output.get("error"))  # trigger a 400
                 else:
                     raise UpstreamServiceError(
-                        status_code=500, content=bytes(model_output["error"])
+                        status_code=500, content=bytes(model_output["error"], "utf-8")
                     )
 
         elif model_content.inference_framework == LLMInferenceFramework.VLLM:

From 660e8230fc47218823148ecac0a4241b6ac74f67 Mon Sep 17 00:00:00 2001
From: Tiffany Zhao <tiffany.zhao@scale.com>
Date: Thu, 2 Nov 2023 23:30:40 +0000
Subject: [PATCH 09/13] check ecr image

---
 .../model_engine_server/api/llms_v1.py        |  7 +-
 .../model_engine_server/domain/exceptions.py  |  6 --
 .../use_cases/llm_model_endpoint_use_cases.py | 89 ++++++++-----------
 .../tests/unit/domain/test_llm_use_cases.py   |  7 +-
 4 files changed, 42 insertions(+), 67 deletions(-)

diff --git a/model-engine/model_engine_server/api/llms_v1.py b/model-engine/model_engine_server/api/llms_v1.py
index 72d502856..fe5601b0c 100644
--- a/model-engine/model_engine_server/api/llms_v1.py
+++ b/model-engine/model_engine_server/api/llms_v1.py
@@ -48,7 +48,6 @@
     EndpointResourceInvalidRequestException,
     EndpointUnsupportedInferenceTypeException,
     ExistingEndpointOperationInProgressException,
-    InvalidInferenceFrameworkImageTagException,
     InvalidRequestException,
     LLMFineTuningMethodNotImplementedException,
     LLMFineTuningQuotaReached,
@@ -132,6 +131,7 @@ async def create_model_endpoint(
             model_bundle_repository=external_interfaces.model_bundle_repository,
             model_endpoint_service=external_interfaces.model_endpoint_service,
             llm_artifact_gateway=external_interfaces.llm_artifact_gateway,
+            docker_repository=external_interfaces.docker_repository,
         )
         return await use_case.execute(user=auth, request=request)
     except ObjectAlreadyExistsException as exc:
@@ -151,11 +151,6 @@ async def create_model_endpoint(
             status_code=400,
             detail=str(exc),
         ) from exc
-    except InvalidInferenceFrameworkImageTagException as exc:
-        raise HTTPException(
-            status_code=400,
-            detail="The specified inference framework image tag doesn't exist for the specified inference framework.",
-        ) from exc
     except ObjectNotApprovedException as exc:
         raise HTTPException(
             status_code=403,
diff --git a/model-engine/model_engine_server/domain/exceptions.py b/model-engine/model_engine_server/domain/exceptions.py
index 69d588485..934a5e215 100644
--- a/model-engine/model_engine_server/domain/exceptions.py
+++ b/model-engine/model_engine_server/domain/exceptions.py
@@ -170,9 +170,3 @@ class TriggerNameAlreadyExistsException(DomainException):
     """
     Thrown if the requested name already exists in the trigger repository
     """
-
-
-class InvalidInferenceFrameworkImageTagException(DomainException):
-    """
-    Thrown if the image tag passed in doesn't exist for the provided inference framework
-    """
diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
index 7367c7b66..c4d49b85a 100644
--- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
+++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -47,9 +47,9 @@
     StreamingEnhancedRunnableImageFlavor,
 )
 from model_engine_server.domain.exceptions import (
+    DockerImageNotFoundException,
     EndpointLabelsException,
     EndpointUnsupportedInferenceTypeException,
-    InvalidInferenceFrameworkImageTagException,
     InvalidRequestException,
     ObjectHasInvalidValueException,
     ObjectNotAuthorizedException,
@@ -58,6 +58,7 @@
 )
 from model_engine_server.domain.gateways.llm_artifact_gateway import LLMArtifactGateway
 from model_engine_server.domain.repositories import ModelBundleRepository
+from model_engine_server.domain.repositories.docker_repository import DockerRepository
 from model_engine_server.domain.services import LLMModelEndpointService, ModelEndpointService
 from model_engine_server.infra.gateways.filesystem_gateway import FilesystemGateway
 
@@ -75,52 +76,6 @@
 
 logger = make_logger(logger_name())
 
-_VALID_FRAMEWORK_IMAGE_TAGS = {
-    # setting this to empty for now since no one uses deepspeed
-    LLMInferenceFramework.DEEPSPEED: [],
-    LLMInferenceFramework.TEXT_GENERATION_INFERENCE: [
-        "0.9.4.1",
-        "0.9.4",
-        "0.9.3-launch_s3",
-        "0.9.3",
-        "0.9.1-launch_s3",
-        "0.9.1",
-        "ipv6",
-        "ipv6-0",
-        "0.8",
-    ],
-    LLMInferenceFramework.VLLM: [
-        "0.2.1.post1",
-        "0.2.1",
-        "0.2.0",
-        "0.1.7-awq",
-        "0.1.5",
-        "0.1.7",
-        "0.1.3.10",
-        "0.1.3.9",
-        "0.1.3.8",
-        "0.1.3.7",
-        "0.1.3.6",
-        "0.1.3.5",
-        "0.1.3.4",
-        "0.1.3.3",
-        "0.1.3.2",
-        "0.1.3.1",
-        "0.1.3",
-    ],
-    LLMInferenceFramework.LIGHTLLM: [
-        "0.0.9",
-        "0.0.8",
-        "0.0.7",
-        "0.0.6",
-        "0.0.5",
-        "0.0.4",
-        "0.0.3",
-        "0.0.2",
-        "0.0.1",
-    ],
-}
-
 _SUPPORTED_MODEL_NAMES = {
     LLMInferenceFramework.DEEPSPEED: {
         "mpt-7b": "mosaicml/mpt-7b",
@@ -285,12 +240,14 @@ def __init__(
         model_bundle_repository: ModelBundleRepository,
         model_endpoint_service: ModelEndpointService,
         llm_artifact_gateway: LLMArtifactGateway,
+        docker_repository: DockerRepository,
     ):
         self.authz_module = LiveAuthorizationModule()
         self.create_model_bundle_use_case = create_model_bundle_use_case
         self.model_bundle_repository = model_bundle_repository
         self.model_endpoint_service = model_endpoint_service
         self.llm_artifact_gateway = llm_artifact_gateway
+        self.docker_repository = docker_repository
 
     async def create_model_bundle(
         self,
@@ -306,13 +263,15 @@ async def create_model_bundle(
         checkpoint_path: Optional[str],
     ) -> ModelBundle:
         if source == LLMSource.HUGGING_FACE:
-            # validate the image tag / framework pair
-            if framework != LLMInferenceFramework.DEEPSPEED and framework_image_tag not in _VALID_FRAMEWORK_IMAGE_TAGS[framework]:  # type: ignore
-                raise InvalidInferenceFrameworkImageTagException(
-                    f"Valid image tags for framework {framework} are {_VALID_FRAMEWORK_IMAGE_TAGS[framework]}"
-                )
-
             if framework == LLMInferenceFramework.DEEPSPEED:
+                if not self.docker_repository.image_exists(
+                    image_tag=framework_image_tag,
+                    repository_name="instant-llm",
+                ):
+                    raise DockerImageNotFoundException(
+                        repository="instant-llm",
+                        tag=framework_image_tag,
+                    )
                 bundle_id = await self.create_deepspeed_bundle(
                     user,
                     model_name,
@@ -321,6 +280,14 @@ async def create_model_bundle(
                     endpoint_name,
                 )
             elif framework == LLMInferenceFramework.TEXT_GENERATION_INFERENCE:
+                if not self.docker_repository.image_exists(
+                    image_tag=framework_image_tag,
+                    repository_name=hmi_config.tgi_repository,
+                ):
+                    raise DockerImageNotFoundException(
+                        repository=hmi_config.tgi_repository,
+                        tag=framework_image_tag,
+                    )
                 bundle_id = await self.create_text_generation_inference_bundle(
                     user,
                     model_name,
@@ -331,6 +298,14 @@ async def create_model_bundle(
                     checkpoint_path,
                 )
             elif framework == LLMInferenceFramework.VLLM:
+                if not self.docker_repository.image_exists(
+                    image_tag=framework_image_tag,
+                    repository_name=hmi_config.vllm_repository,
+                ):
+                    raise DockerImageNotFoundException(
+                        repository=hmi_config.vllm_repository,
+                        tag=framework_image_tag,
+                    )
                 bundle_id = await self.create_vllm_bundle(
                     user,
                     model_name,
@@ -349,6 +324,14 @@ async def create_model_bundle(
                     num_shards,
                     checkpoint_path,
                 )
+                if not self.docker_repository.image_exists(
+                    image_tag=framework_image_tag,
+                    repository_name=hmi_config.lightllm_repository,
+                ):
+                    raise DockerImageNotFoundException(
+                        repository=hmi_config.lightllm_repository,
+                        tag=framework_image_tag,
+                    )
             else:
                 raise ObjectHasInvalidValueException(
                     f"Framework {framework} is not supported for source {source}."
diff --git a/model-engine/tests/unit/domain/test_llm_use_cases.py b/model-engine/tests/unit/domain/test_llm_use_cases.py
index ce317dc59..29bca2170 100644
--- a/model-engine/tests/unit/domain/test_llm_use_cases.py
+++ b/model-engine/tests/unit/domain/test_llm_use_cases.py
@@ -20,8 +20,8 @@
     ModelEndpointType,
 )
 from model_engine_server.domain.exceptions import (
+    DockerImageNotFoundException,
     EndpointUnsupportedInferenceTypeException,
-    InvalidInferenceFrameworkImageTagException,
     InvalidRequestException,
     LLMFineTuningQuotaReached,
     ObjectHasInvalidValueException,
@@ -167,6 +167,7 @@ async def test_create_model_bundle_inference_framework_image_tag_validation(
     fake_model_bundle_repository,
     fake_model_endpoint_service,
     fake_docker_repository_image_always_exists,
+    fake_docker_repository_image_never_exists,
     fake_model_primitive_gateway,
     fake_llm_artifact_gateway,
     create_llm_model_endpoint_text_generation_inference_request_streaming: CreateLLMModelEndpointV1Request,
@@ -186,6 +187,7 @@ async def test_create_model_bundle_inference_framework_image_tag_validation(
         model_bundle_repository=fake_model_bundle_repository,
         model_endpoint_service=fake_model_endpoint_service,
         llm_artifact_gateway=fake_llm_artifact_gateway,
+        docker_repository=fake_docker_repository_image_always_exists,
     )
 
     request = create_llm_model_endpoint_text_generation_inference_request_streaming.copy()
@@ -195,7 +197,8 @@ async def test_create_model_bundle_inference_framework_image_tag_validation(
     if valid:
         await use_case.execute(user=user, request=request)
     else:
-        with pytest.raises(InvalidInferenceFrameworkImageTagException):
+        use_case.docker_repository = fake_docker_repository_image_never_exists
+        with pytest.raises(DockerImageNotFoundException):
             await use_case.execute(user=user, request=request)
 
 

From bc0a915614ab7eeb9ff0b3794260281613408683 Mon Sep 17 00:00:00 2001
From: Tiffany Zhao <tiffany.zhao@scale.com>
Date: Fri, 3 Nov 2023 04:21:27 +0000
Subject: [PATCH 10/13] catch docker image exception

---
 model-engine/model_engine_server/api/llms_v1.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/model-engine/model_engine_server/api/llms_v1.py b/model-engine/model_engine_server/api/llms_v1.py
index fe5601b0c..a74b78561 100644
--- a/model-engine/model_engine_server/api/llms_v1.py
+++ b/model-engine/model_engine_server/api/llms_v1.py
@@ -43,6 +43,7 @@
     make_logger,
 )
 from model_engine_server.domain.exceptions import (
+    DockerImageNotFoundException,
     EndpointDeleteFailedException,
     EndpointLabelsException,
     EndpointResourceInvalidRequestException,
@@ -161,6 +162,11 @@ async def create_model_endpoint(
             status_code=404,
             detail="The specified model bundle could not be found.",
         ) from exc
+    except DockerImageNotFoundException as exc:
+        raise HTTPException(
+            status_code=404,
+            detail="The specified docker image could not be found.",
+        ) from exc
 
 
 @llm_router_v1.get("/model-endpoints", response_model=ListLLMModelEndpointsV1Response)

From 6ba1f02e947705bdadea986d53bdc74a1ecc6c40 Mon Sep 17 00:00:00 2001
From: Tiffany Zhao <tiffany.zhao@scale.com>
Date: Fri, 3 Nov 2023 04:29:40 +0000
Subject: [PATCH 11/13] fix

---
 model-engine/tests/unit/domain/test_llm_use_cases.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/model-engine/tests/unit/domain/test_llm_use_cases.py b/model-engine/tests/unit/domain/test_llm_use_cases.py
index 29bca2170..bc8d39497 100644
--- a/model-engine/tests/unit/domain/test_llm_use_cases.py
+++ b/model-engine/tests/unit/domain/test_llm_use_cases.py
@@ -71,6 +71,7 @@ async def test_create_model_endpoint_use_case_success(
         model_bundle_repository=fake_model_bundle_repository,
         model_endpoint_service=fake_model_endpoint_service,
         llm_artifact_gateway=fake_llm_artifact_gateway,
+        docker_repository=fake_docker_repository_image_always_exists,
     )
     user = User(user_id=test_api_key, team_id=test_api_key, is_privileged_user=True)
     response_1 = await use_case.execute(user=user, request=create_llm_model_endpoint_request_async)
@@ -224,6 +225,7 @@ async def test_create_model_endpoint_text_generation_inference_use_case_success(
         model_bundle_repository=fake_model_bundle_repository,
         model_endpoint_service=fake_model_endpoint_service,
         llm_artifact_gateway=fake_llm_artifact_gateway,
+        docker_repository=fake_docker_repository_image_always_exists,
     )
     user = User(user_id=test_api_key, team_id=test_api_key, is_privileged_user=True)
     response_1 = await use_case.execute(
@@ -279,6 +281,7 @@ async def test_create_llm_model_endpoint_use_case_raises_invalid_value_exception
         model_bundle_repository=fake_model_bundle_repository,
         model_endpoint_service=fake_model_endpoint_service,
         llm_artifact_gateway=fake_llm_artifact_gateway,
+        docker_repository=fake_docker_repository_image_always_exists,
     )
     user = User(user_id=test_api_key, team_id=test_api_key, is_privileged_user=True)
     with pytest.raises(ObjectHasInvalidValueException):
@@ -308,6 +311,7 @@ async def test_create_llm_model_endpoint_use_case_quantization_exception(
         model_bundle_repository=fake_model_bundle_repository,
         model_endpoint_service=fake_model_endpoint_service,
         llm_artifact_gateway=fake_llm_artifact_gateway,
+        docker_repository=fake_docker_repository_image_always_exists,
     )
     user = User(user_id=test_api_key, team_id=test_api_key, is_privileged_user=True)
     with pytest.raises(ObjectHasInvalidValueException):

From 1b8f3aa1efc80bbcaebfa6e86966cc6f457a7077 Mon Sep 17 00:00:00 2001
From: Tiffany Zhao <tiffany.zhao@scale.com>
Date: Wed, 8 Nov 2023 06:14:44 +0000
Subject: [PATCH 12/13] revert removal commit

---
 .../use_cases/llm_model_endpoint_use_cases.py    | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
index 1cdfcd2f0..182554483 100644
--- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
+++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -232,8 +232,21 @@ def validate_num_shards(
             raise ObjectHasInvalidValueException("DeepSpeed requires more than 1 GPU.")
         if num_shards != gpus:
             raise ObjectHasInvalidValueException(
-                f"DeepSpeed requires num shard {num_shards} to be the same as number of GPUs {gpus}."
+                f"Num shard {num_shards} must be the same as number of GPUs {gpus} for DeepSpeed."
             )
+    if num_shards > gpus:
+        raise ObjectHasInvalidValueException(
+            f"Num shard {num_shards} must be less than or equal to the number of GPUs {gpus}."
+        )
+
+
+def validate_quantization(
+    quantize: Optional[Quantization], inference_framework: LLMInferenceFramework
+) -> None:
+    if quantize is not None and quantize not in _SUPPORTED_QUANTIZATIONS[inference_framework]:
+        raise ObjectHasInvalidValueException(
+            f"Quantization {quantize} is not supported for inference framework {inference_framework}. Supported quantization types are {_SUPPORTED_QUANTIZATIONS[inference_framework]}."
+        )
 
 
 class CreateLLMModelEndpointV1UseCase:
@@ -731,6 +744,7 @@ async def execute(
         validate_post_inference_hooks(user, request.post_inference_hooks)
         validate_model_name(request.model_name, request.inference_framework)
         validate_num_shards(request.num_shards, request.inference_framework, request.gpus)
+        validate_quantization(request.quantize, request.inference_framework)
 
         if request.inference_framework in [
             LLMInferenceFramework.TEXT_GENERATION_INFERENCE,

From 6873c127bc16c2d50697793d148bb0973ca4cc32 Mon Sep 17 00:00:00 2001
From: Tiffany Zhao <tiffany.zhao@scale.com>
Date: Thu, 9 Nov 2023 02:07:53 +0000
Subject: [PATCH 13/13] fix + refactor

---
 .../use_cases/llm_model_endpoint_use_cases.py | 56 ++++++++-----------
 1 file changed, 23 insertions(+), 33 deletions(-)

diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
index 182554483..829f4801b 100644
--- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
+++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -265,6 +265,18 @@ def __init__(
         self.llm_artifact_gateway = llm_artifact_gateway
         self.docker_repository = docker_repository
 
+    def check_docker_image_exists_for_image_tag(
+        self, framework_image_tag: str, repository_name: str
+    ):
+        if not self.docker_repository.image_exists(
+            image_tag=framework_image_tag,
+            repository_name=repository_name,
+        ):
+            raise DockerImageNotFoundException(
+                repository=repository_name,
+                tag=framework_image_tag,
+            )
+
     async def create_model_bundle(
         self,
         user: User,
@@ -280,14 +292,7 @@ async def create_model_bundle(
     ) -> ModelBundle:
         if source == LLMSource.HUGGING_FACE:
             if framework == LLMInferenceFramework.DEEPSPEED:
-                if not self.docker_repository.image_exists(
-                    image_tag=framework_image_tag,
-                    repository_name="instant-llm",
-                ):
-                    raise DockerImageNotFoundException(
-                        repository="instant-llm",
-                        tag=framework_image_tag,
-                    )
+                self.check_docker_image_exists_for_image_tag(framework_image_tag, "instant-llm")
                 bundle_id = await self.create_deepspeed_bundle(
                     user,
                     model_name,
@@ -296,14 +301,9 @@ async def create_model_bundle(
                     endpoint_name,
                 )
             elif framework == LLMInferenceFramework.TEXT_GENERATION_INFERENCE:
-                if not self.docker_repository.image_exists(
-                    image_tag=framework_image_tag,
-                    repository_name=hmi_config.tgi_repository,
-                ):
-                    raise DockerImageNotFoundException(
-                        repository=hmi_config.tgi_repository,
-                        tag=framework_image_tag,
-                    )
+                self.check_docker_image_exists_for_image_tag(
+                    framework_image_tag, hmi_config.tgi_repository
+                )
                 bundle_id = await self.create_text_generation_inference_bundle(
                     user,
                     model_name,
@@ -314,14 +314,9 @@ async def create_model_bundle(
                     checkpoint_path,
                 )
             elif framework == LLMInferenceFramework.VLLM:
-                if not self.docker_repository.image_exists(
-                    image_tag=framework_image_tag,
-                    repository_name=hmi_config.vllm_repository,
-                ):
-                    raise DockerImageNotFoundException(
-                        repository=hmi_config.vllm_repository,
-                        tag=framework_image_tag,
-                    )
+                self.check_docker_image_exists_for_image_tag(
+                    framework_image_tag, hmi_config.vllm_repository
+                )
                 bundle_id = await self.create_vllm_bundle(
                     user,
                     model_name,
@@ -332,6 +327,9 @@ async def create_model_bundle(
                     checkpoint_path,
                 )
             elif framework == LLMInferenceFramework.LIGHTLLM:
+                self.check_docker_image_exists_for_image_tag(
+                    framework_image_tag, hmi_config.lightllm_repository
+                )
                 bundle_id = await self.create_lightllm_bundle(
                     user,
                     model_name,
@@ -340,14 +338,6 @@ async def create_model_bundle(
                     num_shards,
                     checkpoint_path,
                 )
-                if not self.docker_repository.image_exists(
-                    image_tag=framework_image_tag,
-                    repository_name=hmi_config.lightllm_repository,
-                ):
-                    raise DockerImageNotFoundException(
-                        repository=hmi_config.lightllm_repository,
-                        tag=framework_image_tag,
-                    )
             else:
                 raise ObjectHasInvalidValueException(
                     f"Framework {framework} is not supported for source {source}."
@@ -1372,7 +1362,7 @@ async def execute(
         )
 
         if len(model_endpoints) == 0:
-            raise ObjectNotFoundException
+            raise ObjectNotFoundException(f"Model endpoint {model_endpoint_name} not found.")
 
         if len(model_endpoints) > 1:
             raise ObjectHasInvalidValueException(