scaleapi · saiatmakuri · Feb 13, 2024 · Feb 13, 2024
diff --git a/model-engine/model_engine_server/common/dtos/llms.py b/model-engine/model_engine_server/common/dtos/llms.py
@@ -180,6 +180,10 @@ class CompletionSyncV1Request(BaseModel):
     """
     Controls the cumulative probability of the top tokens to consider. 1.0 means consider all tokens.
     """
+    include_stop_str_in_output: Optional[bool] = None
+    """
+    Whether to include the stop strings in output text.
+    """
 
 
 class TokenOutput(BaseModel):
@@ -240,6 +244,10 @@ class CompletionStreamV1Request(BaseModel):
     """
     Controls the cumulative probability of the top tokens to consider. 1.0 means consider all tokens.
     """
+    include_stop_str_in_output: Optional[bool] = None
+    """
+    Whether to include the stop strings in output text.
+    """
 
 
 class CompletionStreamOutput(BaseModel):

diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -1349,6 +1349,15 @@ def validate_and_update_completion_params(
                 "return_token_log_probs is only supported in deepspeed, text-generation-inference, vllm, lightllm."
             )
 
+    # include_stop_str_in_output
+    if inference_framework == LLMInferenceFramework.VLLM:
+        pass
+    else:
+        if request.include_stop_str_in_output is not None:
+            raise ObjectHasInvalidValueException(
+                "include_stop_str_in_output is only supported in vllm."
+            )
+
     return request
 
 
@@ -1634,6 +1643,8 @@ async def execute(
                 vllm_args["top_p"] = request.top_p
             if request.return_token_log_probs:
                 vllm_args["logprobs"] = 1
+            if request.include_stop_str_in_output is not None:
+                vllm_args["include_stop_str_in_output"] = request.include_stop_str_in_output
 
             inference_request = SyncEndpointPredictV1Request(
                 args=vllm_args,
@@ -1888,6 +1899,8 @@ async def execute(
                 args["top_p"] = request.top_p
             if request.return_token_log_probs:
                 args["logprobs"] = 1
+            if request.include_stop_str_in_output is not None:
+                args["include_stop_str_in_output"] = request.include_stop_str_in_output
             args["stream"] = True
         elif model_content.inference_framework == LLMInferenceFramework.LIGHTLLM:
             args = {