scaleapi · yunfeng-scale · Jan 17, 2024 · Jan 17, 2024
diff --git a/model-engine/model_engine_server/inference/batch_inference/requirements.txt b/model-engine/model_engine_server/inference/batch_inference/requirements.txt
@@ -1,5 +1,6 @@
 ray==2.6.3
-git+https://github.com/vllm-project/vllm.git@4b61c6b669e368c6850531815940d9a542b9f223#egg=vllm
+#git+https://github.com/vllm-project/vllm.git@4b61c6b669e368c6850531815940d9a542b9f223#egg=vllm
+vllm==0.2.5
 pydantic==1.10.13
 boto3==1.34.15
 smart-open==6.4.0

diff --git a/model-engine/model_engine_server/inference/batch_inference/vllm_batch.py b/model-engine/model_engine_server/inference/batch_inference/vllm_batch.py
@@ -13,6 +13,7 @@
     CreateBatchCompletionsRequestContent,
     TokenOutput,
 )
+from tqdm import tqdm
 
 CONFIG_FILE = os.getenv("CONFIG_FILE")
 AWS_REGION = os.getenv("AWS_REGION", "us-west-2")
@@ -123,11 +124,16 @@ async def batch_inference():
 
     results_generators = await generate_with_vllm(request, content, model, job_index)
 
+    bar = tqdm(total=len(content.prompts), desc="Processed prompts")
+
     outputs = []
     for generator in results_generators:
         last_output_text = ""
         tokens = []
         async for request_output in generator:
+            if request_output.finished:
+                bar.update(1)
+
             token_text = request_output.outputs[-1].text[len(last_output_text) :]
             log_probs = (
                 request_output.outputs[0].logprobs[-1] if content.return_token_log_probs else None
@@ -155,6 +161,8 @@ async def batch_inference():
 
         outputs.append(output.dict())
 
+    bar.close()
+
     if request.data_parallelism == 1:
         with smart_open.open(request.output_data_path, "w") as f:
             f.write(json.dumps(outputs))
@@ -178,6 +186,7 @@ async def generate_with_vllm(request, content, model, job_index):
         quantization=request.model_config.quantize,
         tensor_parallel_size=request.model_config.num_shards,
         seed=request.model_config.seed or 0,
+        disable_log_requests=True,
     )
 
     llm = AsyncLLMEngine.from_engine_args(engine_args)