diff --git a/model-engine/model_engine_server/inference/batch_inference/requirements.txt b/model-engine/model_engine_server/inference/batch_inference/requirements.txt index 5b7cf76a1..9e8d1188c 100644 --- a/model-engine/model_engine_server/inference/batch_inference/requirements.txt +++ b/model-engine/model_engine_server/inference/batch_inference/requirements.txt @@ -1,5 +1,6 @@ ray==2.6.3 -git+https://github.com/vllm-project/vllm.git@4b61c6b669e368c6850531815940d9a542b9f223#egg=vllm +#git+https://github.com/vllm-project/vllm.git@4b61c6b669e368c6850531815940d9a542b9f223#egg=vllm +vllm==0.2.5 pydantic==1.10.13 boto3==1.34.15 smart-open==6.4.0 diff --git a/model-engine/model_engine_server/inference/batch_inference/vllm_batch.py b/model-engine/model_engine_server/inference/batch_inference/vllm_batch.py index 6c0c76dbc..20e0459dc 100644 --- a/model-engine/model_engine_server/inference/batch_inference/vllm_batch.py +++ b/model-engine/model_engine_server/inference/batch_inference/vllm_batch.py @@ -13,6 +13,7 @@ CreateBatchCompletionsRequestContent, TokenOutput, ) +from tqdm import tqdm CONFIG_FILE = os.getenv("CONFIG_FILE") AWS_REGION = os.getenv("AWS_REGION", "us-west-2") @@ -123,11 +124,16 @@ async def batch_inference(): results_generators = await generate_with_vllm(request, content, model, job_index) + bar = tqdm(total=len(content.prompts), desc="Processed prompts") + outputs = [] for generator in results_generators: last_output_text = "" tokens = [] async for request_output in generator: + if request_output.finished: + bar.update(1) + token_text = request_output.outputs[-1].text[len(last_output_text) :] log_probs = ( request_output.outputs[0].logprobs[-1] if content.return_token_log_probs else None @@ -155,6 +161,8 @@ async def batch_inference(): outputs.append(output.dict()) + bar.close() + if request.data_parallelism == 1: with smart_open.open(request.output_data_path, "w") as f: f.write(json.dumps(outputs)) @@ -178,6 +186,7 @@ async def generate_with_vllm(request, content, model, job_index): quantization=request.model_config.quantize, tensor_parallel_size=request.model_config.num_shards, seed=request.model_config.seed or 0, + disable_log_requests=True, ) llm = AsyncLLMEngine.from_engine_args(engine_args)