Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 6 additions & 9 deletions modules/src/vllm_module/vllm_module.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
"metadata": {},
"outputs": [],
"source": [
"import mlrun\n",
"import mlrun\n"
]
},
{
Expand Down Expand Up @@ -170,34 +170,31 @@
"body = {\n",
" \"model\": vllm_module.model,\n",
" \"messages\": [{\"role\": \"user\", \"content\": \"what are the 3 countries with the most gpu as far as you know\"}],\n",
" \"max_tokens\": vllm_module.max_tokens, # start smaller for testing\n",
" \"max_tokens\": vllm_module.max_tokens, # start smaller for testing\n",
"}\n",
"\n",
"resp = app.invoke(path=\"/v1/chat/completions\", body=body)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"execution_count": 22,
"id": "a459d5f8-dad0-4735-94c2-3801d4f94bb5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Raw response keys: dict_keys(['id', 'object', 'created', 'model', 'choices', 'service_tier', 'system_fingerprint', 'usage', 'prompt_logprobs', 'prompt_token_ids', 'kv_transfer_params'])\n",
"\n",
"assistant:\n",
"\n",
"As of July 2023, the three countries with the most scientists in articles that explain or discuss GPU contributions to AI are the United States, China, and India.\n",
"The number of scientists is not the best measure of the number of GPUs. According to Practical Deep Learning forמות, China has 6,307 GPU-equipped tens of thousands of compute servers, the number of GPUs in the top 100 supercomputers is 6,492 (19th largest: 10,363), and the per capita number of GPUs invested by research institutions is high. From the total value perspective, the annual procurement increase of GPUs in China is estimated to be more than $40 billion. Similarly, the United States and India have significantly higher prices than China purely due to price controls.\n",
"In summary, there is limited data to support the claim that GPU prices vary significantly between the three countries. However, China has a significant number of GPUs in use, and its computational resources are some of the largest in the world.\n"
"As of the most commonly cited estimates, the three countries with the largest GPU capacity for AI workloads are the United States, China, and India.\n"
]
}
],
"source": [
"data = resp.json()\n",
"data = resp\n",
"assistant_text = data[\"choices\"][0][\"message\"][\"content\"]\n",
"\n",
"print(\"\\nassistant:\\n\")\n",
Expand All @@ -207,7 +204,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "1de85b32-9c91-4609-9a63-0b38ed4fde65",
"id": "957b5d21-7ade-4131-9100-878652c477fc",
"metadata": {},
"outputs": [],
"source": []
Expand Down
13 changes: 1 addition & 12 deletions modules/src/vllm_module/vllm_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,21 +52,12 @@ def __init__(
mem: str = "10G",
port: int = 8000,
dtype: str = "auto",
tensor_parallel_size: Optional[int] = None,
uvicorn_log_level: str = "info",
max_tokens: int = 500,
):
if gpus < 1:
raise ValueError("gpus must be >= 1")

if tensor_parallel_size is not None:
if tensor_parallel_size < 1:
raise ValueError("tensor_parallel_size must be >= 1")
if tensor_parallel_size > gpus:
raise ValueError(
f"tensor_parallel_size ({tensor_parallel_size}) cannot be greater than gpus ({gpus})"
)



if node_selector is None:
Expand All @@ -87,7 +78,6 @@ def __init__(
self.node_selector = node_selector
self.port = port
self.dtype = dtype
self.tensor_parallel_size = tensor_parallel_size
self.uvicorn_log_level = uvicorn_log_level
self.max_tokens = max_tokens

Expand Down Expand Up @@ -117,8 +107,7 @@ def __init__(
args += ["--uvicorn-log-level", self.uvicorn_log_level]

if self.gpus > 1:
tps = self.tensor_parallel_size or self.gpus
args += ["--tensor-parallel-size", str(tps)]
args += ["--tensor-parallel-size", str(gpus)]

# For more than one GPU you should create a share volume for the multiple GPUs
self.vllm_app.spec.volumes = [{"name": "dshm", "emptyDir": {"medium": "Memory"}}]
Expand Down