mlrun · Eyal-Danieli · Dec 28, 2025 · Dec 14, 2025 · Dec 14, 2025 · Dec 25, 2025
diff --git a/modules/src/vllm_module/vllm_module.ipynb b/modules/src/vllm_module/vllm_module.ipynb
@@ -17,7 +17,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import mlrun\n",
+    "import mlrun\n"
    ]
   },
   {
@@ -170,34 +170,31 @@
     "body = {\n",
     "    \"model\": vllm_module.model,\n",
     "    \"messages\": [{\"role\": \"user\", \"content\": \"what are the 3 countries with the most gpu as far as you know\"}],\n",
-    "    \"max_tokens\": vllm_module.max_tokens,   # start smaller for testing\n",
+    "    \"max_tokens\": vllm_module.max_tokens,     # start smaller for testing\n",
     "}\n",
     "\n",
     "resp = app.invoke(path=\"/v1/chat/completions\", body=body)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 22,
    "id": "a459d5f8-dad0-4735-94c2-3801d4f94bb5",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Raw response keys: dict_keys(['id', 'object', 'created', 'model', 'choices', 'service_tier', 'system_fingerprint', 'usage', 'prompt_logprobs', 'prompt_token_ids', 'kv_transfer_params'])\n",
       "\n",
       "assistant:\n",
       "\n",
-      "As of July 2023, the three countries with the most scientists in articles that explain or discuss GPU contributions to AI are the United States, China, and India.\n",
-      "The number of scientists is not the best measure of the number of GPUs. According to Practical Deep Learning forמות, China has 6,307 GPU-equipped tens of thousands of compute servers, the number of GPUs in the top 100 supercomputers is 6,492 (19th largest: 10,363), and the per capita number of GPUs invested by research institutions is high. From the total value perspective, the annual procurement increase of GPUs in China is estimated to be more than $40 billion. Similarly, the United States and India have significantly higher prices than China purely due to price controls.\n",
-      "In summary, there is limited data to support the claim that GPU prices vary significantly between the three countries. However, China has a significant number of GPUs in use, and its computational resources are some of the largest in the world.\n"
+      "As of the most commonly cited estimates, the three countries with the largest GPU capacity for AI workloads are the United States, China, and India.\n"
      ]
     }
    ],
    "source": [
-    "data = resp.json()\n",
+    "data = resp\n",
     "assistant_text = data[\"choices\"][0][\"message\"][\"content\"]\n",
     "\n",
     "print(\"\\nassistant:\\n\")\n",
@@ -207,7 +204,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "1de85b32-9c91-4609-9a63-0b38ed4fde65",
+   "id": "957b5d21-7ade-4131-9100-878652c477fc",
    "metadata": {},
    "outputs": [],
    "source": []

diff --git a/modules/src/vllm_module/vllm_module.py b/modules/src/vllm_module/vllm_module.py
@@ -52,21 +52,12 @@ def __init__(
             mem: str = "10G",
             port: int = 8000,
             dtype: str = "auto",
-            tensor_parallel_size: Optional[int] = None,
             uvicorn_log_level: str = "info",
             max_tokens: int = 500,
     ):
         if gpus < 1:
             raise ValueError("gpus must be >= 1")
 
-        if tensor_parallel_size is not None:
-            if tensor_parallel_size < 1:
-                raise ValueError("tensor_parallel_size must be >= 1")
-            if tensor_parallel_size > gpus:
-                raise ValueError(
-                    f"tensor_parallel_size ({tensor_parallel_size}) cannot be greater than gpus ({gpus})"
-                )
-
 
 
         if node_selector is None:
@@ -87,7 +78,6 @@ def __init__(
         self.node_selector = node_selector
         self.port = port
         self.dtype = dtype
-        self.tensor_parallel_size = tensor_parallel_size
         self.uvicorn_log_level = uvicorn_log_level
         self.max_tokens = max_tokens
 
@@ -117,8 +107,7 @@ def __init__(
             args += ["--uvicorn-log-level", self.uvicorn_log_level]
 
         if self.gpus > 1:
-            tps = self.tensor_parallel_size or self.gpus
-            args += ["--tensor-parallel-size", str(tps)]
+            args += ["--tensor-parallel-size", str(gpus)]
 
             # For more than one GPU you should create a share volume for the multiple GPUs
             self.vllm_app.spec.volumes = [{"name": "dshm", "emptyDir": {"medium": "Memory"}}]