Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions charts/model-engine/templates/balloon_h100_deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
{{- if not .Values.serviceIdentifier }}
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ .Chart.Name }}-balloon-h100
labels:
team: infra
product: common-warm-nodes
spec:
replicas: {{ .Values.replicaCount.balloonH100 }}
selector:
matchLabels:
app: {{ .Chart.Name }}-balloon-h100
version: v1
template:
metadata:
labels:
app: {{ .Chart.Name }}-balloon-h100
product: common-warm-nodes
team: infra
env: {{ .Values.context }}
version: v1
annotations:
sidecar.istio.io/inject: "false"
spec:
nodeSelector:
k8s.amazonaws.com/accelerator: nvidia-ampere-h100
{{- with .Values.balloonNodeSelector }}
{{- toYaml . | nindent 8 }}
{{- end }}
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
containers:
- image: public.ecr.aws/ubuntu/ubuntu:latest
imagePullPolicy: IfNotPresent
name: main
resources:
limits:
memory: 28Gi
nvidia.com/gpu: 1
cpu: 4
command:
- /bin/bash
- -c
- "while true; do sleep 30; done"
terminationGracePeriodSeconds: 0
priorityClassName: {{ .Chart.Name }}-low-priority
{{- end }}
1 change: 1 addition & 0 deletions charts/model-engine/values_circleci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ replicaCount:
balloonA100: 0
balloonCpu: 0
balloonT4: 0
balloonH100: 0

# tag needs to be set dynamically every time. Usually it is set to the SHA1 hash of the git
# commit from which the image was built.
Expand Down
23 changes: 23 additions & 0 deletions charts/model-engine/values_sample.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ replicaCount:
balloonCpu: 0
# balloonT4 is a low priority pod deployment for T4 GPU nodes
balloonT4: 0
# balloonH100 is a low priority pod deployment for H100 GPU nodes
balloonH100: 0

# autoscaling is the autoscaling configuration for LLM Engine server deployments (e.g gateway, cache, and builder deployments)
autoscaling:
Expand Down Expand Up @@ -254,6 +256,27 @@ imageCache:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
- name: h100
nodeSelector:
k8s.amazonaws.com/accelerator: nvidia-hopper-h100
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
- name: h100-mig-1g-20gb
nodeSelector:
k8s.amazonaws.com/accelerator: nvidia-hopper-h100-mig-1g-20gb
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
- name: h100-mig-3g-40gb
nodeSelector:
k8s.amazonaws.com/accelerator: nvidia-hopper-h100-mig-3g-40gb
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"

# celeryBrokerType specifies the celery broker type for async endpoints, either "sqs" or "elasticache"
celeryBrokerType: sqs
Expand Down
2 changes: 1 addition & 1 deletion clients/python/llmengine/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

__version__ = "0.0.0b32"
__version__ = "0.0.0b33"

import os
from typing import Sequence
Expand Down
4 changes: 4 additions & 0 deletions clients/python/llmengine/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@ def create(
- ``nvidia-ampere-a100``
- ``nvidia-ampere-a100e``
- ``nvidia-hopper-h100``
- ``nvidia-hopper-h100-1g20gb``
- ``nvidia-hopper-h100-3g40gb``

high_priority (`Optional[bool]`):
Either ``True`` or ``False``. Enabling this will allow the created
Expand Down Expand Up @@ -533,6 +535,8 @@ def update(
- ``nvidia-ampere-a100``
- ``nvidia-ampere-a100e``
- ``nvidia-hopper-h100``
- ``nvidia-hopper-h100-1g20gb``
- ``nvidia-hopper-h100-3g40gb``

high_priority (`Optional[bool]`):
Either ``True`` or ``False``. Enabling this will allow the created
Expand Down
2 changes: 1 addition & 1 deletion clients/python/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "scale-llm-engine"
version = "0.0.0.beta32"
version = "0.0.0.beta33"
description = "Scale LLM Engine Python client"
license = "Apache-2.0"
authors = ["Phil Chen <phil.chen@scale.com>"]
Expand Down
2 changes: 1 addition & 1 deletion clients/python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
setup(
name="scale-llm-engine",
python_requires=">=3.7",
version="0.0.0.beta32",
version="0.0.0.beta33",
packages=find_packages(),
package_data={"llmengine": ["py.typed"]},
)
Original file line number Diff line number Diff line change
Expand Up @@ -879,6 +879,10 @@ async def execute(
max_workers=request.max_workers,
endpoint_type=request.endpoint_type,
)
if request.gpu_type == GpuType.NVIDIA_AMPERE_A100E: # pragma: no cover
raise ObjectHasInvalidValueException(
"We have migrated A100 usage to H100. Please request for H100 instead!"
)
if request.labels is None:
raise EndpointLabelsException("Endpoint labels cannot be None!")
validate_labels(request.labels)
Expand Down