From 0ce8198b4605dcf8de18e9711efd5afbaf5af537 Mon Sep 17 00:00:00 2001 From: Yunfeng Bai Date: Wed, 24 Apr 2024 10:25:42 -0700 Subject: [PATCH 1/2] Add Llama 3 models --- docs/model_zoo.md | 4 ++++ .../domain/use_cases/llm_model_endpoint_use_cases.py | 5 +++++ .../infra/repositories/live_tokenizer_repository.py | 4 ++++ 3 files changed, 13 insertions(+) diff --git a/docs/model_zoo.md b/docs/model_zoo.md index dd8342193..13262baff 100644 --- a/docs/model_zoo.md +++ b/docs/model_zoo.md @@ -11,6 +11,10 @@ Scale hosts the following models in the LLM Engine Model Zoo: | `llama-2-13b-chat` | ✅ | | text-generation-inference, vllm | 4096 | | `llama-2-70b` | ✅ | ✅ | text-generation-inference, vllm | 4096 | | `llama-2-70b-chat` | ✅ | | text-generation-inference, vllm | 4096 | +| `llama-3-8b` | ✅ | ✅ | vllm | 8192 | +| `llama-3-8b-instruct` | ✅ | | vllm | 8192 | +| `llama-3-70b` | ✅ | ✅ | vllm | 8192 | +| `llama-3-70b-instruct` | ✅ | | vllm | 8192 | | `falcon-7b` | ✅ | | text-generation-inference, vllm | 2048 | | `falcon-7b-instruct` | ✅ | | text-generation-inference, vllm | 2048 | | `falcon-40b` | ✅ | | text-generation-inference, vllm | 2048 | diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py index 924d3391a..40e47716a 100644 --- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py @@ -162,6 +162,10 @@ "llama-2-13b-chat", "llama-2-70b", "llama-2-70b-chat", + "llama-3-8b", + "llama-3-8b-instruct", + "llama-3-70b", + "llama-3-70b-instruct", "falcon-7b", "falcon-7b-instruct", "falcon-40b", @@ -231,6 +235,7 @@ # Can also see 13B, 34B there too "gemma": {"max_model_len": 8192, "max_num_batched_tokens": 8192}, "llama-2": {"max_model_len": None, "max_num_batched_tokens": 4096}, + "llama-3": {"max_model_len": None, "max_num_batched_tokens": 8192}, "mistral": {"max_model_len": 8000, "max_num_batched_tokens": 8000}, "mixtral-8x7b": {"max_model_len": 32768, "max_num_batched_tokens": 32768}, "mixtral-8x22b": {"max_model_len": 65536, "max_num_batched_tokens": 65536}, diff --git a/model-engine/model_engine_server/infra/repositories/live_tokenizer_repository.py b/model-engine/model_engine_server/infra/repositories/live_tokenizer_repository.py index 180bea31b..ea7b93d90 100644 --- a/model-engine/model_engine_server/infra/repositories/live_tokenizer_repository.py +++ b/model-engine/model_engine_server/infra/repositories/live_tokenizer_repository.py @@ -40,6 +40,10 @@ def get_default_supported_models_info() -> Dict[str, ModelInfo]: "llama-2-13b-chat": ModelInfo("meta-llama/Llama-2-13b-chat-hf", None), "llama-2-70b": ModelInfo("meta-llama/Llama-2-70b-hf", None), "llama-2-70b-chat": ModelInfo("meta-llama/Llama-2-70b-chat-hf", None), + "llama-3-8b": ModelInfo("meta-llama/Meta-Llama-3-8B", None), + "llama-3-8b-instruct": ModelInfo("meta-llama/Meta-Llama-3-8B-Instruct", None), + "llama-3-70b": ModelInfo("meta-llama/Meta-Llama-3-70B", None), + "llama-3-70b-instruct": ModelInfo("meta-llama/Meta-Llama-3-70B-Instruct", None), "falcon-7b": ModelInfo("tiiuae/falcon-7b", None), "falcon-7b-instruct": ModelInfo("tiiuae/falcon-7b-instruct", None), "falcon-40b": ModelInfo("tiiuae/falcon-40b", None), From 30b4d802b2f010e694631f7bb1f611893ea0c9b1 Mon Sep 17 00:00:00 2001 From: Yunfeng Bai Date: Wed, 24 Apr 2024 10:29:21 -0700 Subject: [PATCH 2/2] fix --- docs/model_zoo.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/model_zoo.md b/docs/model_zoo.md index 13262baff..a8f4ae63d 100644 --- a/docs/model_zoo.md +++ b/docs/model_zoo.md @@ -11,9 +11,9 @@ Scale hosts the following models in the LLM Engine Model Zoo: | `llama-2-13b-chat` | ✅ | | text-generation-inference, vllm | 4096 | | `llama-2-70b` | ✅ | ✅ | text-generation-inference, vllm | 4096 | | `llama-2-70b-chat` | ✅ | | text-generation-inference, vllm | 4096 | -| `llama-3-8b` | ✅ | ✅ | vllm | 8192 | +| `llama-3-8b` | ✅ | | vllm | 8192 | | `llama-3-8b-instruct` | ✅ | | vllm | 8192 | -| `llama-3-70b` | ✅ | ✅ | vllm | 8192 | +| `llama-3-70b` | ✅ | | vllm | 8192 | | `llama-3-70b-instruct` | ✅ | | vllm | 8192 | | `falcon-7b` | ✅ | | text-generation-inference, vllm | 2048 | | `falcon-7b-instruct` | ✅ | | text-generation-inference, vllm | 2048 |