diff --git a/docs/model_zoo.md b/docs/model_zoo.md index 50326b6f1..18610abbb 100644 --- a/docs/model_zoo.md +++ b/docs/model_zoo.md @@ -2,32 +2,34 @@ Scale hosts the following models in the LLM Engine Model Zoo: -| Model Name | Inference APIs Available | Fine-tuning APIs Available | Inference Frameworks Available | Inference max total tokens (prompt + response) | -| --------------------- | ------------------------ | -------------------------- | ------------------------------ | ------------------------------ | -| `llama-7b` | ✅ | ✅ | deepspeed, text-generation-inference | 2048 | -| `llama-2-7b` | ✅ | ✅ | text-generation-inference, vllm | 4096| -| `llama-2-7b-chat` | ✅ | | text-generation-inference, vllm | 4096| -| `llama-2-13b` | ✅ | | text-generation-inference, vllm | 4096| -| `llama-2-13b-chat` | ✅ | | text-generation-inference, vllm | 4096| -| `llama-2-70b` | ✅ | ✅ | text-generation-inference, vllm | 4096| -| `llama-2-70b-chat` | ✅ | | text-generation-inference, vllm | 4096| -| `falcon-7b` | ✅ | | text-generation-inference, vllm | 2048 | -| `falcon-7b-instruct` | ✅ | | text-generation-inference, vllm | 2048 | -| `falcon-40b` | ✅ | | text-generation-inference, vllm | 2048 | -| `falcon-40b-instruct` | ✅ | | text-generation-inference, vllm | 2048 | -| `mpt-7b` | ✅ | | deepspeed, text-generation-inference, vllm | 2048 | -| `mpt-7b-instruct` | ✅ | ✅ | deepspeed, text-generation-inference, vllm | 2048 | -| `flan-t5-xxl` | ✅ | | deepspeed, text-generation-inference | 2048 | -| `mistral-7b` | ✅ | ✅ | vllm | 8000 | -| `mistral-7b-instruct` | ✅ | ✅ | vllm | 8000 | -| `codellama-7b` | ✅ | ✅ | text-generation-inference, vllm | 16384 | -| `codellama-7b-instruct` | ✅ | ✅ | text-generation-inference, vllm | 16384 | -| `codellama-13b` | ✅ | ✅ | text-generation-inference, vllm | 16384 | -| `codellama-13b-instruct` | ✅ | ✅ | text-generation-inference, vllm | 16384 | -| `codellama-34b` | ✅ | ✅ | text-generation-inference, vllm | 16384 | -| `codellama-34b-instruct` | ✅ | ✅ | text-generation-inference, vllm | 16384 | -| `zephyr-7b-alpha` | ✅ | | text-generation-inference, vllm | 32768 | -| `zephyr-7b-beta` | ✅ | | text-generation-inference, vllm | 32768 | +| Model Name | Inference APIs Available | Fine-tuning APIs Available | Inference Frameworks Available | Inference max total tokens (prompt + response) | +| ------------------------ | ------------------------ | -------------------------- | ------------------------------------------ | ---------------------------------------------- | +| `llama-7b` | ✅ | ✅ | deepspeed, text-generation-inference | 2048 | +| `llama-2-7b` | ✅ | ✅ | text-generation-inference, vllm | 4096 | +| `llama-2-7b-chat` | ✅ | | text-generation-inference, vllm | 4096 | +| `llama-2-13b` | ✅ | | text-generation-inference, vllm | 4096 | +| `llama-2-13b-chat` | ✅ | | text-generation-inference, vllm | 4096 | +| `llama-2-70b` | ✅ | ✅ | text-generation-inference, vllm | 4096 | +| `llama-2-70b-chat` | ✅ | | text-generation-inference, vllm | 4096 | +| `falcon-7b` | ✅ | | text-generation-inference, vllm | 2048 | +| `falcon-7b-instruct` | ✅ | | text-generation-inference, vllm | 2048 | +| `falcon-40b` | ✅ | | text-generation-inference, vllm | 2048 | +| `falcon-40b-instruct` | ✅ | | text-generation-inference, vllm | 2048 | +| `mpt-7b` | ✅ | | deepspeed, text-generation-inference, vllm | 2048 | +| `mpt-7b-instruct` | ✅ | ✅ | deepspeed, text-generation-inference, vllm | 2048 | +| `flan-t5-xxl` | ✅ | | deepspeed, text-generation-inference | 2048 | +| `mistral-7b` | ✅ | ✅ | vllm | 8000 | +| `mistral-7b-instruct` | ✅ | ✅ | vllm | 8000 | +| `mixtral-8x7b` | ✅ | | vllm | 32768 | +| `mixtral-8x7b-instruct` | ✅ | | vllm | 32768 | +| `codellama-7b` | ✅ | ✅ | text-generation-inference, vllm | 16384 | +| `codellama-7b-instruct` | ✅ | ✅ | text-generation-inference, vllm | 16384 | +| `codellama-13b` | ✅ | ✅ | text-generation-inference, vllm | 16384 | +| `codellama-13b-instruct` | ✅ | ✅ | text-generation-inference, vllm | 16384 | +| `codellama-34b` | ✅ | ✅ | text-generation-inference, vllm | 16384 | +| `codellama-34b-instruct` | ✅ | ✅ | text-generation-inference, vllm | 16384 | +| `zephyr-7b-alpha` | ✅ | | text-generation-inference, vllm | 32768 | +| `zephyr-7b-beta` | ✅ | | text-generation-inference, vllm | 32768 | ## Usage diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py index 31ebfd35f..35689a225 100644 --- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py @@ -165,6 +165,8 @@ "codellama-34b-instruct", "mistral-7b", "mistral-7b-instruct", + "mixtral-8x7b", + "mixtral-8x7b-instruct", "mammoth-coder-llama-2-7b", "mammoth-coder-llama-2-13b", "mammoth-coder-llama-2-34b", @@ -210,6 +212,7 @@ # Can also see 13B, 34B there too "llama-2": {"max_model_len": None, "max_num_batched_tokens": 4096}, "mistral": {"max_model_len": 8000, "max_num_batched_tokens": 8000}, + "mixtral": {"max_model_len": 32768, "max_num_batched_tokens": 32768}, "zephyr": {"max_model_len": 32768, "max_num_batched_tokens": 32768}, } diff --git a/model-engine/model_engine_server/inference/vllm/Dockerfile b/model-engine/model_engine_server/inference/vllm/Dockerfile index d03a2c032..6f1a00c57 100644 --- a/model-engine/model_engine_server/inference/vllm/Dockerfile +++ b/model-engine/model_engine_server/inference/vllm/Dockerfile @@ -1,8 +1,13 @@ -FROM nvcr.io/nvidia/pytorch:22.12-py3 +FROM nvcr.io/nvidia/pytorch:23.09-py3 RUN pip uninstall torch -y COPY requirements.txt /workspace/requirements.txt RUN pip install -r requirements.txt + +# install special version of megablocks +RUN pip install git+https://github.com/stanford-futuredata/megablocks.git@5897cd6f254b7b3edf7a708a3a3314ecb54b6f78#egg=megablocks + RUN wget https://github.com/peak/s5cmd/releases/download/v2.2.1/s5cmd_2.2.1_Linux-64bit.tar.gz RUN tar -xvzf s5cmd_2.2.1_Linux-64bit.tar.gz + COPY vllm_server.py /workspace/vllm_server.py diff --git a/model-engine/model_engine_server/inference/vllm/requirements.txt b/model-engine/model_engine_server/inference/vllm/requirements.txt index b5407ab91..e2c3aa08c 100644 --- a/model-engine/model_engine_server/inference/vllm/requirements.txt +++ b/model-engine/model_engine_server/inference/vllm/requirements.txt @@ -1,3 +1,3 @@ ray==2.6.3 -vllm==0.2.0 -pydantic==1.10.12 +vllm==0.2.5 +pydantic==1.10.13 diff --git a/model-engine/model_engine_server/infra/repositories/live_tokenizer_repository.py b/model-engine/model_engine_server/infra/repositories/live_tokenizer_repository.py index 873f2e65f..b586bc9c5 100644 --- a/model-engine/model_engine_server/infra/repositories/live_tokenizer_repository.py +++ b/model-engine/model_engine_server/infra/repositories/live_tokenizer_repository.py @@ -58,6 +58,8 @@ def get_default_supported_models_info() -> Dict[str, ModelInfo]: ), "mistral-7b": ModelInfo("mistralai/Mistral-7B-v0.1", None), "mistral-7b-instruct": ModelInfo("mistralai/Mistral-7B-Instruct-v0.1", None), + "mixtral-8x7b": ModelInfo("mistralai/Mixtral-8x7B-v0.1", None), + "mixtral-8x7b-instruct": ModelInfo("mistralai/Mixtral-8x7B-Instruct-v0.1", None), "mammoth-coder-llama-2-7b": ModelInfo("TIGER-Lab/MAmmoTH-Coder-7B", None), "mammoth-coder-llama-2-13b": ModelInfo("TIGER-Lab/MAmmoTH-Coder-13B", None), "mammoth-coder-llama-2-34b": ModelInfo("TIGER-Lab/MAmmoTH-Coder-34B", None),