diff --git a/docs/model_zoo.md b/docs/model_zoo.md index 18610abbb..8805418cd 100644 --- a/docs/model_zoo.md +++ b/docs/model_zoo.md @@ -28,8 +28,14 @@ Scale hosts the following models in the LLM Engine Model Zoo: | `codellama-13b-instruct` | ✅ | ✅ | text-generation-inference, vllm | 16384 | | `codellama-34b` | ✅ | ✅ | text-generation-inference, vllm | 16384 | | `codellama-34b-instruct` | ✅ | ✅ | text-generation-inference, vllm | 16384 | +| `codellama-70b` | ✅ | | vllm | 16384 | +| `codellama-70b-instruct` | ✅ | | vllm | 4096 | | `zephyr-7b-alpha` | ✅ | | text-generation-inference, vllm | 32768 | | `zephyr-7b-beta` | ✅ | | text-generation-inference, vllm | 32768 | +| `gemma-2b` | ✅ | | vllm | 8192 | +| `gemma-2b-instruct` | ✅ | | vllm | 8192 | +| `gemma-7b` | ✅ | | vllm | 8192 | +| `gemma-7b-instruct` | ✅ | | vllm | 8192 | ## Usage diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py index be56c7c54..ce2bf2657 100644 --- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py @@ -185,6 +185,10 @@ "mammoth-coder-llama-2-34b", "zephyr-7b-alpha", "zephyr-7b-beta", + "gemma-2b", + "gemma-2b-instruct", + "gemma-7b", + "gemma-7b-instruct", ] ), LLMInferenceFramework.LIGHTLLM: set( @@ -223,6 +227,7 @@ }, # setting both for backwards compatibility, will phase code-llama out in a future pr # Based on config here: https://huggingface.co/codellama/CodeLlama-7b-hf/blob/main/config.json#L12 # Can also see 13B, 34B there too + "gemma": {"max_model_len": 8192, "max_num_batched_tokens": 8192}, "llama-2": {"max_model_len": None, "max_num_batched_tokens": 4096}, "mistral": {"max_model_len": 8000, "max_num_batched_tokens": 8000}, "mixtral": {"max_model_len": 32768, "max_num_batched_tokens": 32768}, diff --git a/model-engine/model_engine_server/inference/vllm/requirements.txt b/model-engine/model_engine_server/inference/vllm/requirements.txt index 4cc6239a4..78e033bbe 100644 --- a/model-engine/model_engine_server/inference/vllm/requirements.txt +++ b/model-engine/model_engine_server/inference/vllm/requirements.txt @@ -1,3 +1,3 @@ -ray==2.6.3 -git+https://github.com/vllm-project/vllm.git@4b61c6b669e368c6850531815940d9a542b9f223#egg=vllm -pydantic==1.10.13 +ray>=2.9 +vllm==0.3.2 +pydantic>=2.0 diff --git a/model-engine/model_engine_server/infra/repositories/live_tokenizer_repository.py b/model-engine/model_engine_server/infra/repositories/live_tokenizer_repository.py index 1140686fb..41356aefc 100644 --- a/model-engine/model_engine_server/infra/repositories/live_tokenizer_repository.py +++ b/model-engine/model_engine_server/infra/repositories/live_tokenizer_repository.py @@ -73,6 +73,10 @@ def get_default_supported_models_info() -> Dict[str, ModelInfo]: "vicuna-13b": ModelInfo("eachadea/vicuna-13b-1.1", None), "zephyr-7b-alpha": ModelInfo("HuggingFaceH4/zephyr-7b-alpha", None), "zephyr-7b-beta": ModelInfo("HuggingFaceH4/zephyr-7b-beta", None), + "gemma-2b": ModelInfo("google/gemma-2b", None), + "gemma-2b-instruct": ModelInfo("google/gemma-2b-it", None), + "gemma-7b": ModelInfo("google/gemma-7b", None), + "gemma-7b-instruct": ModelInfo("google/gemma-7b-it", None), } diff --git a/model-engine/requirements.in b/model-engine/requirements.in index eaa46f550..380f7ec97 100644 --- a/model-engine/requirements.in +++ b/model-engine/requirements.in @@ -51,9 +51,9 @@ sseclient-py==1.7.2 stringcase==1.2.0 tenacity>=6.0.0,<=6.2.0 testing-postgresql==1.3.0 -transformers==4.34.1 +tokenizers~=0.15.2 tqdm~=4.64 -transformers==4.34.1 +transformers==4.38.0 twine==3.7.1 uvicorn==0.17.6 uvloop==0.17.0 diff --git a/model-engine/requirements.txt b/model-engine/requirements.txt index 56acbaf9d..d4e6cd11c 100644 --- a/model-engine/requirements.txt +++ b/model-engine/requirements.txt @@ -190,7 +190,7 @@ hpack==4.0.0 # via h2 httptools==0.5.0 # via -r model-engine/requirements.in -huggingface-hub==0.17.3 +huggingface-hub==0.20.3 # via # tokenizers # transformers @@ -418,7 +418,7 @@ rsa==4.9 # via google-auth s3transfer==0.6.1 # via boto3 -safetensors==0.4.0 +safetensors==0.4.2 # via transformers scramp==1.4.4 # via pg8000 @@ -474,8 +474,10 @@ testing-common-database==2.0.3 # via testing-postgresql testing-postgresql==1.3.0 # via -r model-engine/requirements.in -tokenizers==0.14.1 - # via transformers +tokenizers==0.15.2 + # via + # -r model-engine/requirements.in + # transformers tomli==2.0.1 # via # build @@ -487,7 +489,7 @@ tqdm==4.65.0 # huggingface-hub # transformers # twine -transformers==4.34.1 +transformers==4.38.0 # via -r model-engine/requirements.in twine==3.7.1 # via -r model-engine/requirements.in