scaleapi · sam-scale · Oct 20, 2023 · Oct 19, 2023 · Oct 19, 2023 · Oct 20, 2023
diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -124,6 +124,12 @@
         "mistral-7b-instruct": "mistralai/Mistral-7B-Instruct-v0.1",
         "falcon-180b": "tiiuae/falcon-180B",
         "falcon-180b-chat": "tiiuae/falcon-180B-chat",
+        "code-llama-7b": "codellama/CodeLlama-7b-hf",
+        "code-llama-13b": "codellama/CodeLlama-13b-hf",
+        "code-llama-34b": "codellama/CodeLlama-34b-hf",
+        "mammoth-coder-llama-2-7b": "TIGER-Lab/MAmmoTH-Coder-7B",
+        "mammoth-coder-llama-2-13b": "TIGER-Lab/MAmmoTH-Coder-13B",
+        "mammoth-coder-llama-2-34b": "TIGER-Lab/MAmmoTH-Coder-34B",
     },
     LLMInferenceFramework.LIGHTLLM: {
         "llama-7b": "decapoda-research/llama-7b-hf",
@@ -143,6 +149,20 @@
     LLMInferenceFramework.LIGHTLLM: [],
 }
 
+# We need a dict where if we need to override we can
+# NOTE: These are in *descending* order of priority. e.g. if you see 'mammoth-coder'
+# you'll use that override and not listen to the 'llama-2' override
+_VLLM_MODEL_LENGTH_OVERRIDES: Dict[str, Dict[str, int]] = {
+    "mammoth-coder": {"max_model_len": 16384, "max_num_batched_tokens": 16384},
+    # Based on config here: https://huggingface.co/TIGER-Lab/MAmmoTH-Coder-7B/blob/main/config.json#L12
+    # Can also see 13B, 34B there too
+    "code-llama": {"max_model_len": 16384, "max_num_batched_tokens": 16384},
+    # Based on config here: https://huggingface.co/codellama/CodeLlama-7b-hf/blob/main/config.json#L12
+    # Can also see 13B, 34B there too
+    "llama-2": {"max_model_len": 4096, "max_num_batched_tokens": 4096},
+    "mistral": {"max_model_len": 8000, "max_num_batched_tokens": 8000},
+}
+
 
 NUM_DOWNSTREAM_REQUEST_RETRIES = 80  # has to be high enough so that the retries take the 5 minutes
 DOWNSTREAM_REQUEST_TIMEOUT_SECONDS = 5 * 60  # 5 minutes
@@ -514,13 +534,14 @@ async def create_vllm_bundle(
     ):
         command = []
 
-        max_num_batched_tokens = 2560  # vLLM's default
-        max_model_len = None
-        if "llama-2" in model_name:
-            max_num_batched_tokens = 4096  # Need to be bigger than model's context window
-        if "mistral" in model_name:
-            max_num_batched_tokens = 8000
-            max_model_len = 8000
+        max_num_batched_tokens: int = 2560  # vLLM's default
+        max_model_len: Optional[int] = None
+
+        for key, value in _VLLM_MODEL_LENGTH_OVERRIDES.items():
+            if key in model_name:
+                max_model_len = value["max_model_len"]
+                max_num_batched_tokens = value["max_num_batched_tokens"]
+                break
 
         subcommands = []
         if checkpoint_path is not None: