Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,12 @@
"mistral-7b-instruct": "mistralai/Mistral-7B-Instruct-v0.1",
"falcon-180b": "tiiuae/falcon-180B",
"falcon-180b-chat": "tiiuae/falcon-180B-chat",
"code-llama-7b": "codellama/CodeLlama-7b-hf",
"code-llama-13b": "codellama/CodeLlama-13b-hf",
"code-llama-34b": "codellama/CodeLlama-34b-hf",
"mammoth-coder-llama-2-7b": "TIGER-Lab/MAmmoTH-Coder-7B",
"mammoth-coder-llama-2-13b": "TIGER-Lab/MAmmoTH-Coder-13B",
"mammoth-coder-llama-2-34b": "TIGER-Lab/MAmmoTH-Coder-34B",
},
LLMInferenceFramework.LIGHTLLM: {
"llama-7b": "decapoda-research/llama-7b-hf",
Expand All @@ -143,6 +149,20 @@
LLMInferenceFramework.LIGHTLLM: [],
}

# We need a dict where if we need to override we can
# NOTE: These are in *descending* order of priority. e.g. if you see 'mammoth-coder'
# you'll use that override and not listen to the 'llama-2' override
_VLLM_MODEL_LENGTH_OVERRIDES: Dict[str, Dict[str, int]] = {
"mammoth-coder": {"max_model_len": 16384, "max_num_batched_tokens": 16384},
# Based on config here: https://huggingface.co/TIGER-Lab/MAmmoTH-Coder-7B/blob/main/config.json#L12
# Can also see 13B, 34B there too
"code-llama": {"max_model_len": 16384, "max_num_batched_tokens": 16384},
# Based on config here: https://huggingface.co/codellama/CodeLlama-7b-hf/blob/main/config.json#L12
# Can also see 13B, 34B there too
"llama-2": {"max_model_len": 4096, "max_num_batched_tokens": 4096},
"mistral": {"max_model_len": 8000, "max_num_batched_tokens": 8000},
}


NUM_DOWNSTREAM_REQUEST_RETRIES = 80 # has to be high enough so that the retries take the 5 minutes
DOWNSTREAM_REQUEST_TIMEOUT_SECONDS = 5 * 60 # 5 minutes
Expand Down Expand Up @@ -514,13 +534,14 @@ async def create_vllm_bundle(
):
command = []

max_num_batched_tokens = 2560 # vLLM's default
max_model_len = None
if "llama-2" in model_name:
max_num_batched_tokens = 4096 # Need to be bigger than model's context window
if "mistral" in model_name:
max_num_batched_tokens = 8000
max_model_len = 8000
max_num_batched_tokens: int = 2560 # vLLM's default
max_model_len: Optional[int] = None

for key, value in _VLLM_MODEL_LENGTH_OVERRIDES.items():
if key in model_name:
max_model_len = value["max_model_len"]
max_num_batched_tokens = value["max_num_batched_tokens"]
break

subcommands = []
if checkpoint_path is not None:
Expand Down