From 3db212fe189a39338d4efbe280501f121677098c Mon Sep 17 00:00:00 2001
From: Sean Shi <sean.shi@scale.com>
Date: Tue, 20 Feb 2024 17:51:10 -0800
Subject: [PATCH 01/14] prepare allowing a csv input

---
 scripts/throughput_benchmarks.py | 36 ++++++++++++++++++++++++++------
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/scripts/throughput_benchmarks.py b/scripts/throughput_benchmarks.py
index c689d8cc5..f7be6dc35 100644
--- a/scripts/throughput_benchmarks.py
+++ b/scripts/throughput_benchmarks.py
@@ -3,6 +3,7 @@
 import os
 import queue
 import random
+import re
 import threading
 import time
 import traceback
@@ -206,11 +207,26 @@ def send_requests(
     return results
 
 
-def generate_prompt(num, hf_model):
-    random.seed(1)
-    text = lorem.words(num // 2)  # Roughly 2 tokens per lorem word
-    tokenizer = AutoTokenizer.from_pretrained(hf_model)
-    return tokenizer.decode(tokenizer.encode(text)[: num - 2])
+# TODO test this
+def read_input_file(input_file: str) -> List[str]:
+    # Only supports csvs for now
+    if re.match(r".*\.csv$", input_file):
+        with open(input_file, "r", newline="") as file:
+            reader = csv.reader(file)
+            # May have to ignore first line
+            return [row[0] for row in reader]
+    raise ValueError(f"Unsupported file type for input file {input_file}")
+
+
+def generate_prompt(num, hf_model, inputs: Optional[List]):
+    # TODO handle inputs
+    if inputs is not None:
+        raise NotImplementedError
+    else:
+        random.seed(1)
+        text = lorem.words(num // 2)  # Roughly 2 tokens per lorem word
+        tokenizer = AutoTokenizer.from_pretrained(hf_model)
+        return tokenizer.decode(tokenizer.encode(text)[: num - 2])
 
 
 def generate_output_token_counts(mean, std, num, input_token_count):
@@ -231,8 +247,14 @@ def run_benchmark(
     concurrency: int,
     verbose: bool,
     local_port: int,
+    input_file: Optional[str] = None,
 ):
-    prompt = generate_prompt(config.input_token_count, hf_model)
+
+    inputs = None
+
+    if input_file is not None:
+        inputs = read_input_file(input_file)
+    prompt = generate_prompt(config.input_token_count, hf_model, inputs)
 
     prompt_num_tokens = config.input_token_count
 
@@ -346,6 +368,7 @@ def run_benchmarks(
     input_token_count: int,
     output_token_count_mean: int,
     num_trials: int = 50,
+    input_file: Optional[str] = None,
     output_file: Optional[str] = None,
     use_localhost: bool = False,
     concurrency: int = 1,
@@ -396,6 +419,7 @@ def run_benchmarks_concurrency_range(
     input_token_count: int,
     output_token_count_mean: int,
     num_trials_per_concurrency: int = 5,
+    input_file: Optional[str] = None,
     output_file: Optional[str] = None,
     use_localhost: bool = False,
     concurrency_min: int = 1,

From 140e6a6c059bba0691d50a39598ced1fa9fdc209 Mon Sep 17 00:00:00 2001
From: Sean Shi <sean.shi@scale.com>
Date: Tue, 20 Feb 2024 17:53:58 -0800
Subject: [PATCH 02/14] randomly select input

---
 scripts/throughput_benchmarks.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/scripts/throughput_benchmarks.py b/scripts/throughput_benchmarks.py
index f7be6dc35..d218ebfe3 100644
--- a/scripts/throughput_benchmarks.py
+++ b/scripts/throughput_benchmarks.py
@@ -219,9 +219,8 @@ def read_input_file(input_file: str) -> List[str]:
 
 
 def generate_prompt(num, hf_model, inputs: Optional[List]):
-    # TODO handle inputs
     if inputs is not None:
-        raise NotImplementedError
+        return random.choice(inputs)
     else:
         random.seed(1)
         text = lorem.words(num // 2)  # Roughly 2 tokens per lorem word

From e5b45df812af2172b699c96146b1c568b53daf99 Mon Sep 17 00:00:00 2001
From: Sean Shi <sean.shi@scale.com>
Date: Wed, 21 Feb 2024 12:41:35 -0800
Subject: [PATCH 03/14] pass some args through

---
 scripts/throughput_benchmarks.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/throughput_benchmarks.py b/scripts/throughput_benchmarks.py
index d218ebfe3..3b5736b61 100644
--- a/scripts/throughput_benchmarks.py
+++ b/scripts/throughput_benchmarks.py
@@ -397,6 +397,7 @@ def run_benchmarks(
             concurrency,
             verbose,
             local_port,
+            input_file,
         )
         all_statistics.append(statistics)
     except Exception:
@@ -439,6 +440,7 @@ def run_benchmarks_concurrency_range(
             input_token_count,
             output_token_count_mean,
             num_trials_per_concurrency * concurrency,
+            input_file,
             output_file,
             use_localhost,
             concurrency,

From 37fbb77f478ec88ea3aae39bcc78d0bd40931bd7 Mon Sep 17 00:00:00 2001
From: Sean Shi <sean.shi@scale.com>
Date: Wed, 21 Feb 2024 12:45:51 -0800
Subject: [PATCH 04/14] log output token count percentiles

---
 scripts/throughput_benchmarks.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/scripts/throughput_benchmarks.py b/scripts/throughput_benchmarks.py
index 3b5736b61..4c4d16697 100644
--- a/scripts/throughput_benchmarks.py
+++ b/scripts/throughput_benchmarks.py
@@ -278,7 +278,8 @@ def run_benchmark(
     elapsed = end - start
     results = [result for result in results if result is not None]
 
-    num_sampled_tokens = sum([result["num_completion_tokens"] for result in results])
+    sampled_token_counts = [result["num_completion_tokens"] for result in results]
+    num_sampled_tokens = sum(sampled_token_counts)
     num_prompt_tokens = prompt_num_tokens * len(results)
     n = len(results)
     time_to_process_prompt = []
@@ -314,6 +315,10 @@ def run_benchmark(
     p90_time_to_first_token = np.percentile(time_to_first_token, 90)
     p95_time_to_first_token = np.percentile(time_to_first_token, 95)
     p99_time_to_first_token = np.percentile(time_to_first_token, 99)
+    p50_sampled_token_counts = np.percentile(sampled_token_counts, 50)
+    p90_sampled_token_counts = np.percentile(sampled_token_counts, 90)
+    p95_sampled_token_counts = np.percentile(sampled_token_counts, 95)
+    p99_sampled_token_counts = np.percentile(sampled_token_counts, 99)
 
     statistics = {
         "concurrency": concurrency,
@@ -351,6 +356,14 @@ def run_benchmark(
         "total_num_tokens": total_num_tokens,
         "total_num_sampled_tokens": num_sampled_tokens,
     }
+    if input_file is not None:
+        sampled_token_counts_statistics = {
+            "p50_sampled_token_counts": p50_sampled_token_counts,
+            "p90_sampled_token_counts": p90_sampled_token_counts,
+            "p95_sampled_token_counts": p95_sampled_token_counts,
+            "p99_sampled_token_counts": p99_sampled_token_counts,
+        }
+        statistics.update(sampled_token_counts_statistics)
     if verbose:
         print(f"Statistics: {statistics}")
 

From 3222ffec9519a5e724b5b2cc6d7ce651a3804de9 Mon Sep 17 00:00:00 2001
From: Sean Shi <sean.shi@scale.com>
Date: Wed, 21 Feb 2024 12:55:58 -0800
Subject: [PATCH 05/14] debug + ignore first line in file

---
 scripts/throughput_benchmarks.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/scripts/throughput_benchmarks.py b/scripts/throughput_benchmarks.py
index 4c4d16697..23a7aab71 100644
--- a/scripts/throughput_benchmarks.py
+++ b/scripts/throughput_benchmarks.py
@@ -19,6 +19,7 @@
 
 AUTH_USER_ID = os.getenv("AUTH_USER_ID")
 GATEWAY_URL = os.getenv("GATEWAY_URL")
+DEBUG = os.getenv("DEBUG")
 app = typer.Typer(name="throughput-benchmarks", add_completion=False)
 
 MAX_CONTEXT_WINDOW = 4096
@@ -88,6 +89,8 @@ def send_request(url, request, user=None):
         if payload.startswith("data:"):
             payload_data = payload.lstrip("data:").rstrip("/n")
             payload_json = json.loads(payload_data)
+            if DEBUG:
+                print(payload_json)
 
     return {
         "payload": payload_json,
@@ -214,13 +217,16 @@ def read_input_file(input_file: str) -> List[str]:
         with open(input_file, "r", newline="") as file:
             reader = csv.reader(file)
             # May have to ignore first line
-            return [row[0] for row in reader]
+            return [row[0] for row in reader][1:]
     raise ValueError(f"Unsupported file type for input file {input_file}")
 
 
 def generate_prompt(num, hf_model, inputs: Optional[List]):
     if inputs is not None:
-        return random.choice(inputs)
+        choice = random.choice(inputs)
+        if DEBUG:
+            print(f"Using input {choice}")
+        return choice
     else:
         random.seed(1)
         text = lorem.words(num // 2)  # Roughly 2 tokens per lorem word

From ebe948841694b38c0b848e94bc071e8bcd22051d Mon Sep 17 00:00:00 2001
From: Sean Shi <sean.shi@scale.com>
Date: Wed, 21 Feb 2024 13:03:13 -0800
Subject: [PATCH 06/14] lazy try except lmao

---
 scripts/throughput_benchmarks.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/throughput_benchmarks.py b/scripts/throughput_benchmarks.py
index 23a7aab71..43f2eee0d 100644
--- a/scripts/throughput_benchmarks.py
+++ b/scripts/throughput_benchmarks.py
@@ -90,7 +90,10 @@ def send_request(url, request, user=None):
             payload_data = payload.lstrip("data:").rstrip("/n")
             payload_json = json.loads(payload_data)
             if DEBUG:
-                print(payload_json)
+                try:
+                    print(payload_json["output"]["text"])
+                except KeyError:
+                    pass
 
     return {
         "payload": payload_json,

From d629d91fb6e0149879790a6ff1e8a26eb71578fa Mon Sep 17 00:00:00 2001
From: Sean Shi <sean.shi@scale.com>
Date: Wed, 21 Feb 2024 13:03:52 -0800
Subject: [PATCH 07/14] oops

---
 scripts/throughput_benchmarks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/throughput_benchmarks.py b/scripts/throughput_benchmarks.py
index 43f2eee0d..9838e496f 100644
--- a/scripts/throughput_benchmarks.py
+++ b/scripts/throughput_benchmarks.py
@@ -220,7 +220,7 @@ def read_input_file(input_file: str) -> List[str]:
         with open(input_file, "r", newline="") as file:
             reader = csv.reader(file)
             # May have to ignore first line
-            return [row[0] for row in reader][1:]
+            return [row[1] for row in reader][1:]
     raise ValueError(f"Unsupported file type for input file {input_file}")
 
 

From e50af0d28684610ce6deccef344bbb566aa9044f Mon Sep 17 00:00:00 2001
From: Sean Shi <sean.shi@scale.com>
Date: Wed, 21 Feb 2024 13:04:59 -0800
Subject: [PATCH 08/14] oops x2

---
 scripts/throughput_benchmarks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/throughput_benchmarks.py b/scripts/throughput_benchmarks.py
index 9838e496f..efdc7ca97 100644
--- a/scripts/throughput_benchmarks.py
+++ b/scripts/throughput_benchmarks.py
@@ -91,7 +91,7 @@ def send_request(url, request, user=None):
             payload_json = json.loads(payload_data)
             if DEBUG:
                 try:
-                    print(payload_json["output"]["text"])
+                    print(payload_json["output"]["text"], end="")
                 except KeyError:
                     pass
 

From 2ee933958dba345a5ef6982e79fe51e22e18ccfb Mon Sep 17 00:00:00 2001
From: Sean Shi <sean.shi@scale.com>
Date: Wed, 21 Feb 2024 13:06:36 -0800
Subject: [PATCH 09/14] oops x3

---
 scripts/throughput_benchmarks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/throughput_benchmarks.py b/scripts/throughput_benchmarks.py
index efdc7ca97..110abfd46 100644
--- a/scripts/throughput_benchmarks.py
+++ b/scripts/throughput_benchmarks.py
@@ -91,7 +91,7 @@ def send_request(url, request, user=None):
             payload_json = json.loads(payload_data)
             if DEBUG:
                 try:
-                    print(payload_json["output"]["text"], end="")
+                    print(payload_json["output"]["text"], end="", flush=True)
                 except KeyError:
                     pass
 

From cfb7e1a6d70339783f0ffc1ecd6be3773d24235c Mon Sep 17 00:00:00 2001
From: Sean Shi <sean.shi@scale.com>
Date: Wed, 21 Feb 2024 13:13:56 -0800
Subject: [PATCH 10/14] .

---
 scripts/throughput_benchmarks.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/throughput_benchmarks.py b/scripts/throughput_benchmarks.py
index 110abfd46..a3b726d4d 100644
--- a/scripts/throughput_benchmarks.py
+++ b/scripts/throughput_benchmarks.py
@@ -229,6 +229,7 @@ def generate_prompt(num, hf_model, inputs: Optional[List]):
         choice = random.choice(inputs)
         if DEBUG:
             print(f"Using input {choice}")
+            print("---")
         return choice
     else:
         random.seed(1)

From ff9c6ee743caa84bf935c5ce671c4e95d68ee367 Mon Sep 17 00:00:00 2001
From: Sean Shi <sean.shi@scale.com>
Date: Wed, 21 Feb 2024 13:22:43 -0800
Subject: [PATCH 11/14] oops prompt sample is reused

---
 scripts/throughput_benchmarks.py | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/scripts/throughput_benchmarks.py b/scripts/throughput_benchmarks.py
index a3b726d4d..a3f03187e 100644
--- a/scripts/throughput_benchmarks.py
+++ b/scripts/throughput_benchmarks.py
@@ -9,7 +9,7 @@
 import traceback
 from dataclasses import dataclass
 from enum import Enum
-from typing import List, Optional
+from typing import List, Optional, Union
 
 import numpy as np
 import requests
@@ -59,6 +59,9 @@ def from_value(cls, value):
 
 
 def send_request(url, request, user=None):
+    if DEBUG:
+        print(f"Using input {request}")
+        print("---")
     start = time.time()
     response = requests.post(
         url,
@@ -175,17 +178,21 @@ def generate_request(
 
 def send_requests(
     model: str,
-    prompt: str,
+    prompt: Union[str, List[str]],
     output_token_counts: List[int],
     use_localhost: bool,
     concurrency: int,
     framework: InferenceFramework,
     local_port: int = 5005,
 ):
+    if type(prompt) == str:
+        prompt = [prompt]
     thread_results: queue.Queue = queue.Queue()
     requests_queue: queue.Queue = queue.Queue()
-    for output_token_count in output_token_counts:
-        request = generate_request(framework, prompt, output_token_count, use_localhost)
+    for i, output_token_count in enumerate(output_token_counts):
+        request = generate_request(
+            framework, prompt[i % len(prompt)], output_token_count, use_localhost
+        )
         requests_queue.put(request)
     threads = []
     for i in range(concurrency):
@@ -224,12 +231,11 @@ def read_input_file(input_file: str) -> List[str]:
     raise ValueError(f"Unsupported file type for input file {input_file}")
 
 
-def generate_prompt(num, hf_model, inputs: Optional[List]):
+def generate_prompt(
+    num, hf_model, inputs: Optional[List], num_samples: int = 1
+) -> Union[str, List[str]]:
     if inputs is not None:
-        choice = random.choice(inputs)
-        if DEBUG:
-            print(f"Using input {choice}")
-            print("---")
+        choice = random.sample(inputs, min(num_samples, len(inputs)))
         return choice
     else:
         random.seed(1)
@@ -263,7 +269,7 @@ def run_benchmark(
 
     if input_file is not None:
         inputs = read_input_file(input_file)
-    prompt = generate_prompt(config.input_token_count, hf_model, inputs)
+    prompt = generate_prompt(config.input_token_count, hf_model, inputs, num_trials)
 
     prompt_num_tokens = config.input_token_count
 

From 8c13b3f0bb6f8acb1807d7b011692a9b5d28923c Mon Sep 17 00:00:00 2001
From: Sean Shi <sean.shi@scale.com>
Date: Thu, 22 Feb 2024 02:40:51 +0000
Subject: [PATCH 12/14] revert the changes to main, I'm gonna just have it take
 in a distribution of output token counts

---
 scripts/throughput_benchmarks.py | 76 +++++---------------------------
 1 file changed, 11 insertions(+), 65 deletions(-)

diff --git a/scripts/throughput_benchmarks.py b/scripts/throughput_benchmarks.py
index a3f03187e..c689d8cc5 100644
--- a/scripts/throughput_benchmarks.py
+++ b/scripts/throughput_benchmarks.py
@@ -3,13 +3,12 @@
 import os
 import queue
 import random
-import re
 import threading
 import time
 import traceback
 from dataclasses import dataclass
 from enum import Enum
-from typing import List, Optional, Union
+from typing import List, Optional
 
 import numpy as np
 import requests
@@ -19,7 +18,6 @@
 
 AUTH_USER_ID = os.getenv("AUTH_USER_ID")
 GATEWAY_URL = os.getenv("GATEWAY_URL")
-DEBUG = os.getenv("DEBUG")
 app = typer.Typer(name="throughput-benchmarks", add_completion=False)
 
 MAX_CONTEXT_WINDOW = 4096
@@ -59,9 +57,6 @@ def from_value(cls, value):
 
 
 def send_request(url, request, user=None):
-    if DEBUG:
-        print(f"Using input {request}")
-        print("---")
     start = time.time()
     response = requests.post(
         url,
@@ -92,11 +87,6 @@ def send_request(url, request, user=None):
         if payload.startswith("data:"):
             payload_data = payload.lstrip("data:").rstrip("/n")
             payload_json = json.loads(payload_data)
-            if DEBUG:
-                try:
-                    print(payload_json["output"]["text"], end="", flush=True)
-                except KeyError:
-                    pass
 
     return {
         "payload": payload_json,
@@ -178,21 +168,17 @@ def generate_request(
 
 def send_requests(
     model: str,
-    prompt: Union[str, List[str]],
+    prompt: str,
     output_token_counts: List[int],
     use_localhost: bool,
     concurrency: int,
     framework: InferenceFramework,
     local_port: int = 5005,
 ):
-    if type(prompt) == str:
-        prompt = [prompt]
     thread_results: queue.Queue = queue.Queue()
     requests_queue: queue.Queue = queue.Queue()
-    for i, output_token_count in enumerate(output_token_counts):
-        request = generate_request(
-            framework, prompt[i % len(prompt)], output_token_count, use_localhost
-        )
+    for output_token_count in output_token_counts:
+        request = generate_request(framework, prompt, output_token_count, use_localhost)
         requests_queue.put(request)
     threads = []
     for i in range(concurrency):
@@ -220,28 +206,11 @@ def send_requests(
     return results
 
 
-# TODO test this
-def read_input_file(input_file: str) -> List[str]:
-    # Only supports csvs for now
-    if re.match(r".*\.csv$", input_file):
-        with open(input_file, "r", newline="") as file:
-            reader = csv.reader(file)
-            # May have to ignore first line
-            return [row[1] for row in reader][1:]
-    raise ValueError(f"Unsupported file type for input file {input_file}")
-
-
-def generate_prompt(
-    num, hf_model, inputs: Optional[List], num_samples: int = 1
-) -> Union[str, List[str]]:
-    if inputs is not None:
-        choice = random.sample(inputs, min(num_samples, len(inputs)))
-        return choice
-    else:
-        random.seed(1)
-        text = lorem.words(num // 2)  # Roughly 2 tokens per lorem word
-        tokenizer = AutoTokenizer.from_pretrained(hf_model)
-        return tokenizer.decode(tokenizer.encode(text)[: num - 2])
+def generate_prompt(num, hf_model):
+    random.seed(1)
+    text = lorem.words(num // 2)  # Roughly 2 tokens per lorem word
+    tokenizer = AutoTokenizer.from_pretrained(hf_model)
+    return tokenizer.decode(tokenizer.encode(text)[: num - 2])
 
 
 def generate_output_token_counts(mean, std, num, input_token_count):
@@ -262,14 +231,8 @@ def run_benchmark(
     concurrency: int,
     verbose: bool,
     local_port: int,
-    input_file: Optional[str] = None,
 ):
-
-    inputs = None
-
-    if input_file is not None:
-        inputs = read_input_file(input_file)
-    prompt = generate_prompt(config.input_token_count, hf_model, inputs, num_trials)
+    prompt = generate_prompt(config.input_token_count, hf_model)
 
     prompt_num_tokens = config.input_token_count
 
@@ -294,8 +257,7 @@ def run_benchmark(
     elapsed = end - start
     results = [result for result in results if result is not None]
 
-    sampled_token_counts = [result["num_completion_tokens"] for result in results]
-    num_sampled_tokens = sum(sampled_token_counts)
+    num_sampled_tokens = sum([result["num_completion_tokens"] for result in results])
     num_prompt_tokens = prompt_num_tokens * len(results)
     n = len(results)
     time_to_process_prompt = []
@@ -331,10 +293,6 @@ def run_benchmark(
     p90_time_to_first_token = np.percentile(time_to_first_token, 90)
     p95_time_to_first_token = np.percentile(time_to_first_token, 95)
     p99_time_to_first_token = np.percentile(time_to_first_token, 99)
-    p50_sampled_token_counts = np.percentile(sampled_token_counts, 50)
-    p90_sampled_token_counts = np.percentile(sampled_token_counts, 90)
-    p95_sampled_token_counts = np.percentile(sampled_token_counts, 95)
-    p99_sampled_token_counts = np.percentile(sampled_token_counts, 99)
 
     statistics = {
         "concurrency": concurrency,
@@ -372,14 +330,6 @@ def run_benchmark(
         "total_num_tokens": total_num_tokens,
         "total_num_sampled_tokens": num_sampled_tokens,
     }
-    if input_file is not None:
-        sampled_token_counts_statistics = {
-            "p50_sampled_token_counts": p50_sampled_token_counts,
-            "p90_sampled_token_counts": p90_sampled_token_counts,
-            "p95_sampled_token_counts": p95_sampled_token_counts,
-            "p99_sampled_token_counts": p99_sampled_token_counts,
-        }
-        statistics.update(sampled_token_counts_statistics)
     if verbose:
         print(f"Statistics: {statistics}")
 
@@ -396,7 +346,6 @@ def run_benchmarks(
     input_token_count: int,
     output_token_count_mean: int,
     num_trials: int = 50,
-    input_file: Optional[str] = None,
     output_file: Optional[str] = None,
     use_localhost: bool = False,
     concurrency: int = 1,
@@ -426,7 +375,6 @@ def run_benchmarks(
             concurrency,
             verbose,
             local_port,
-            input_file,
         )
         all_statistics.append(statistics)
     except Exception:
@@ -448,7 +396,6 @@ def run_benchmarks_concurrency_range(
     input_token_count: int,
     output_token_count_mean: int,
     num_trials_per_concurrency: int = 5,
-    input_file: Optional[str] = None,
     output_file: Optional[str] = None,
     use_localhost: bool = False,
     concurrency_min: int = 1,
@@ -469,7 +416,6 @@ def run_benchmarks_concurrency_range(
             input_token_count,
             output_token_count_mean,
             num_trials_per_concurrency * concurrency,
-            input_file,
             output_file,
             use_localhost,
             concurrency,

From 49219b31ee44fa6fa874b7c719a25fa34d7d4a94 Mon Sep 17 00:00:00 2001
From: Sean Shi <sean.shi@scale.com>
Date: Thu, 22 Feb 2024 02:56:13 +0000
Subject: [PATCH 13/14] output token count distribution

---
 scripts/throughput_benchmarks.py | 52 ++++++++++++++++++++++++++++----
 1 file changed, 46 insertions(+), 6 deletions(-)

diff --git a/scripts/throughput_benchmarks.py b/scripts/throughput_benchmarks.py
index c689d8cc5..5830c577b 100644
--- a/scripts/throughput_benchmarks.py
+++ b/scripts/throughput_benchmarks.py
@@ -220,6 +220,31 @@ def generate_output_token_counts(mean, std, num, input_token_count):
         output[i] = min(output[i], MAX_CONTEXT_WINDOW - input_token_count)
     return output
 
+def generate_output_token_counts_from_existing(distribution: List[int], num: int, input_token_count: int):
+    assert len(distribution) > 0, "Can't have a distribution with 0 tokens"
+    output = []
+    # Sample without replacement so that we don't have as much variance
+    for _ in range(num // len(distribution)):
+        random.shuffle(distribution)
+        output.extend(distribution)
+    random.shuffle(distribution)
+    output.extend(distribution[:num % len(distribution)])
+    assert len(output) == num
+
+    for i in range(len(output)):
+        output[i] = min(output[i], MAX_CONTEXT_WINDOW - input_token_count)
+    return output
+
+
+def read_distribution_from_file(fpath: str):
+    # Assumes the distribution is some json-formatted string that represents a list
+    try:
+        with open(fpath, "r") as fin:
+            return json.load(fin)
+    except FileNotFoundError:
+        print(f"File not found. Exiting.")
+        raise
+
 
 def run_benchmark(
     model: str,
@@ -231,17 +256,23 @@ def run_benchmark(
     concurrency: int,
     verbose: bool,
     local_port: int,
+    output_token_count_distribution: Optional[List] = None,
 ):
     prompt = generate_prompt(config.input_token_count, hf_model)
 
     prompt_num_tokens = config.input_token_count
 
-    output_token_counts = generate_output_token_counts(
-        config.output_token_count_mean,
-        config.output_token_count_std,
-        num_trials,
-        config.input_token_count,
-    )
+    if output_token_count_distribution is not None:
+        output_token_counts = generate_output_token_counts_from_existing(
+            output_token_count_distribution, num_trials, config.input_token_count
+        )
+    else:
+        output_token_counts = generate_output_token_counts(
+            config.output_token_count_mean,
+            config.output_token_count_std,
+            num_trials,
+            config.input_token_count,
+        )
 
     start = time.time()
     results = send_requests(
@@ -352,10 +383,16 @@ def run_benchmarks(
     verbose: bool = False,
     hf_model: Optional[str] = None,
     local_port: int = 5005,
+    output_token_count_distribution_file: Optional[str] = None,
 ):
     """Run benchmarks."""
     all_statistics = []
     config = BenchmarkConfig(input_token_count, output_token_count_mean)
+
+    output_token_count_distribution = None
+    if output_token_count_distribution_file is not None:
+        output_token_count_distribution = read_distribution_from_file(output_token_count_distribution_file)
+
     try:
         if verbose:
             print(f"Running benchmark for config {config}")
@@ -375,6 +412,7 @@ def run_benchmarks(
             concurrency,
             verbose,
             local_port,
+            output_token_count_distribution,
         )
         all_statistics.append(statistics)
     except Exception:
@@ -404,6 +442,7 @@ def run_benchmarks_concurrency_range(
     verbose: bool = False,
     hf_model: Optional[str] = None,
     local_port: int = 5005,
+    output_token_count_distribution_file: Optional[str] = None,
 ):
     if output_file is not None:
         # Create empty file
@@ -422,6 +461,7 @@ def run_benchmarks_concurrency_range(
             verbose,
             hf_model,
             local_port,
+            output_token_count_distribution_file,
         )
 
 

From e9771f7fadaebf0c36621a0b08865455dd97d3b8 Mon Sep 17 00:00:00 2001
From: Sean Shi <sean.shi@scale.com>
Date: Mon, 26 Feb 2024 12:43:25 -0800
Subject: [PATCH 14/14] renane var to be more clear

---
 scripts/throughput_benchmarks.py | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/scripts/throughput_benchmarks.py b/scripts/throughput_benchmarks.py
index 5830c577b..d67614a5f 100644
--- a/scripts/throughput_benchmarks.py
+++ b/scripts/throughput_benchmarks.py
@@ -220,7 +220,10 @@ def generate_output_token_counts(mean, std, num, input_token_count):
         output[i] = min(output[i], MAX_CONTEXT_WINDOW - input_token_count)
     return output
 
-def generate_output_token_counts_from_existing(distribution: List[int], num: int, input_token_count: int):
+
+def generate_output_token_counts_from_existing(
+    distribution: List[int], num: int, input_token_count: int
+):
     assert len(distribution) > 0, "Can't have a distribution with 0 tokens"
     output = []
     # Sample without replacement so that we don't have as much variance
@@ -228,7 +231,7 @@ def generate_output_token_counts_from_existing(distribution: List[int], num: int
         random.shuffle(distribution)
         output.extend(distribution)
     random.shuffle(distribution)
-    output.extend(distribution[:num % len(distribution)])
+    output.extend(distribution[: num % len(distribution)])
     assert len(output) == num
 
     for i in range(len(output)):
@@ -242,7 +245,7 @@ def read_distribution_from_file(fpath: str):
         with open(fpath, "r") as fin:
             return json.load(fin)
     except FileNotFoundError:
-        print(f"File not found. Exiting.")
+        print("File not found. Exiting.")
         raise
 
 
@@ -256,15 +259,15 @@ def run_benchmark(
     concurrency: int,
     verbose: bool,
     local_port: int,
-    output_token_count_distribution: Optional[List] = None,
+    response_token_count_distribution: Optional[List] = None,
 ):
     prompt = generate_prompt(config.input_token_count, hf_model)
 
     prompt_num_tokens = config.input_token_count
 
-    if output_token_count_distribution is not None:
+    if response_token_count_distribution is not None:
         output_token_counts = generate_output_token_counts_from_existing(
-            output_token_count_distribution, num_trials, config.input_token_count
+            response_token_count_distribution, num_trials, config.input_token_count
         )
     else:
         output_token_counts = generate_output_token_counts(
@@ -383,15 +386,17 @@ def run_benchmarks(
     verbose: bool = False,
     hf_model: Optional[str] = None,
     local_port: int = 5005,
-    output_token_count_distribution_file: Optional[str] = None,
+    response_token_count_distribution_file: Optional[str] = None,
 ):
     """Run benchmarks."""
     all_statistics = []
     config = BenchmarkConfig(input_token_count, output_token_count_mean)
 
-    output_token_count_distribution = None
-    if output_token_count_distribution_file is not None:
-        output_token_count_distribution = read_distribution_from_file(output_token_count_distribution_file)
+    response_token_count_distribution = None
+    if response_token_count_distribution_file is not None:
+        response_token_count_distribution = read_distribution_from_file(
+            response_token_count_distribution_file
+        )
 
     try:
         if verbose:
@@ -412,7 +417,7 @@ def run_benchmarks(
             concurrency,
             verbose,
             local_port,
-            output_token_count_distribution,
+            response_token_count_distribution,
         )
         all_statistics.append(statistics)
     except Exception:
@@ -442,7 +447,7 @@ def run_benchmarks_concurrency_range(
     verbose: bool = False,
     hf_model: Optional[str] = None,
     local_port: int = 5005,
-    output_token_count_distribution_file: Optional[str] = None,
+    response_token_count_distribution_file: Optional[str] = None,
 ):
     if output_file is not None:
         # Create empty file
@@ -461,7 +466,7 @@ def run_benchmarks_concurrency_range(
             verbose,
             hf_model,
             local_port,
-            output_token_count_distribution_file,
+            response_token_count_distribution_file,
         )