From bbb148cf85f802675129e327c0e3fe15dd67bafa Mon Sep 17 00:00:00 2001
From: Jingyuan <zhangjyr@gmail.com>
Date: Thu, 13 Feb 2025 17:39:34 -0800
Subject: [PATCH] [Bug] GPU optimizer bug fix and document fix (#656)

* Bug fix

* Fix configuration for domain podautoscaler
Add test case for make url created from metricSource as expected: endpoint should include port, if not and port is specified, port will be append to endpoint.

* Lint fix

* Add license for new files.

* Lint fix on added unit test.

* Add authorization support

* Support parameterized benchmark

* Remove next_in paramter

* Bug fix

* Fix typo

* Bug fix

* Apply stream parameter

* Cleaning up responses.

* Bug fix

* If error not reported as a temporary eror, we will not retry.

* GPU profile now support TPAT (time per all token)
Fix an error in benchmark that may cause error when now all token_latencies might missing some data.

* Debug optimizer

* bird prompt dataset generation

* update benchmark to support prompt dataset loading

* Benchmark now support workload parameter

* Bug fix

* Log control

* Improve stability and lint fix.

* Bug fix

* switch logs for gpu-optimizer to json format

* added BIRD dataset with Aruze timestamp script

* add BIRD brust pattern workload generation

* Visualizer now support workload file

* Print out workload input

* Bug fix

* lint fix

* remove timestamp offset

* Bug fix: call _parse_profiles without parameter out_records will not add up returns.

* Use current ts to load profile may to early, revert to use an interval ago.

* Use the larger of average request rate in window and current request rate to get sufficient resources.

* Tuning up request rate temporarily.

* Bug fix
Fix request rate to 8 temporarily

* Remove fixed rate

* changing load profile back

* Provide compatibility to v3 gateway profiles.

* Adjust development config

* Add config for gateway-plugin development

* delayed scale in deployment added

* Add trace to benchmark

* rollback to old version without delayed scale in

* Disregard pending requests for now.

* Bug fix

* Bug fix

* Adapt to latest profile about pending requests and update unittest.

* Output correct timestamp

* Output pending and total requests from load reader

* Ignore pending for now.

* Add throughput filter.

* bug and lint fix

* Fix a bug that when mat_tputs are 0

* Lint fix

* fix benchmark on count num_requests

* Optimizer now can adopt deployment changes using "kubectl apply"

* Add comments

* bug fix

* Make signature prefer higher index on choose profiles.

* Bug fix, watch ScalingReplicaSet for label changes

* Bug fix

* Change back SLO preference.
Optimize update logic.

* Refine gpu optimizer document and apply more generic default parameters.

* Update document to use production vllm configuration example
Fix benchmark and gen_profile to work inside python module.

* Add samples/heterogenous

* Clean up

* Modify load reader to support latest workload
Fix a potential bug that in corner cases, out of profile patterns are maps to closest profiled patterns and causes possible data loss.

* Fix doc and example

* Use 100 instead 1 as scale fraction.

* remove unnecessary samples

* Lint fix

---------

Signed-off-by: Jingyuan <zhangjyr@gmail.com>
Co-authored-by: Jingyuan Zhang <jingyuan.zhang0929@bytedance.com>
Co-authored-by: Ning Wang <n.wang.chn@hotmail.com>
---
 .../patch_podautoscaler_a40.yaml              |  2 +-
 .../simulator/patch_podautoscaler_a100.yaml   |  2 +-
 docs/source/features/heterogeneous-gpu.rst    | 18 +++++------
 .../gpu_optimizer/load_monitor/load_reader.py | 12 ++++++-
 .../gpu_optimizer/load_monitor/monitor.py     |  6 +++-
 .../gpu_optimizer/optimizer/optimizer.py      | 10 ++++--
 .../deepseek-coder-7b-l20-deployment.yaml     |  1 -
 .../deepseek-coder-7b-l20-podautoscaler.yaml  |  4 +--
 .../deepseek-coder-7b-v100-podautoscaler.yaml |  2 +-
 samples/heterogeneous/kustomization.yaml      | 32 +++++++++++++++++++
 10 files changed, 69 insertions(+), 20 deletions(-)
 create mode 100644 samples/heterogeneous/kustomization.yaml

diff --git a/development/app/config/heterogeneous/simulator_a40/patch_podautoscaler_a40.yaml b/development/app/config/heterogeneous/simulator_a40/patch_podautoscaler_a40.yaml
index a85c14ba..d8bc78e4 100644
--- a/development/app/config/heterogeneous/simulator_a40/patch_podautoscaler_a40.yaml
+++ b/development/app/config/heterogeneous/simulator_a40/patch_podautoscaler_a40.yaml
@@ -16,4 +16,4 @@ spec:
       endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080
       path: /metrics/default/simulator-llama2-7b-a40
       targetMetric: "vllm:deployment_replicas"
-      targetValue: "1"
\ No newline at end of file
+      targetValue: "100" # For stable workloads. Set to a fraction to tolerate bursts.
\ No newline at end of file
diff --git a/development/app/config/simulator/patch_podautoscaler_a100.yaml b/development/app/config/simulator/patch_podautoscaler_a100.yaml
index d6a7b70c..728a8a90 100644
--- a/development/app/config/simulator/patch_podautoscaler_a100.yaml
+++ b/development/app/config/simulator/patch_podautoscaler_a100.yaml
@@ -16,4 +16,4 @@ spec:
       endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080
       path: /metrics/default/simulator-llama2-7b-a100
       targetMetric: "vllm:deployment_replicas"
-      targetValue: "1"
\ No newline at end of file
+      targetValue: "100"  # For stable workloads. Set to a fraction to tolerate bursts.
\ No newline at end of file
diff --git a/docs/source/features/heterogeneous-gpu.rst b/docs/source/features/heterogeneous-gpu.rst
index ed583254..74881d2a 100644
--- a/docs/source/features/heterogeneous-gpu.rst
+++ b/docs/source/features/heterogeneous-gpu.rst
@@ -24,7 +24,7 @@ Step 1: Deploy the heterogeneous deployments.
 
 One deployment and corresponding PodAutoscaler should be deployed for each GPU type.
 See `sample heterogeneous configuration <https://github.com/aibrix/aibrix/tree/main/samples/heterogeneous>`_ for an example of heterogeneous configuration composed of two GPU types. The following codes 
-deploy heterogeneous deployments using L20 and A10 GPU.
+deploy heterogeneous deployments using L20 and V100 GPU.
 
 .. code-block:: bash
 
@@ -45,9 +45,10 @@ Incoming requests are routed through the gateway and directed to the optimal pod
 
     kubectl get pods
     NAME                                       READY   STATUS    RESTARTS   AGE
-    deepseek-coder-7b-a10-96667667c-6gjql      2/2     Running   0          33s
+    deepseek-coder-7b-v100-96667667c-6gjql     2/2     Running   0          33s
     deepseek-coder-7b-l20-96667667c-7zj7k      2/2     Running   0          33s
 
+Step 2: Install aibrix python module:
 
 Step 2: Install aibrix python module:
 
@@ -74,32 +75,31 @@ Step 4: Decide SLO and generate profile, run `aibrix_gen_profile -h` for help.
 
     kubectl -n aibrix-system port-forward svc/aibrix-redis-master 6379:6379 1>/dev/null 2>&1 &
     # Wait for port-forward taking effect.
-    aibrix_gen_profile deepseek-coder-7b-a10 --cost [cost1] [SLO-metric] [SLO-value] -o "redis://localhost:6379/?model=deepseek-coder-7b"
+    aibrix_gen_profile deepseek-coder-7b-v100 --cost [cost1] [SLO-metric] [SLO-value] -o "redis://localhost:6379/?model=deepseek-coder-7b"
     aibrix_gen_profile deepseek-coder-7b-l20 --cost [cost2] [SLO-metric] [SLO-value] -o "redis://localhost:6379/?model=deepseek-coder-7b"
 
 Now the GPU Optimizer is ready to work. You should observe that the number of workload pods changes in response to the requests sent to the gateway. Once the GPU optimizer finishes the scaling optimization, the output of the GPU optimizer is passed to PodAutoscaler as a metricSource via a designated HTTP endpoint for the final scaling decision.  The following is an example of PodAutoscaler spec.
 
-A simple example of PodAutoscaler spec for a10 GPU is as follows:
+A simple example of PodAutoscaler spec for v100 GPU is as follows:
 
-.. literalinclude:: ../../../samples/heterogeneous/deepseek-coder-7b-l20-podautoscaler.yaml
+.. literalinclude:: ../../../samples/heterogeneous/deepseek-coder-7b-v100-podautoscaler.yaml
    :language: yaml
 
-
 Miscellaneous
 -------------
 
-A new label label ``model.aibrix.ai/min_replicas`` is added to specifies the minimum number of replicas to maintain when there is no workload. We recommend setting this to 1 for at least one Deployment spec to ensure there is always one READY pod available. For example, while the GPU optimizer might recommend 0 replicas for an a10 GPU during periods of no activity, setting ``model.aibrix.ai/min_replicas: "1"`` will maintain one a10 replica. This label only affects the system when there is no workload - it is ignored when there are active requests.
+A new label label ``model.aibrix.ai/min_replicas`` is added to specifies the minimum number of replicas to maintain when there is no workload. We recommend setting this to 1 for at least one Deployment spec to ensure there is always one READY pod available. For example, while the GPU optimizer might recommend 0 replicas for an v100 GPU during periods of no activity, setting ``model.aibrix.ai/min_replicas: "1"`` will maintain one v100 replica. This label only affects the system when there is no workload - it is ignored when there are active requests.
 
 .. code-block:: yaml
 
     apiVersion: apps/v1
     kind: Deployment
     metadata:
-      name: deepseek-coder-7b-a10
+      name: deepseek-coder-7b-v100
       labels:
         model.aibrix.ai/name: "deepseek-coder-7b"
         model.aibrix.ai/min_replicas: "1" # min replica for gpu optimizer when no workloads.
     ... rest yaml deployments
 
-Important: The ``minReplicas`` field in the PodAutoscaler spec must be set to 0 to allow proper scaling behavior. Setting it to any value greater than 0 will interfere with the GPU optimizer's scaling decisions. For instance, if the GPU optimizer determines an optimal configuration of ``{a10: 0, l20: 4}`` but the a10 PodAutoscaler has ``minReplicas: 1``, the system won't be able to scale the a10 down to 0 as recommended.
+Important: The ``minReplicas`` field in the PodAutoscaler spec must be set to 0 to allow proper scaling behavior. Setting it to any value greater than 0 will interfere with the GPU optimizer's scaling decisions. For instance, if the GPU optimizer determines an optimal configuration of ``{v100: 0, l20: 4}`` but the v100 PodAutoscaler has ``minReplicas: 1``, the system won't be able to scale the v100 down to 0 as recommended.
 
diff --git a/python/aibrix/aibrix/gpu_optimizer/load_monitor/load_reader.py b/python/aibrix/aibrix/gpu_optimizer/load_monitor/load_reader.py
index 3ffcf9ad..eb05350d 100644
--- a/python/aibrix/aibrix/gpu_optimizer/load_monitor/load_reader.py
+++ b/python/aibrix/aibrix/gpu_optimizer/load_monitor/load_reader.py
@@ -14,6 +14,7 @@
 
 import json
 import logging
+import math
 import re
 from datetime import datetime
 from typing import Any, List, Optional, Protocol, Tuple, Union
@@ -147,7 +148,12 @@ class WorkloadReader:
 
     def __init__(self, filepath, scale: float = 1.0, interval: int = 10) -> None:
         if filepath != unittest_filepath:
-            self.df = pd.read_json(filepath)
+            try:
+                self.df = pd.read_json(filepath)
+            except Exception:
+                self.df = pd.read_json(filepath, lines=True)
+                self.df["Timestamp"] = self.df["timestamp"]
+                self.df["Requests"] = self.df["requests"]
 
         self.scale = scale
         self.interval = interval
@@ -180,6 +186,10 @@ def read(self, ts: float = 0.0) -> Tuple[List[LoadRecord], float]:
                 self.log2_aggregate(self.tick_df["Prompt Length"] * self.scale, 1),
                 self.log2_aggregate(self.tick_df["Output Length"] * self.scale, 1),
             ):
+                # Unlikely, just in case.
+                if math.isinf(output_tokens) or math.isinf(input_tokens):
+                    continue
+
                 records.append(
                     LoadRecord(
                         (self.tick - self.start),
diff --git a/python/aibrix/aibrix/gpu_optimizer/load_monitor/monitor.py b/python/aibrix/aibrix/gpu_optimizer/load_monitor/monitor.py
index e241323a..b8314090 100644
--- a/python/aibrix/aibrix/gpu_optimizer/load_monitor/monitor.py
+++ b/python/aibrix/aibrix/gpu_optimizer/load_monitor/monitor.py
@@ -106,6 +106,7 @@ def __init__(
         deployment: Optional[DeploymentStates] = None,
         namespace: Optional[str] = None,
         profile_reader: Optional[ProfileReader] = None,
+        gpu_fraction: float = 100.0,
         debug: bool = False,
     ):
         """Initialize the model monitor.
@@ -119,6 +120,7 @@ def __init__(
             replicas: (optional) The initial number of replicas for the model deployment.
             interval: (optional) The interval (in seconds) at which to monitor the model. Defaults to 10 seconds.
             window: (optional) The window (in seconds) to consider for clustering. Defaults to 240 seconds.
+            gpu_fraction: (optional) The number of fractions that a GPU is counted. Defaults to 100.
             debug: (optional) Whether to enable debugging behavior. Defaults to False.
         """
         self.model_name = model_name
@@ -129,6 +131,7 @@ def __init__(
         self.debug = debug
         self.done = False
         self.window = float(window)
+        self.gpu_fraction = gpu_fraction
         self._lock = threading.Lock()
 
         # Load reader
@@ -139,7 +142,7 @@ def __init__(
 
         # Optimizer
         self._profiles: Dict[str, GPUProfile] = {}
-        self._optimizer = Optimizer()
+        self._optimizer = Optimizer(self.gpu_fraction)
 
         # Monitor states
         self._centers: Iterable[Centeroid] = Empty_Array
@@ -276,6 +279,7 @@ def load_profiles(self, profile_reader: Optional[ProfileReader] = None) -> bool:
 
             profiles = profile_reader.read()
             for profile in profiles:
+                profile.cost /= self.gpu_fraction
                 if self._update_profile(profile):
                     logger.debug(f"Profile of {profile.gpu} updated.")
 
diff --git a/python/aibrix/aibrix/gpu_optimizer/optimizer/optimizer.py b/python/aibrix/aibrix/gpu_optimizer/optimizer/optimizer.py
index 16ea4e89..9959d6dc 100644
--- a/python/aibrix/aibrix/gpu_optimizer/optimizer/optimizer.py
+++ b/python/aibrix/aibrix/gpu_optimizer/optimizer/optimizer.py
@@ -27,11 +27,14 @@
 
 
 class Optimizer:
-    def __init__(self, profiles: Optional[Iterable[GPUProfile]] = None):
+    def __init__(
+        self, gpu_fraction: float, profiles: Optional[Iterable[GPUProfile]] = None
+    ):
         self._config = MelangConfig()
         self._workload_distribution_template: Optional[np.ndarray] = None
         self._indexes: Optional[list] = None  # Values ticks of tputs columns and rows
         self._log_indexes: Optional[list] = None  # Cache the log2 value of index
+        self._gpu_fraction = gpu_fraction
         if profiles is not None:
             for profile in profiles:
                 self.set_profile(profile)
@@ -73,7 +76,7 @@ def set_workload_distribution(
         self._workload_distribution_template.fill(0)
 
         # Maintain the overall request scale disregard some request are not covered.
-        self._config.total_request_rate = total_request_rate
+        self._config.total_request_rate = total_request_rate * self._gpu_fraction
         # covered_request_rate is used to calculate the workload distribution.
         covered_request_rate = reduce(
             lambda cnt, center: cnt + center.rate, profiles, 0.0
@@ -82,7 +85,8 @@ def set_workload_distribution(
         for profile in profiles:
             try:
                 signature = self._validate_workload_signature(profile)
-                self._workload_distribution_template[signature] = (
+                # Merge possible multiple patterns (out of range patterns coinincident with border patterns)
+                self._workload_distribution_template[signature] += (
                     profile.rate / covered_request_rate
                 )  # type: ignore
                 logger.debug(
diff --git a/samples/heterogeneous/deepseek-coder-7b-l20-deployment.yaml b/samples/heterogeneous/deepseek-coder-7b-l20-deployment.yaml
index d36816ed..3f9568b1 100644
--- a/samples/heterogeneous/deepseek-coder-7b-l20-deployment.yaml
+++ b/samples/heterogeneous/deepseek-coder-7b-l20-deployment.yaml
@@ -3,7 +3,6 @@ kind: Deployment
 metadata:
   labels:
     adapter.model.aibrix.ai/enabled: "true"
-    model.aibrix.ai/min_replicas: "4"
     model.aibrix.ai/name: deepseek-coder-7b
     model.aibrix.ai/port: "8000"
     model.aibrix.ai/min_replicas: "1" # min replica when there is no workloads.
diff --git a/samples/heterogeneous/deepseek-coder-7b-l20-podautoscaler.yaml b/samples/heterogeneous/deepseek-coder-7b-l20-podautoscaler.yaml
index fd76d796..a8ecefdf 100644
--- a/samples/heterogeneous/deepseek-coder-7b-l20-podautoscaler.yaml
+++ b/samples/heterogeneous/deepseek-coder-7b-l20-podautoscaler.yaml
@@ -15,8 +15,8 @@ spec:
     path: /metrics/default/deepseek-coder-7b-l20
     protocolType: http
     targetMetric: vllm:deployment_replicas
-    targetValue: "1"
-  minReplicas: 1
+    targetValue: "100"  # For stable workloads. Set to a fraction to tolerate bursts.
+  minReplicas: 0
   scaleTargetRef:
     apiVersion: apps/v1
     kind: Deployment
diff --git a/samples/heterogeneous/deepseek-coder-7b-v100-podautoscaler.yaml b/samples/heterogeneous/deepseek-coder-7b-v100-podautoscaler.yaml
index a3de4cd6..8c80ab1a 100644
--- a/samples/heterogeneous/deepseek-coder-7b-v100-podautoscaler.yaml
+++ b/samples/heterogeneous/deepseek-coder-7b-v100-podautoscaler.yaml
@@ -15,7 +15,7 @@ spec:
     path: /metrics/default/deepseek-coder-7b-v100
     protocolType: http
     targetMetric: vllm:deployment_replicas
-    targetValue: "1"
+    targetValue: "100"  # For stable workloads. Set to a fraction to tolerate bursts.
   minReplicas: 0
   scaleTargetRef:
     apiVersion: apps/v1
diff --git a/samples/heterogeneous/kustomization.yaml b/samples/heterogeneous/kustomization.yaml
new file mode 100644
index 00000000..a889052e
--- /dev/null
+++ b/samples/heterogeneous/kustomization.yaml
@@ -0,0 +1,32 @@
+kind: Kustomization
+
+resources:
+- deepseek-coder-7b-service.yaml
+- deepseek-coder-7b-l20-deployment.yaml
+- deepseek-coder-7b-l20-podautoscaler.yaml
+- deepseek-coder-7b-v100-deployment.yaml
+- deepseek-coder-7b-v100-podautoscaler.yaml
+
+patches:
+- patch: |-  # Use the '|' and '-' for inline patching, warm up 10 hosts and start with 7
+    apiVersion: apps/v1
+    kind: Deployment
+    metadata:
+      name: deepseek-coder-7b-v100
+      labels:
+        model.aibrix.ai/min_replicas: "1"
+  target:
+    kind: Deployment
+    name: deepseek-coder-7b-v100
+- patch: |-  # Use the '|' and '-' for inline patching, warm up 10 hosts and start with 7
+    apiVersion: apps/v1
+    kind: Deployment
+    metadata:
+      name: deepseek-coder-7b-l20
+      labels:
+        model.aibrix.ai/min_replicas: "0"
+  target:
+    kind: Deployment
+    name: deepseek-coder-7b-l20
+
+apiVersion: kustomize.config.k8s.io/v1beta1
\ No newline at end of file