Add image build details and examples for multi-host inference (#278)

Add downstream details to support multi-host infer
vllm-project · Oct 9, 2024 · 85a7411 · 85a7411
1 parent 436be0b
commit 85a7411
Show file tree

Hide file tree

Showing 3 changed files with 117 additions and 0 deletions.
diff --git a/config/samples/orchestration_v1alpha1_rayclusterreplicaset.yaml b/config/samples/orchestration_v1alpha1_rayclusterreplicaset.yaml
@@ -31,10 +31,34 @@ spec:
                     name: dashboard
                   - containerPort: 10001
                     name: client
+                  - containerPort: 8000
+                    name: service
                 resources:
                   limits:
                     cpu: 1
                     memory: "1024Mi"
                   requests:
                     cpu: 1
                     memory: "1024Mi"
+      workerGroupSpecs:
+        - replicas: 1
+          minReplicas: 1
+          maxReplicas: 5
+          groupName: small-group
+          rayStartParams: {}
+          template:
+            spec:
+              containers:
+                - name: ray-worker # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name',  or '123-abc'
+                  image: rayproject/ray:2.10.0
+                  lifecycle:
+                    preStop:
+                      exec:
+                        command: [ "/bin/sh","-c","ray stop" ]
+                  resources:
+                    limits:
+                      cpu: 1
+                      memory: "1024Mi"
+                    requests:
+                      cpu: 1
+                      memory: "1024Mi"
diff --git a/docs/tutorial/distributed/README.md b/docs/tutorial/distributed/README.md
@@ -1,5 +1,24 @@
 # Run vLLM Distributed Inference with Ray
 
+## Container Image
+
+> Note: some upstream work has not been merged yet. So we need to do some downstream changes
+
+```
+FROM vllm/vllm-openai:v0.6.2
+RUN apt update && apt install -y wget # important for future healthcheck
+RUN pip3 install ray[default] # important for future healthcheck
+COPY utils.py /usr/local/lib/python3.12/dist-packages/vllm/executor/ray_utils.py
+ENTRYPOINT [""]
+```
+
+> Note: copy uitls.py from upstream version and remove the placement group validation logic. See [#228](https://github.com/aibrix/aibrix/issues/228) for more details.
+> Note: No need to downgrade ray to v2.10.0. Seem only ray-project/ray image has issues.
+
+Container Image Combination which supports the distributed multi-host inference.
+- aibrix-container-registry-cn-beijing.cr.volces.com/aibrix/kuberay-operator:v1.2.1-patch
+- aibrix-container-registry-cn-beijing.cr.volces.com/aibrix/vllm-openai:v0.6.2-distributed
+
 ## Environment Setup
 
 ### Configure the GPU Cloud Instance

diff --git a/docs/tutorial/distributed/fleet.yaml b/docs/tutorial/distributed/fleet.yaml
@@ -0,0 +1,74 @@
+apiVersion: orchestration.aibrix.ai/v1alpha1
+kind: RayClusterFleet
+metadata:
+  labels:
+    app.kubernetes.io/name: aibrix
+    app.kubernetes.io/managed-by: kustomize
+  name: facebook-opt-13b
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      models.aibricks.ai: facebook-opt-13b
+  strategy:
+    rollingUpdate:
+      maxSurge: 25%
+      maxUnavailable: 25%
+    type: RollingUpdate
+  template:
+    metadata:
+      labels:
+        models.aibricks.ai: facebook-opt-13b
+      annotations:
+          ray.io/overwrite-container-cmd: "true"
+    spec:
+      rayVersion: '2.10.0' # should match the Ray version in the image of the containers
+      headGroupSpec:
+        rayStartParams:
+          dashboard-host: '0.0.0.0'
+        template:
+          spec:
+            containers:
+              - name: ray-head
+                image: rayproject/ray:2.10.0
+                ports:
+                  - containerPort: 6379
+                    name: gcs-server
+                  - containerPort: 8265
+                    name: dashboard
+                  - containerPort: 10001
+                    name: client
+                  - containerPort: 8000
+                    name: service
+                    command: ["/bin/bash", "-lc", "--"]
+                    args: ["ulimit -n 65536; echo head; $KUBERAY_GEN_RAY_START_CMD"; vllm serve facebook/opt-125m --tensor-parallel-size 2 --distributed-executor-backend ray]
+                resources:
+                  limits:
+                    cpu: "1000m"
+                    nvidia.com/gpu: 1
+                  requests:
+                    cpu: "200m"
+                    nvidia.com/gpu: 1
+      workerGroupSpecs:
+        # the pod replicas in this group typed worker
+        - replicas: 1
+          minReplicas: 1
+          maxReplicas: 5
+          groupName: small-group
+          rayStartParams: {}
+          template:
+            spec:
+              containers:
+                - name: ray-worker # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name',  or '123-abc'
+                  image: rayproject/ray:2.10.0
+                  lifecycle:
+                    preStop:
+                      exec:
+                        command: [ "/bin/sh","-c","ray stop" ]
+                  resources:
+                    limits:
+                      cpu: "1000m"
+                      nvidia.com/gpu: 1
+                    requests:
+                      cpu: "200m"
+                      nvidia.com/gpu: 1