feat: Add Ray head Pod high availability with Redis (awslabs#555)

Co-authored-by: Bryant Biggs <[email protected]>
hitsub2 · Jun 12, 2024 · 9e52517 · 9e52517
1 parent a84f3b2
commit 9e52517
Show file tree

Hide file tree

Showing 14 changed files with 464 additions and 1 deletion.
diff --git a/ai-ml/trainium-inferentia/elastic-cache-redis.tf b/ai-ml/trainium-inferentia/elastic-cache-redis.tf
@@ -0,0 +1,54 @@
+module "elasticache" {
+  create   = var.enable_rayserve_ha_elastic_cache_redis
+  source  = "terraform-aws-modules/elasticache/aws"
+  version = "1.2.0"
+
+  cluster_id               = local.name
+  create_cluster           = true
+  create_replication_group = false
+
+  engine_version = "7.1"
+  node_type      = "cache.t4g.small"
+
+  apply_immediately = true
+
+  # Security Group
+  vpc_id = module.vpc.vpc_id
+  security_group_rules = {
+    ingress_vpc = {
+      # Default type is `ingress`
+      # Default port is based on the default engine port
+      description = "VPC traffic"
+      cidr_ipv4   = module.vpc.vpc_cidr_block
+    }
+
+    ingress_from_eks_worker_node_tcp = {
+      description                  = "Ingress rule to allow TCP on port 6379 from EKS Ray Head Node"
+      protocol                     = "tcp"
+      from_port                    = 6379
+      referenced_security_group_id = module.eks.node_security_group_id
+      to_port                      = 6379
+      type                         = "ingress"
+    }
+  }
+
+  # Subnet Group
+  subnet_group_name        = local.name
+  subnet_group_description = "${title(local.name)} subnet group"
+  subnet_ids               = module.vpc.private_subnets
+
+  # Parameter Group
+  create_parameter_group      = true
+  parameter_group_name        = local.name
+  parameter_group_family      = "redis7"
+  parameter_group_description = "${title(local.name)} parameter group"
+  parameters = [
+    {
+      name  = "latency-tracking"
+      value = "yes"
+    }
+  ]
+
+  tags = local.tags
+
+}
diff --git a/ai-ml/trainium-inferentia/outputs.tf b/ai-ml/trainium-inferentia/outputs.tf
@@ -2,3 +2,8 @@ output "configure_kubectl" {
   description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig"
   value       = "aws eks --region ${var.region} update-kubeconfig --name ${local.name}"
 }
+
+output "elastic_cache_redis_cluster_arn" {
+  description = "Cluster arn of the cache cluster"
+  value       = module.elasticache.cluster_arn
+}
diff --git a/ai-ml/trainium-inferentia/variables.tf b/ai-ml/trainium-inferentia/variables.tf
@@ -140,3 +140,9 @@ variable "kms_key_admin_roles" {
   type        = list(string)
   default     = []
 }
+
+variable "enable_rayserve_ha_elastic_cache_redis" {
+  description = "Flag to enable Ray Head High Availability with Elastic Cache for Redis"
+  type        = bool
+  default     = false
+}
diff --git a/gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral-ft.yaml b/gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral-ft.yaml
@@ -0,0 +1,186 @@
+#----------------------------------------------------------------------
+# NOTE: For deployment instructions, refer to the DoEKS website.
+#----------------------------------------------------------------------
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: mistral
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: hf-token
+  namespace: mistral
+data:
+  hf-token: $HUGGING_FACE_HUB_TOKEN
+---
+apiVersion: ray.io/v1
+kind: RayService
+metadata:
+  name: mistral
+  namespace: mistral
+  annotations:
+    ray.io/ft-enabled: "true" # enables Global Control Service(GCS) fault tolerance
+spec:
+  serviceUnhealthySecondThreshold: 900
+  deploymentUnhealthySecondThreshold: 300
+  serveConfigV2: |
+    applications:
+      - name: mistral-deployment
+        import_path: "ray_serve_mistral:entrypoint"
+        route_prefix: "/"
+        runtime_env:
+          env_vars:
+            MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.2"
+            NEURON_CC_FLAGS: "-O1"
+            LD_LIBRARY_PATH: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
+            NEURON_CORES: "2"
+        deployments:
+          - name: mistral-7b
+            autoscaling_config:
+              metrics_interval_s: 0.2
+              min_replicas: 2
+              max_replicas: 12
+              look_back_period_s: 2
+              downscale_delay_s: 30
+              upscale_delay_s: 2
+              target_num_ongoing_requests_per_replica: 1
+            graceful_shutdown_timeout_s: 5
+            max_concurrent_queries: 100
+            ray_actor_options:
+              num_cpus: 10
+              resources: {"neuron_cores": 2}
+              runtime_env:
+                env_vars:
+                  LD_LIBRARY_PATH: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
+  rayClusterConfig:
+    rayVersion: '2.22.0'
+    enableInTreeAutoscaling: true
+    headGroupSpec:
+      headService:
+        metadata:
+          name: mistral
+          namespace: mistral
+      rayStartParams:
+        dashboard-host: '0.0.0.0'
+        num-cpus: "0" # this is to ensure no tasks or actors are scheduled on the head Pod
+      template:
+        spec:
+          containers:
+          - name: head
+            image: public.ecr.aws/data-on-eks/ray2.22.0-py310-mistral7b-neuron:latest
+            imagePullPolicy: IfNotPresent # Ensure the image is always pulled when updated
+            lifecycle:
+              preStop:
+                exec:
+                  command: ["/bin/sh", "-c", "ray stop"]
+            ports:
+            - containerPort: 6379
+              name: redis
+            - containerPort: 8265
+              name: dashboard
+            - containerPort: 10001
+              name: client
+            - containerPort: 8000
+              name: serve
+            volumeMounts:
+            - mountPath: /tmp/ray
+              name: ray-logs
+            resources:
+              limits:
+                cpu: "2"
+                memory: "20G"
+              requests:
+                cpu: "2"
+                memory: "20G"
+            env:
+            - name: RAY_REDIS_ADDRESS
+              value: $EXT_REDIS_ENDPOINT:6379
+            - name: LD_LIBRARY_PATH
+              value: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
+            - name: HUGGING_FACE_HUB_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token
+                  key: hf-token
+          nodeSelector:
+            instanceType: mixed-x86
+            provisionerType: Karpenter
+            workload: rayhead
+          volumes:
+          - name: ray-logs
+            emptyDir: {}
+    workerGroupSpecs:
+    - groupName: inf2
+      replicas: 1
+      minReplicas: 1
+      maxReplicas: 5
+      rayStartParams: {}
+      template:
+        spec:
+          containers:
+          - name: worker
+            image: public.ecr.aws/data-on-eks/ray2.22.0-py310-mistral7b-neuron:latest
+            imagePullPolicy: Always # Ensure the image is always pulled when updated
+            lifecycle:
+              preStop:
+                exec:
+                  command: ["/bin/sh", "-c", "ray stop"]
+            # We are using 2 Neuron cores per HTTP request hence this configuration handles 6 requests per second
+            resources:
+              limits:
+                cpu: "90" # All vCPUs of inf2.24xlarge; 6vCPU daemonset overhead
+                memory: "360G" # All memory of inf2.24xlarge; 24G for daemonset overhead
+                aws.amazon.com/neuron: "6" # All Neuron cores of inf2.24xlarge
+              requests:
+                cpu: "90" # All vCPUs of inf2.24xlarge; 6vCPU daemonset overhead
+                memory: "360G" # All memory of inf2.24xlarge; 24G for daemonset overhead
+                aws.amazon.com/neuron: "6" # All Neuron cores of inf2.24xlarge
+            env:
+            - name: LD_LIBRARY_PATH
+              value: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
+            - name: HUGGING_FACE_HUB_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token
+                  key: hf-token
+          nodeSelector:
+            instanceType: inferentia-inf2
+            provisionerType: Karpenter
+          tolerations:
+          - key: "aws.amazon.com/neuron"
+            operator: "Exists"
+            effect: "NoSchedule"
+          - key: "hub.jupyter.org/dedicated"
+            operator: "Equal"
+            value: "user"
+            effect: "NoSchedule"
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: mistral
+  namespace: mistral
+  annotations:
+    nginx.ingress.kubernetes.io/rewrite-target: "/$1"
+spec:
+  ingressClassName: nginx
+  rules:
+  - http:
+      paths:
+      # Ray Dashboard
+      - path: /dashboard/(.*)
+        pathType: ImplementationSpecific
+        backend:
+          service:
+            name: mistral
+            port:
+              number: 8265
+      # Ray Serve
+      - path: /serve/(.*)
+        pathType: ImplementationSpecific
+        backend:
+          service:
+            name: mistral
+            port:
+              number: 8000
diff --git a/gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral.yaml b/gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral.yaml
@@ -61,6 +61,7 @@ spec:
           namespace: mistral
       rayStartParams:
         dashboard-host: '0.0.0.0'
+        num-cpus: "0" # this is to ensure no tasks or actors are scheduled on the head Pod
       template:
         spec:
           containers:

diff --git a/website/docs/gen-ai/inference/Mistral-7b-inf2.md b/website/docs/gen-ai/inference/Mistral-7b-inf2.md
@@ -237,7 +237,6 @@ Finally, we'll provide instructions for cleaning up and deprovisioning the resou
 
 **Step1:** Delete Gradio App and mistral Inference deployment
 
-
 ```bash
 cd gen-ai/inference/mistral-7b-rayserve-inf2
 kubectl delete -f gradio-ui.yaml

diff --git a/website/docs/gen-ai/inference/img/answer-1-contd.png b/website/docs/gen-ai/inference/img/answer-1-contd.png
diff --git a/website/docs/gen-ai/inference/img/answer-1.png b/website/docs/gen-ai/inference/img/answer-1.png
diff --git a/website/docs/gen-ai/inference/img/gradio-test-ft.png b/website/docs/gen-ai/inference/img/gradio-test-ft.png
diff --git a/website/docs/gen-ai/inference/img/head-pod-deleted.png b/website/docs/gen-ai/inference/img/head-pod-deleted.png
diff --git a/website/docs/gen-ai/inference/img/ray-head-ha-1.png b/website/docs/gen-ai/inference/img/ray-head-ha-1.png
diff --git a/website/docs/gen-ai/inference/img/ray-head-ha-2.png b/website/docs/gen-ai/inference/img/ray-head-ha-2.png
diff --git a/website/docs/gen-ai/inference/img/worker-pod-running.png b/website/docs/gen-ai/inference/img/worker-pod-running.png
-Original file line number
+Diff line change
@@ Expand Up @@
     **Step1:** Delete Gradio App and mistral Inference deployment
     ```bash
     cd gen-ai/inference/mistral-7b-rayserve-inf2
     kubectl delete -f gradio-ui.yaml
@@ Expand Down @@