Skip to content

Commit

Permalink
feat: Add Ray head Pod high availability with Redis (awslabs#555)
Browse files Browse the repository at this point in the history
Co-authored-by: Bryant Biggs <[email protected]>
  • Loading branch information
ratnopamc and bryantbiggs authored Jun 12, 2024
1 parent a84f3b2 commit 9e52517
Show file tree
Hide file tree
Showing 14 changed files with 464 additions and 1 deletion.
54 changes: 54 additions & 0 deletions ai-ml/trainium-inferentia/elastic-cache-redis.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
module "elasticache" {
create = var.enable_rayserve_ha_elastic_cache_redis
source = "terraform-aws-modules/elasticache/aws"
version = "1.2.0"

cluster_id = local.name
create_cluster = true
create_replication_group = false

engine_version = "7.1"
node_type = "cache.t4g.small"

apply_immediately = true

# Security Group
vpc_id = module.vpc.vpc_id
security_group_rules = {
ingress_vpc = {
# Default type is `ingress`
# Default port is based on the default engine port
description = "VPC traffic"
cidr_ipv4 = module.vpc.vpc_cidr_block
}

ingress_from_eks_worker_node_tcp = {
description = "Ingress rule to allow TCP on port 6379 from EKS Ray Head Node"
protocol = "tcp"
from_port = 6379
referenced_security_group_id = module.eks.node_security_group_id
to_port = 6379
type = "ingress"
}
}

# Subnet Group
subnet_group_name = local.name
subnet_group_description = "${title(local.name)} subnet group"
subnet_ids = module.vpc.private_subnets

# Parameter Group
create_parameter_group = true
parameter_group_name = local.name
parameter_group_family = "redis7"
parameter_group_description = "${title(local.name)} parameter group"
parameters = [
{
name = "latency-tracking"
value = "yes"
}
]

tags = local.tags

}
5 changes: 5 additions & 0 deletions ai-ml/trainium-inferentia/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,8 @@ output "configure_kubectl" {
description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig"
value = "aws eks --region ${var.region} update-kubeconfig --name ${local.name}"
}

output "elastic_cache_redis_cluster_arn" {
description = "Cluster arn of the cache cluster"
value = module.elasticache.cluster_arn
}
6 changes: 6 additions & 0 deletions ai-ml/trainium-inferentia/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,9 @@ variable "kms_key_admin_roles" {
type = list(string)
default = []
}

variable "enable_rayserve_ha_elastic_cache_redis" {
description = "Flag to enable Ray Head High Availability with Elastic Cache for Redis"
type = bool
default = false
}
186 changes: 186 additions & 0 deletions gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral-ft.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
#----------------------------------------------------------------------
# NOTE: For deployment instructions, refer to the DoEKS website.
#----------------------------------------------------------------------
apiVersion: v1
kind: Namespace
metadata:
name: mistral
---
apiVersion: v1
kind: Secret
metadata:
name: hf-token
namespace: mistral
data:
hf-token: $HUGGING_FACE_HUB_TOKEN
---
apiVersion: ray.io/v1
kind: RayService
metadata:
name: mistral
namespace: mistral
annotations:
ray.io/ft-enabled: "true" # enables Global Control Service(GCS) fault tolerance
spec:
serviceUnhealthySecondThreshold: 900
deploymentUnhealthySecondThreshold: 300
serveConfigV2: |
applications:
- name: mistral-deployment
import_path: "ray_serve_mistral:entrypoint"
route_prefix: "/"
runtime_env:
env_vars:
MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.2"
NEURON_CC_FLAGS: "-O1"
LD_LIBRARY_PATH: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
NEURON_CORES: "2"
deployments:
- name: mistral-7b
autoscaling_config:
metrics_interval_s: 0.2
min_replicas: 2
max_replicas: 12
look_back_period_s: 2
downscale_delay_s: 30
upscale_delay_s: 2
target_num_ongoing_requests_per_replica: 1
graceful_shutdown_timeout_s: 5
max_concurrent_queries: 100
ray_actor_options:
num_cpus: 10
resources: {"neuron_cores": 2}
runtime_env:
env_vars:
LD_LIBRARY_PATH: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
rayClusterConfig:
rayVersion: '2.22.0'
enableInTreeAutoscaling: true
headGroupSpec:
headService:
metadata:
name: mistral
namespace: mistral
rayStartParams:
dashboard-host: '0.0.0.0'
num-cpus: "0" # this is to ensure no tasks or actors are scheduled on the head Pod
template:
spec:
containers:
- name: head
image: public.ecr.aws/data-on-eks/ray2.22.0-py310-mistral7b-neuron:latest
imagePullPolicy: IfNotPresent # Ensure the image is always pulled when updated
lifecycle:
preStop:
exec:
command: ["/bin/sh", "-c", "ray stop"]
ports:
- containerPort: 6379
name: redis
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
- containerPort: 8000
name: serve
volumeMounts:
- mountPath: /tmp/ray
name: ray-logs
resources:
limits:
cpu: "2"
memory: "20G"
requests:
cpu: "2"
memory: "20G"
env:
- name: RAY_REDIS_ADDRESS
value: $EXT_REDIS_ENDPOINT:6379
- name: LD_LIBRARY_PATH
value: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token
key: hf-token
nodeSelector:
instanceType: mixed-x86
provisionerType: Karpenter
workload: rayhead
volumes:
- name: ray-logs
emptyDir: {}
workerGroupSpecs:
- groupName: inf2
replicas: 1
minReplicas: 1
maxReplicas: 5
rayStartParams: {}
template:
spec:
containers:
- name: worker
image: public.ecr.aws/data-on-eks/ray2.22.0-py310-mistral7b-neuron:latest
imagePullPolicy: Always # Ensure the image is always pulled when updated
lifecycle:
preStop:
exec:
command: ["/bin/sh", "-c", "ray stop"]
# We are using 2 Neuron cores per HTTP request hence this configuration handles 6 requests per second
resources:
limits:
cpu: "90" # All vCPUs of inf2.24xlarge; 6vCPU daemonset overhead
memory: "360G" # All memory of inf2.24xlarge; 24G for daemonset overhead
aws.amazon.com/neuron: "6" # All Neuron cores of inf2.24xlarge
requests:
cpu: "90" # All vCPUs of inf2.24xlarge; 6vCPU daemonset overhead
memory: "360G" # All memory of inf2.24xlarge; 24G for daemonset overhead
aws.amazon.com/neuron: "6" # All Neuron cores of inf2.24xlarge
env:
- name: LD_LIBRARY_PATH
value: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token
key: hf-token
nodeSelector:
instanceType: inferentia-inf2
provisionerType: Karpenter
tolerations:
- key: "aws.amazon.com/neuron"
operator: "Exists"
effect: "NoSchedule"
- key: "hub.jupyter.org/dedicated"
operator: "Equal"
value: "user"
effect: "NoSchedule"
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: mistral
namespace: mistral
annotations:
nginx.ingress.kubernetes.io/rewrite-target: "/$1"
spec:
ingressClassName: nginx
rules:
- http:
paths:
# Ray Dashboard
- path: /dashboard/(.*)
pathType: ImplementationSpecific
backend:
service:
name: mistral
port:
number: 8265
# Ray Serve
- path: /serve/(.*)
pathType: ImplementationSpecific
backend:
service:
name: mistral
port:
number: 8000
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ spec:
namespace: mistral
rayStartParams:
dashboard-host: '0.0.0.0'
num-cpus: "0" # this is to ensure no tasks or actors are scheduled on the head Pod
template:
spec:
containers:
Expand Down
1 change: 0 additions & 1 deletion website/docs/gen-ai/inference/Mistral-7b-inf2.md
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,6 @@ Finally, we'll provide instructions for cleaning up and deprovisioning the resou

**Step1:** Delete Gradio App and mistral Inference deployment


```bash
cd gen-ai/inference/mistral-7b-rayserve-inf2
kubectl delete -f gradio-ui.yaml
Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added website/docs/gen-ai/inference/img/answer-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 9e52517

Please sign in to comment.