forked from awslabs/data-on-eks
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Add Ray head Pod high availability with Redis (awslabs#555)
Co-authored-by: Bryant Biggs <[email protected]>
- Loading branch information
1 parent
a84f3b2
commit 9e52517
Showing
14 changed files
with
464 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
module "elasticache" { | ||
create = var.enable_rayserve_ha_elastic_cache_redis | ||
source = "terraform-aws-modules/elasticache/aws" | ||
version = "1.2.0" | ||
|
||
cluster_id = local.name | ||
create_cluster = true | ||
create_replication_group = false | ||
|
||
engine_version = "7.1" | ||
node_type = "cache.t4g.small" | ||
|
||
apply_immediately = true | ||
|
||
# Security Group | ||
vpc_id = module.vpc.vpc_id | ||
security_group_rules = { | ||
ingress_vpc = { | ||
# Default type is `ingress` | ||
# Default port is based on the default engine port | ||
description = "VPC traffic" | ||
cidr_ipv4 = module.vpc.vpc_cidr_block | ||
} | ||
|
||
ingress_from_eks_worker_node_tcp = { | ||
description = "Ingress rule to allow TCP on port 6379 from EKS Ray Head Node" | ||
protocol = "tcp" | ||
from_port = 6379 | ||
referenced_security_group_id = module.eks.node_security_group_id | ||
to_port = 6379 | ||
type = "ingress" | ||
} | ||
} | ||
|
||
# Subnet Group | ||
subnet_group_name = local.name | ||
subnet_group_description = "${title(local.name)} subnet group" | ||
subnet_ids = module.vpc.private_subnets | ||
|
||
# Parameter Group | ||
create_parameter_group = true | ||
parameter_group_name = local.name | ||
parameter_group_family = "redis7" | ||
parameter_group_description = "${title(local.name)} parameter group" | ||
parameters = [ | ||
{ | ||
name = "latency-tracking" | ||
value = "yes" | ||
} | ||
] | ||
|
||
tags = local.tags | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
186 changes: 186 additions & 0 deletions
186
gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral-ft.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,186 @@ | ||
#---------------------------------------------------------------------- | ||
# NOTE: For deployment instructions, refer to the DoEKS website. | ||
#---------------------------------------------------------------------- | ||
apiVersion: v1 | ||
kind: Namespace | ||
metadata: | ||
name: mistral | ||
--- | ||
apiVersion: v1 | ||
kind: Secret | ||
metadata: | ||
name: hf-token | ||
namespace: mistral | ||
data: | ||
hf-token: $HUGGING_FACE_HUB_TOKEN | ||
--- | ||
apiVersion: ray.io/v1 | ||
kind: RayService | ||
metadata: | ||
name: mistral | ||
namespace: mistral | ||
annotations: | ||
ray.io/ft-enabled: "true" # enables Global Control Service(GCS) fault tolerance | ||
spec: | ||
serviceUnhealthySecondThreshold: 900 | ||
deploymentUnhealthySecondThreshold: 300 | ||
serveConfigV2: | | ||
applications: | ||
- name: mistral-deployment | ||
import_path: "ray_serve_mistral:entrypoint" | ||
route_prefix: "/" | ||
runtime_env: | ||
env_vars: | ||
MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.2" | ||
NEURON_CC_FLAGS: "-O1" | ||
LD_LIBRARY_PATH: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH" | ||
NEURON_CORES: "2" | ||
deployments: | ||
- name: mistral-7b | ||
autoscaling_config: | ||
metrics_interval_s: 0.2 | ||
min_replicas: 2 | ||
max_replicas: 12 | ||
look_back_period_s: 2 | ||
downscale_delay_s: 30 | ||
upscale_delay_s: 2 | ||
target_num_ongoing_requests_per_replica: 1 | ||
graceful_shutdown_timeout_s: 5 | ||
max_concurrent_queries: 100 | ||
ray_actor_options: | ||
num_cpus: 10 | ||
resources: {"neuron_cores": 2} | ||
runtime_env: | ||
env_vars: | ||
LD_LIBRARY_PATH: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH" | ||
rayClusterConfig: | ||
rayVersion: '2.22.0' | ||
enableInTreeAutoscaling: true | ||
headGroupSpec: | ||
headService: | ||
metadata: | ||
name: mistral | ||
namespace: mistral | ||
rayStartParams: | ||
dashboard-host: '0.0.0.0' | ||
num-cpus: "0" # this is to ensure no tasks or actors are scheduled on the head Pod | ||
template: | ||
spec: | ||
containers: | ||
- name: head | ||
image: public.ecr.aws/data-on-eks/ray2.22.0-py310-mistral7b-neuron:latest | ||
imagePullPolicy: IfNotPresent # Ensure the image is always pulled when updated | ||
lifecycle: | ||
preStop: | ||
exec: | ||
command: ["/bin/sh", "-c", "ray stop"] | ||
ports: | ||
- containerPort: 6379 | ||
name: redis | ||
- containerPort: 8265 | ||
name: dashboard | ||
- containerPort: 10001 | ||
name: client | ||
- containerPort: 8000 | ||
name: serve | ||
volumeMounts: | ||
- mountPath: /tmp/ray | ||
name: ray-logs | ||
resources: | ||
limits: | ||
cpu: "2" | ||
memory: "20G" | ||
requests: | ||
cpu: "2" | ||
memory: "20G" | ||
env: | ||
- name: RAY_REDIS_ADDRESS | ||
value: $EXT_REDIS_ENDPOINT:6379 | ||
- name: LD_LIBRARY_PATH | ||
value: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH" | ||
- name: HUGGING_FACE_HUB_TOKEN | ||
valueFrom: | ||
secretKeyRef: | ||
name: hf-token | ||
key: hf-token | ||
nodeSelector: | ||
instanceType: mixed-x86 | ||
provisionerType: Karpenter | ||
workload: rayhead | ||
volumes: | ||
- name: ray-logs | ||
emptyDir: {} | ||
workerGroupSpecs: | ||
- groupName: inf2 | ||
replicas: 1 | ||
minReplicas: 1 | ||
maxReplicas: 5 | ||
rayStartParams: {} | ||
template: | ||
spec: | ||
containers: | ||
- name: worker | ||
image: public.ecr.aws/data-on-eks/ray2.22.0-py310-mistral7b-neuron:latest | ||
imagePullPolicy: Always # Ensure the image is always pulled when updated | ||
lifecycle: | ||
preStop: | ||
exec: | ||
command: ["/bin/sh", "-c", "ray stop"] | ||
# We are using 2 Neuron cores per HTTP request hence this configuration handles 6 requests per second | ||
resources: | ||
limits: | ||
cpu: "90" # All vCPUs of inf2.24xlarge; 6vCPU daemonset overhead | ||
memory: "360G" # All memory of inf2.24xlarge; 24G for daemonset overhead | ||
aws.amazon.com/neuron: "6" # All Neuron cores of inf2.24xlarge | ||
requests: | ||
cpu: "90" # All vCPUs of inf2.24xlarge; 6vCPU daemonset overhead | ||
memory: "360G" # All memory of inf2.24xlarge; 24G for daemonset overhead | ||
aws.amazon.com/neuron: "6" # All Neuron cores of inf2.24xlarge | ||
env: | ||
- name: LD_LIBRARY_PATH | ||
value: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH" | ||
- name: HUGGING_FACE_HUB_TOKEN | ||
valueFrom: | ||
secretKeyRef: | ||
name: hf-token | ||
key: hf-token | ||
nodeSelector: | ||
instanceType: inferentia-inf2 | ||
provisionerType: Karpenter | ||
tolerations: | ||
- key: "aws.amazon.com/neuron" | ||
operator: "Exists" | ||
effect: "NoSchedule" | ||
- key: "hub.jupyter.org/dedicated" | ||
operator: "Equal" | ||
value: "user" | ||
effect: "NoSchedule" | ||
--- | ||
apiVersion: networking.k8s.io/v1 | ||
kind: Ingress | ||
metadata: | ||
name: mistral | ||
namespace: mistral | ||
annotations: | ||
nginx.ingress.kubernetes.io/rewrite-target: "/$1" | ||
spec: | ||
ingressClassName: nginx | ||
rules: | ||
- http: | ||
paths: | ||
# Ray Dashboard | ||
- path: /dashboard/(.*) | ||
pathType: ImplementationSpecific | ||
backend: | ||
service: | ||
name: mistral | ||
port: | ||
number: 8265 | ||
# Ray Serve | ||
- path: /serve/(.*) | ||
pathType: ImplementationSpecific | ||
backend: | ||
service: | ||
name: mistral | ||
port: | ||
number: 8000 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Oops, something went wrong.