forked from rh-aiservices-bu/models-aas
-
Notifications
You must be signed in to change notification settings - Fork 0
/
granite-code-vllm-raw.yaml
84 lines (84 loc) · 2.09 KB
/
granite-code-vllm-raw.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
---
apiVersion: serving.kserve.io/v1alpha1
kind: ServingRuntime
metadata:
annotations:
opendatahub.io/apiProtocol: REST
opendatahub.io/recommended-accelerators: '["nvidia.com/gpu"]'
openshift.io/display-name: "vLLM-RHOAI 2.12-max-len: 6144"
opendatahub.io/template-display-name: "vLLM-RHOAI 2.12-max-len: 6144"
opendatahub.io/template-name: vllm-2.12-6144
name: vllm-2.12-6144
labels:
opendatahub.io/dashboard: 'true'
spec:
annotations:
prometheus.io/path: /metrics
prometheus.io/port: '8080'
containers:
- args:
- --port=8080
- --model=/mnt/models
- --served-model-name={{.Name}}
- --distributed-executor-backend=mp
- --max-model-len=6144
command:
- python
- '-m'
- vllm.entrypoints.openai.api_server
env:
- name: HF_HOME
value: /tmp/hf_home
image: 'quay.io/modh/vllm:rhoai-2.12'
name: kserve-container
ports:
- containerPort: 8080
protocol: TCP
volumeMounts:
- mountPath: /dev/shm
name: shm
multiModel: false
supportedModelFormats:
- autoSelect: true
name: vLLM
volumes:
- emptyDir:
medium: Memory
sizeLimit: 2Gi
name: shm
---
apiVersion: serving.kserve.io/v1beta1
kind: InferenceService
metadata:
annotations:
openshift.io/display-name: Granite-8b-code-instruct-128k
serving.kserve.io/deploymentMode: RawDeployment
name: granite-8b-code-instruct-128k
labels:
opendatahub.io/dashboard: 'true'
spec:
predictor:
maxReplicas: 1
minReplicas: 1
model:
modelFormat:
name: vLLM
name: ''
resources:
limits:
cpu: '8'
memory: 24Gi
nvidia.com/gpu: '1'
requests:
cpu: '4'
memory: 16Gi
nvidia.com/gpu: '1'
runtime: vllm-2.12-6144
storage:
key: aws-connection-models
path: ibm-granite/granite-8b-code-instruct-128k/
tolerations:
- effect: NoSchedule
key: nvidia.com/gpu
operator: Equal
value: 'NVIDIA-A10G-SHARED'