forked from AlibabaPAI/llumnix
-
Notifications
You must be signed in to change notification settings - Fork 0
/
run.sh
101 lines (86 loc) · 2.45 KB
/
run.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# When recieve ctrl+c from user
cleanup() {
echo "Stopping ray server"
ray stop
echo "Ray server stopped."
exit 0
}
# Trace ctrl+c command
trap cleanup SIGINT
# Configure on all nodes.
HEAD_NODE_IP_ADDRESS=$(ifconfig | grep 'inet ' | awk '{print $2}' | head -n 1)
export HEAD_NODE_IP=$HEAD_NODE_IP_ADDRESS
# Configure on head node.
export HEAD_NODE=1
# Run llumnix api server
# Get GPU count
gpu_count=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
echo "Detected GPUs: $gpu_count"
HOST="localhost"
PORT="8003"
INITIAL_INSTANCES=$gpu_count
MODEL_PATH="/root/models/facebook/opt-6.7b/"
DRAFT_MODEL_PATH="/root/models/facebook/opt-125m/"
MAX_MODEL_LEN=2048
python -m llumnix.entrypoints.vllm.api_server \
--host $HOST \
--port $PORT \
--initial-instances $INITIAL_INSTANCES \
--launch-ray-cluster \
--model $MODEL_PATH \
--engine-use-ray \
--worker-use-ray \
--speculative-model $DRAFT_MODEL_PATH \
--num-speculative-tokens 5 \
--use-v2-block-manager \
--max-model-len $MAX_MODEL_LEN &
server_pid=$!
echo "Run llumnix api server successfully"
# Run llumnix server benchmark
cd benchmark
NUM_PROMPTS=1
QPS=1
FILE="/root/vllm/ShareGPT_V3_unfiltered_cleaned_split.json"
# Check if dataset exists
if [ -e "$FILE" ]; then
echo "$FILE already exist"
else
echo "$FILE does not exist, downloading it ..."
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
echo "$FILE download done."
fi
# Wait 180 seconds for server to initialize...
timeout 180 bash -c 'until curl -s localhost:8003/is_ready > /dev/null; do sleep 1; done' || exit 1
echo "Server ready."
python benchmark_serving.py \
--ip_ports $HOST:$PORT \
--tokenizer $MODEL_PATH \
--random_prompt_count $NUM_PROMPTS \
--dataset_type "sharegpt" \
--dataset_path $FILE\
--qps $QPS \
--distribution "poisson" \
--log_latencies \
--fail_on_response_failure
kill $server_pid
echo "Stopping ray server"
ray stop
echo "Ray server stopped."
# 初始化累加变量
sum_a=0
sum_b=0
# 逐行读取文件
while IFS=' ' read -r a b; do
# 累加a和b
sum_a=$((sum_a + a))
sum_b=$((sum_b + b))
done < "$filename"
# 计算b/a
if [ "$sum_a" -ne 0 ]; then
ratio=$(echo "scale=2; $sum_b / $sum_a" | bc)
echo "Sum of a: $sum_a"
echo "Sum of b: $sum_b"
echo "b / a: $ratio"
else
echo "Sum of a is 0, cannot divide by zero."
fi