Skip to content

Commit

Permalink
feat(loadtest): add loadtest tools (TabbyML#906)
Browse files Browse the repository at this point in the history
* add loadtest tools

* update

* [autofix.ci] apply automated fixes

* add readme

* cleanup legacy loadtest

---------

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
  • Loading branch information
wsxiaoys and autofix-ci[bot] authored Nov 28, 2023
1 parent b47abc6 commit edd33a3
Show file tree
Hide file tree
Showing 7 changed files with 317 additions and 37 deletions.
7 changes: 0 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,10 +1,3 @@
loadtest:
ifdef TABBY_API_HOST
k6 run tests/*.loadtest.js
else
$(error TABBY_API_HOST is undefined)
endif

fix:
cargo machete --fix || true
cargo +nightly fmt
Expand Down
6 changes: 6 additions & 0 deletions python/tabby-loadtest/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Run loadtest with tabby on modal GPUs

Steps:
1. Adjust `TABBY_API_HOST` in `run.sh` to match your modal deployment url.
2. Add models you're interested in to benchmark at end of `run.sh`
3. Run `run.sh`, output will be appended to `record.csv`
48 changes: 48 additions & 0 deletions python/tabby-loadtest/loadtest.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import http from "k6/http";
import { check, sleep } from "k6";
import { textSummary } from "https://jslib.k6.io/k6-utils/1.4.0/index.js";

const PARALLELISM = parseInt(__ENV.PARALLELISM);

export const options = {
stages: [
{ duration: "1s", target: PARALLELISM },
{ duration: "30s", target: PARALLELISM },
],
// Below thresholds are tested against TabbyML/StarCoder-1B served by NVIDIA T4 GPU.
thresholds: {
http_req_failed: ['rate<0.001'],
http_req_duration: ["med<1800", "avg<1800", "p(90)<2500", "p(95)<3000"],
},
};

export default () => {
const payload = JSON.stringify({
language: "python",
segments: {
prefix: "def binarySearch(arr, left, right, x):\n mid = (left +"
},
});
const headers = { "Content-Type": "application/json" };
const res = http.post(`${__ENV.TABBY_API_HOST}/v1/completions`, payload, {
headers,
});
check(res, { success: (r) => r.status === 200 });
sleep(0.5);
};

export function handleSummary(data) {
const avg_latency = data.metrics.http_req_duration.values.avg / 1000;
const med_latency = data.metrics.http_req_duration.values.med / 1000;
const p90_latency = data.metrics.http_req_duration.values["p(90)"] / 1000;
const p95_latency = data.metrics.http_req_duration.values["p(95)"] / 1000;
const qps = PARALLELISM / avg_latency;

return {
"metrics.txt": `${rounded(qps)},${rounded(avg_latency)},${rounded(med_latency)},${rounded(p90_latency)},${rounded(p95_latency)}`
};
}

function rounded(x) {
return Math.round(x * 100) / 100;
}
59 changes: 59 additions & 0 deletions python/tabby-loadtest/record.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
GPU,Model,Parallelism,QPS,Latency (Avg)," Latency (Med)"," Latency (p90)"," Latency (p95)"," Passed"
T4,TabbyML/StarCoder-1B,7,4.14,1.69,"1.58","2","2.05","SUCCESS"
T4,TabbyML/StarCoder-1B,10,4.85,2.06,"1.98","2.49","2.63","FAILED"
T4,TabbyML/StarCoder-1B,8,4.22,1.9,"1.85","2.29","2.38","FAILED"
A10G,TabbyML/StarCoder-1B,17,12.01,1.42,"1.35","1.66","1.9","SUCCESS"
A10G,TabbyML/StarCoder-1B,25,14.61,1.71,"1.68","1.97","2.07","SUCCESS"
A10G,TabbyML/StarCoder-1B,29,15.94,1.82,"1.79","2.11","2.17","FAILED"
A10G,TabbyML/StarCoder-1B,27,14.89,1.81,"1.79","2.03","2.37","FAILED"
A10G,TabbyML/StarCoder-1B,26,14.79,1.76,"1.72","2.08","2.22","SUCCESS"
A100,TabbyML/StarCoder-1B,33,13.16,2.51,"1.57","2.58","11.52","FAILED"
A100,TabbyML/StarCoder-1B,17,12.94,1.31,"1.29","1.46","1.53","SUCCESS"
A100,TabbyML/StarCoder-1B,25,16.98,1.47,"1.43","1.69","1.86","SUCCESS"
A100,TabbyML/StarCoder-1B,29,10.46,2.77,"2.8","3.11","3.18","FAILED"
A100,TabbyML/StarCoder-1B,27,13.58,1.99,"1.96","2.37","2.42","FAILED"
A100,TabbyML/StarCoder-1B,26,16.52,1.57,"1.47","1.96","2.19","SUCCESS"
T4,TabbyML/DeepseekCoder-1.3B,7,3.82,1.83,"1.86","1.94","1.96","FAILED"
T4,TabbyML/DeepseekCoder-1.3B,4,3.05,1.31,"1.32","1.45","1.49","SUCCESS"
T4,TabbyML/DeepseekCoder-1.3B,5,3.18,1.57,"1.54","1.92","1.97","SUCCESS"
T4,TabbyML/DeepseekCoder-1.3B,6,3.33,1.8,"1.84","1.92","1.95","FAILED"
A10G,TabbyML/DeepseekCoder-1.3B,17,11.66,1.46,1.46,1.65,1.77,SUCCESS
A10G,TabbyML/DeepseekCoder-1.3B,21,6.83,3.07,1.63,14.32,14.56,FAILED
A10G,TabbyML/DeepseekCoder-1.3B,19,12.63,1.5,1.5,1.69,1.78,SUCCESS
A10G,TabbyML/DeepseekCoder-1.3B,20,12.88,1.55,1.53,1.79,1.92,SUCCESS
A100,TabbyML/DeepseekCoder-1.3B,33,20.78,1.59,1.55,1.84,1.9,SUCCESS
A100,TabbyML/DeepseekCoder-1.3B,49,18.2,2.69,2.62,3.15,3.38,FAILED
A100,TabbyML/DeepseekCoder-1.3B,41,16.44,2.49,2.36,2.9,3.49,FAILED
A100,TabbyML/DeepseekCoder-1.3B,37,21.15,1.75,1.67,2.07,2.18,SUCCESS
A100,TabbyML/DeepseekCoder-1.3B,39,14.78,2.64,2.6,3.02,3.13,FAILED
A100,TabbyML/DeepseekCoder-1.3B,38,20.88,1.82,1.76,2.08,2.19,FAILED
T4,TabbyML/StarCoder-3B,7,1.89,3.7,3.68,3.89,3.96,FAILED
T4,TabbyML/StarCoder-3B,4,1.53,2.62,2.62,2.75,2.78,FAILED
T4,TabbyML/StarCoder-3B,2,0.86,2.32,2.34,2.43,2.47,FAILED
A10G,TabbyML/StarCoder-3B,17,5.42,3.14,3.16,3.47,3.84,FAILED
A10G,TabbyML/StarCoder-3B,9,4.31,2.09,2.05,2.39,2.69,FAILED
A10G,TabbyML/StarCoder-3B,5,2.85,1.75,1.73,1.95,2.28,SUCCESS
A10G,TabbyML/StarCoder-3B,7,3.61,1.94,1.94,2.18,2.29,FAILED
A10G,TabbyML/StarCoder-3B,6,3.17,1.89,1.91,2.04,2.06,FAILED
A100,TabbyML/StarCoder-3B,33,8.12,4.07,4.12,4.53,4.61,FAILED
A100,TabbyML/StarCoder-3B,17,7.9,2.15,2.1,2.34,2.81,FAILED
A100,TabbyML/StarCoder-3B,9,4.47,2.01,1.99,2.19,2.26,FAILED
A100,TabbyML/StarCoder-3B,5,3.21,1.56,1.56,1.68,1.72,SUCCESS
A100,TabbyML/StarCoder-3B,7,4.29,1.63,1.61,1.78,1.8,SUCCESS
A100,TabbyML/StarCoder-3B,8,4.63,1.73,1.73,1.92,2.03,SUCCESS
A10G,TabbyML/DeepseekCoder-6.7B,5,1.3,3.85,3.83,4.25,4.31,FAILED
A10G,TabbyML/DeepseekCoder-6.7B,3,1.14,2.63,2.6,2.81,2.86,FAILED
A10G,TabbyML/DeepseekCoder-6.7B,2,0.83,2.4,2.4,2.48,2.5,FAILED
A100,TabbyML/DeepseekCoder-6.7B,9,3.14,2.87,2.85,3.08,3.13,FAILED
A100,TabbyML/DeepseekCoder-6.7B,5,2.08,2.4,2.46,2.58,2.63,FAILED
A100,TabbyML/DeepseekCoder-6.7B,3,1.32,2.27,2.3,2.54,2.69,FAILED
A100,TabbyML/DeepseekCoder-6.7B,2,1.2,1.67,1.66,1.84,1.93,SUCCESS
A100,TabbyML/CodeLlama-7B,9,3.69,2.44,2.45,2.59,2.63,FAILED
A100,TabbyML/CodeLlama-7B,5,2.14,2.34,2.31,2.61,3.26,FAILED
A100,TabbyML/CodeLlama-7B,3,1.52,1.97,2.02,2.3,2.37,FAILED
A100,TabbyML/CodeLlama-7B,5,2.37,2.11,2.13,2.24,2.26,FAILED
A100,TabbyML/CodeLlama-7B,3,1.59,1.89,1.95,2.04,2.07,FAILED
A100,TabbyML/CodeLlama-7B,2,1.45,1.38,1.39,1.54,1.56,SUCCESS
A100,TabbyML/CodeLlama-13B,5,1.21,4.14,4.15,4.38,4.5,FAILED
A100,TabbyML/CodeLlama-13B,3,0.89,3.36,3.4,3.71,3.73,FAILED
A100,TabbyML/CodeLlama-13B,2,0.73,2.75,2.73,2.92,3.06,FAILED
107 changes: 107 additions & 0 deletions python/tabby-loadtest/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#!/bin/bash

record() {
echo $GPU_CONFIG,$MODEL_ID,$PARALLELISM,$1 >> record.csv
}

cleanup() {
MODAL_APP_ID=$(modal app list | grep tabby-server-loadtest | grep deployed | awk '{print $2}')

if [ -z $MODAL_APP_ID ]; then
modal app stop $MODAL_APP_ID
fi
}

loadtest() {
export GPU_CONFIG=$1
export MODEL_ID=$2
export PARALLELISM=$3

>&2 modal deploy server.py

export MODAL_PROCESS_ID=$!
export TABBY_API_HOST=https://wsxiaoys--tabby-server-loadtest-app.modal.run

# wait for warmup
>&2 echo "Waiting for warmup..."


n=0
while [[ "$(curl -s -o /dev/null -w ''%{http_code}'' $TABBY_API_HOST/v1/health)" != "200" ]]; do
if [ "$n" -ge 5 ]; then
# error after 5 retries.
return 1
fi

sleep 10;
n=$((n+1))
done

>&2 echo "Start load testing..."

>&2 k6 run loadtest.js
SUCCESS=$?
METRICS=$(cat metrics.txt)
rm metrics.txt

if [ $SUCCESS -ne 0 ]; then
record $METRICS,FAILED
else
record $METRICS,SUCCESS
fi

cleanup

return $SUCCESS
}

function dichotomic_search {
min=$1
max=$2
command=$3

while (( $min < $max )); do
# Compute the mean between min and max, rounded up to the superior unit
current=$(( (min + max + 1 ) / 2 ))

if $command $current
then min=$current
else max=$((current - 1))
fi
done
}

test_t4() {
loadtest T4 $MODEL_ID $1
}

test_a10g() {
loadtest A10G $MODEL_ID $1
}

test_a100() {
loadtest A100 $MODEL_ID $1
}

test_1b3b_model() {
export MODEL_ID="$1"

dichotomic_search 1 12 test_t4
dichotomic_search 1 32 test_a10g
dichotomic_search 1 64 test_a100
}

test_7b_model() {
export MODEL_ID="$1"

dichotomic_search 1 8 test_a100
}

test_13b_model() {
export MODEL_ID="$1"

dichotomic_search 1 8 test_a100
}

# test_7b_model TabbyML/CodeLlama-7B
test_13b_model TabbyML/CodeLlama-13B
97 changes: 97 additions & 0 deletions python/tabby-loadtest/server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
"""Usage:
modal serve app.py
"""

import os
from modal import Image, Stub, asgi_app

GPU_CONFIG = os.environ.get("GPU_CONFIG", "T4")
IMAGE_NAME = "tabbyml/tabby:0.6.0"
MODEL_ID = os.environ.get("MODEL_ID", "TabbyML/StarCoder-1B")
PARALLELISM = os.environ.get("PARALLELISM", "4")


def download_model():
import os
import subprocess

model_id = os.environ.get("MODEL_ID")
subprocess.run(
[
"/opt/tabby/bin/tabby",
"download",
"--model",
model_id,
]
)


image = (
Image.from_registry(
IMAGE_NAME,
add_python="3.11",
)
.env({"MODEL_ID": MODEL_ID})
.dockerfile_commands("ENTRYPOINT []")
.run_function(download_model)
.pip_install("asgi-proxy-lib")
.env({"PARALLELISM": PARALLELISM})
)

stub = Stub("tabby-server-loadtest", image=image)


@stub.function(
gpu=GPU_CONFIG,
allow_concurrent_inputs=int(PARALLELISM),
container_idle_timeout=120,
timeout=360,
)
@asgi_app()
def app():
import os
import socket
import subprocess
import time
from asgi_proxy import asgi_proxy

model_id = os.environ.get("MODEL_ID")
parallelism = os.environ.get("PARALLELISM")

env = os.environ.copy()
env["TABBY_DISABLE_USAGE_COLLECTION"] = "1"

launcher = subprocess.Popen(
[
"/opt/tabby/bin/tabby",
"serve",
"--model",
model_id,
"--port",
"8000",
"--device",
"cuda",
"--parallelism",
parallelism,
],
env=env
)

# Poll until webserver at 127.0.0.1:8000 accepts connections before running inputs.
def tabby_ready():
try:
socket.create_connection(("127.0.0.1", 8000), timeout=1).close()
return True
except (socket.timeout, ConnectionRefusedError):
# Check if launcher webserving process has exited.
# If so, a connection can never be made.
retcode = launcher.poll()
if retcode is not None:
raise RuntimeError(f"launcher exited unexpectedly with code {retcode}")
return False

while not tabby_ready():
time.sleep(1.0)

print("Tabby server ready!")
return asgi_proxy("http://localhost:8000")
30 changes: 0 additions & 30 deletions tests/default.loadtest.js

This file was deleted.

0 comments on commit edd33a3

Please sign in to comment.