forked from TabbyML/tabby
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(loadtest): add loadtest tools (TabbyML#906)
* add loadtest tools * update * [autofix.ci] apply automated fixes * add readme * cleanup legacy loadtest --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
- Loading branch information
1 parent
b47abc6
commit edd33a3
Showing
7 changed files
with
317 additions
and
37 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
# Run loadtest with tabby on modal GPUs | ||
|
||
Steps: | ||
1. Adjust `TABBY_API_HOST` in `run.sh` to match your modal deployment url. | ||
2. Add models you're interested in to benchmark at end of `run.sh` | ||
3. Run `run.sh`, output will be appended to `record.csv` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
import http from "k6/http"; | ||
import { check, sleep } from "k6"; | ||
import { textSummary } from "https://jslib.k6.io/k6-utils/1.4.0/index.js"; | ||
|
||
const PARALLELISM = parseInt(__ENV.PARALLELISM); | ||
|
||
export const options = { | ||
stages: [ | ||
{ duration: "1s", target: PARALLELISM }, | ||
{ duration: "30s", target: PARALLELISM }, | ||
], | ||
// Below thresholds are tested against TabbyML/StarCoder-1B served by NVIDIA T4 GPU. | ||
thresholds: { | ||
http_req_failed: ['rate<0.001'], | ||
http_req_duration: ["med<1800", "avg<1800", "p(90)<2500", "p(95)<3000"], | ||
}, | ||
}; | ||
|
||
export default () => { | ||
const payload = JSON.stringify({ | ||
language: "python", | ||
segments: { | ||
prefix: "def binarySearch(arr, left, right, x):\n mid = (left +" | ||
}, | ||
}); | ||
const headers = { "Content-Type": "application/json" }; | ||
const res = http.post(`${__ENV.TABBY_API_HOST}/v1/completions`, payload, { | ||
headers, | ||
}); | ||
check(res, { success: (r) => r.status === 200 }); | ||
sleep(0.5); | ||
}; | ||
|
||
export function handleSummary(data) { | ||
const avg_latency = data.metrics.http_req_duration.values.avg / 1000; | ||
const med_latency = data.metrics.http_req_duration.values.med / 1000; | ||
const p90_latency = data.metrics.http_req_duration.values["p(90)"] / 1000; | ||
const p95_latency = data.metrics.http_req_duration.values["p(95)"] / 1000; | ||
const qps = PARALLELISM / avg_latency; | ||
|
||
return { | ||
"metrics.txt": `${rounded(qps)},${rounded(avg_latency)},${rounded(med_latency)},${rounded(p90_latency)},${rounded(p95_latency)}` | ||
}; | ||
} | ||
|
||
function rounded(x) { | ||
return Math.round(x * 100) / 100; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
GPU,Model,Parallelism,QPS,Latency (Avg)," Latency (Med)"," Latency (p90)"," Latency (p95)"," Passed" | ||
T4,TabbyML/StarCoder-1B,7,4.14,1.69,"1.58","2","2.05","SUCCESS" | ||
T4,TabbyML/StarCoder-1B,10,4.85,2.06,"1.98","2.49","2.63","FAILED" | ||
T4,TabbyML/StarCoder-1B,8,4.22,1.9,"1.85","2.29","2.38","FAILED" | ||
A10G,TabbyML/StarCoder-1B,17,12.01,1.42,"1.35","1.66","1.9","SUCCESS" | ||
A10G,TabbyML/StarCoder-1B,25,14.61,1.71,"1.68","1.97","2.07","SUCCESS" | ||
A10G,TabbyML/StarCoder-1B,29,15.94,1.82,"1.79","2.11","2.17","FAILED" | ||
A10G,TabbyML/StarCoder-1B,27,14.89,1.81,"1.79","2.03","2.37","FAILED" | ||
A10G,TabbyML/StarCoder-1B,26,14.79,1.76,"1.72","2.08","2.22","SUCCESS" | ||
A100,TabbyML/StarCoder-1B,33,13.16,2.51,"1.57","2.58","11.52","FAILED" | ||
A100,TabbyML/StarCoder-1B,17,12.94,1.31,"1.29","1.46","1.53","SUCCESS" | ||
A100,TabbyML/StarCoder-1B,25,16.98,1.47,"1.43","1.69","1.86","SUCCESS" | ||
A100,TabbyML/StarCoder-1B,29,10.46,2.77,"2.8","3.11","3.18","FAILED" | ||
A100,TabbyML/StarCoder-1B,27,13.58,1.99,"1.96","2.37","2.42","FAILED" | ||
A100,TabbyML/StarCoder-1B,26,16.52,1.57,"1.47","1.96","2.19","SUCCESS" | ||
T4,TabbyML/DeepseekCoder-1.3B,7,3.82,1.83,"1.86","1.94","1.96","FAILED" | ||
T4,TabbyML/DeepseekCoder-1.3B,4,3.05,1.31,"1.32","1.45","1.49","SUCCESS" | ||
T4,TabbyML/DeepseekCoder-1.3B,5,3.18,1.57,"1.54","1.92","1.97","SUCCESS" | ||
T4,TabbyML/DeepseekCoder-1.3B,6,3.33,1.8,"1.84","1.92","1.95","FAILED" | ||
A10G,TabbyML/DeepseekCoder-1.3B,17,11.66,1.46,1.46,1.65,1.77,SUCCESS | ||
A10G,TabbyML/DeepseekCoder-1.3B,21,6.83,3.07,1.63,14.32,14.56,FAILED | ||
A10G,TabbyML/DeepseekCoder-1.3B,19,12.63,1.5,1.5,1.69,1.78,SUCCESS | ||
A10G,TabbyML/DeepseekCoder-1.3B,20,12.88,1.55,1.53,1.79,1.92,SUCCESS | ||
A100,TabbyML/DeepseekCoder-1.3B,33,20.78,1.59,1.55,1.84,1.9,SUCCESS | ||
A100,TabbyML/DeepseekCoder-1.3B,49,18.2,2.69,2.62,3.15,3.38,FAILED | ||
A100,TabbyML/DeepseekCoder-1.3B,41,16.44,2.49,2.36,2.9,3.49,FAILED | ||
A100,TabbyML/DeepseekCoder-1.3B,37,21.15,1.75,1.67,2.07,2.18,SUCCESS | ||
A100,TabbyML/DeepseekCoder-1.3B,39,14.78,2.64,2.6,3.02,3.13,FAILED | ||
A100,TabbyML/DeepseekCoder-1.3B,38,20.88,1.82,1.76,2.08,2.19,FAILED | ||
T4,TabbyML/StarCoder-3B,7,1.89,3.7,3.68,3.89,3.96,FAILED | ||
T4,TabbyML/StarCoder-3B,4,1.53,2.62,2.62,2.75,2.78,FAILED | ||
T4,TabbyML/StarCoder-3B,2,0.86,2.32,2.34,2.43,2.47,FAILED | ||
A10G,TabbyML/StarCoder-3B,17,5.42,3.14,3.16,3.47,3.84,FAILED | ||
A10G,TabbyML/StarCoder-3B,9,4.31,2.09,2.05,2.39,2.69,FAILED | ||
A10G,TabbyML/StarCoder-3B,5,2.85,1.75,1.73,1.95,2.28,SUCCESS | ||
A10G,TabbyML/StarCoder-3B,7,3.61,1.94,1.94,2.18,2.29,FAILED | ||
A10G,TabbyML/StarCoder-3B,6,3.17,1.89,1.91,2.04,2.06,FAILED | ||
A100,TabbyML/StarCoder-3B,33,8.12,4.07,4.12,4.53,4.61,FAILED | ||
A100,TabbyML/StarCoder-3B,17,7.9,2.15,2.1,2.34,2.81,FAILED | ||
A100,TabbyML/StarCoder-3B,9,4.47,2.01,1.99,2.19,2.26,FAILED | ||
A100,TabbyML/StarCoder-3B,5,3.21,1.56,1.56,1.68,1.72,SUCCESS | ||
A100,TabbyML/StarCoder-3B,7,4.29,1.63,1.61,1.78,1.8,SUCCESS | ||
A100,TabbyML/StarCoder-3B,8,4.63,1.73,1.73,1.92,2.03,SUCCESS | ||
A10G,TabbyML/DeepseekCoder-6.7B,5,1.3,3.85,3.83,4.25,4.31,FAILED | ||
A10G,TabbyML/DeepseekCoder-6.7B,3,1.14,2.63,2.6,2.81,2.86,FAILED | ||
A10G,TabbyML/DeepseekCoder-6.7B,2,0.83,2.4,2.4,2.48,2.5,FAILED | ||
A100,TabbyML/DeepseekCoder-6.7B,9,3.14,2.87,2.85,3.08,3.13,FAILED | ||
A100,TabbyML/DeepseekCoder-6.7B,5,2.08,2.4,2.46,2.58,2.63,FAILED | ||
A100,TabbyML/DeepseekCoder-6.7B,3,1.32,2.27,2.3,2.54,2.69,FAILED | ||
A100,TabbyML/DeepseekCoder-6.7B,2,1.2,1.67,1.66,1.84,1.93,SUCCESS | ||
A100,TabbyML/CodeLlama-7B,9,3.69,2.44,2.45,2.59,2.63,FAILED | ||
A100,TabbyML/CodeLlama-7B,5,2.14,2.34,2.31,2.61,3.26,FAILED | ||
A100,TabbyML/CodeLlama-7B,3,1.52,1.97,2.02,2.3,2.37,FAILED | ||
A100,TabbyML/CodeLlama-7B,5,2.37,2.11,2.13,2.24,2.26,FAILED | ||
A100,TabbyML/CodeLlama-7B,3,1.59,1.89,1.95,2.04,2.07,FAILED | ||
A100,TabbyML/CodeLlama-7B,2,1.45,1.38,1.39,1.54,1.56,SUCCESS | ||
A100,TabbyML/CodeLlama-13B,5,1.21,4.14,4.15,4.38,4.5,FAILED | ||
A100,TabbyML/CodeLlama-13B,3,0.89,3.36,3.4,3.71,3.73,FAILED | ||
A100,TabbyML/CodeLlama-13B,2,0.73,2.75,2.73,2.92,3.06,FAILED |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
#!/bin/bash | ||
|
||
record() { | ||
echo $GPU_CONFIG,$MODEL_ID,$PARALLELISM,$1 >> record.csv | ||
} | ||
|
||
cleanup() { | ||
MODAL_APP_ID=$(modal app list | grep tabby-server-loadtest | grep deployed | awk '{print $2}') | ||
|
||
if [ -z $MODAL_APP_ID ]; then | ||
modal app stop $MODAL_APP_ID | ||
fi | ||
} | ||
|
||
loadtest() { | ||
export GPU_CONFIG=$1 | ||
export MODEL_ID=$2 | ||
export PARALLELISM=$3 | ||
|
||
>&2 modal deploy server.py | ||
|
||
export MODAL_PROCESS_ID=$! | ||
export TABBY_API_HOST=https://wsxiaoys--tabby-server-loadtest-app.modal.run | ||
|
||
# wait for warmup | ||
>&2 echo "Waiting for warmup..." | ||
|
||
|
||
n=0 | ||
while [[ "$(curl -s -o /dev/null -w ''%{http_code}'' $TABBY_API_HOST/v1/health)" != "200" ]]; do | ||
if [ "$n" -ge 5 ]; then | ||
# error after 5 retries. | ||
return 1 | ||
fi | ||
|
||
sleep 10; | ||
n=$((n+1)) | ||
done | ||
|
||
>&2 echo "Start load testing..." | ||
|
||
>&2 k6 run loadtest.js | ||
SUCCESS=$? | ||
METRICS=$(cat metrics.txt) | ||
rm metrics.txt | ||
|
||
if [ $SUCCESS -ne 0 ]; then | ||
record $METRICS,FAILED | ||
else | ||
record $METRICS,SUCCESS | ||
fi | ||
|
||
cleanup | ||
|
||
return $SUCCESS | ||
} | ||
|
||
function dichotomic_search { | ||
min=$1 | ||
max=$2 | ||
command=$3 | ||
|
||
while (( $min < $max )); do | ||
# Compute the mean between min and max, rounded up to the superior unit | ||
current=$(( (min + max + 1 ) / 2 )) | ||
|
||
if $command $current | ||
then min=$current | ||
else max=$((current - 1)) | ||
fi | ||
done | ||
} | ||
|
||
test_t4() { | ||
loadtest T4 $MODEL_ID $1 | ||
} | ||
|
||
test_a10g() { | ||
loadtest A10G $MODEL_ID $1 | ||
} | ||
|
||
test_a100() { | ||
loadtest A100 $MODEL_ID $1 | ||
} | ||
|
||
test_1b3b_model() { | ||
export MODEL_ID="$1" | ||
|
||
dichotomic_search 1 12 test_t4 | ||
dichotomic_search 1 32 test_a10g | ||
dichotomic_search 1 64 test_a100 | ||
} | ||
|
||
test_7b_model() { | ||
export MODEL_ID="$1" | ||
|
||
dichotomic_search 1 8 test_a100 | ||
} | ||
|
||
test_13b_model() { | ||
export MODEL_ID="$1" | ||
|
||
dichotomic_search 1 8 test_a100 | ||
} | ||
|
||
# test_7b_model TabbyML/CodeLlama-7B | ||
test_13b_model TabbyML/CodeLlama-13B |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
"""Usage: | ||
modal serve app.py | ||
""" | ||
|
||
import os | ||
from modal import Image, Stub, asgi_app | ||
|
||
GPU_CONFIG = os.environ.get("GPU_CONFIG", "T4") | ||
IMAGE_NAME = "tabbyml/tabby:0.6.0" | ||
MODEL_ID = os.environ.get("MODEL_ID", "TabbyML/StarCoder-1B") | ||
PARALLELISM = os.environ.get("PARALLELISM", "4") | ||
|
||
|
||
def download_model(): | ||
import os | ||
import subprocess | ||
|
||
model_id = os.environ.get("MODEL_ID") | ||
subprocess.run( | ||
[ | ||
"/opt/tabby/bin/tabby", | ||
"download", | ||
"--model", | ||
model_id, | ||
] | ||
) | ||
|
||
|
||
image = ( | ||
Image.from_registry( | ||
IMAGE_NAME, | ||
add_python="3.11", | ||
) | ||
.env({"MODEL_ID": MODEL_ID}) | ||
.dockerfile_commands("ENTRYPOINT []") | ||
.run_function(download_model) | ||
.pip_install("asgi-proxy-lib") | ||
.env({"PARALLELISM": PARALLELISM}) | ||
) | ||
|
||
stub = Stub("tabby-server-loadtest", image=image) | ||
|
||
|
||
@stub.function( | ||
gpu=GPU_CONFIG, | ||
allow_concurrent_inputs=int(PARALLELISM), | ||
container_idle_timeout=120, | ||
timeout=360, | ||
) | ||
@asgi_app() | ||
def app(): | ||
import os | ||
import socket | ||
import subprocess | ||
import time | ||
from asgi_proxy import asgi_proxy | ||
|
||
model_id = os.environ.get("MODEL_ID") | ||
parallelism = os.environ.get("PARALLELISM") | ||
|
||
env = os.environ.copy() | ||
env["TABBY_DISABLE_USAGE_COLLECTION"] = "1" | ||
|
||
launcher = subprocess.Popen( | ||
[ | ||
"/opt/tabby/bin/tabby", | ||
"serve", | ||
"--model", | ||
model_id, | ||
"--port", | ||
"8000", | ||
"--device", | ||
"cuda", | ||
"--parallelism", | ||
parallelism, | ||
], | ||
env=env | ||
) | ||
|
||
# Poll until webserver at 127.0.0.1:8000 accepts connections before running inputs. | ||
def tabby_ready(): | ||
try: | ||
socket.create_connection(("127.0.0.1", 8000), timeout=1).close() | ||
return True | ||
except (socket.timeout, ConnectionRefusedError): | ||
# Check if launcher webserving process has exited. | ||
# If so, a connection can never be made. | ||
retcode = launcher.poll() | ||
if retcode is not None: | ||
raise RuntimeError(f"launcher exited unexpectedly with code {retcode}") | ||
return False | ||
|
||
while not tabby_ready(): | ||
time.sleep(1.0) | ||
|
||
print("Tabby server ready!") | ||
return asgi_proxy("http://localhost:8000") |
This file was deleted.
Oops, something went wrong.