Skip to content

Commit

Permalink
Add a node spin up benchmark application (togethercomputer#101)
Browse files Browse the repository at this point in the history
  • Loading branch information
justusc authored Apr 25, 2023
1 parent 71dd823 commit e64116e
Show file tree
Hide file tree
Showing 3 changed files with 234 additions and 0 deletions.
68 changes: 68 additions & 0 deletions tools/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# OpenChatKit Tools

## convert_to_hf_gptneox.py

## ml_load_benchmark.py

The commands to run the model load benchmark tool is:
```shell
$ python3 model_load_benchmark.py -i benchmark_input.json -o benchmark_results.json -d cuda:0
```

```
usage: model_load_benchmark.py [-h] -i INPUT -o OUTPUT [-d DEVICE] [-r REPEAT_INFER]
Benchmark downloading, loading, and running an inferernce for a set of ML models.
optional arguments:
-h, --help show this help message and exit
-i INPUT, --input INPUT
Input JSON file containing models to be benchmark
-o OUTPUT, --output OUTPUT
Output JSON file with model benchmark results
-d DEVICE, --device DEVICE
Cuda device name, e.g. "cuda:0"
-r REPEAT_INFER, --repeat-infer REPEAT_INFER
Repeat inferrence for warm timings
```

The input file is a JSON file with the names and paths of the models to be tested. For example:
```JSON
{
"GPT-NeoXT-Chat-Base-20B": "togethercomputer/GPT-NeoXT-Chat-Base-20B",
"Pythia-Chat-Base-7B": "togethercomputer/Pythia-Chat-Base-7B",
"GPT-JT-Moderation-6B": "togethercomputer/GPT-JT-Moderation-6B",
"GPT-JT-6B-v1": "togethercomputer/GPT-JT-6B-v1",
"GPT-JT-6B-v0": "togethercomputer/GPT-JT-6B-v0"
}
```

The output is a json file with the timings for:
1. tokenizer download time in seconds -- `tokenizer_download_sec`
2. tokenizer load time in seconds -- `tokenizer_load_sec`
3. model download time -- `model_download_sec`
5. model load to RAM time -- `model_load_to_ram_sec`
6. model transfer to GPU time -- `model_transfer_to_gpu_sec`
7. inference time (input is "hello, world!") -- `inference_sec`
8. total time (sum of all the above) -- `total_sec`
9. inference time from a warm start (the average of running inference `REPEAT_INFER` times) -- `inference_warm_sec`
10. model main memory footprint in MB -- `model_main_memory_MB`
11. model GPU memory footprint in MB -- `model_gpu_memory_MB`

An example of the output is:
```JSON
{
"GPT-JT-6B-v1": {
"tokenizer_download_sec": 1.52,
"tokenizer_load_sec": 0.10,
"model_download_sec": 124.70,
"model_load_to_ram_sec": 127.81,
"model_main_memory_MB": 12297.10,
"model_transfer_to_gpu_sec": 3.29,
"model_gpu_memory_MB": 12219.74,
"inference_sec": 0.93,
"inference_warm_sec": 0.047,
"total_sec": 258.38
}
}
```
7 changes: 7 additions & 0 deletions tools/benchmark_input.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"GPT-NeoXT-Chat-Base-20B": "togethercomputer/GPT-NeoXT-Chat-Base-20B",
"Pythia-Chat-Base-7B": "togethercomputer/Pythia-Chat-Base-7B",
"GPT-JT-Moderation-6B": "togethercomputer/GPT-JT-Moderation-6B",
"GPT-JT-6B-v1": "togethercomputer/GPT-JT-6B-v1",
"GPT-JT-6B-v0": "togethercomputer/GPT-JT-6B-v0"
}
159 changes: 159 additions & 0 deletions tools/model_load_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
import argparse
import json
import time
import torch
import torchvision
import os
import re
import psutil
from transformers import AutoTokenizer, AutoModelForCausalLM

# Benchmark download, tokenize, load, inference time.
def benchmark(model_dict: dict, device_name: str, repeat_infer: int):

# Initialize the benchmark results dictionary
results_dict = {}

# Check that we have CUDA GPUs available before running the benchmark
if not torch.cuda.is_available():
print("ERROR: CUDA GPUs are not available, benchmark not run")
return results_dict

device = torch.device(device_name)

process = psutil.Process()

print(f'Using device {device}')

# Loop through the models to test
for model_name, model_path in model_dict.items():
# purge unused cached memory
torch.cuda.empty_cache()

print(f"Testing model: {model_name}")

# Measure the time it takes to download the tokenizer data and load the tokenizer
tokenizer_download_start_time = time.time()
tokenizer = AutoTokenizer.from_pretrained(model_path, force_download=True)
tokenizer_download_end_time = time.time()

tokenizer = None

# Measure the time it takes to load the tokenizer
tokenizer_load_start_time = time.time()
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer_load_end_time = time.time()

tokenizer_load_sec = tokenizer_load_end_time - tokenizer_load_start_time
tokenizer_download_sec = tokenizer_download_end_time - tokenizer_download_start_time - tokenizer_load_sec

print(f"Testing model: {model_name} --- tokenizer download time = {tokenizer_download_sec:.3} sec")
print(f"Testing model: {model_name} --- tokenize load time = {tokenizer_load_sec:.3} sec")

# Measure the time it takes to download and load the model into main memory
model_download_start_time = time.time()
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, torchscript=True, force_download=True)
model_download_end_time = time.time()

model = None

# Measure the time it takes to load the model into main memory
memory_used_main_start = process.memory_info().rss
model_load_to_ram_start_time = time.time()
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, torchscript=True)
model_load_to_ram_end_time = time.time()
memory_used_main_end = process.memory_info().rss

model_load_to_ram_sec = model_load_to_ram_end_time - model_load_to_ram_start_time
model_download_sec = model_download_end_time - model_download_start_time - model_load_to_ram_sec
model_main_memory_bytes = memory_used_main_end - memory_used_main_start

print(f"Testing model: {model_name} --- model download time = {model_download_sec:.3} sec")
print(f"Testing model: {model_name} --- model load to RAM time = {model_load_to_ram_sec:.3} sec")
print(f"Testing model: {model_name} --- model main memory size = {model_main_memory_bytes} bytes")

# Measure the time it takes to load the model from main memory to the GPU
gpu_memory_start = torch.cuda.memory_allocated(device)
model_xfer_to_gpu_start_time = time.time()
model = model.to(device)
model_xfer_to_gpu_end_time = time.time()
gpu_memory_end = torch.cuda.memory_allocated(device)

model_xfer_to_gpu_sec = model_xfer_to_gpu_end_time - model_xfer_to_gpu_start_time
model_gpu_memory_bytes = gpu_memory_end - gpu_memory_start

print(f"Testing model: {model_name} --- model transfer to GPU time = {model_xfer_to_gpu_sec:.3} sec")
print(f"Testing model: {model_name} --- model GPU memory size = {model_gpu_memory_bytes} bytes")

# Measure the time it takes to run inference from a cold start
inference_start_time = time.time()
inputs = tokenizer("Hello, world!", return_tensors="pt").to(device)
outputs = model(**inputs)
inference_end_time = time.time()
inference_sec = inference_end_time - inference_start_time

print(f"Testing model: {model_name} --- inference time = {inference_sec:.3} sec")

# Measure the time it takes to run inference from a cold start
inference_warm_start_time = time.time()
for i in range(0, repeat_infer):
inputs = tokenizer("Hello, world!", return_tensors="pt").to(device)
outputs = model(**inputs)
inference_warm_end_time = time.time()
inference_warm_sec = (inference_warm_end_time - inference_warm_start_time) / float(repeat_infer)

print(f"Testing model: {model_name} --- inference warm time = {inference_warm_sec:.3} sec")

total_sec = tokenizer_download_sec + tokenizer_load_sec + model_download_sec + model_load_to_ram_sec + model_xfer_to_gpu_sec + inference_sec

print(f"Testing model: {model_name} --- total time = {total_sec:.3} sec")

# Add the results to the dictionary
results_dict[model_name] = {
"tokenizer_download_sec": tokenizer_download_sec,
"tokenizer_load_sec": tokenizer_load_sec,
"model_download_sec": model_download_sec,
"model_load_to_ram_sec": model_load_to_ram_sec,
"model_main_memory_MB": float(model_main_memory_bytes) / 1000000.0,
"model_transfer_to_gpu_sec": model_xfer_to_gpu_sec,
"model_gpu_memory_MB": float(model_gpu_memory_bytes) / 1000000.0,
"inference_sec": inference_sec,
"inference_warm_sec": inference_warm_sec,
"total_sec": total_sec
}

# Unload the model
model = None
torch.cuda.empty_cache()

return results_dict

# Define the main function
def main(input_file: str, output_file: str, device_name: str, repeat_infer: int):

# Load the models to test from the input JSON file
with open(input_file, "r") as f:
model_dict = json.load(f)

# Run the benchmark
results_dict = benchmark(model_dict, device_name, repeat_infer)

# Write the results to the JSON output file
# use a regular expression to apply formatting to floatin point
json_data = re.sub('"(.*?)":\s*(0\.0*\d{2}|\d+\.\d{2})\d*(,?\n)', '"\\1": \\2\\3', json.dumps(results_dict, indent=4))
with open(output_file, 'w') as f:
f.write(json_data)

if __name__ == "__main__":
# Create an argument parser
parser = argparse.ArgumentParser(description='Benchmark downloading, loading, and running an inferernce for a set of ML models.')
parser.add_argument('-i', '--input', required=True, help='Input JSON file containing models to be benchmark')
parser.add_argument('-o', '--output', required=True, help='Output JSON file with model benchmark results')
parser.add_argument('-d', '--device', required=False, default='cuda:0', help='Cuda device name, e.g. "cuda:0"')
parser.add_argument('-r', '--repeat-infer', required=False, default=30, help='Repeat inferrence for warm timings')

# Parse the command line arguments
args = parser.parse_args()

# Process the data
main(args.input, args.output, args.device, max(args.repeat_infer, 1))

0 comments on commit e64116e

Please sign in to comment.