Skip to content

Commit

Permalink
Update script
Browse files Browse the repository at this point in the history
  • Loading branch information
nox-410 committed Jul 26, 2022
1 parent eacff92 commit 2d10e5a
Show file tree
Hide file tree
Showing 10 changed files with 171 additions and 86 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ Supporting opset11, use ./testing/torch2onnx.py to get some supported models.

```bash
nnfusion model.onnx -f onnx -ftune_output_file=model.json -fconst_folding_backend="CUDA" &&
python3 -m run_compiler model.json tuned.json --device 0 --topk 10 &&
python3 -m run_compiler model.json tuned.json --device 0 --topk 20 &&
nnfusion model.onnx -f onnx -ftune_output_file=model.json -fconst_folding_backend="CUDA" -ftune_input_file=tuned.json &&
rm -rf nnfusion_rt/cuda_codegen/build/ && cmake -S nnfusion_rt/cuda_codegen/ -B nnfusion_rt/cuda_codegen/build/ &&
make -C nnfusion_rt/cuda_codegen/build/
Expand Down
1 change: 1 addition & 0 deletions python/memopt/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import torch
from .IRpass import *
from .scope import get_scope, Scope
from .schedule_rewrite import CodeGenerator
Expand Down
2 changes: 1 addition & 1 deletion python/memopt/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def append_host_call(self):
if (cudaEventRecord(stop, 0) != cudaSuccess) return -1;
if (cudaEventSynchronize(stop) != cudaSuccess) return -1;
cudaEventElapsedTime(&ms, start, stop);
int repeats = min(100, int(ceil(300.0 / ms)));
int repeats = int(ceil(100.0 / ms));
cudaEventRecord(start, 0);
for (int _ = 0; _ < repeats; _++)
{};
Expand Down
99 changes: 51 additions & 48 deletions testing/ansor.py
Original file line number Diff line number Diff line change
@@ -1,55 +1,58 @@
import onnx
import numpy as np
import os.path as osp
import time
import tvm
from tvm import relay, auto_scheduler
import tvm.relay.testing
from tvm.contrib import graph_executor

prefix = "/home/v-yiningshi/learn_tvm/testing/temp/bert"
target = tvm.target.cuda(arch="sm_70")
# import tensorflow.compat.v1 as tf
# pt_model = open(osp.join(prefix, "classifier.pb"), "rb")
# graph_def = tf.GraphDef()
# graph_def.ParseFromString(pt_model.read())
# mod, params = relay.frontend.from_tensorflow(graph_def, "NHWC")
# feed_dict = dict(np.load(osp.join(prefix, "inputs.npz"), allow_pickle=True))
# shape_dict = {key: value.shape for key, value in feed_dict.items()}
onnx_model = onnx.load(osp.join(prefix, "model.onnx"))
mod, params = relay.frontend.from_onnx(onnx_model)
tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
log_file = osp.join(prefix, "ansor_tune.log")

for idx, task in enumerate(tasks):
print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key))
print(task.compute_dag)

def run_tuning():
print("Begin tuning...")
measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10, device=3)

tuner = auto_scheduler.TaskScheduler(tasks, task_weights, load_log_file=log_file)
tune_option = auto_scheduler.TuningOptions(
num_measure_trials=len(tasks) * 512,
runner=measure_ctx.runner,
measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
)

tuner.tune(tune_option)

# run_tuning()

# Compile with the history best
print("Compile...")
with auto_scheduler.ApplyHistoryBest(log_file):
with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
lib = relay.build(mod, target=target, params=params)

# Create graph executor
dev = tvm.device(str(target), 3)
module = graph_executor.GraphModule(lib["default"](dev))

# Evaluate
print("Evaluate inference time cost...")
print(module.benchmark(dev, min_repeat_ms=500, end_to_end=False))
import argparse

def run_ansor(prefix, device, skip_tuning):
target = tvm.target.cuda(arch="sm_70")
onnx_model = onnx.load(osp.join(prefix, "model.onnx"))
mod, params = relay.frontend.from_onnx(onnx_model)
tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
log_file = osp.join(prefix, "ansor_tune.log")

for idx, task in enumerate(tasks):
print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key))
print(task.compute_dag)

num_trials = len(tasks) * 800
if osp.exists(log_file):
with open(log_file, "r") as f:
cur_records = len(f.readlines())
num_trials -= cur_records
if num_trials > 0 and not skip_tuning:
print("Begin tuning...")
measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10, device=device)
tuner = auto_scheduler.TaskScheduler(tasks, task_weights, load_log_file=log_file)
tune_option = auto_scheduler.TuningOptions(
num_measure_trials=num_trials,
runner=measure_ctx.runner,
measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
)
tuner.tune(tune_option)

# Compile with the history best
print("Compile...")
with auto_scheduler.ApplyHistoryBest(log_file):
with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
lib = relay.build(mod, target=target, params=params)

# Create graph executor
dev = tvm.device(str(target), device)
module = graph_executor.GraphModule(lib["default"](dev))

# Evaluate
print("Evaluate inference time cost...")
print(module.benchmark(dev, min_repeat_ms=500, end_to_end=False))

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--prefix', type=str, default="temp")
parser.add_argument('--device', type=int, default=0)
parser.add_argument('--skip', action="store_true")
args = parser.parse_args()
start_time = time.time()
run_ansor(args.prefix, args.device, args.skip)
18 changes: 4 additions & 14 deletions testing/model/pytorch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def shufflenet(batch_size):
input = torch.randn(batch_size, 3, 224, 224)
return model, (input, )

def SqueezeNet(batch_size):
def squeezenet(batch_size):
from .squeezenet import SqueezeNet as Net
model = Net()
input = torch.randn(batch_size, 3, 224, 224)
Expand Down Expand Up @@ -83,19 +83,9 @@ def transformer(batch_size):
return model, inputs

def vit(batch_size):
from vit_pytorch import ViT
model = ViT(
image_size = 256,
patch_size = 32,
num_classes = 1000,
dim = 1024,
depth = 6,
heads = 16,
mlp_dim = 2048,
dropout = 0.1,
emb_dropout = 0.1
)
input = torch.randn(batch_size, 3, 256, 256)
from timm.models import vit_small_patch32_224 as Net
model = Net()
input = torch.randn(batch_size, 3, 224, 224)
return model, (input, )

def localvit(batch_size):
Expand Down
7 changes: 2 additions & 5 deletions testing/run_tf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from ast import arg
import tensorflow as tf
import numpy as np
import argparse
Expand All @@ -8,8 +7,6 @@
import onnx
from onnx_tf.backend import prepare

os.environ["CUDA_VISIBLE_DEVICES"] = "3"

def load_graph(onnx_file):
onnx_model = onnx.load(onnx_file)
tf_rep = prepare(onnx_model, device="cuda")
Expand Down Expand Up @@ -47,8 +44,8 @@ def get_runtime():
tic = time.time()
_ = sess.run(outputs, feed_dict=feed_dict)
return (time.time() - tic) * 1000
_ = [get_runtime() for i in range(50)] # warmup
times = [get_runtime() for i in range(100)]
_ = [get_runtime() for i in range(200)] # warmup
times = [get_runtime() for i in range(800)]
print(np.mean(times), np.min(times), np.max(times))

if __name__ == "__main__":
Expand Down
43 changes: 43 additions & 0 deletions testing/run_tf2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import tensorflow as tf
import numpy as np
import argparse
import os
import time
import os.path as osp
import tempfile

def load_graph(onnx_file):
import onnx
from onnx_tf.backend import prepare
onnx_model = onnx.load(onnx_file)
tf_rep = prepare(onnx_model, device="cuda")
exported = tempfile.TemporaryDirectory()
tf_rep.export_graph(exported.name)
return exported

def run_tf(prefix, xla=False):
if xla:
tf.config.optimizer.set_jit(True)
exported = load_graph(osp.join(prefix, "model.onnx"))
feed_dict = dict(np.load(osp.join(prefix, "inputs.npz"), allow_pickle=True))

saved_model_loaded = tf.saved_model.load(
exported.name, tags=[tf.saved_model.SERVING])
graph_func = saved_model_loaded.signatures[tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY]

def get_runtime():
tic = time.time()
_ = graph_func(**feed_dict)
return (time.time() - tic) * 1000
_ = [get_runtime() for i in range(200)] # warmup
times = [get_runtime() for i in range(800)]
print(np.mean(times), np.min(times), np.max(times))

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--xla', action="store_true")
parser.add_argument('--prefix', type=str, default="temp")
parser.add_argument('--device', type=int, default=0)
args = parser.parse_args()
os.environ["CUDA_VISIBLE_DEVICES"] = str(args.device)
run_tf(args.prefix, xla=args.xla)
52 changes: 52 additions & 0 deletions testing/run_tf_trt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import tensorflow as tf
from tensorflow.python.compiler.tensorrt import trt_convert as trt
import numpy as np
import argparse
import os
import time
import os.path as osp
import tempfile

def load_graph(onnx_file):
import onnx
from onnx_tf.backend import prepare
onnx_model = onnx.load(onnx_file)
tf_rep = prepare(onnx_model, device="cuda")
exported = tempfile.TemporaryDirectory()
tf_rep.export_graph(exported.name)
return exported

def run_tf(prefix):
exported = load_graph(osp.join(prefix, "model.onnx"))
feed_dict = dict(np.load(osp.join(prefix, "inputs.npz"), allow_pickle=True))

conversion_params = trt.DEFAULT_TRT_CONVERSION_PARAMS
converter = trt.TrtGraphConverterV2(
input_saved_model_dir=exported.name,
conversion_params=conversion_params)
converter.convert()
def my_input_fn():
yield tuple(feed_dict.values())
converter.build(input_fn=my_input_fn)
convert_exported = tempfile.TemporaryDirectory()
converter.save(convert_exported.name)

saved_model_loaded = tf.saved_model.load(
convert_exported.name, tags=[tf.saved_model.SERVING])
graph_func = saved_model_loaded.signatures[tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY]

def get_runtime():
tic = time.time()
_ = graph_func(**feed_dict)
return (time.time() - tic) * 1000
_ = [get_runtime() for i in range(200)] # warmup
times = [get_runtime() for i in range(800)]
print(np.mean(times), np.min(times), np.max(times))

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--prefix', type=str, default="temp")
parser.add_argument('--device', type=int, default=0)
args = parser.parse_args()
os.environ["CUDA_VISIBLE_DEVICES"] = str(args.device)
run_tf(args.prefix)
14 changes: 7 additions & 7 deletions testing/run_trt.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
import argparse
import torch

torch.cuda.set_device(3)

def run_trt(prefix):
logger = trt.Logger(trt.Logger.ERROR)
builder = trt.Builder(logger)
Expand Down Expand Up @@ -35,13 +33,13 @@ def run_trt(prefix):
feed_dict = dict(np.load(osp.join(prefix, "inputs.npz"), allow_pickle=True))
for item in feed_dict.values():
input_tensor.append(torch.from_numpy(item))
for i, tensor in enumerate(input_tensor):
tensors[i] = tensor.cuda()

context = engine.create_execution_context()
buffer = [tensor.data_ptr() for tensor in tensors]
def get_runtime():
tic = time.time()
for i, tensor in enumerate(input_tensor):
tensors[i] = tensor.cuda()
buffer = [tensor.data_ptr() for tensor in tensors]
context.execute(1, buffer)
return (time.time() - tic) * 1000
_ = [get_runtime() for i in range(50)] # warmup
Expand All @@ -50,6 +48,8 @@ def get_runtime():
# print(tensors[1])

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--prefix', type=str, default="temp")
args = parser.parse_args()
torch.random.manual_seed(0)
prefix = "temp"
run_trt(prefix)
run_trt(args.prefix)
19 changes: 9 additions & 10 deletions testing/torch2onnx.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,24 +30,23 @@ def torch2onnx(prefix, model, inputs):
do_constant_folding=False,
opset_version=11)
# tofp16model( osp.join(prefix, "model.onnx"), osp.join(prefix, "model_fp16.onnx"))
# feed_dict = dict(zip(input_names, inputs))
# np.savez(osp.join(prefix, "inputs.npz"), **feed_dict)
feed_dict = dict(zip(input_names, inputs))
np.savez(osp.join(prefix, "inputs.npz"), **feed_dict)

def run_torch(model, inputs):
model = model.cuda()
model.eval()
cu_inputs = []
for item in inputs:
cu_inputs.append(item.cuda() if isinstance(item, torch.Tensor) else item)
def get_runtime():
torch.cuda.synchronize()
tic = time.time()
cu_inputs = []
for item in inputs:
cu_inputs.append(item.cuda() if isinstance(item, torch.Tensor) else item)
with torch.no_grad():
_ = model(*cu_inputs)
_ = model(*cu_inputs)
torch.cuda.synchronize()
return (time.time() - tic) * 1000
_ = [get_runtime() for i in range(50)] # warmup
times = [get_runtime() for i in range(100)]
with torch.no_grad():
_ = [get_runtime() for i in range(50)] # warmup
times = [get_runtime() for i in range(100)]
print("mean: {}ms min: {}ms max: {}ms".format(np.mean(times), np.min(times), np.max(times)))

if __name__ == "__main__":
Expand Down

0 comments on commit 2d10e5a

Please sign in to comment.