Update script

nox-410 · Jul 26, 2022 · 2d10e5a · 2d10e5a
1 parent eacff92
commit 2d10e5a
Show file tree

Hide file tree

Showing 10 changed files with 171 additions and 86 deletions.
diff --git a/README.md b/README.md
@@ -24,7 +24,7 @@ Supporting opset11, use ./testing/torch2onnx.py to get some supported models.
 
 ```bash
 nnfusion model.onnx -f onnx -ftune_output_file=model.json -fconst_folding_backend="CUDA" &&
-python3 -m run_compiler model.json tuned.json --device 0 --topk 10 &&
+python3 -m run_compiler model.json tuned.json --device 0 --topk 20 &&
 nnfusion model.onnx -f onnx -ftune_output_file=model.json -fconst_folding_backend="CUDA" -ftune_input_file=tuned.json &&
 rm -rf nnfusion_rt/cuda_codegen/build/ && cmake -S nnfusion_rt/cuda_codegen/ -B nnfusion_rt/cuda_codegen/build/ &&
 make -C nnfusion_rt/cuda_codegen/build/

diff --git a/python/memopt/__init__.py b/python/memopt/__init__.py
@@ -1,3 +1,4 @@
+import torch
 from .IRpass import *
 from .scope import get_scope, Scope
 from .schedule_rewrite import CodeGenerator

diff --git a/python/memopt/utils.py b/python/memopt/utils.py
@@ -59,7 +59,7 @@ def append_host_call(self):
     if (cudaEventRecord(stop, 0) != cudaSuccess) return -1;
     if (cudaEventSynchronize(stop) != cudaSuccess) return -1;
     cudaEventElapsedTime(&ms, start, stop);
-    int repeats = min(100, int(ceil(300.0 / ms)));
+    int repeats = int(ceil(100.0 / ms));
     cudaEventRecord(start, 0);
     for (int _ = 0; _ < repeats; _++)
         {};

diff --git a/testing/ansor.py b/testing/ansor.py
@@ -1,55 +1,58 @@
 import onnx
-import numpy as np
 import os.path as osp
 import time
 import tvm
 from tvm import relay, auto_scheduler
 import tvm.relay.testing
 from tvm.contrib import graph_executor
-
-prefix = "/home/v-yiningshi/learn_tvm/testing/temp/bert"
-target = tvm.target.cuda(arch="sm_70")
-# import tensorflow.compat.v1 as tf
-# pt_model = open(osp.join(prefix, "classifier.pb"), "rb")
-# graph_def = tf.GraphDef()
-# graph_def.ParseFromString(pt_model.read())
-# mod, params = relay.frontend.from_tensorflow(graph_def, "NHWC")
-# feed_dict = dict(np.load(osp.join(prefix, "inputs.npz"), allow_pickle=True))
-# shape_dict = {key: value.shape for key, value in feed_dict.items()}
-onnx_model = onnx.load(osp.join(prefix, "model.onnx"))
-mod, params = relay.frontend.from_onnx(onnx_model)
-tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
-log_file = osp.join(prefix, "ansor_tune.log")
-
-for idx, task in enumerate(tasks):
-    print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
-    print(task.compute_dag)
-
-def run_tuning():
-    print("Begin tuning...")
-    measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10, device=3)
-
-    tuner = auto_scheduler.TaskScheduler(tasks, task_weights, load_log_file=log_file)
-    tune_option = auto_scheduler.TuningOptions(
-        num_measure_trials=len(tasks) * 512,
-        runner=measure_ctx.runner,
-        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
-    )
-
-    tuner.tune(tune_option)
-
-# run_tuning()
-
-# Compile with the history best
-print("Compile...")
-with auto_scheduler.ApplyHistoryBest(log_file):
-    with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
-        lib = relay.build(mod, target=target, params=params)
-
-# Create graph executor
-dev = tvm.device(str(target), 3)
-module = graph_executor.GraphModule(lib["default"](dev))
-
-# Evaluate
-print("Evaluate inference time cost...")
-print(module.benchmark(dev, min_repeat_ms=500, end_to_end=False))
+import argparse
+
+def run_ansor(prefix, device, skip_tuning):
+    target = tvm.target.cuda(arch="sm_70")
+    onnx_model = onnx.load(osp.join(prefix, "model.onnx"))
+    mod, params = relay.frontend.from_onnx(onnx_model)
+    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
+    log_file = osp.join(prefix, "ansor_tune.log")
+
+    for idx, task in enumerate(tasks):
+        print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
+        print(task.compute_dag)
+
+    num_trials = len(tasks) * 800
+    if osp.exists(log_file):
+        with open(log_file, "r") as f:
+            cur_records = len(f.readlines())
+        num_trials -= cur_records
+    if num_trials > 0 and not skip_tuning:
+        print("Begin tuning...")
+        measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10, device=device)
+        tuner = auto_scheduler.TaskScheduler(tasks, task_weights, load_log_file=log_file)
+        tune_option = auto_scheduler.TuningOptions(
+            num_measure_trials=num_trials,
+            runner=measure_ctx.runner,
+            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+        )
+        tuner.tune(tune_option)
+
+    # Compile with the history best
+    print("Compile...")
+    with auto_scheduler.ApplyHistoryBest(log_file):
+        with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
+            lib = relay.build(mod, target=target, params=params)
+
+    # Create graph executor
+    dev = tvm.device(str(target), device)
+    module = graph_executor.GraphModule(lib["default"](dev))
+
+    # Evaluate
+    print("Evaluate inference time cost...")
+    print(module.benchmark(dev, min_repeat_ms=500, end_to_end=False))
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--prefix', type=str, default="temp")
+    parser.add_argument('--device', type=int, default=0)
+    parser.add_argument('--skip', action="store_true")
+    args = parser.parse_args()
+    start_time = time.time()
+    run_ansor(args.prefix, args.device, args.skip)
diff --git a/testing/model/pytorch/__init__.py b/testing/model/pytorch/__init__.py
@@ -19,7 +19,7 @@ def shufflenet(batch_size):
     input = torch.randn(batch_size, 3, 224, 224)
     return model, (input, )
 
-def SqueezeNet(batch_size):
+def squeezenet(batch_size):
     from .squeezenet import SqueezeNet as Net
     model = Net()
     input = torch.randn(batch_size, 3, 224, 224)
@@ -83,19 +83,9 @@ def transformer(batch_size):
     return model, inputs
 
 def vit(batch_size):
-    from vit_pytorch import ViT
-    model = ViT(
-        image_size = 256,
-        patch_size = 32,
-        num_classes = 1000,
-        dim = 1024,
-        depth = 6,
-        heads = 16,
-        mlp_dim = 2048,
-        dropout = 0.1,
-        emb_dropout = 0.1
-    )
-    input = torch.randn(batch_size, 3, 256, 256)
+    from timm.models import vit_small_patch32_224 as Net
+    model = Net()
+    input = torch.randn(batch_size, 3, 224, 224)
     return model, (input, )
 
 def localvit(batch_size):

diff --git a/testing/run_tf.py b/testing/run_tf.py
@@ -1,4 +1,3 @@
-from ast import arg
 import tensorflow as tf
 import numpy as np
 import argparse
@@ -8,8 +7,6 @@
 import onnx
 from onnx_tf.backend import prepare
 
-os.environ["CUDA_VISIBLE_DEVICES"] = "3"
-
 def load_graph(onnx_file):
     onnx_model = onnx.load(onnx_file)
     tf_rep = prepare(onnx_model, device="cuda")
@@ -47,8 +44,8 @@ def get_runtime():
         tic = time.time()
         _ = sess.run(outputs, feed_dict=feed_dict)
         return (time.time() - tic) * 1000
-    _ = [get_runtime() for i in range(50)] # warmup
-    times = [get_runtime() for i in range(100)]
+    _ = [get_runtime() for i in range(200)] # warmup
+    times = [get_runtime() for i in range(800)]
     print(np.mean(times), np.min(times), np.max(times))
 
 if __name__ == "__main__":

diff --git a/testing/run_tf2.py b/testing/run_tf2.py
@@ -0,0 +1,43 @@
+import tensorflow as tf
+import numpy as np
+import argparse
+import os
+import time
+import os.path as osp
+import tempfile
+
+def load_graph(onnx_file):
+    import onnx
+    from onnx_tf.backend import prepare
+    onnx_model = onnx.load(onnx_file)
+    tf_rep = prepare(onnx_model, device="cuda")
+    exported = tempfile.TemporaryDirectory()
+    tf_rep.export_graph(exported.name)
+    return exported
+
+def run_tf(prefix, xla=False):
+    if xla:
+        tf.config.optimizer.set_jit(True)
+    exported = load_graph(osp.join(prefix, "model.onnx"))
+    feed_dict = dict(np.load(osp.join(prefix, "inputs.npz"), allow_pickle=True))
+
+    saved_model_loaded = tf.saved_model.load(
+        exported.name, tags=[tf.saved_model.SERVING])
+    graph_func = saved_model_loaded.signatures[tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
+
+    def get_runtime():
+        tic = time.time()
+        _ = graph_func(**feed_dict)
+        return (time.time() - tic) * 1000
+    _ = [get_runtime() for i in range(200)] # warmup
+    times = [get_runtime() for i in range(800)]
+    print(np.mean(times), np.min(times), np.max(times))
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--xla', action="store_true")
+    parser.add_argument('--prefix', type=str, default="temp")
+    parser.add_argument('--device', type=int, default=0)
+    args = parser.parse_args()
+    os.environ["CUDA_VISIBLE_DEVICES"] = str(args.device)
+    run_tf(args.prefix, xla=args.xla)
diff --git a/testing/run_tf_trt.py b/testing/run_tf_trt.py
@@ -0,0 +1,52 @@
+import tensorflow as tf
+from tensorflow.python.compiler.tensorrt import trt_convert as trt
+import numpy as np
+import argparse
+import os
+import time
+import os.path as osp
+import tempfile
+
+def load_graph(onnx_file):
+    import onnx
+    from onnx_tf.backend import prepare
+    onnx_model = onnx.load(onnx_file)
+    tf_rep = prepare(onnx_model, device="cuda")
+    exported = tempfile.TemporaryDirectory()
+    tf_rep.export_graph(exported.name)
+    return exported
+
+def run_tf(prefix):
+    exported = load_graph(osp.join(prefix, "model.onnx"))
+    feed_dict = dict(np.load(osp.join(prefix, "inputs.npz"), allow_pickle=True))
+
+    conversion_params = trt.DEFAULT_TRT_CONVERSION_PARAMS
+    converter = trt.TrtGraphConverterV2(
+        input_saved_model_dir=exported.name,
+        conversion_params=conversion_params)
+    converter.convert()
+    def my_input_fn():
+        yield tuple(feed_dict.values())
+    converter.build(input_fn=my_input_fn)
+    convert_exported = tempfile.TemporaryDirectory()
+    converter.save(convert_exported.name)
+
+    saved_model_loaded = tf.saved_model.load(
+        convert_exported.name, tags=[tf.saved_model.SERVING])
+    graph_func = saved_model_loaded.signatures[tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
+
+    def get_runtime():
+        tic = time.time()
+        _ = graph_func(**feed_dict)
+        return (time.time() - tic) * 1000
+    _ = [get_runtime() for i in range(200)] # warmup
+    times = [get_runtime() for i in range(800)]
+    print(np.mean(times), np.min(times), np.max(times))
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--prefix', type=str, default="temp")
+    parser.add_argument('--device', type=int, default=0)
+    args = parser.parse_args()
+    os.environ["CUDA_VISIBLE_DEVICES"] = str(args.device)
+    run_tf(args.prefix)
diff --git a/testing/run_trt.py b/testing/run_trt.py
@@ -5,8 +5,6 @@
 import argparse
 import torch
 
-torch.cuda.set_device(3)
-
 def run_trt(prefix):
     logger = trt.Logger(trt.Logger.ERROR)
     builder = trt.Builder(logger)
@@ -35,13 +33,13 @@ def run_trt(prefix):
     feed_dict = dict(np.load(osp.join(prefix, "inputs.npz"), allow_pickle=True))
     for item in feed_dict.values():
         input_tensor.append(torch.from_numpy(item))
+    for i, tensor in enumerate(input_tensor):
+        tensors[i] = tensor.cuda()
 
     context = engine.create_execution_context()
+    buffer = [tensor.data_ptr() for tensor in tensors]
     def get_runtime():
         tic = time.time()
-        for i, tensor in enumerate(input_tensor):
-            tensors[i] = tensor.cuda()
-        buffer = [tensor.data_ptr() for tensor in tensors]
         context.execute(1, buffer)
         return (time.time() - tic) * 1000
     _ = [get_runtime() for i in range(50)] # warmup
@@ -50,6 +48,8 @@ def get_runtime():
     # print(tensors[1])
 
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--prefix', type=str, default="temp")
+    args = parser.parse_args()
     torch.random.manual_seed(0)
-    prefix = "temp"
-    run_trt(prefix)
+    run_trt(args.prefix)
diff --git a/testing/torch2onnx.py b/testing/torch2onnx.py
@@ -30,24 +30,23 @@ def torch2onnx(prefix, model, inputs):
         do_constant_folding=False,
         opset_version=11)
     # tofp16model( osp.join(prefix, "model.onnx"),  osp.join(prefix, "model_fp16.onnx"))
-    # feed_dict = dict(zip(input_names, inputs))
-    # np.savez(osp.join(prefix, "inputs.npz"), **feed_dict)
+    feed_dict = dict(zip(input_names, inputs))
+    np.savez(osp.join(prefix, "inputs.npz"), **feed_dict)
 
 def run_torch(model, inputs):
     model = model.cuda()
     model.eval()
+    cu_inputs = []
+    for item in inputs:
+        cu_inputs.append(item.cuda() if isinstance(item, torch.Tensor) else item)
     def get_runtime():
-        torch.cuda.synchronize()
         tic = time.time()
-        cu_inputs = []
-        for item in inputs:
-            cu_inputs.append(item.cuda() if isinstance(item, torch.Tensor) else item)
-        with torch.no_grad():
-            _ = model(*cu_inputs)
+        _ = model(*cu_inputs)
         torch.cuda.synchronize()
         return (time.time() - tic) * 1000
-    _ = [get_runtime() for i in range(50)] # warmup
-    times = [get_runtime() for i in range(100)]
+    with torch.no_grad():
+        _ = [get_runtime() for i in range(50)] # warmup
+        times = [get_runtime() for i in range(100)]
     print("mean: {}ms min: {}ms max: {}ms".format(np.mean(times), np.min(times), np.max(times)))
 
 if __name__ == "__main__":