-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbench_tvm.py
executable file
·109 lines (92 loc) · 3.18 KB
/
bench_tvm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env python
import tvm
from tvm import relay
from tvm.contrib import graph_runtime
import numpy as np
import time
from tvm.contrib.debugger import debug_runtime
import sys
import argparse
def load_model(prefix):
lib = tvm.runtime.load_module("{}.so".format(prefix))
with open("{}.json".format(prefix), "r") as fh:
graph = fh.read()
with open("{}.params".format(prefix), "rb") as fh:
params = fh.read()
return lib, graph, params
def benchmark(prefix, batch, seq, device, N=1):
lib0, graph0, params0 = load_model(prefix)
shape = {
"input_ids": (batch, seq),
"attention_mask": (batch, seq),
}
feed_dict = {
"input_ids": np.random.randint(0, 10000, size=[batch, seq]).astype("int64"),
"attention_mask": np.ones([batch, seq]).astype("int64"),
}
if "distilbert" not in prefix and "roberta" not in prefix:
shape["token_type_ids"] = (batch, seq)
feed_dict["token_type_ids"] = np.zeros([batch, seq]).astype("int64")
if device == "cpu":
ctx = tvm.cpu(0)
elif device == "gpu":
ctx = tvm.cuda(0)
else:
raise RuntimeError("Unknown device={}".format(device))
m0 = graph_runtime.graph_executor.create(graph0, lib0, ctx)
m0.load_params(params0)
m0.set_input(**feed_dict)
for _ in range(10):
m0.run()
# t1 = time.time()
# for _ in range(N):
# m0.run()
# t2 = time.time()
# dt = t2 - t1
ftimer = m0.module.time_evaluator("run", ctx, min_repeat_ms=500, repeat=10)
dt = np.mean(ftimer().results)
inf_time = dt * 1000
return inf_time
parser = argparse.ArgumentParser(description="Process input args")
parser.add_argument("--model", type=str, required=False)
parser.add_argument(
"--backend", type=str, required=False, default="cpu", choices=["cpu", "gpu"]
)
parser.add_argument("--batch", type=int, required=False)
parser.add_argument("--seq", type=int, required=False)
parser.add_argument("--N", type=int, required=False, default=100)
parser.add_argument("--type", type=str, required=True, choices=["onnx", "pt"])
args = parser.parse_args()
model_name = args.model
batch, seq = args.batch, args.seq
backend = args.backend
N = args.N
model_type = args.type
if model_name:
model_names = [model_name]
else:
with open("models.txt") as fh:
model_names = fh.readlines()
model_names = [model.rstrip() for model in model_names]
batches = [batch] if batch else [1, 4]
seqs = [seq] if seq else [32, 64, 128, 256]
for batch in batches:
print(
"---------------begin profiling {}-tvm batch={}------------------".format(
model_type, batch
)
)
for model_name in model_names:
line = "{}".format(model_name)
for seq in seqs:
if model_type == "onnx":
model_prefix = "models/{}/{}-{}-{}".format(
model_name, model_name, batch, seq
)
else:
model_prefix = "pt_models/{}/{}-{}-{}".format(
model_name, model_name, batch, seq
)
latency = benchmark(model_prefix, batch, seq, backend, N=N)
line += ",{}".format(latency)
print(line)