-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathplot_benchmark.py
143 lines (125 loc) · 9.33 KB
/
plot_benchmark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from matplotlib import pyplot as plt
from taichi.benchmark import benchmark as benchmark_taichi
from cuda.benchmark import benchmark as benchmark_cuda
import sys
import os
sample_res = {"cublas": [{"N": 256, "time": 0.002, "gflops": 58.671, "gbs": 352.029}, {"N": 512, "time": 0.003, "gflops": 170.522, "gbs": 1023.134}, {"N": 1024, "time": 0.021, "gflops": 101.976, "gbs": 611.855}, {"N": 2048, "time": 0.075, "gflops": 111.558, "gbs": 669.347}, {"N": 4096, "time": 0.293, "gflops": 114.517, "gbs": 687.102}], "thrust": [{"N": 256, "fold": 1, "time": 0.006, "gflops": 23.711, "gbs": 142.269}, {"N": 256, "fold": 2, "time": 0.005, "gflops": 47.977, "gbs": 143.93}, {"N": 256, "fold": 4, "time": 0.006, "gflops": 94.805, "gbs": 142.207}, {"N": 256, "fold": 8, "time": 0.006, "gflops": 184.875, "gbs": 138.657}, {"N": 256, "fold": 16, "time": 0.006, "gflops": 334.826, "gbs": 125.56}, {"N": 512, "fold": 1, "time": 0.006, "gflops": 81.535, "gbs": 489.212}, {"N": 512, "fold": 2, "time": 0.006, "gflops": 163.743, "gbs": 491.228}, {"N": 512, "fold": 4, "time": 0.006, "gflops": 325.564, "gbs": 488.346}, {"N": 512, "fold": 8, "time": 0.007, "gflops": 641.9, "gbs": 481.425}, {"N": 512, "fold": 16, "time": 0.007, "gflops": 1188.255, "gbs": 445.596}, {"N": 1024, "fold": 1, "time": 0.024, "gflops": 88.949, "gbs": 533.694}, {"N": 1024, "fold": 2, "time": 0.024, "gflops": 178.07, "gbs": 534.211}, {"N": 1024, "fold": 4, "time": 0.023, "gflops": 357.312, "gbs": 535.968}, {"N": 1024, "fold": 8, "time": 0.024, "gflops": 709.66, "gbs": 532.245}, {"N": 1024, "fold": 16, "time": 0.024, "gflops": 1392.069, "gbs": 522.026}, {"N": 2048, "fold": 1, "time": 0.079, "gflops": 106.381, "gbs": 638.284}, {"N": 2048, "fold": 2, "time": 0.079, "gflops": 212.751, "gbs": 638.252}, {"N": 2048, "fold": 4, "time": 0.079, "gflops": 425.298, "gbs": 637.948}, {"N": 2048, "fold": 8, "time": 0.079, "gflops": 849.912, "gbs": 637.434}, {"N": 2048, "fold": 16, "time": 0.079, "gflops": 1689.668, "gbs": 633.625}, {"N": 4096, "fold": 1, "time": 0.296, "gflops": 113.187, "gbs": 679.122}, {"N": 4096, "fold": 2, "time": 0.296, "gflops": 226.36, "gbs": 679.079}, {"N": 4096, "fold": 4, "time": 0.296, "gflops": 452.762, "gbs": 679.144}, {"N": 4096, "fold": 8, "time": 0.297, "gflops": 905.013, "gbs": 678.76}, {"N": 4096, "fold": 16, "time": 0.297, "gflops": 1807.705, "gbs": 677.889}], "taichi": [{"N": 256, "fold": 1, "time": 0.006319954199716448, "gflops": 20.739390802211936, "gbs": 124.43634481327163}, {"N": 256, "fold": 2, "time": 0.0062749893870204685, "gflops": 41.77600691122012, "gbs": 125.32802073366038}, {"N": 256, "fold": 4, "time": 0.006239291187375783, "gflops": 84.03005794325061, "gbs": 126.04508691487594}, {"N": 256, "fold": 8, "time": 0.006320460978895426, "gflops": 165.90182322164276, "gbs": 124.42636741623207}, {"N": 256, "fold": 16, "time": 0.0063150141853839156, "gflops": 332.0898320155564, "gbs": 124.53368700583364}, {"N": 512, "fold": 1, "time": 0.006552474806085229, "gflops": 80.0137376359079, "gbs": 480.08242581544744}, {"N": 512, "fold": 2, "time": 0.00637908037751913, "gflops": 164.37729859861065, "gbs": 493.131895795832}, {"N": 512, "fold": 4, "time": 0.006395104434341192, "gflops": 327.9308448410106, "gbs": 491.8962672615159}, {"N": 512, "fold": 8, "time": 0.006447763834148646, "gflops": 650.5052151237501, "gbs": 487.8789113428126}, {"N": 512, "fold": 16, "time": 0.006529038399457931, "gflops": 1284.815234154, "gbs": 481.8057128077501}, {"N": 1024, "fold": 1, "time": 0.02107226480729878, "gflops": 99.52190802355575, "gbs": 597.1314481413345}, {"N": 1024, "fold": 2, "time": 0.021460041217505933, "gflops": 195.4471549000808, "gbs": 586.3414647002425}, {"N": 1024, "fold": 4, "time": 0.020669475989416243, "gflops": 405.8452185384558, "gbs": 608.7678278076838}, {"N": 1024, "fold": 8, "time": 0.02099389820359647, "gflops": 799.1472492291065, "gbs": 599.3604369218299}, {"N": 1024, "fold": 16, "time": 0.021242901170626282, "gflops": 1579.5597658947613, "gbs": 592.3349122105355}, {"N": 2048, "fold": 1, "time": 0.07358282720670103, "gflops": 114.00225186286492, "gbs": 684.0135111771896}, {"N": 2048, "fold": 2, "time": 0.07370002781972289, "gflops": 227.64192221254828, "gbs": 682.9257666376449}, {"N": 2048, "fold": 4, "time": 0.07357445200905204, "gflops": 456.0609163065424, "gbs": 684.0913744598137}, {"N": 2048, "fold": 8, "time": 0.07350112521089613, "gflops": 913.0317911112943, "gbs": 684.7738433334707}, {"N": 2048, "fold": 16, "time": 0.0736398187931627, "gflops": 1822.6243654535153, "gbs": 683.4841370450682}, {"N": 4096, "fold": 1, "time": 0.2832261357922107, "gflops": 118.47223034747495, "gbs": 710.8333820848497}, {"N": 4096, "fold": 2, "time": 0.2831576330121607, "gflops": 237.00178337455554, "gbs": 711.0053501236667}, {"N": 4096, "fold": 4, "time": 0.2836653312202543, "gflops": 473.1552051941996, "gbs": 709.7328077912994}, {"N": 4096, "fold": 8, "time": 0.28355305101722483, "gflops": 946.6851265997965, "gbs": 710.0138449498473}, {"N": 4096, "fold": 16, "time": 0.28309709001332517, "gflops": 1896.419747637568, "gbs": 711.157405364088}]}
def run_benchmarks():
results = benchmark_cuda()
taichi_results = benchmark_taichi(max_nesting=16)
results['taichi'] = taichi_results
return results
def extract_gflops(result_dict):
x = []
y = []
for res in result_dict:
nesting_factor = res.get("fold")
if nesting_factor == 1 or nesting_factor == None:
x.append(res["N"])
y.append(res["gflops"])
return x, y
def extract_bw(result_dict):
x = []
y = []
for res in result_dict:
nesting_factor = res.get("fold")
if nesting_factor == 1 or nesting_factor == None:
x.append(res["N"])
y.append(res["gbs"])
return x, y
def extract_nested(result_dict, N=4096):
x = []
y = []
for res in result_dict:
nesting_factor = res.get("fold")
if res["N"] == N:
if nesting_factor != None:
x.append(nesting_factor)
y.append(res["gflops"])
return x, y
def plot_compute(results):
fig, ax = plt.subplots()
dims, y = extract_gflops(results['taichi'])
bar_pos = [i*4 for i in range(len(y))]
labels = ["{}x{}".format(i, i) for i in dims]
ax.bar(bar_pos, y)
_, y = extract_gflops(results['thrust'])
bar_pos = [i*4+1 for i in range(len(y))]
ax.bar(bar_pos, y)
ax.set_xticks(bar_pos, labels)
_, y = extract_gflops(results['cublas'])
bar_pos = [i*4+2 for i in range(len(y))]
ax.bar(bar_pos, y)
ax.legend(['Taichi', 'CUDA/Thrust', 'CUDA/cuBLAS'])
ax.set_xlabel("Array shape")
ax.set_ylabel("Performance (GFLOPS)")
ax.axvline(x = 7, color='grey', linestyle = 'dashed')
def comp2mem(x):
return x * 6.0
def mem2comp(x):
return x / 6.0
ax2 = ax.secondary_yaxis('right', functions=(comp2mem, mem2comp))
ax2.set_ylabel("Bandwidth (GB/s)")
ax.text(7, 120, 'L2 Cache=5MB', rotation=270)
plt.axhline(y = 760 / 6.0, color='grey', linestyle = 'dashed')
plt.text(11, 770/6.0, 'DRAM Bandwidth=760GB/s')
ax.set_title("SAXPY benchmark on 2D arrays")
plt.savefig("fig/compute_bench.png", dpi=150)
def plot_nested(results, N=4096):
plt.figure()
_, y = extract_nested(results['taichi'], N)
bar_pos = [i*4 for i in range(len(y))]
plt.bar(bar_pos, y)
nesting_factors, y = extract_nested(results['thrust'], N)
bar_pos = [i*4+1 for i in range(len(y))]
plt.bar(bar_pos, y)
labels = ["{}".format(i) for i in nesting_factors]
plt.xticks(bar_pos, labels)
bar_count = len(y)
_, y = extract_nested(results['cublas'], N)
assert(len(y) == 1)
y = [y[0] for i in range(bar_count)]
bar_pos = [i*4+2 for i in range(len(y))]
plt.bar(bar_pos, y)
plt.legend(['Taichi', 'CUDA/Thrust', 'CUDA/cuBLAS'], loc='upper left')
plt.xlabel("Nesting factor")
plt.ylabel("Performance (GFLOPS)")
plt.title("Nested saxpy compute benchmark on {}x{} arrays".format(N, N))
plt.savefig("fig/nesting_bench_{}.png".format(N), dpi=150)
def plot_nested_all_in_one(results, N_arr=[512, 4096]):
fig, ax_arr = plt.subplots(1, len(N_arr), figsize=(14, 4.5))
for i, N in enumerate(N_arr):
ax = ax_arr[i]
ax.set_ylim([0, 2000])
_, y = extract_nested(results['taichi'], N)
bar_pos = [i*4 for i in range(len(y))]
ax.bar(bar_pos, y)
nesting_factors, y = extract_nested(results['thrust'], N)
bar_pos = [i*4+1 for i in range(len(y))]
ax.bar(bar_pos, y)
labels = ["{}".format(i) for i in nesting_factors]
ax.set_xticks(bar_pos, labels)
bar_count = len(y)
_, y = extract_nested(results['cublas'], N)
assert(len(y) == 1)
y = [y[0] for i in range(bar_count)]
bar_pos = [i*4+2 for i in range(len(y))]
ax.bar(bar_pos, y)
ax.legend(['Taichi', 'CUDA/Thrust', 'CUDA/cuBLAS'], loc='upper left')
ax.set_xlabel("Nesting factor")
ax.set_ylabel("Performance (GFLOPS)")
ax.title.set_text("Nested SAXPY benchmark on {}x{} arrays".format(N, N))
plt.savefig("fig/nesting_bench_all.png", dpi=150)
if __name__ == '__main__':
try:
os.makedirs('fig')
except FileExistsError:
pass
if len(sys.argv) >= 2 and sys.argv[1] == 'sample':
results = sample_res
else:
results = run_benchmarks()
plot_compute(results)
plot_nested(results, 4096)
plot_nested(results, 512)
plot_nested_all_in_one(results)