-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbusy_pytorch_blas.py
executable file
·135 lines (115 loc) · 4.44 KB
/
busy_pytorch_blas.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python3
# Keep CPU or GPU busy by loops of matrix multiply, that's it.
# Specify the matrix dimension and number of loops in command line arguments.
# Example 1
# ./busy_pytorch_blas.py 8000 3
# Example 2:
# ./busy_pytorch_blas.py 8000 3 -dcuda:0 -pFP32
# Example 3 (for SLURM):
# srun -J test_blas -p szsc --nodes=1 --time=2:00 bash -c "module load apps/PyTorch; python3 $HOME/test_usage/busy_pytorch_blas.py 8000 3"
# change of pytorch v1.12
# Disable TF32 for matmul by default and add high-level control of fp32 matmul precision
# https://dev-discuss.pytorch.org/t/pytorch-and-tensorfloat32/504
import sys
from time import time, localtime, strftime
import torch
from torch import randn, norm, eye
print('Torch version : ', torch.__version__)
print("CUDA available: ", torch.cuda.is_available());
device_name = 'cuda' # can be cpu, cuda, cuda:0
str_precision = 'fp32' # default precision
#simple parse
argv_nonoption = []
for j in range(len(sys.argv)):
if j==0: continue
if sys.argv[j].startswith('-d'):
device_name = sys.argv[j][2:]
elif sys.argv[j].startswith('-p'):
str_precision = sys.argv[j][2:].lower()
else:
argv_nonoption.append(sys.argv[j])
# https://pytorch.org/docs/stable/tensor_attributes.html#torch.dtype
# https://en.wikipedia.org/wiki/Bfloat16_floating-point_format
# TF32 was default for pytorch v1.7~v1.11 we don't like that
# https://dev-discuss.pytorch.org/t/pytorch-and-tensorfloat32/504
torch.backends.cuda.matmul.allow_tf32 = False
if str_precision in ['fp32', 'float32', 'float', 'binary32']:
# FP32: 1 + 8 + 23 (32bits)
dtype = torch.float
elif str_precision in ['fp64', 'float64', 'double', 'binary64']:
# FP64: 1 + 11 + 52 (64bits)
dtype = torch.double
elif str_precision in ['fp16', 'float16', 'half', 'binary16']:
# FP16: 1 + 5 + 10 (16bits)
dtype = torch.float16
elif str_precision in ['tf32', 'tensorfloat32']:
# TensorFloat32 (TF32) operation
# TF32: 1 + 8 + 10 (19bits)
dtype = torch.float
torch.backends.cuda.matmul.allow_tf32 = True
#torch.backends.cudnn.allow_tf32 = True
elif str_precision in ['bf16', 'bfloat16']:
# Bfloat16: 1 + 8 + 7 (16bits)
dtype = torch.bfloat16
else:
dtype = str_precision
#device = torch.device("cpu")
device = torch.device(device_name)
info_cuda = ' ver=' + torch.version.cuda if device_name=='cuda' else ''
print('Using device :', device_name, info_cuda)
print('Data type :', str_precision)
if device.type != 'cpu' and not torch.cuda.is_available():
print('Selected device not available. (Check torch GPU support or CUDA hardware availability)')
exit()
# Keep CPU or GPU busy, that's it.
# n : size of matrix
# k_max : number of loops
def busy_gemm(n = 8000, k_max = 2**31-2):
is_gpu = device.type != 'cpu'
# contruct the timer
if is_gpu:
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
cuda_stream = torch.cuda.Stream(device)
torch.cuda.set_stream(cuda_stream) # although discouraged
#print('Current cuda_stream: ', torch.cuda.current_stream())
def start_timer():
start_event.record()
return None
def stop_timer(st):
end_event.record()
# Wait for the events to be recorded!
torch.cuda.synchronize(device)
t = start_event.elapsed_time(end_event) / 1000.0
return t
else:
def start_timer():
t0 = time()
return t0
def stop_timer(t0):
t = time() - t0
return t
gflo = n ** 3 * 2 / 1e9
v = randn(n,1, device=device, dtype=dtype);
v = v / norm(v)
u = randn(n,1, device=device, dtype=dtype);
u = u / norm(u)
# a (not very) random (but fast generated) orthogonal matrix
a = eye(n, device=device, dtype=dtype) \
- 2 * u.mm(u.t()) - 2 * v.mm(v.t()) \
+ (4 * u * (u.t().mm(v))).mm(v.t())
c = a;
for k in range(1, k_max+1):
timer = start_timer()
c = c.mm(a) # the core payload
t = stop_timer(timer)
s = strftime("%Y-%m-%d %H:%M:%S %Z", localtime())
print('%s, t=%.3f, #%d, GFLOPS=%5.1f.' % (s, t, k, gflo/t))
if __name__ == '__main__':
param = [int(i) for i in argv_nonoption]
busy_gemm(*param)
# fp32 fp64
#cpu W-2145: 1554.5 750.5
# 2080Ti : 13332.0 516.1
# AMD ? : 10499.4 5414.9
# V100 : 13601.2 6860.6