-
Notifications
You must be signed in to change notification settings - Fork 0
/
copypadmul2_62.ptx
109 lines (96 loc) · 2.59 KB
/
copypadmul2_62.ptx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-20732876
// Cuda compilation tools, release 8.0, V8.0.26
// Based on LLVM 3.4svn
//
.version 5.0
.target sm_62
.address_size 64
// .globl copypadmul2
.visible .entry copypadmul2(
.param .u64 copypadmul2_param_0,
.param .u32 copypadmul2_param_1,
.param .u32 copypadmul2_param_2,
.param .u32 copypadmul2_param_3,
.param .u64 copypadmul2_param_4,
.param .u32 copypadmul2_param_5,
.param .u32 copypadmul2_param_6,
.param .u32 copypadmul2_param_7,
.param .u64 copypadmul2_param_8,
.param .f32 copypadmul2_param_9,
.param .u64 copypadmul2_param_10
)
{
.reg .pred %p<8>;
.reg .f32 %f<14>;
.reg .b32 %r<22>;
.reg .f64 %fd<3>;
.reg .b64 %rd<17>;
ld.param.u64 %rd1, [copypadmul2_param_0];
ld.param.u32 %r5, [copypadmul2_param_1];
ld.param.u32 %r6, [copypadmul2_param_2];
ld.param.u64 %rd2, [copypadmul2_param_4];
ld.param.u32 %r7, [copypadmul2_param_5];
ld.param.u32 %r8, [copypadmul2_param_6];
ld.param.u32 %r9, [copypadmul2_param_7];
ld.param.u64 %rd3, [copypadmul2_param_8];
ld.param.f32 %f12, [copypadmul2_param_9];
ld.param.u64 %rd4, [copypadmul2_param_10];
mov.u32 %r10, %ntid.x;
mov.u32 %r11, %ctaid.x;
mov.u32 %r12, %tid.x;
mad.lo.s32 %r1, %r10, %r11, %r12;
mov.u32 %r13, %ntid.y;
mov.u32 %r14, %ctaid.y;
mov.u32 %r15, %tid.y;
mad.lo.s32 %r2, %r13, %r14, %r15;
mov.u32 %r16, %ntid.z;
mov.u32 %r17, %ctaid.z;
mov.u32 %r18, %tid.z;
mad.lo.s32 %r3, %r16, %r17, %r18;
setp.lt.s32 %p1, %r1, %r7;
setp.lt.s32 %p2, %r2, %r8;
and.pred %p3, %p1, %p2;
setp.lt.s32 %p4, %r3, %r9;
and.pred %p5, %p3, %p4;
@!%p5 bra BB0_6;
bra.uni BB0_1;
BB0_1:
mad.lo.s32 %r19, %r3, %r8, %r2;
mad.lo.s32 %r4, %r19, %r7, %r1;
setp.eq.s64 %p6, %rd3, 0;
@%p6 bra BB0_3;
cvta.to.global.u64 %rd5, %rd3;
mul.wide.s32 %rd6, %r4, 4;
add.s64 %rd7, %rd5, %rd6;
ld.global.nc.f32 %f6, [%rd7];
mul.f32 %f12, %f6, %f12;
BB0_3:
setp.eq.s64 %p7, %rd4, 0;
mov.f32 %f13, 0f3F800000;
@%p7 bra BB0_5;
cvta.to.global.u64 %rd8, %rd4;
mul.wide.s32 %rd9, %r4, 4;
add.s64 %rd10, %rd8, %rd9;
ld.global.nc.f32 %f13, [%rd10];
BB0_5:
cvta.to.global.u64 %rd11, %rd1;
cvta.to.global.u64 %rd12, %rd2;
cvt.f64.f32 %fd1, %f12;
mul.f64 %fd2, %fd1, 0d3EB515370F99F6CB;
cvt.rn.f32.f64 %f8, %fd2;
mul.wide.s32 %rd13, %r4, 4;
add.s64 %rd14, %rd12, %rd13;
ld.global.nc.f32 %f9, [%rd14];
mul.f32 %f10, %f8, %f13;
mul.f32 %f11, %f10, %f9;
mad.lo.s32 %r20, %r3, %r6, %r2;
mad.lo.s32 %r21, %r20, %r5, %r1;
mul.wide.s32 %rd15, %r21, 4;
add.s64 %rd16, %rd11, %rd15;
st.global.f32 [%rd16], %f11;
BB0_6:
ret;
}