Skip to content

Commit

Permalink
Add ptx code for all supported architectures
Browse files Browse the repository at this point in the history
ptx version 6.5
architectures 30 32 35 37 50 52 53 60 61 62 70 72 75
  • Loading branch information
JeroenMulkers authored and godsic committed Jun 18, 2020
1 parent e6edd89 commit 83b34cc
Show file tree
Hide file tree
Showing 49 changed files with 27,702 additions and 156 deletions.
306 changes: 306 additions & 0 deletions cuda/copypadmul2_wrapper.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,14 +88,17 @@ func k_copypadmul2_async(dst unsafe.Pointer, Dx int, Dy int, Dz int, src unsafe.
// maps compute capability on PTX code for copypadmul2 kernel.
var copypadmul2_map = map[int]string{0: "",
30: copypadmul2_ptx_30,
32: copypadmul2_ptx_32,
35: copypadmul2_ptx_35,
37: copypadmul2_ptx_37,
50: copypadmul2_ptx_50,
52: copypadmul2_ptx_52,
53: copypadmul2_ptx_53,
60: copypadmul2_ptx_60,
61: copypadmul2_ptx_61,
62: copypadmul2_ptx_62,
70: copypadmul2_ptx_70,
72: copypadmul2_ptx_72,
75: copypadmul2_ptx_75}

// copypadmul2 PTX code for various compute capabilities.
Expand Down Expand Up @@ -200,6 +203,107 @@ BB0_6:
}
`
copypadmul2_ptx_32 = `
.version 6.5
.target sm_32
.address_size 64
// .globl copypadmul2
.visible .entry copypadmul2(
.param .u64 copypadmul2_param_0,
.param .u32 copypadmul2_param_1,
.param .u32 copypadmul2_param_2,
.param .u32 copypadmul2_param_3,
.param .u64 copypadmul2_param_4,
.param .u32 copypadmul2_param_5,
.param .u32 copypadmul2_param_6,
.param .u32 copypadmul2_param_7,
.param .u64 copypadmul2_param_8,
.param .f32 copypadmul2_param_9,
.param .u64 copypadmul2_param_10
)
{
.reg .pred %p<8>;
.reg .f32 %f<14>;
.reg .b32 %r<22>;
.reg .f64 %fd<3>;
.reg .b64 %rd<17>;
ld.param.u64 %rd1, [copypadmul2_param_0];
ld.param.u32 %r5, [copypadmul2_param_1];
ld.param.u32 %r6, [copypadmul2_param_2];
ld.param.u64 %rd2, [copypadmul2_param_4];
ld.param.u32 %r7, [copypadmul2_param_5];
ld.param.u32 %r8, [copypadmul2_param_6];
ld.param.u32 %r9, [copypadmul2_param_7];
ld.param.u64 %rd3, [copypadmul2_param_8];
ld.param.f32 %f12, [copypadmul2_param_9];
ld.param.u64 %rd4, [copypadmul2_param_10];
mov.u32 %r10, %ntid.x;
mov.u32 %r11, %ctaid.x;
mov.u32 %r12, %tid.x;
mad.lo.s32 %r1, %r10, %r11, %r12;
mov.u32 %r13, %ntid.y;
mov.u32 %r14, %ctaid.y;
mov.u32 %r15, %tid.y;
mad.lo.s32 %r2, %r13, %r14, %r15;
mov.u32 %r16, %ntid.z;
mov.u32 %r17, %ctaid.z;
mov.u32 %r18, %tid.z;
mad.lo.s32 %r3, %r16, %r17, %r18;
setp.ge.s32 %p1, %r1, %r7;
setp.ge.s32 %p2, %r2, %r8;
or.pred %p3, %p1, %p2;
setp.ge.s32 %p4, %r3, %r9;
or.pred %p5, %p3, %p4;
@%p5 bra BB0_6;
mad.lo.s32 %r19, %r3, %r8, %r2;
mad.lo.s32 %r4, %r19, %r7, %r1;
setp.eq.s64 %p6, %rd3, 0;
@%p6 bra BB0_3;
cvta.to.global.u64 %rd5, %rd3;
mul.wide.s32 %rd6, %r4, 4;
add.s64 %rd7, %rd5, %rd6;
ld.global.nc.f32 %f6, [%rd7];
mul.f32 %f12, %f6, %f12;
BB0_3:
setp.eq.s64 %p7, %rd4, 0;
mov.f32 %f13, 0f3F800000;
@%p7 bra BB0_5;
cvta.to.global.u64 %rd8, %rd4;
mul.wide.s32 %rd9, %r4, 4;
add.s64 %rd10, %rd8, %rd9;
ld.global.nc.f32 %f13, [%rd10];
BB0_5:
cvta.to.global.u64 %rd11, %rd1;
cvta.to.global.u64 %rd12, %rd2;
mul.wide.s32 %rd13, %r4, 4;
add.s64 %rd14, %rd12, %rd13;
ld.global.nc.f32 %f8, [%rd14];
cvt.f64.f32 %fd1, %f12;
mul.f64 %fd2, %fd1, 0d3EB515370F99F6CB;
cvt.rn.f32.f64 %f9, %fd2;
mul.f32 %f10, %f9, %f13;
mul.f32 %f11, %f10, %f8;
mad.lo.s32 %r20, %r3, %r6, %r2;
mad.lo.s32 %r21, %r20, %r5, %r1;
mul.wide.s32 %rd15, %r21, 4;
add.s64 %rd16, %rd11, %rd15;
st.global.f32 [%rd16], %f11;
BB0_6:
ret;
}
`
copypadmul2_ptx_35 = `
.version 6.5
Expand Down Expand Up @@ -907,6 +1011,107 @@ BB0_6:
}
`
copypadmul2_ptx_62 = `
.version 6.5
.target sm_62
.address_size 64
// .globl copypadmul2
.visible .entry copypadmul2(
.param .u64 copypadmul2_param_0,
.param .u32 copypadmul2_param_1,
.param .u32 copypadmul2_param_2,
.param .u32 copypadmul2_param_3,
.param .u64 copypadmul2_param_4,
.param .u32 copypadmul2_param_5,
.param .u32 copypadmul2_param_6,
.param .u32 copypadmul2_param_7,
.param .u64 copypadmul2_param_8,
.param .f32 copypadmul2_param_9,
.param .u64 copypadmul2_param_10
)
{
.reg .pred %p<8>;
.reg .f32 %f<14>;
.reg .b32 %r<22>;
.reg .f64 %fd<3>;
.reg .b64 %rd<17>;
ld.param.u64 %rd1, [copypadmul2_param_0];
ld.param.u32 %r5, [copypadmul2_param_1];
ld.param.u32 %r6, [copypadmul2_param_2];
ld.param.u64 %rd2, [copypadmul2_param_4];
ld.param.u32 %r7, [copypadmul2_param_5];
ld.param.u32 %r8, [copypadmul2_param_6];
ld.param.u32 %r9, [copypadmul2_param_7];
ld.param.u64 %rd3, [copypadmul2_param_8];
ld.param.f32 %f12, [copypadmul2_param_9];
ld.param.u64 %rd4, [copypadmul2_param_10];
mov.u32 %r10, %ntid.x;
mov.u32 %r11, %ctaid.x;
mov.u32 %r12, %tid.x;
mad.lo.s32 %r1, %r10, %r11, %r12;
mov.u32 %r13, %ntid.y;
mov.u32 %r14, %ctaid.y;
mov.u32 %r15, %tid.y;
mad.lo.s32 %r2, %r13, %r14, %r15;
mov.u32 %r16, %ntid.z;
mov.u32 %r17, %ctaid.z;
mov.u32 %r18, %tid.z;
mad.lo.s32 %r3, %r16, %r17, %r18;
setp.ge.s32 %p1, %r1, %r7;
setp.ge.s32 %p2, %r2, %r8;
or.pred %p3, %p1, %p2;
setp.ge.s32 %p4, %r3, %r9;
or.pred %p5, %p3, %p4;
@%p5 bra BB0_6;
mad.lo.s32 %r19, %r3, %r8, %r2;
mad.lo.s32 %r4, %r19, %r7, %r1;
setp.eq.s64 %p6, %rd3, 0;
@%p6 bra BB0_3;
cvta.to.global.u64 %rd5, %rd3;
mul.wide.s32 %rd6, %r4, 4;
add.s64 %rd7, %rd5, %rd6;
ld.global.nc.f32 %f6, [%rd7];
mul.f32 %f12, %f6, %f12;
BB0_3:
setp.eq.s64 %p7, %rd4, 0;
mov.f32 %f13, 0f3F800000;
@%p7 bra BB0_5;
cvta.to.global.u64 %rd8, %rd4;
mul.wide.s32 %rd9, %r4, 4;
add.s64 %rd10, %rd8, %rd9;
ld.global.nc.f32 %f13, [%rd10];
BB0_5:
cvta.to.global.u64 %rd11, %rd1;
cvta.to.global.u64 %rd12, %rd2;
mul.wide.s32 %rd13, %r4, 4;
add.s64 %rd14, %rd12, %rd13;
ld.global.nc.f32 %f8, [%rd14];
cvt.f64.f32 %fd1, %f12;
mul.f64 %fd2, %fd1, 0d3EB515370F99F6CB;
cvt.rn.f32.f64 %f9, %fd2;
mul.f32 %f10, %f9, %f13;
mul.f32 %f11, %f10, %f8;
mad.lo.s32 %r20, %r3, %r6, %r2;
mad.lo.s32 %r21, %r20, %r5, %r1;
mul.wide.s32 %rd15, %r21, 4;
add.s64 %rd16, %rd11, %rd15;
st.global.f32 [%rd16], %f11;
BB0_6:
ret;
}
`
copypadmul2_ptx_70 = `
.version 6.5
Expand Down Expand Up @@ -1008,6 +1213,107 @@ BB0_6:
}
`
copypadmul2_ptx_72 = `
.version 6.5
.target sm_72
.address_size 64
// .globl copypadmul2
.visible .entry copypadmul2(
.param .u64 copypadmul2_param_0,
.param .u32 copypadmul2_param_1,
.param .u32 copypadmul2_param_2,
.param .u32 copypadmul2_param_3,
.param .u64 copypadmul2_param_4,
.param .u32 copypadmul2_param_5,
.param .u32 copypadmul2_param_6,
.param .u32 copypadmul2_param_7,
.param .u64 copypadmul2_param_8,
.param .f32 copypadmul2_param_9,
.param .u64 copypadmul2_param_10
)
{
.reg .pred %p<8>;
.reg .f32 %f<14>;
.reg .b32 %r<22>;
.reg .f64 %fd<3>;
.reg .b64 %rd<17>;
ld.param.u64 %rd1, [copypadmul2_param_0];
ld.param.u32 %r5, [copypadmul2_param_1];
ld.param.u32 %r6, [copypadmul2_param_2];
ld.param.u64 %rd2, [copypadmul2_param_4];
ld.param.u32 %r7, [copypadmul2_param_5];
ld.param.u32 %r8, [copypadmul2_param_6];
ld.param.u32 %r9, [copypadmul2_param_7];
ld.param.u64 %rd3, [copypadmul2_param_8];
ld.param.f32 %f12, [copypadmul2_param_9];
ld.param.u64 %rd4, [copypadmul2_param_10];
mov.u32 %r10, %ntid.x;
mov.u32 %r11, %ctaid.x;
mov.u32 %r12, %tid.x;
mad.lo.s32 %r1, %r10, %r11, %r12;
mov.u32 %r13, %ntid.y;
mov.u32 %r14, %ctaid.y;
mov.u32 %r15, %tid.y;
mad.lo.s32 %r2, %r13, %r14, %r15;
mov.u32 %r16, %ntid.z;
mov.u32 %r17, %ctaid.z;
mov.u32 %r18, %tid.z;
mad.lo.s32 %r3, %r16, %r17, %r18;
setp.ge.s32 %p1, %r1, %r7;
setp.ge.s32 %p2, %r2, %r8;
or.pred %p3, %p1, %p2;
setp.ge.s32 %p4, %r3, %r9;
or.pred %p5, %p3, %p4;
@%p5 bra BB0_6;
mad.lo.s32 %r19, %r3, %r8, %r2;
mad.lo.s32 %r4, %r19, %r7, %r1;
setp.eq.s64 %p6, %rd3, 0;
@%p6 bra BB0_3;
cvta.to.global.u64 %rd5, %rd3;
mul.wide.s32 %rd6, %r4, 4;
add.s64 %rd7, %rd5, %rd6;
ld.global.nc.f32 %f6, [%rd7];
mul.f32 %f12, %f6, %f12;
BB0_3:
setp.eq.s64 %p7, %rd4, 0;
mov.f32 %f13, 0f3F800000;
@%p7 bra BB0_5;
cvta.to.global.u64 %rd8, %rd4;
mul.wide.s32 %rd9, %r4, 4;
add.s64 %rd10, %rd8, %rd9;
ld.global.nc.f32 %f13, [%rd10];
BB0_5:
cvta.to.global.u64 %rd11, %rd1;
cvta.to.global.u64 %rd12, %rd2;
mul.wide.s32 %rd13, %r4, 4;
add.s64 %rd14, %rd12, %rd13;
ld.global.nc.f32 %f8, [%rd14];
cvt.f64.f32 %fd1, %f12;
mul.f64 %fd2, %fd1, 0d3EB515370F99F6CB;
cvt.rn.f32.f64 %f9, %fd2;
mul.f32 %f10, %f9, %f13;
mul.f32 %f11, %f10, %f8;
mad.lo.s32 %r20, %r3, %r6, %r2;
mad.lo.s32 %r21, %r20, %r5, %r1;
mul.wide.s32 %rd15, %r21, 4;
add.s64 %rd16, %rd11, %rd15;
st.global.f32 [%rd16], %f11;
BB0_6:
ret;
}
`
copypadmul2_ptx_75 = `
.version 6.5
Expand Down
Loading

0 comments on commit 83b34cc

Please sign in to comment.