-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdevice_functions.cpp
387 lines (313 loc) · 12.7 KB
/
device_functions.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
/*
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip/device_functions.h>
#include <hc.hpp>
#include <grid_launch.h>
#include <hc_math.hpp>
#include "device_util.h"
__device__ float __double2float_rd(double x) { return (double)x; }
__device__ float __double2float_rn(double x) { return (double)x; }
__device__ float __double2float_ru(double x) { return (double)x; }
__device__ float __double2float_rz(double x) { return (double)x; }
__device__ int __double2hiint(double x) {
static_assert(sizeof(double) == 2 * sizeof(int), "");
int tmp[2];
__builtin_memcpy(tmp, &x, sizeof(tmp));
return tmp[1];
}
__device__ int __double2loint(double x) {
static_assert(sizeof(double) == 2 * sizeof(int), "");
int tmp[2];
__builtin_memcpy(tmp, &x, sizeof(tmp));
return tmp[0];
}
__device__ int __double2int_rd(double x) { return (int)x; }
__device__ int __double2int_rn(double x) { return (int)x; }
__device__ int __double2int_ru(double x) { return (int)x; }
__device__ int __double2int_rz(double x) { return (int)x; }
__device__ long long int __double2ll_rd(double x) { return (long long int)x; }
__device__ long long int __double2ll_rn(double x) { return (long long int)x; }
__device__ long long int __double2ll_ru(double x) { return (long long int)x; }
__device__ long long int __double2ll_rz(double x) { return (long long int)x; }
__device__ unsigned int __double2uint_rd(double x) { return (unsigned int)x; }
__device__ unsigned int __double2uint_rn(double x) { return (unsigned int)x; }
__device__ unsigned int __double2uint_ru(double x) { return (unsigned int)x; }
__device__ unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
__device__ unsigned long long int __double2ull_rd(double x) { return (unsigned long long int)x; }
__device__ unsigned long long int __double2ull_rn(double x) { return (unsigned long long int)x; }
__device__ unsigned long long int __double2ull_ru(double x) { return (unsigned long long int)x; }
__device__ unsigned long long int __double2ull_rz(double x) { return (unsigned long long int)x; }
__device__ long long int __double_as_longlong(double x) {
static_assert(sizeof(long long) == sizeof(double), "");
long long tmp;
__builtin_memcpy(&tmp, &x, sizeof(tmp));
return tmp;
}
__device__ int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
__device__ int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
__device__ int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
__device__ int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
__device__ long long int __float2ll_rd(float x) { return (long long int)x; }
__device__ long long int __float2ll_rn(float x) { return (long long int)x; }
__device__ long long int __float2ll_ru(float x) { return (long long int)x; }
__device__ long long int __float2ll_rz(float x) { return (long long int)x; }
__device__ unsigned int __float2uint_rd(float x) { return (unsigned int)x; }
__device__ unsigned int __float2uint_rn(float x) { return (unsigned int)x; }
__device__ unsigned int __float2uint_ru(float x) { return (unsigned int)x; }
__device__ unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
__device__ unsigned long long int __float2ull_rd(float x) { return (unsigned long long int)x; }
__device__ unsigned long long int __float2ull_rn(float x) { return (unsigned long long int)x; }
__device__ unsigned long long int __float2ull_ru(float x) { return (unsigned long long int)x; }
__device__ unsigned long long int __float2ull_rz(float x) { return (unsigned long long int)x; }
__device__ int __float_as_int(float x) {
static_assert(sizeof(int) == sizeof(float), "");
int tmp;
__builtin_memcpy(&tmp, &x, sizeof(tmp));
return tmp;
}
__device__ unsigned int __float_as_uint(float x) {
static_assert(sizeof(unsigned int) == sizeof(float), "");
unsigned int tmp;
__builtin_memcpy(&tmp, &x, sizeof(tmp));
return tmp;
}
__device__ double __hiloint2double(int32_t hi, int32_t lo) {
static_assert(sizeof(double) == sizeof(uint64_t), "");
uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
double tmp1;
__builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__ double __int2double_rn(int x) { return (double)x; }
__device__ float __int2float_rd(int x) { return (float)x; }
__device__ float __int2float_rn(int x) { return (float)x; }
__device__ float __int2float_ru(int x) { return (float)x; }
__device__ float __int2float_rz(int x) { return (float)x; }
__device__ float __int_as_float(int x) {
static_assert(sizeof(float) == sizeof(int), "");
float tmp;
__builtin_memcpy(&tmp, &x, sizeof(tmp));
return tmp;
}
__device__ double __ll2double_rd(long long int x) { return (double)x; }
__device__ double __ll2double_rn(long long int x) { return (double)x; }
__device__ double __ll2double_ru(long long int x) { return (double)x; }
__device__ double __ll2double_rz(long long int x) { return (double)x; }
__device__ float __ll2float_rd(long long int x) { return (float)x; }
__device__ float __ll2float_rn(long long int x) { return (float)x; }
__device__ float __ll2float_ru(long long int x) { return (float)x; }
__device__ float __ll2float_rz(long long int x) { return (float)x; }
__device__ double __longlong_as_double(long long int x) {
static_assert(sizeof(double) == sizeof(long long), "");
double tmp;
__builtin_memcpy(&tmp, &x, sizeof(tmp));
return x;
}
__device__ double __uint2double_rn(int x) { return (double)x; }
__device__ float __uint2float_rd(unsigned int x) { return (float)x; }
__device__ float __uint2float_rn(unsigned int x) { return (float)x; }
__device__ float __uint2float_ru(unsigned int x) { return (float)x; }
__device__ float __uint2float_rz(unsigned int x) { return (float)x; }
__device__ float __uint_as_float(unsigned int x) {
static_assert(sizeof(float) == sizeof(unsigned int), "");
float tmp;
__builtin_memcpy(&tmp, &x, sizeof(tmp));
return tmp;
}
__device__ double __ull2double_rd(unsigned long long int x) { return (double)x; }
__device__ double __ull2double_rn(unsigned long long int x) { return (double)x; }
__device__ double __ull2double_ru(unsigned long long int x) { return (double)x; }
__device__ double __ull2double_rz(unsigned long long int x) { return (double)x; }
__device__ float __ull2float_rd(unsigned long long int x) { return (float)x; }
__device__ float __ull2float_rn(unsigned long long int x) { return (float)x; }
__device__ float __ull2float_ru(unsigned long long int x) { return (float)x; }
__device__ float __ull2float_rz(unsigned long long int x) { return (float)x; }
/*
Integer Intrinsics
*/
// integer intrinsic function __poc __clz __ffs __brev
__device__ unsigned int __popc(unsigned int input) { return hc::__popcount_u32_b32(input); }
__device__ unsigned int __popcll(unsigned long long int input) {
return hc::__popcount_u32_b64(input);
}
__device__ unsigned int __clz(unsigned int input) {
#ifdef NVCC_COMPAT
return input == 0 ? 32 : hc::__firstbit_u32_u32(input);
#else
return hc::__firstbit_u32_u32(input);
#endif
}
__device__ unsigned int __clzll(unsigned long long int input) {
#ifdef NVCC_COMPAT
return input == 0 ? 64 : hc::__firstbit_u32_u64(input);
#else
return hc::__firstbit_u32_u64(input);
#endif
}
__device__ unsigned int __clz(int input) {
#ifdef NVCC_COMPAT
return input == 0 ? 32 : hc::__firstbit_u32_s32(input);
#else
return hc::__firstbit_u32_s32(input);
#endif
}
__device__ unsigned int __clzll(long long int input) {
#ifdef NVCC_COMPAT
return input == 0 ? 64 : hc::__firstbit_u32_s64(input);
#else
return hc::__firstbit_u32_s64(input);
#endif
}
__device__ unsigned int __ffs(unsigned int input) {
#ifdef NVCC_COMPAT
return hc::__lastbit_u32_u32(input) + 1;
#else
return hc::__lastbit_u32_u32(input);
#endif
}
__device__ unsigned int __ffsll(unsigned long long int input) {
#ifdef NVCC_COMPAT
return hc::__lastbit_u32_u64(input) + 1;
#else
return hc::__lastbit_u32_u64(input);
#endif
}
__device__ unsigned int __ffs(int input) {
#ifdef NVCC_COMPAT
return hc::__lastbit_u32_s32(input) + 1;
#else
return hc::__lastbit_u32_s32(input);
#endif
}
__device__ unsigned int __ffsll(long long int input) {
#ifdef NVCC_COMPAT
return hc::__lastbit_u32_s64(input) + 1;
#else
return hc::__lastbit_u32_s64(input);
#endif
}
__device__ unsigned int __brev(unsigned int input) { return hc::__bitrev_b32(input); }
__device__ unsigned long long int __brevll(unsigned long long int input) {
return hc::__bitrev_b64(input);
}
struct ucharHolder {
union {
unsigned char c[4];
unsigned int ui;
};
} __attribute__((aligned(4)));
struct uchar2Holder {
union {
unsigned int ui[2];
unsigned char c[8];
};
} __attribute__((aligned(8)));
__device__ unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
struct uchar2Holder cHoldVal;
struct ucharHolder cHoldKey;
struct ucharHolder cHoldOut;
cHoldKey.ui = s;
cHoldVal.ui[0] = x;
cHoldVal.ui[1] = y;
cHoldOut.c[0] = cHoldVal.c[cHoldKey.c[0]];
cHoldOut.c[1] = cHoldVal.c[cHoldKey.c[1]];
cHoldOut.c[2] = cHoldVal.c[cHoldKey.c[2]];
cHoldOut.c[3] = cHoldVal.c[cHoldKey.c[3]];
return cHoldOut.ui;
}
__device__ long long __mul64hi(long long int x, long long int y) {
ulong x0 = (ulong)x & 0xffffffffUL;
long x1 = x >> 32;
ulong y0 = (ulong)y & 0xffffffffUL;
long y1 = y >> 32;
ulong z0 = x0*y0;
long t = x1*y0 + (z0 >> 32);
long z1 = t & 0xffffffffL;
long z2 = t >> 32;
z1 = x0*y1 + z1;
return x1*y1 + z2 + (z1 >> 32);
}
__device__ unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
ulong x0 = x & 0xffffffffUL;
ulong x1 = x >> 32;
ulong y0 = y & 0xffffffffUL;
ulong y1 = y >> 32;
ulong z0 = x0*y0;
ulong t = x1*y0 + (z0 >> 32);
ulong z1 = t & 0xffffffffUL;
ulong z2 = t >> 32;
z1 = x0*y1 + z1;
return x1*y1 + z2 + (z1 >> 32);
}
/*
HIP specific device functions
*/
__device__ unsigned __hip_ds_bpermute(int index, unsigned src) {
return hc::__amdgcn_ds_bpermute(index, src);
}
__device__ float __hip_ds_bpermutef(int index, float src) {
return hc::__amdgcn_ds_bpermute(index, src);
}
__device__ unsigned __hip_ds_permute(int index, unsigned src) {
return hc::__amdgcn_ds_permute(index, src);
}
__device__ float __hip_ds_permutef(int index, float src) {
return hc::__amdgcn_ds_permute(index, src);
}
__device__ unsigned __hip_ds_swizzle(unsigned int src, int pattern) {
return hc::__amdgcn_ds_swizzle(src, pattern);
}
__device__ float __hip_ds_swizzlef(float src, int pattern) {
return hc::__amdgcn_ds_swizzle(src, pattern);
}
__device__ int __hip_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl) {
return hc::__amdgcn_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
}
#define MASK1 0x00ff00ff
#define MASK2 0xff00ff00
__device__ char4 __hip_hc_add8pk(char4 in1, char4 in2) {
char4 out;
unsigned one1 = in1.a & MASK1;
unsigned one2 = in2.a & MASK1;
out.a = (one1 + one2) & MASK1;
one1 = in1.a & MASK2;
one2 = in2.a & MASK2;
out.a = out.a | ((one1 + one2) & MASK2);
return out;
}
__device__ char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
char4 out;
unsigned one1 = in1.a & MASK1;
unsigned one2 = in2.a & MASK1;
out.a = (one1 - one2) & MASK1;
one1 = in1.a & MASK2;
one2 = in2.a & MASK2;
out.a = out.a | ((one1 - one2) & MASK2);
return out;
}
__device__ char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
char4 out;
unsigned one1 = in1.a & MASK1;
unsigned one2 = in2.a & MASK1;
out.a = (one1 * one2) & MASK1;
one1 = in1.a & MASK2;
one2 = in2.a & MASK2;
out.a = out.a | ((one1 * one2) & MASK2);
return out;
}