-
Notifications
You must be signed in to change notification settings - Fork 104
/
Copy pathinline_ptx.h
64 lines (52 loc) · 2.03 KB
/
inline_ptx.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#pragma once
/*
Inline ptx instructions for low-level control of code generation.
Primarily these are for doing stores avoiding L1 cache and minimal
impact on L2 (streaming through L2).
*/
// Define a different pointer storage size for 64 and 32 bit
#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
#define __PTR "l"
#else
#define __PTR "r"
#endif
namespace quda {
__device__ inline void load_streaming_double2(double2 &a, const double2* addr)
{
double x, y;
asm("ld.cs.global.v2.f64 {%0, %1}, [%2+0];" : "=d"(x), "=d"(y) : __PTR(addr));
a.x = x; a.y = y;
}
__device__ inline void load_streaming_float4(float4 &a, const float4* addr)
{
float x, y, z, w;
asm("ld.cs.global.v4.f32 {%0, %1, %2, %3}, [%4+0];" : "=f"(x), "=f"(y), "=f"(z), "=f"(w) : __PTR(addr));
a.x = x; a.y = y; a.z = z; a.w = w;
}
__device__ inline void load_global_float4(float4 &a, const float4* addr)
{
float x, y, z, w;
asm("ld.cg.global.v4.f32 {%0, %1, %2, %3}, [%4+0];" : "=f"(x), "=f"(y), "=f"(z), "=f"(w) : __PTR(addr));
a.x = x; a.y = y; a.z = z; a.w = w;
}
__device__ inline void store_streaming_float4(float4* addr, float x, float y, float z, float w)
{
asm("st.cs.global.v4.f32 [%0+0], {%1, %2, %3, %4};" :: __PTR(addr), "f"(x), "f"(y), "f"(z), "f"(w));
}
__device__ inline void store_streaming_short4(short4* addr, short x, short y, short z, short w)
{
asm("st.cs.global.v4.s16 [%0+0], {%1, %2, %3, %4};" :: __PTR(addr), "h"(x), "h"(y), "h"(z), "h"(w));
}
__device__ inline void store_streaming_double2(double2* addr, double x, double y)
{
asm("st.cs.global.v2.f64 [%0+0], {%1, %2};" :: __PTR(addr), "d"(x), "d"(y));
}
__device__ inline void store_streaming_float2(float2* addr, float x, float y)
{
asm("st.cs.global.v2.f32 [%0+0], {%1, %2};" :: __PTR(addr), "f"(x), "f"(y));
}
__device__ inline void store_streaming_short2(short2* addr, short x, short y)
{
asm("st.cs.global.v2.s16 [%0+0], {%1, %2};" :: __PTR(addr), "h"(x), "h"(y));
}
} // namespace quda