forked from google/highwayhash
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvec2.h
201 lines (166 loc) · 5.82 KB
/
vec2.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef HIGHWAYHASH_VEC2_H_
#define HIGHWAYHASH_VEC2_H_
// Defines SIMD vector classes ("V4x64U") with overloaded arithmetic operators:
// const V4x64U masked_sum = (a + b) & m;
// This is shorter and more readable than compiler intrinsics:
// const __m256i masked_sum = _mm256_and_si256(_mm256_add_epi64(a, b), m);
// There is typically no runtime cost for these abstractions.
//
// The naming convention is VNxBBT where N is the number of lanes, BB the
// number of bits per lane and T is the lane type: unsigned integer (U),
// signed integer (I), or floating-point (F).
//
// Requires reasonable C++11 support (VC2015) and an AVX2-capable CPU.
#include <immintrin.h>
#include <cstdint>
#include "code_annotation.h"
// 256-bit AVX-2 vector with 4 uint64_t lanes.
class V4x64U {
public:
using T = uint64_t;
static constexpr size_t kNumLanes = sizeof(__m256i) / sizeof(T);
// Leaves v_ uninitialized - typically used for output parameters.
INLINE V4x64U() {}
// Lane 0 (p_0) is the lowest.
INLINE V4x64U(T p_3, T p_2, T p_1, T p_0)
: v_(_mm256_set_epi64x(p_3, p_2, p_1, p_0)) {}
// Broadcasts i to all lanes.
INLINE explicit V4x64U(T i)
: v_(_mm256_broadcastq_epi64(_mm_cvtsi64_si128(i))) {}
// Converts to/from intrinsics.
INLINE explicit V4x64U(const __m256i& v) : v_(v) {}
INLINE operator __m256i() const { return v_; }
INLINE V4x64U& operator=(const __m256i& v) {
v_ = v;
return *this;
}
// _mm256_setzero_epi64 generates suboptimal code. Instead set
// z = x - x (given an existing "x"), or x == x to set all bits.
INLINE V4x64U& operator=(const V4x64U& other) {
v_ = other.v_;
return *this;
}
INLINE V4x64U& operator+=(const V4x64U& other) {
v_ = _mm256_add_epi64(v_, other);
return *this;
}
INLINE V4x64U& operator-=(const V4x64U& other) {
v_ = _mm256_sub_epi64(v_, other);
return *this;
}
INLINE V4x64U& operator&=(const V4x64U& other) {
v_ = _mm256_and_si256(v_, other);
return *this;
}
INLINE V4x64U& operator|=(const V4x64U& other) {
v_ = _mm256_or_si256(v_, other);
return *this;
}
INLINE V4x64U& operator^=(const V4x64U& other) {
v_ = _mm256_xor_si256(v_, other);
return *this;
}
INLINE V4x64U& operator<<=(const int count) {
v_ = _mm256_slli_epi64(v_, count);
return *this;
}
INLINE V4x64U& operator<<=(const __m128i& count) {
v_ = _mm256_sll_epi64(v_, count);
return *this;
}
INLINE V4x64U& operator>>=(const int count) {
v_ = _mm256_srli_epi64(v_, count);
return *this;
}
INLINE V4x64U& operator>>=(const __m128i& count) {
v_ = _mm256_srl_epi64(v_, count);
return *this;
}
private:
__m256i v_;
};
// Nonmember functions implemented in terms of member functions
static INLINE V4x64U operator+(const V4x64U& left, const V4x64U& right) {
V4x64U t(left);
return t += right;
}
static INLINE V4x64U operator-(const V4x64U& left, const V4x64U& right) {
V4x64U t(left);
return t -= right;
}
static INLINE V4x64U operator<<(const V4x64U& v, const int count) {
V4x64U t(v);
return t <<= count;
}
static INLINE V4x64U operator>>(const V4x64U& v, const int count) {
V4x64U t(v);
return t >>= count;
}
static INLINE V4x64U operator<<(const V4x64U& v, const __m128i& count) {
V4x64U t(v);
return t <<= count;
}
static INLINE V4x64U operator>>(const V4x64U& v, const __m128i& count) {
V4x64U t(v);
return t >>= count;
}
static INLINE V4x64U operator&(const V4x64U& left, const V4x64U& right) {
V4x64U t(left);
return t &= right;
}
static INLINE V4x64U operator|(const V4x64U& left, const V4x64U& right) {
V4x64U t(left);
return t |= right;
}
static INLINE V4x64U operator^(const V4x64U& left, const V4x64U& right) {
V4x64U t(left);
return t ^= right;
}
// Load/Store.
// "from" must be vector-aligned.
static INLINE V4x64U Load(const uint64_t* RESTRICT const from) {
return V4x64U(_mm256_load_si256(reinterpret_cast<const __m256i*>(from)));
}
static INLINE V4x64U LoadU(const uint64_t* RESTRICT const from) {
return V4x64U(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)));
}
// "to" must be vector-aligned.
static INLINE void Store(const V4x64U& v, uint64_t* RESTRICT const to) {
_mm256_store_si256(reinterpret_cast<__m256i*>(to), v);
}
static INLINE void StoreU(const V4x64U& v, uint64_t* RESTRICT const to) {
_mm256_storeu_si256(reinterpret_cast<__m256i*>(to), v);
}
// Writes directly to (aligned) memory, bypassing the cache. This is useful for
// data that will not be read again in the near future.
static INLINE void Stream(const V4x64U& v, uint64_t* RESTRICT const to) {
_mm256_stream_si256(reinterpret_cast<__m256i*>(to), v);
}
// Miscellaneous functions.
static INLINE V4x64U AndNot(const V4x64U& neg_mask, const V4x64U& values) {
return V4x64U(_mm256_andnot_si256(neg_mask, values));
}
static INLINE V4x64U UnpackLow(const V4x64U& low, const V4x64U& high) {
return V4x64U(_mm256_unpacklo_epi64(low, high));
}
static INLINE V4x64U UnpackHigh(const V4x64U& low, const V4x64U& high) {
return V4x64U(_mm256_unpackhi_epi64(low, high));
}
// There are no greater-than comparison instructions for unsigned T.
static INLINE V4x64U operator==(const V4x64U& left, const V4x64U& right) {
return V4x64U(_mm256_cmpeq_epi64(left, right));
}
#endif // #ifndef HIGHWAYHASH_VEC2_H_