-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmemchr.c
290 lines (216 loc) · 9.05 KB
/
memchr.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
/***
*memchr.c - search block of memory for a given character
*
* Copyright (c) Microsoft Corporation. All rights reserved.
*
*Purpose:
* defines memchr() - search memory until a character is
* found or a limit is reached.
*
*******************************************************************************/
#include <string.h>
#pragma function(memchr)
/***
*char *memchr(buf, chr, cnt) - search memory for given character.
*
*Purpose:
* Searches at buf for the given character, stopping when chr is
* first found or cnt bytes have been searched through.
*
*Entry:
* void *buf - memory buffer to be searched
* int chr - character to search for
* size_t cnt - max number of bytes to search
*
*Exit:
* returns pointer to first occurence of chr in buf
* returns NULL if chr not found in the first cnt bytes
*
*Exceptions:
*
*******************************************************************************/
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
#include <arm64string.h>
void * __cdecl memchr (
const void * buf,
int chr,
size_t cnt
)
{
__n64 *src_a64, characters64, match64, chmatch64, chmatchrev64;
vector_t *src_a;
unsigned __int64 ch_lword0, mask;
int ch_bitoffset, offset;
size_t count;
char *end;
if (cnt < 8) {
// Must not read bytes beyond cnt, so process them bytewise if they are less than a vector width.
// In any case, for less than about 4 bytes, bytewise is faster than the vector implementation.
unsigned char *chbuf = (unsigned char *)buf;
while ( cnt && (*chbuf++ != (unsigned char)chr) ) {
cnt--;
}
return(cnt ? (void *)--chbuf : NULL);
}
count = cnt;
// Start by getting the aligned XMMWORD containing the first
// characters of the buf. This is done first to partially
// cover any memory access latency.
// No perf benefit from adjusting for 8 byte alignment.
src_a64 = (__n64 *)buf;
characters64 = *(src_a64);
// Now create patterns to check for a terminating zero or match.
// These characters are copied to every position of a XMMWORD.
match64 = neon_dupr8(chr);
// Do initial check. Compare against each pattern to get bytewise flags for
// each match.
chmatch64 = neon_cmeq8(characters64, match64);
// reduce overhead in short case. handle first 64bits (half vector) quickly
chmatchrev64 = neon_rev64_8(chmatch64);
ch_lword0 = neon_umov64(chmatchrev64, 0);
// Scan the lword0, lword1 for the position of the first character match
ch_bitoffset = _CountLeadingZeros64(ch_lword0);
if (ch_bitoffset != 64) {
end = (char *)((ch_bitoffset >> 3) + (intptr_t)(src_a64));
return (end < ((char *)buf) + cnt) ? end : NULL;
}
count -= 8;
src_a64++;
src_a = (vector_t*) src_a64;
if (count >= 16) {
vector_t characters, match, chmatch, chmatchrev;
unsigned __int64 lword1, lword0;
__n64 uaddlvq32;
match = neon_dupqr8(chr);
// Unrolling by a factor of 8 is slightly faster in long cases
// but much slower in short cases.
// Unroll by a factor of 4. Use 2 x LDP instruction to load 4 Q regs.
// The loop is 7% faster than unrolling twice
while ((count >= 64))
{
vector_t characters2, chmatch2;
vector_t characters3, chmatch3;
vector_t characters4, chmatch4;
vector_t chmatch12, chmatch34, chmatch1234;
// Load and check each subsequent XMMWORD.
characters = *(src_a++);
chmatch = neon_cmeqq8(characters, match);
characters2 = *(src_a++);
chmatch2 = neon_cmeqq8(characters2, match);
characters3 = *(src_a++);
chmatch3 = neon_cmeqq8(characters3, match);
characters4 = *(src_a++);
chmatch4 = neon_cmeqq8(characters4, match);
chmatch12 = neon_orrq(chmatch, chmatch2);
chmatch34 = neon_orrq(chmatch3, chmatch4);
chmatch1234 = neon_orrq(chmatch12, chmatch34);
// unsigned add across long vector. non zero means we have a match
uaddlvq32 = neon_uaddlvq32(chmatch1234);
mask = neon_umov64(uaddlvq32, 0);
count -= 64;
if (mask != 0) {
// There is a match in chmatch or chmatch2 or chmatch3 or chmatch4.
uaddlvq32 = neon_uaddlvq32(chmatch12);
mask = neon_umov64(uaddlvq32, 0);
if (mask != 0) { // match is in vectors 1 or 2
uaddlvq32 = neon_uaddlvq32(chmatch);
mask = neon_umov64(uaddlvq32, 0);
if (mask != 0) { // match is in vector 1
src_a -= 4;
chmatchrev = neon_rev64q_8(chmatch);
} else { // match is in vector 2
src_a -= 3;
chmatchrev = neon_rev64q_8(chmatch2);
}
} else { // match is in vectors 3 or 4
uaddlvq32 = neon_uaddlvq32(chmatch3);
mask = neon_umov64(uaddlvq32, 0);
if (mask != 0) { // match is in vector 3
src_a -= 2;
chmatchrev = neon_rev64q_8(chmatch3);
} else { // match is in vector 4
src_a -= 1;
chmatchrev = neon_rev64q_8(chmatch4);
}
}
// move the chmatch into lword1, lword0
lword0 = neon_umovq64(chmatchrev, 0);
lword1 = neon_umovq64(chmatchrev, 1);
// Scan the lword1,lword0 for the position of the first match
ch_bitoffset = _CountLeadingZeros128(lword0, lword1);
offset = ch_bitoffset >> 3;
// Add the offset to the XMMWORD address to get a pointer
// to the significant character.
end = (char *)(offset + (intptr_t)(src_a));
// Address is in range. Return pointer;
return end;
}
}
// Check each XMMWORD until a match or the end of the buf is found.
while (count >= 16)
{
// Load and check each subsequent XMMWORD.
characters = *(src_a++);
chmatch = neon_cmeqq8(characters, match);
// unsigned add across long vector. non zero means we have a match
uaddlvq32 = neon_uaddlvq32(chmatch);
mask = neon_umov64(uaddlvq32, 0);
count -= 16;
if (mask != 0) {
chmatchrev = neon_rev64q_8(chmatch);
// move the chmatch into lword1, lword0
lword0 = neon_umovq64(chmatchrev, 0);
lword1 = neon_umovq64(chmatchrev, 1);
// Scan the lword1,lword0 for the position of the first match
ch_bitoffset = _CountLeadingZeros128(lword0, lword1);
offset = ch_bitoffset >> 3;
// Add the offset to the XMMWORD address to get a pointer
// to the significant character.
end = (char *)(offset + (intptr_t)(src_a - 1));
// We know address is in range, so return pointer;
return end;
}
}
}
// Must not access memory beyond cnt.
// So, for the last 16 bytes, do not use a 16 byte vector access
// If we have 8 bytes left, it is safe to use an 8 byte vector access.
if (count >= 8) {
src_a64 = (__n64 *)src_a;
characters64 = *src_a64;
chmatch64 = neon_cmeq8(characters64, match64);
// reduce overhead in short case. handle first 64bits (half vector) quickly
chmatchrev64 = neon_rev64_8(chmatch64);
ch_lword0 = neon_umov64(chmatchrev64, 0);
// Scan the lword0, lword1 for the position of the first character match
ch_bitoffset = _CountLeadingZeros64(ch_lword0);
if (ch_bitoffset != 64) {
end = (char *)((ch_bitoffset >> 3) + (intptr_t)(src_a64));
return end;
}
count -= 8;
src_a64++;
src_a = (vector_t *)src_a64;
}
// Must not access memory beyond cnt.
// So, for the last <8 bytes, use bytewise access.
unsigned char *chbuf = (unsigned char *)src_a;
while ( count && (*chbuf++ != (unsigned char)chr) ) {
count--;
}
return(count ? (void *)--chbuf : NULL);
}
#else
void * __cdecl memchr (
const void * buf,
int chr,
size_t cnt
)
{
while ( cnt && (*(unsigned char *)buf != (unsigned char)chr) ) {
buf = (unsigned char *)buf + 1;
cnt--;
}
return(cnt ? (void *)buf : NULL);
}
#endif