Skip to content

Commit 8d24966

Browse files
committed
[x86] allow pairs of PCMPEQ for vector-sized integer equality comparisons (PR33325)
This is an extension of D31156 with the goal that we'll allow memcmp() == 0 expansion for x86 to use 2 pairs of loads per block. The memcmp expansion pass (formerly part of CGP) will generate this kind of pattern with oversized integer compares, so we want to transform these into x86-specific vector nodes before legalization splits things into scalar chunks. See PR33325 for more details: https://bugs.llvm.org/show_bug.cgi?id=33325 Differential Revision: https://reviews.llvm.org/D41618 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@321656 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent d8de4ce commit 8d24966

File tree

2 files changed

+227
-95
lines changed

2 files changed

+227
-95
lines changed

lib/Target/X86/X86ISelLowering.cpp

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -36316,13 +36316,23 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
3631636316
ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
3631736317
assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
3631836318

36319-
// We're looking for an oversized integer equality comparison, but ignore a
36320-
// comparison with zero because that gets special treatment in EmitTest().
36319+
// We're looking for an oversized integer equality comparison.
3632136320
SDValue X = SetCC->getOperand(0);
3632236321
SDValue Y = SetCC->getOperand(1);
3632336322
EVT OpVT = X.getValueType();
3632436323
unsigned OpSize = OpVT.getSizeInBits();
36325-
if (!OpVT.isScalarInteger() || OpSize < 128 || isNullConstant(Y))
36324+
if (!OpVT.isScalarInteger() || OpSize < 128)
36325+
return SDValue();
36326+
36327+
// Ignore a comparison with zero because that gets special treatment in
36328+
// EmitTest(). But make an exception for the special case of a pair of
36329+
// logically-combined vector-sized operands compared to zero. This pattern may
36330+
// be generated by the memcmp expansion pass with oversized integer compares
36331+
// (see PR33325).
36332+
bool IsOrXorXorCCZero = isNullConstant(Y) && X.getOpcode() == ISD::OR &&
36333+
X.getOperand(0).getOpcode() == ISD::XOR &&
36334+
X.getOperand(1).getOpcode() == ISD::XOR;
36335+
if (isNullConstant(Y) && !IsOrXorXorCCZero)
3632636336
return SDValue();
3632736337

3632836338
// Bail out if we know that this is not really just an oversized integer.
@@ -36337,15 +36347,29 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
3633736347
if ((OpSize == 128 && Subtarget.hasSSE2()) ||
3633836348
(OpSize == 256 && Subtarget.hasAVX2())) {
3633936349
EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
36340-
SDValue VecX = DAG.getBitcast(VecVT, X);
36341-
SDValue VecY = DAG.getBitcast(VecVT, Y);
36342-
36350+
SDValue Cmp;
36351+
if (IsOrXorXorCCZero) {
36352+
// This is a bitwise-combined equality comparison of 2 pairs of vectors:
36353+
// setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
36354+
// Use 2 vector equality compares and 'and' the results before doing a
36355+
// MOVMSK.
36356+
SDValue A = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(0));
36357+
SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1));
36358+
SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0));
36359+
SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1));
36360+
SDValue Cmp1 = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, A, B);
36361+
SDValue Cmp2 = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, C, D);
36362+
Cmp = DAG.getNode(ISD::AND, DL, VecVT, Cmp1, Cmp2);
36363+
} else {
36364+
SDValue VecX = DAG.getBitcast(VecVT, X);
36365+
SDValue VecY = DAG.getBitcast(VecVT, Y);
36366+
Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
36367+
}
3634336368
// If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
3634436369
// setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
3634536370
// setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
3634636371
// setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
3634736372
// setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
36348-
SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
3634936373
SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
3635036374
SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
3635136375
MVT::i32);

test/CodeGen/X86/setcc-wide-types.ll

Lines changed: 196 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -193,22 +193,33 @@ define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) {
193193
; if we allowed 2 pairs of 16-byte loads per block.
194194

195195
define i32 @ne_i128_pair(i128* %a, i128* %b) {
196-
; ANY-LABEL: ne_i128_pair:
197-
; ANY: # %bb.0:
198-
; ANY-NEXT: movq (%rdi), %rax
199-
; ANY-NEXT: movq 8(%rdi), %rcx
200-
; ANY-NEXT: xorq (%rsi), %rax
201-
; ANY-NEXT: xorq 8(%rsi), %rcx
202-
; ANY-NEXT: movq 24(%rdi), %rdx
203-
; ANY-NEXT: movq 16(%rdi), %rdi
204-
; ANY-NEXT: xorq 16(%rsi), %rdi
205-
; ANY-NEXT: orq %rax, %rdi
206-
; ANY-NEXT: xorq 24(%rsi), %rdx
207-
; ANY-NEXT: orq %rcx, %rdx
208-
; ANY-NEXT: xorl %eax, %eax
209-
; ANY-NEXT: orq %rdi, %rdx
210-
; ANY-NEXT: setne %al
211-
; ANY-NEXT: retq
196+
; SSE2-LABEL: ne_i128_pair:
197+
; SSE2: # %bb.0:
198+
; SSE2-NEXT: movdqu (%rdi), %xmm0
199+
; SSE2-NEXT: movdqu 16(%rdi), %xmm1
200+
; SSE2-NEXT: movdqu (%rsi), %xmm2
201+
; SSE2-NEXT: pcmpeqb %xmm0, %xmm2
202+
; SSE2-NEXT: movdqu 16(%rsi), %xmm0
203+
; SSE2-NEXT: pcmpeqb %xmm1, %xmm0
204+
; SSE2-NEXT: pand %xmm2, %xmm0
205+
; SSE2-NEXT: pmovmskb %xmm0, %ecx
206+
; SSE2-NEXT: xorl %eax, %eax
207+
; SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
208+
; SSE2-NEXT: setne %al
209+
; SSE2-NEXT: retq
210+
;
211+
; AVXANY-LABEL: ne_i128_pair:
212+
; AVXANY: # %bb.0:
213+
; AVXANY-NEXT: vmovdqu (%rdi), %xmm0
214+
; AVXANY-NEXT: vmovdqu 16(%rdi), %xmm1
215+
; AVXANY-NEXT: vpcmpeqb 16(%rsi), %xmm1, %xmm1
216+
; AVXANY-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
217+
; AVXANY-NEXT: vpand %xmm1, %xmm0, %xmm0
218+
; AVXANY-NEXT: vpmovmskb %xmm0, %ecx
219+
; AVXANY-NEXT: xorl %eax, %eax
220+
; AVXANY-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
221+
; AVXANY-NEXT: setne %al
222+
; AVXANY-NEXT: retq
212223
%a0 = load i128, i128* %a
213224
%b0 = load i128, i128* %b
214225
%xor1 = xor i128 %a0, %b0
@@ -227,22 +238,33 @@ define i32 @ne_i128_pair(i128* %a, i128* %b) {
227238
; if we allowed 2 pairs of 16-byte loads per block.
228239

229240
define i32 @eq_i128_pair(i128* %a, i128* %b) {
230-
; ANY-LABEL: eq_i128_pair:
231-
; ANY: # %bb.0:
232-
; ANY-NEXT: movq (%rdi), %rax
233-
; ANY-NEXT: movq 8(%rdi), %rcx
234-
; ANY-NEXT: xorq (%rsi), %rax
235-
; ANY-NEXT: xorq 8(%rsi), %rcx
236-
; ANY-NEXT: movq 24(%rdi), %rdx
237-
; ANY-NEXT: movq 16(%rdi), %rdi
238-
; ANY-NEXT: xorq 16(%rsi), %rdi
239-
; ANY-NEXT: orq %rax, %rdi
240-
; ANY-NEXT: xorq 24(%rsi), %rdx
241-
; ANY-NEXT: orq %rcx, %rdx
242-
; ANY-NEXT: xorl %eax, %eax
243-
; ANY-NEXT: orq %rdi, %rdx
244-
; ANY-NEXT: sete %al
245-
; ANY-NEXT: retq
241+
; SSE2-LABEL: eq_i128_pair:
242+
; SSE2: # %bb.0:
243+
; SSE2-NEXT: movdqu (%rdi), %xmm0
244+
; SSE2-NEXT: movdqu 16(%rdi), %xmm1
245+
; SSE2-NEXT: movdqu (%rsi), %xmm2
246+
; SSE2-NEXT: pcmpeqb %xmm0, %xmm2
247+
; SSE2-NEXT: movdqu 16(%rsi), %xmm0
248+
; SSE2-NEXT: pcmpeqb %xmm1, %xmm0
249+
; SSE2-NEXT: pand %xmm2, %xmm0
250+
; SSE2-NEXT: pmovmskb %xmm0, %ecx
251+
; SSE2-NEXT: xorl %eax, %eax
252+
; SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
253+
; SSE2-NEXT: sete %al
254+
; SSE2-NEXT: retq
255+
;
256+
; AVXANY-LABEL: eq_i128_pair:
257+
; AVXANY: # %bb.0:
258+
; AVXANY-NEXT: vmovdqu (%rdi), %xmm0
259+
; AVXANY-NEXT: vmovdqu 16(%rdi), %xmm1
260+
; AVXANY-NEXT: vpcmpeqb 16(%rsi), %xmm1, %xmm1
261+
; AVXANY-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
262+
; AVXANY-NEXT: vpand %xmm1, %xmm0, %xmm0
263+
; AVXANY-NEXT: vpmovmskb %xmm0, %ecx
264+
; AVXANY-NEXT: xorl %eax, %eax
265+
; AVXANY-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
266+
; AVXANY-NEXT: sete %al
267+
; AVXANY-NEXT: retq
246268
%a0 = load i128, i128* %a
247269
%b0 = load i128, i128* %b
248270
%xor1 = xor i128 %a0, %b0
@@ -261,34 +283,77 @@ define i32 @eq_i128_pair(i128* %a, i128* %b) {
261283
; if we allowed 2 pairs of 32-byte loads per block.
262284

263285
define i32 @ne_i256_pair(i256* %a, i256* %b) {
264-
; ANY-LABEL: ne_i256_pair:
265-
; ANY: # %bb.0:
266-
; ANY-NEXT: movq 16(%rdi), %r9
267-
; ANY-NEXT: movq 24(%rdi), %r11
268-
; ANY-NEXT: movq (%rdi), %r8
269-
; ANY-NEXT: movq 8(%rdi), %r10
270-
; ANY-NEXT: xorq 8(%rsi), %r10
271-
; ANY-NEXT: xorq 24(%rsi), %r11
272-
; ANY-NEXT: xorq (%rsi), %r8
273-
; ANY-NEXT: xorq 16(%rsi), %r9
274-
; ANY-NEXT: movq 48(%rdi), %rdx
275-
; ANY-NEXT: movq 32(%rdi), %rax
276-
; ANY-NEXT: movq 56(%rdi), %rcx
277-
; ANY-NEXT: movq 40(%rdi), %rdi
278-
; ANY-NEXT: xorq 40(%rsi), %rdi
279-
; ANY-NEXT: xorq 56(%rsi), %rcx
280-
; ANY-NEXT: orq %r11, %rcx
281-
; ANY-NEXT: orq %rdi, %rcx
282-
; ANY-NEXT: orq %r10, %rcx
283-
; ANY-NEXT: xorq 32(%rsi), %rax
284-
; ANY-NEXT: xorq 48(%rsi), %rdx
285-
; ANY-NEXT: orq %r9, %rdx
286-
; ANY-NEXT: orq %rax, %rdx
287-
; ANY-NEXT: orq %r8, %rdx
288-
; ANY-NEXT: xorl %eax, %eax
289-
; ANY-NEXT: orq %rcx, %rdx
290-
; ANY-NEXT: setne %al
291-
; ANY-NEXT: retq
286+
; SSE2-LABEL: ne_i256_pair:
287+
; SSE2: # %bb.0:
288+
; SSE2-NEXT: movq 16(%rdi), %r9
289+
; SSE2-NEXT: movq 24(%rdi), %r11
290+
; SSE2-NEXT: movq (%rdi), %r8
291+
; SSE2-NEXT: movq 8(%rdi), %r10
292+
; SSE2-NEXT: xorq 8(%rsi), %r10
293+
; SSE2-NEXT: xorq 24(%rsi), %r11
294+
; SSE2-NEXT: xorq (%rsi), %r8
295+
; SSE2-NEXT: xorq 16(%rsi), %r9
296+
; SSE2-NEXT: movq 48(%rdi), %rdx
297+
; SSE2-NEXT: movq 32(%rdi), %rax
298+
; SSE2-NEXT: movq 56(%rdi), %rcx
299+
; SSE2-NEXT: movq 40(%rdi), %rdi
300+
; SSE2-NEXT: xorq 40(%rsi), %rdi
301+
; SSE2-NEXT: xorq 56(%rsi), %rcx
302+
; SSE2-NEXT: orq %r11, %rcx
303+
; SSE2-NEXT: orq %rdi, %rcx
304+
; SSE2-NEXT: orq %r10, %rcx
305+
; SSE2-NEXT: xorq 32(%rsi), %rax
306+
; SSE2-NEXT: xorq 48(%rsi), %rdx
307+
; SSE2-NEXT: orq %r9, %rdx
308+
; SSE2-NEXT: orq %rax, %rdx
309+
; SSE2-NEXT: orq %r8, %rdx
310+
; SSE2-NEXT: xorl %eax, %eax
311+
; SSE2-NEXT: orq %rcx, %rdx
312+
; SSE2-NEXT: setne %al
313+
; SSE2-NEXT: retq
314+
;
315+
; AVX1-LABEL: ne_i256_pair:
316+
; AVX1: # %bb.0:
317+
; AVX1-NEXT: movq 16(%rdi), %r9
318+
; AVX1-NEXT: movq 24(%rdi), %r11
319+
; AVX1-NEXT: movq (%rdi), %r8
320+
; AVX1-NEXT: movq 8(%rdi), %r10
321+
; AVX1-NEXT: xorq 8(%rsi), %r10
322+
; AVX1-NEXT: xorq 24(%rsi), %r11
323+
; AVX1-NEXT: xorq (%rsi), %r8
324+
; AVX1-NEXT: xorq 16(%rsi), %r9
325+
; AVX1-NEXT: movq 48(%rdi), %rdx
326+
; AVX1-NEXT: movq 32(%rdi), %rax
327+
; AVX1-NEXT: movq 56(%rdi), %rcx
328+
; AVX1-NEXT: movq 40(%rdi), %rdi
329+
; AVX1-NEXT: xorq 40(%rsi), %rdi
330+
; AVX1-NEXT: xorq 56(%rsi), %rcx
331+
; AVX1-NEXT: orq %r11, %rcx
332+
; AVX1-NEXT: orq %rdi, %rcx
333+
; AVX1-NEXT: orq %r10, %rcx
334+
; AVX1-NEXT: xorq 32(%rsi), %rax
335+
; AVX1-NEXT: xorq 48(%rsi), %rdx
336+
; AVX1-NEXT: orq %r9, %rdx
337+
; AVX1-NEXT: orq %rax, %rdx
338+
; AVX1-NEXT: orq %r8, %rdx
339+
; AVX1-NEXT: xorl %eax, %eax
340+
; AVX1-NEXT: orq %rcx, %rdx
341+
; AVX1-NEXT: setne %al
342+
; AVX1-NEXT: retq
343+
;
344+
; AVX256-LABEL: ne_i256_pair:
345+
; AVX256: # %bb.0:
346+
; AVX256-NEXT: vmovdqu (%rdi), %ymm0
347+
; AVX256-NEXT: vmovdqu 32(%rdi), %ymm1
348+
; AVX256-NEXT: vpcmpeqb 32(%rsi), %ymm1, %ymm1
349+
; AVX256-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0
350+
; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm0
351+
; AVX256-NEXT: vpmovmskb %ymm0, %ecx
352+
; AVX256-NEXT: xorl %eax, %eax
353+
; AVX256-NEXT: cmpl $-1, %ecx
354+
; AVX256-NEXT: setne %al
355+
; AVX256-NEXT: vzeroupper
356+
; AVX256-NEXT: retq
292357
%a0 = load i256, i256* %a
293358
%b0 = load i256, i256* %b
294359
%xor1 = xor i256 %a0, %b0
@@ -307,34 +372,77 @@ define i32 @ne_i256_pair(i256* %a, i256* %b) {
307372
; if we allowed 2 pairs of 32-byte loads per block.
308373

309374
define i32 @eq_i256_pair(i256* %a, i256* %b) {
310-
; ANY-LABEL: eq_i256_pair:
311-
; ANY: # %bb.0:
312-
; ANY-NEXT: movq 16(%rdi), %r9
313-
; ANY-NEXT: movq 24(%rdi), %r11
314-
; ANY-NEXT: movq (%rdi), %r8
315-
; ANY-NEXT: movq 8(%rdi), %r10
316-
; ANY-NEXT: xorq 8(%rsi), %r10
317-
; ANY-NEXT: xorq 24(%rsi), %r11
318-
; ANY-NEXT: xorq (%rsi), %r8
319-
; ANY-NEXT: xorq 16(%rsi), %r9
320-
; ANY-NEXT: movq 48(%rdi), %rdx
321-
; ANY-NEXT: movq 32(%rdi), %rax
322-
; ANY-NEXT: movq 56(%rdi), %rcx
323-
; ANY-NEXT: movq 40(%rdi), %rdi
324-
; ANY-NEXT: xorq 40(%rsi), %rdi
325-
; ANY-NEXT: xorq 56(%rsi), %rcx
326-
; ANY-NEXT: orq %r11, %rcx
327-
; ANY-NEXT: orq %rdi, %rcx
328-
; ANY-NEXT: orq %r10, %rcx
329-
; ANY-NEXT: xorq 32(%rsi), %rax
330-
; ANY-NEXT: xorq 48(%rsi), %rdx
331-
; ANY-NEXT: orq %r9, %rdx
332-
; ANY-NEXT: orq %rax, %rdx
333-
; ANY-NEXT: orq %r8, %rdx
334-
; ANY-NEXT: xorl %eax, %eax
335-
; ANY-NEXT: orq %rcx, %rdx
336-
; ANY-NEXT: sete %al
337-
; ANY-NEXT: retq
375+
; SSE2-LABEL: eq_i256_pair:
376+
; SSE2: # %bb.0:
377+
; SSE2-NEXT: movq 16(%rdi), %r9
378+
; SSE2-NEXT: movq 24(%rdi), %r11
379+
; SSE2-NEXT: movq (%rdi), %r8
380+
; SSE2-NEXT: movq 8(%rdi), %r10
381+
; SSE2-NEXT: xorq 8(%rsi), %r10
382+
; SSE2-NEXT: xorq 24(%rsi), %r11
383+
; SSE2-NEXT: xorq (%rsi), %r8
384+
; SSE2-NEXT: xorq 16(%rsi), %r9
385+
; SSE2-NEXT: movq 48(%rdi), %rdx
386+
; SSE2-NEXT: movq 32(%rdi), %rax
387+
; SSE2-NEXT: movq 56(%rdi), %rcx
388+
; SSE2-NEXT: movq 40(%rdi), %rdi
389+
; SSE2-NEXT: xorq 40(%rsi), %rdi
390+
; SSE2-NEXT: xorq 56(%rsi), %rcx
391+
; SSE2-NEXT: orq %r11, %rcx
392+
; SSE2-NEXT: orq %rdi, %rcx
393+
; SSE2-NEXT: orq %r10, %rcx
394+
; SSE2-NEXT: xorq 32(%rsi), %rax
395+
; SSE2-NEXT: xorq 48(%rsi), %rdx
396+
; SSE2-NEXT: orq %r9, %rdx
397+
; SSE2-NEXT: orq %rax, %rdx
398+
; SSE2-NEXT: orq %r8, %rdx
399+
; SSE2-NEXT: xorl %eax, %eax
400+
; SSE2-NEXT: orq %rcx, %rdx
401+
; SSE2-NEXT: sete %al
402+
; SSE2-NEXT: retq
403+
;
404+
; AVX1-LABEL: eq_i256_pair:
405+
; AVX1: # %bb.0:
406+
; AVX1-NEXT: movq 16(%rdi), %r9
407+
; AVX1-NEXT: movq 24(%rdi), %r11
408+
; AVX1-NEXT: movq (%rdi), %r8
409+
; AVX1-NEXT: movq 8(%rdi), %r10
410+
; AVX1-NEXT: xorq 8(%rsi), %r10
411+
; AVX1-NEXT: xorq 24(%rsi), %r11
412+
; AVX1-NEXT: xorq (%rsi), %r8
413+
; AVX1-NEXT: xorq 16(%rsi), %r9
414+
; AVX1-NEXT: movq 48(%rdi), %rdx
415+
; AVX1-NEXT: movq 32(%rdi), %rax
416+
; AVX1-NEXT: movq 56(%rdi), %rcx
417+
; AVX1-NEXT: movq 40(%rdi), %rdi
418+
; AVX1-NEXT: xorq 40(%rsi), %rdi
419+
; AVX1-NEXT: xorq 56(%rsi), %rcx
420+
; AVX1-NEXT: orq %r11, %rcx
421+
; AVX1-NEXT: orq %rdi, %rcx
422+
; AVX1-NEXT: orq %r10, %rcx
423+
; AVX1-NEXT: xorq 32(%rsi), %rax
424+
; AVX1-NEXT: xorq 48(%rsi), %rdx
425+
; AVX1-NEXT: orq %r9, %rdx
426+
; AVX1-NEXT: orq %rax, %rdx
427+
; AVX1-NEXT: orq %r8, %rdx
428+
; AVX1-NEXT: xorl %eax, %eax
429+
; AVX1-NEXT: orq %rcx, %rdx
430+
; AVX1-NEXT: sete %al
431+
; AVX1-NEXT: retq
432+
;
433+
; AVX256-LABEL: eq_i256_pair:
434+
; AVX256: # %bb.0:
435+
; AVX256-NEXT: vmovdqu (%rdi), %ymm0
436+
; AVX256-NEXT: vmovdqu 32(%rdi), %ymm1
437+
; AVX256-NEXT: vpcmpeqb 32(%rsi), %ymm1, %ymm1
438+
; AVX256-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0
439+
; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm0
440+
; AVX256-NEXT: vpmovmskb %ymm0, %ecx
441+
; AVX256-NEXT: xorl %eax, %eax
442+
; AVX256-NEXT: cmpl $-1, %ecx
443+
; AVX256-NEXT: sete %al
444+
; AVX256-NEXT: vzeroupper
445+
; AVX256-NEXT: retq
338446
%a0 = load i256, i256* %a
339447
%b0 = load i256, i256* %b
340448
%xor1 = xor i256 %a0, %b0

0 commit comments

Comments
 (0)