diff --git llvm/lib/Target/X86/X86ISelLowering.cpp llvm/lib/Target/X86/X86ISelLowering.cpp index 1d2d90d543c0..9592137b3484 100644 --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -42513,10 +42513,12 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, case X86ISD::VPERMV3: { // Combine VPERMV3 to widened VPERMV if the two source operands can be // freely concatenated. - if (VT.is128BitVector() || - (VT.is256BitVector() && Subtarget.useAVX512Regs())) { + MVT WideVT = VT.getDoubleNumVectorElementsVT(); + MVT MaskVT = N.getOperand(1).getSimpleValueType(); + bool CanConcat = VT.is128BitVector() || + (VT.is256BitVector() && Subtarget.useAVX512Regs()); + if (CanConcat) { SDValue Ops[] = {N.getOperand(0), N.getOperand(2)}; - MVT WideVT = VT.getDoubleNumVectorElementsVT(); if (SDValue ConcatSrc = combineConcatVectorOps(DL, WideVT, Ops, DAG, DCI, Subtarget)) { SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG, @@ -42530,9 +42532,24 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SmallVector<int, 32> Mask; if (getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask)) { assert(Mask.size() == NumElts && "Unexpected shuffle mask size"); + // See if we can concatenate the commuted operands. + if (CanConcat) { + if (SDValue ConcatSrc = combineConcatVectorOps( + DL, WideVT, {N.getOperand(2), N.getOperand(0)}, DAG, DCI, + Subtarget)) { + ShuffleVectorSDNode::commuteMask(Mask); + SDValue NewMask = + getConstVector(Mask, MaskVT, DAG, DL, /*IsMask=*/true); + NewMask = widenSubVector(NewMask, false, Subtarget, DAG, DL, + WideVT.getSizeInBits()); + SDValue Perm = + DAG.getNode(X86ISD::VPERMV, DL, WideVT, NewMask, ConcatSrc); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm, + DAG.getVectorIdxConstant(0, DL)); + } + } SDValue V1 = peekThroughBitcasts(N.getOperand(0)); SDValue V2 = peekThroughBitcasts(N.getOperand(2)); - MVT MaskVT = N.getOperand(1).getSimpleValueType(); // Canonicalize to VPERMV if both sources are the same. if (V1 == V2) { for (int &M : Mask) diff --git llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll index 4c4d5cb3166a..951a2b4cafa2 100644 --- llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll +++ llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll @@ -1754,10 +1754,9 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] -; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,18,19,0,21,22,23,0,25,26,27,0,29,30,31] +; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1870,10 +1869,9 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] -; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,18,19,20,21,22,23,0,25,26,27,28,29,30,31] +; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -3776,12 +3774,11 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,9,10,11,16,13,14,15,16,9,10,11,16,13,14,15] -; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,25,26,27,0,29,30,31] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm1 ; AVX512BW-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper @@ -3911,11 +3908,10 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,9,10,11,12,13,16,15,0,0,0,0,16,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,0,31,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm1 ; AVX512BW-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper @@ -4037,11 +4033,10 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,9,10,11,12,13,14,15,16,9,10,11,12,13,14,15] -; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,0,25,26,27,28,29,30,31] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm1 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper @@ -4151,10 +4146,9 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,9,10,11,12,13,14,15,0,0,0,0,16,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll index 16f061474346..c0afc0cfe2c0 100644 --- llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll +++ llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll @@ -1415,10 +1415,9 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; ; AVX512BW-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] -; AVX512BW-NEXT: vpermi2w (%rdi), %ymm0, %ymm1 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,17,18,19,0,21,22,23,0,25,26,27,0,29,30,31] +; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1502,10 +1501,9 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512BW-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] -; AVX512BW-NEXT: vpermi2w (%rdi), %ymm0, %ymm1 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,17,18,19,20,21,22,23,0,25,26,27,28,29,30,31] +; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll index aac5847061cb..fd9b46e82e0b 100644 --- llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -227,11 +227,12 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [9,0,3,0,5,0,7,1] -; CHECK-NEXT: vpermi2w (%rdi), %xmm2, %xmm3 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,8,11,8,13,8,15,9] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9> @@ -243,11 +244,11 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [9,0,3,0,5,0,7,1] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [1,8,11,8,13,8,15,9] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermi2w (%rdi), %xmm2, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9> @@ -304,10 +305,9 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16 define <16 x i16> @test_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec) { ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2] -; CHECK-NEXT: vpermi2w %ymm0, %ymm2, %ymm1 -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,28,29,26,28,29,17,12,22,8,25,27,28,18,30,18] +; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18> ret <16 x i16> %res @@ -315,11 +315,11 @@ define <16 x i16> @test_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec) { define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm4 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2] -; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4 +; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [24,28,29,26,28,29,17,12,22,8,25,27,28,18,30,18] ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 -; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -330,11 +330,10 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [24,28,29,26,28,29,17,12,22,8,25,27,28,18,30,18] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 -; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -344,11 +343,11 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <1 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm4 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26] -; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4 +; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [14,21,31,29,25,2,19,15,20,27,7,23,3,7,25,10] ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 -; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -359,11 +358,10 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [14,21,31,29,25,2,19,15,20,27,7,23,3,7,25,10] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 -; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -373,11 +371,11 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <1 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm4 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15] -; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4 +; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [26,3,4,22,1,18,29,17,21,0,20,19,18,12,11,31] ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 -; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -388,11 +386,10 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [26,3,4,22,1,18,29,17,21,0,20,19,18,12,11,31] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 -; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -440,10 +437,9 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <1 define <8 x i16> @test_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec) { ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [22,27,7,10,13,21,5,14] -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; CHECK-NEXT: vpermt2w %ymm0, %ymm2, %ymm1 -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [6,11,23,26,29,5,21,30] +; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30> @@ -452,11 +448,11 @@ define <8 x i16> @test_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec) { define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [22,27,7,10,13,21,5,14] -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4 -; CHECK-NEXT: vpermt2w %ymm0, %ymm3, %ymm4 +; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [6,11,23,26,29,5,21,30] ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 -; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30> @@ -468,11 +464,10 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [22,27,7,10,13,21,5,14] -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [6,11,23,26,29,5,21,30] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; CHECK-NEXT: vpermt2w %ymm0, %ymm3, %ymm2 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 +; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30> @@ -657,11 +652,11 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask1(ptr %vp, <16 x define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask2(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0] -; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm3 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1} +; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16> @@ -673,11 +668,10 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask2(ptr %vp, <16 x i1 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask2(ptr %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16] ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16> @@ -731,10 +725,9 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask3(ptr %vp, <16 x define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [16,17,5,1,14,14,13,17] -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm0 -; CHECK-NEXT: vpermt2w (%rdi), %ymm1, %ymm0 -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,1,21,17,30,30,29,1] +; CHECK-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp @@ -744,11 +737,11 @@ define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask0(ptr %vp) { define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [16,17,5,1,14,14,13,17] -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm3 -; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm3 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,1,21,17,30,30,29,1] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp @@ -761,11 +754,10 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [16,17,5,1,14,14,13,17] -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,1,21,17,30,30,29,1] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp @@ -778,11 +770,11 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1] -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm3 -; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm3 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [23,22,20,22,28,20,11,17] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp @@ -795,11 +787,10 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1] -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [23,22,20,22,28,20,11,17] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp @@ -1114,11 +1105,12 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,7,7,0] -; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm3 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,3,3,4] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x i32>, ptr %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4> @@ -1130,11 +1122,11 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> % define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,7,7,0] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,3,3,4] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x i32>, ptr %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4> @@ -1503,11 +1495,11 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask0(ptr %vp, <8 x i32 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask1(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [7,3,6,11,0,1,5,15] -; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [15,11,14,3,8,9,13,7] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1} +; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 11, i32 14, i32 3, i32 8, i32 9, i32 13, i32 7> @@ -1519,11 +1511,10 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask1(ptr %vp, <8 x i32> define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask1(ptr %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [7,3,6,11,0,1,5,15] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [15,11,14,3,8,9,13,7] ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 11, i32 14, i32 3, i32 8, i32 9, i32 13, i32 7> @@ -1535,11 +1526,11 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask1(ptr %vp, <8 x i32 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask2(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,14,1,5,4,2,8,10] -; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [12,6,9,13,12,10,0,2] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1} +; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 12, i32 6, i32 9, i32 13, i32 12, i32 10, i32 0, i32 2> @@ -1551,11 +1542,10 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask2(ptr %vp, <8 x i32> define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask2(ptr %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,14,1,5,4,2,8,10] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [12,6,9,13,12,10,0,2] ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 12, i32 6, i32 9, i32 13, i32 12, i32 10, i32 0, i32 2> @@ -1654,11 +1644,11 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [15,5,3,2,0,0,0,0] -; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [7,13,11,10,7,13,15,14] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp @@ -1671,11 +1661,10 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [15,5,3,2,0,0,0,0] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [7,13,11,10,7,13,15,14] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp @@ -1721,9 +1710,10 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32 define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,4,3,6] -; CHECK-NEXT: vpermi2d (%rdi), %xmm1, %xmm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [6,0,7,2] +; CHECK-NEXT: vpermps (%rdi), %ymm0, %ymm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2> @@ -1732,11 +1722,12 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask3(ptr %vp) { define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,4,3,6] -; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm3 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,0,7,2] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2> @@ -1748,11 +1739,11 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [2,4,3,6] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,0,7,2] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2> @@ -2374,11 +2365,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask0(ptr %vp, <4 x i64> define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [4,3,2,4] -; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,7,6,0] ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} +; CHECK-FAST-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1: @@ -2398,11 +2389,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> % define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,3,2,4] +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,6,0] ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-FAST-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1: @@ -2422,11 +2412,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [3,5,5,1] -; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [7,1,1,5] ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} +; CHECK-FAST-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2: @@ -2446,11 +2436,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> % define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [3,5,5,1] +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,1,1,5] ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-FAST-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2: @@ -2535,11 +2524,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [4,0,2,5] -; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,4,6,1] ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} +; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1> @@ -2551,11 +2540,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64> % define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,0,2,5] +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,4,6,1] ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1> @@ -2656,11 +2644,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [3,3,1,5] -; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [7,7,5,1] ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} +; CHECK-FAST-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7: @@ -2680,11 +2668,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> % define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [3,3,1,5] +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,7,5,1] ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-FAST-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7: @@ -2946,9 +2933,10 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 16(%rdi), %xmm1 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,6,0,1] -; CHECK-NEXT: vpermi2ps (%rdi), %xmm1, %xmm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [6,2,4,5] +; CHECK-NEXT: vpermps (%rdi), %ymm0, %ymm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5> @@ -2957,12 +2945,13 @@ define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp) { define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,0,1] -; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm3 -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 -; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,2,4,5] +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 +; CHECK-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5> @@ -2974,12 +2963,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [2,6,0,1] -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 -; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,2,4,5] +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 +; CHECK-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5> @@ -2991,12 +2980,13 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,7,7,2] -; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm3 -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 -; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,3,3,6] +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 +; CHECK-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6> @@ -3008,12 +2998,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [2,7,7,2] -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 -; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,3,3,6] +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 +; CHECK-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6> @@ -3497,12 +3487,12 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1(ptr %vp, <8 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-FAST-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovaps 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vpmovsxbd {{.*#+}} ymm3 = [9,5,2,3,2,8,8,1] -; CHECK-FAST-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3 -; CHECK-FAST-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-FAST-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-FAST-NEXT: vmovaps %ymm3, %ymm0 {%k1} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,13,10,11,10,0,0,9] +; CHECK-FAST-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-FAST-NEXT: vcmpeqps %ymm3, %ymm1, %k1 +; CHECK-FAST-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2: @@ -3524,12 +3514,11 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 x define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 x float> %mask) { ; CHECK-FAST-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovaps 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1] -; CHECK-FAST-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-FAST-NEXT: vcmpeqps %ymm3, %ymm0, %k1 -; CHECK-FAST-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-FAST-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,13,10,11,10,0,0,9] +; CHECK-FAST-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-FAST-NEXT: vcmpeqps %ymm2, %ymm0, %k1 +; CHECK-FAST-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2: @@ -3551,9 +3540,9 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 32(%rdi), %ymm1 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm0 = [7,5,3,3,11,4,12,9] -; CHECK-NEXT: vpermi2ps (%rdi), %ymm1, %ymm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm0 = [15,13,11,11,3,12,4,1] +; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1> @@ -3562,12 +3551,12 @@ define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp) { define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [7,5,3,3,11,4,12,9] -; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3 -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [15,13,11,11,3,12,4,1] +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 +; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1> @@ -3579,12 +3568,11 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp, <8 x define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [7,5,3,3,11,4,12,9] -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1 -; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [15,13,11,11,3,12,4,1] +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %ymm2, %ymm0, %k1 +; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1> @@ -3644,12 +3632,12 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,10,6,15,0,0,0,0] -; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3 -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 -; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,2,14,7,12,6,14,7] +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 +; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp @@ -3662,12 +3650,11 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,10,6,15,0,0,0,0] -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 -; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,2,14,7,12,6,14,7] +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 +; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp @@ -3680,13 +3667,12 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [4,14,4,14] -; CHECK-NEXT: # xmm2 = mem[0,0] -; CHECK-NEXT: vmovaps 32(%rdi), %ymm3 -; CHECK-NEXT: vpermt2ps (%rdi), %ymm2, %ymm3 -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 -; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm2 = [12,6,12,6,12,6,12,6] +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 +; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp @@ -3699,13 +3685,11 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [4,14,4,14] -; CHECK-NEXT: # xmm2 = mem[0,0] -; CHECK-NEXT: vmovaps 32(%rdi), %ymm1 -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 -; CHECK-NEXT: vpermt2ps (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm1 = [12,6,12,6,12,6,12,6] +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 +; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp @@ -4527,12 +4511,12 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask4(ptr %vp, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [2,4,1,5] -; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm3 -; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [6,0,5,1] +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 +; CHECK-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 6, i32 0, i32 5, i32 1> @@ -4544,12 +4528,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask4(ptr %vp, <4 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4(ptr %vp, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [2,4,1,5] -; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 -; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [6,0,5,1] +; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 +; CHECK-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 6, i32 0, i32 5, i32 1> @@ -4593,9 +4576,9 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(ptr %vp, define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp) { ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd 32(%rdi), %ymm1 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,2,4,1] -; CHECK-NEXT: vpermi2pd (%rdi), %ymm1, %ymm0 +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [4,6,0,5] +; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5> @@ -4604,12 +4587,12 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp) { define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,2,4,1] -; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm3 -; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [4,6,0,5] +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 +; CHECK-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5> @@ -4621,12 +4604,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp, <4 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,1] -; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 -; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,6,0,5] +; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 +; CHECK-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5> diff --git llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll index c3b53211978a..9d0183c816b1 100644 --- llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll +++ llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll @@ -582,20 +582,20 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] -; AVX512-NEXT: vpermd (%rdi), %zmm1, %zmm1 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512-NEXT: vpermd %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3] -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512-NEXT: vpermt2d (%rdi), %ymm4, %ymm5 -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11] +; AVX512-NEXT: vpermd %zmm2, %zmm5, %zmm2 +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vmovq %xmm3, (%rsi) ; AVX512-NEXT: vmovq %xmm0, (%rdx) -; AVX512-NEXT: vmovq %xmm2, (%rcx) +; AVX512-NEXT: vmovq %xmm4, (%rcx) ; AVX512-NEXT: vmovq %xmm1, (%r8) -; AVX512-NEXT: vmovq %xmm4, (%r9) -; AVX512-NEXT: vmovq %xmm5, (%rax) +; AVX512-NEXT: vmovq %xmm5, (%r9) +; AVX512-NEXT: vmovq %xmm2, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -613,20 +613,20 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] -; AVX512-FCP-NEXT: vpermd (%rdi), %zmm1, %zmm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512-FCP-NEXT: vpermd %zmm2, %zmm1, %zmm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3] -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512-FCP-NEXT: vpermt2d (%rdi), %ymm4, %ymm5 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11] +; AVX512-FCP-NEXT: vpermd %zmm2, %zmm5, %zmm2 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512-FCP-NEXT: vmovq %xmm0, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm2, (%rcx) +; AVX512-FCP-NEXT: vmovq %xmm4, (%rcx) ; AVX512-FCP-NEXT: vmovq %xmm1, (%r8) -; AVX512-FCP-NEXT: vmovq %xmm4, (%r9) -; AVX512-FCP-NEXT: vmovq %xmm5, (%rax) +; AVX512-FCP-NEXT: vmovq %xmm5, (%r9) +; AVX512-FCP-NEXT: vmovq %xmm2, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -645,20 +645,20 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] -; AVX512DQ-NEXT: vpermd (%rdi), %zmm1, %zmm1 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3] -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512DQ-NEXT: vpermt2d (%rdi), %ymm4, %ymm5 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11] +; AVX512DQ-NEXT: vpermd %zmm2, %zmm5, %zmm2 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovq %xmm3, (%rsi) ; AVX512DQ-NEXT: vmovq %xmm0, (%rdx) -; AVX512DQ-NEXT: vmovq %xmm2, (%rcx) +; AVX512DQ-NEXT: vmovq %xmm4, (%rcx) ; AVX512DQ-NEXT: vmovq %xmm1, (%r8) -; AVX512DQ-NEXT: vmovq %xmm4, (%r9) -; AVX512DQ-NEXT: vmovq %xmm5, (%rax) +; AVX512DQ-NEXT: vmovq %xmm5, (%r9) +; AVX512DQ-NEXT: vmovq %xmm2, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -676,20 +676,20 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] -; AVX512DQ-FCP-NEXT: vpermd (%rdi), %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm1, %zmm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512DQ-FCP-NEXT: vpermt2d (%rdi), %ymm4, %ymm5 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11] +; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm5, %zmm2 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rcx) +; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rcx) ; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r8) -; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%r9) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rax) +; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r9) +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -2876,22 +2876,20 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm2 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,16,16,4,10,16,22,28,16,16,16,4,10,16,22,28] ; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm7 -; AVX512BW-NEXT: vpermi2w %ymm6, %ymm7, %ymm2 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm8 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %ymm6, %ymm7, %ymm8 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm5, %zmm2, %zmm2 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,16,16,5,11,17,23,29,16,16,16,5,11,17,23,29] +; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermw %zmm5, %zmm6, %zmm6 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm7 +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] ; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermw %zmm5, %zmm7, %zmm7 @@ -2933,22 +2931,20 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm2 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,16,16,4,10,16,22,28,16,16,16,4,10,16,22,28] ; AVX512BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm7 -; AVX512BW-FCP-NEXT: vpermi2w %ymm6, %ymm7, %ymm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpermi2w %ymm6, %ymm7, %ymm8 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,16,16,5,11,17,23,29,16,16,16,5,11,17,23,29] +; AVX512BW-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm6, %zmm6 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm7 +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] ; AVX512BW-FCP-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm7, %zmm7 @@ -2990,22 +2986,20 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm2 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,16,16,4,10,16,22,28,16,16,16,4,10,16,22,28] ; AVX512DQ-BW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm7 -; AVX512DQ-BW-NEXT: vpermi2w %ymm6, %ymm7, %ymm2 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512DQ-BW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vpermi2w %ymm6, %ymm7, %ymm8 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,16,16,5,11,17,23,29,16,16,16,5,11,17,23,29] +; AVX512DQ-BW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm6, %zmm6 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm7 +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] ; AVX512DQ-BW-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm7, %zmm7 @@ -3047,22 +3041,20 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,16,16,4,10,16,22,28,16,16,16,4,10,16,22,28] ; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm6, %ymm7, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512DQ-BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm6, %ymm7, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,16,16,5,11,17,23,29,16,16,16,5,11,17,23,29] +; AVX512DQ-BW-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm6, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] ; AVX512DQ-BW-FCP-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm7, %zmm7 diff --git llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll index d9383f524f1d..34f23213500c 100644 --- llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll +++ llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll @@ -103,16 +103,15 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-FCP-LABEL: load_i32_stride3_vf2: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [5,0] -; AVX512-FCP-NEXT: vpermi2d %xmm0, %xmm1, %xmm3 -; AVX512-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512-FCP-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,3,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,4,6,7] +; AVX512-FCP-NEXT: vpermps (%rdi), %ymm1, %ymm1 +; AVX512-FCP-NEXT: vbroadcastss 8(%rdi), %xmm2 +; AVX512-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX512-FCP-NEXT: vmovlps %xmm0, (%rsi) +; AVX512-FCP-NEXT: vmovlps %xmm1, (%rdx) +; AVX512-FCP-NEXT: vmovlps %xmm2, (%rcx) +; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i32_stride3_vf2: @@ -131,16 +130,15 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-FCP-LABEL: load_i32_stride3_vf2: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [5,0] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm0, %xmm1, %xmm3 -; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-FCP-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,3,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,4,6,7] +; AVX512DQ-FCP-NEXT: vpermps (%rdi), %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vbroadcastss 8(%rdi), %xmm2 +; AVX512DQ-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rsi) +; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%rdx) +; AVX512DQ-FCP-NEXT: vmovlps %xmm2, (%rcx) +; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i32_stride3_vf2: @@ -159,16 +157,15 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-FCP-LABEL: load_i32_stride3_vf2: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [5,0] -; AVX512BW-FCP-NEXT: vpermi2d %xmm0, %xmm1, %xmm3 -; AVX512BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm0 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512BW-FCP-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,3,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,4,6,7] +; AVX512BW-FCP-NEXT: vpermps (%rdi), %ymm1, %ymm1 +; AVX512BW-FCP-NEXT: vbroadcastss 8(%rdi), %xmm2 +; AVX512BW-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rsi) +; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%rdx) +; AVX512BW-FCP-NEXT: vmovlps %xmm2, (%rcx) +; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i32_stride3_vf2: @@ -187,16 +184,15 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-FCP-LABEL: load_i32_stride3_vf2: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [5,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm0, %xmm1, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,4,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermps (%rdi), %ymm1, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vbroadcastss 8(%rdi), %xmm2 +; AVX512DQ-BW-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm2, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <6 x i32>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <6 x i32> %wide.vec, <6 x i32> poison, <2 x i32> <i32 0, i32 3> diff --git llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll index 955a7ffcec79..7948141f6bec 100644 --- llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll +++ llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll @@ -239,17 +239,16 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] ; AVX512-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7] -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX512-FCP-NEXT: vpermt2d (%rdi), %ymm2, %ymm7 ; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2 -; AVX512-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm2 +; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7 +; AVX512-FCP-NEXT: vpermps %zmm0, %zmm7, %zmm0 ; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512-FCP-NEXT: vmovq %xmm4, (%rdx) ; AVX512-FCP-NEXT: vmovq %xmm5, (%rcx) ; AVX512-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512-FCP-NEXT: vmovlps %xmm1, (%r9) -; AVX512-FCP-NEXT: vmovq %xmm7, (%r10) +; AVX512-FCP-NEXT: vmovlps %xmm2, (%r10) ; AVX512-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq @@ -304,17 +303,16 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] ; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX512DQ-FCP-NEXT: vpermt2d (%rdi), %ymm2, %ymm7 ; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2 -; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm2 +; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7 +; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm7, %zmm0 ; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rdx) ; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rcx) ; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%r9) -; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%r10) +; AVX512DQ-FCP-NEXT: vmovlps %xmm2, (%r10) ; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -369,17 +367,16 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] ; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7] -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX512BW-FCP-NEXT: vpermt2d (%rdi), %ymm2, %ymm7 ; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2 -; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7 +; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm7, %zmm0 ; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rdx) ; AVX512BW-FCP-NEXT: vmovq %xmm5, (%rcx) ; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%r9) -; AVX512BW-FCP-NEXT: vmovq %xmm7, (%r10) +; AVX512BW-FCP-NEXT: vmovlps %xmm2, (%r10) ; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -434,17 +431,16 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2d (%rdi), %ymm2, %ymm7 ; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm7, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm7, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm2, (%r10) ; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq diff --git llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll index c9b10d9cc866..ec7a708fc0b0 100644 --- llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -1754,10 +1754,9 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] -; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,18,19,0,21,22,23,0,25,26,27,0,29,30,31] +; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1870,10 +1869,9 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] -; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,18,19,20,21,22,23,0,25,26,27,28,29,30,31] +; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll index 5ba2257e2b49..14c2a60a5b99 100644 --- llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll +++ llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll @@ -1415,10 +1415,9 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; ; AVX512BW-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] -; AVX512BW-NEXT: vpermi2w (%rdi), %ymm0, %ymm1 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,17,18,19,0,21,22,23,0,25,26,27,0,29,30,31] +; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1502,10 +1501,9 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512BW-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] -; AVX512BW-NEXT: vpermi2w (%rdi), %ymm0, %ymm1 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,17,18,19,20,21,22,23,0,25,26,27,28,29,30,31] +; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq