release/22.x: [AArch64] Fix shufflevector miscompilation on aarch64_be (#193076)#193744
release/22.x: [AArch64] Fix shufflevector miscompilation on aarch64_be (#193076)#193744folkertdev wants to merge 2 commits intollvm:release/22.xfrom
shufflevector miscompilation on aarch64_be (#193076)#193744Conversation
…3076) A function like ```llvm define <4 x i16> @xtn_shuffle_even_v8i16(<8 x i16> %a) { entry: %r = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> ret <4 x i16> %r } ``` will use the `xtn` instruction, which for each 32-bit vector element keeps only the lower 16 bits, so effectively this is a truncation. However, if the vector actually has 16-bit elements, then the conversion from a shuffle to a truncation is only valid on LE, not on BE. On BE, `uzp1` should be used instead. So this PR moves some logic to right after a check for LE, so that BE does not miscompile. I filed this as an issue on `rust-lang-rust` as rust-lang/rust#155459, and we had some discussion at [#t-libs/stdarch > &llvm#96;simd_shuffle!&llvm#96; behaves strangely on &llvm#96;aarch64_be&llvm#96;](https://rust-lang.zulipchat.com/#narrow/channel/208962-t-libs.2Fstdarch/topic/.60simd_shuffle.21.60.20behaves.20strangely.20on.20.60aarch64_be.60/with/587766433).
|
@llvm/pr-subscribers-backend-aarch64 Author: Folkert de Vries (folkertdev) ChangesFull diff: https://github.com/llvm/llvm-project/pull/193744.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a55b1facb103f..aa22f480c1537 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -24419,33 +24419,6 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
if (N->getOpcode() == AArch64ISD::UZP2)
return SDValue();
- // uzp1(x, undef) -> concat(truncate(x), undef)
- if (Op1.isUndef()) {
- EVT BCVT = MVT::Other, HalfVT = MVT::Other;
- switch (ResVT.getSimpleVT().SimpleTy) {
- default:
- break;
- case MVT::v16i8:
- BCVT = MVT::v8i16;
- HalfVT = MVT::v8i8;
- break;
- case MVT::v8i16:
- BCVT = MVT::v4i32;
- HalfVT = MVT::v4i16;
- break;
- case MVT::v4i32:
- BCVT = MVT::v2i64;
- HalfVT = MVT::v2i32;
- break;
- }
- if (BCVT != MVT::Other) {
- SDValue BC = DAG.getBitcast(BCVT, Op0);
- SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
- DAG.getUNDEF(HalfVT));
- }
- }
-
if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
return Urshr;
@@ -24487,6 +24460,33 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
if (!DAG.getDataLayout().isLittleEndian())
return SDValue();
+ // uzp1(x, undef) -> concat(truncate(x), undef)
+ if (Op1.isUndef()) {
+ EVT BCVT = MVT::Other, HalfVT = MVT::Other;
+ switch (ResVT.getSimpleVT().SimpleTy) {
+ default:
+ break;
+ case MVT::v16i8:
+ BCVT = MVT::v8i16;
+ HalfVT = MVT::v8i8;
+ break;
+ case MVT::v8i16:
+ BCVT = MVT::v4i32;
+ HalfVT = MVT::v4i16;
+ break;
+ case MVT::v4i32:
+ BCVT = MVT::v2i64;
+ HalfVT = MVT::v2i32;
+ break;
+ }
+ if (BCVT != MVT::Other) {
+ SDValue BC = DAG.getBitcast(BCVT, Op0);
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
+ DAG.getPOISON(HalfVT));
+ }
+ }
+
// uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
// Example:
// nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)
diff --git a/llvm/test/CodeGen/AArch64/aarch64_be-shuffle-vector.ll b/llvm/test/CodeGen/AArch64/aarch64_be-shuffle-vector.ll
new file mode 100644
index 0000000000000..4e60d99dbed36
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64_be-shuffle-vector.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefix=CHECKLE
+; RUN: llc < %s -mtriple=aarch64_be | FileCheck %s --check-prefix=CHECKBE
+
+define <4 x i16> @test_reconstructshuffle(<16 x i8> %a, <16 x i8> %b) nounwind {
+; CHECKLE-LABEL: test_reconstructshuffle:
+; CHECKLE: // %bb.0:
+; CHECKLE-NEXT: mov b2, v0.b[3]
+; CHECKLE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
+; CHECKLE-NEXT: mov v2.b[2], v0.b[2]
+; CHECKLE-NEXT: mov v2.b[4], v0.b[1]
+; CHECKLE-NEXT: mov v2.b[6], v0.b[0]
+; CHECKLE-NEXT: zip2 v0.8b, v1.8b, v0.8b
+; CHECKLE-NEXT: add v0.4h, v2.4h, v0.4h
+; CHECKLE-NEXT: bic v0.4h, #255, lsl #8
+; CHECKLE-NEXT: ret
+;
+; CHECKBE-LABEL: test_reconstructshuffle:
+; CHECKBE: // %bb.0:
+; CHECKBE-NEXT: rev64 v0.16b, v0.16b
+; CHECKBE-NEXT: rev64 v1.16b, v1.16b
+; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
+; CHECKBE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
+; CHECKBE-NEXT: mov b2, v0.b[3]
+; CHECKBE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
+; CHECKBE-NEXT: mov v2.b[2], v0.b[2]
+; CHECKBE-NEXT: mov v2.b[4], v0.b[1]
+; CHECKBE-NEXT: mov v2.b[6], v0.b[0]
+; CHECKBE-NEXT: zip2 v0.8b, v1.8b, v0.8b
+; CHECKBE-NEXT: add v0.4h, v2.4h, v0.4h
+; CHECKBE-NEXT: bic v0.4h, #255, lsl #8
+; CHECKBE-NEXT: rev64 v0.4h, v0.4h
+; CHECKBE-NEXT: ret
+ %tmp1 = shufflevector <16 x i8> %a, <16 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ %tmp2 = shufflevector <16 x i8> %b, <16 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+ %tmp3 = add <4 x i8> %tmp1, %tmp2
+ %tmp4 = zext <4 x i8> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+; a shufflevector of even elements can become just xtn on little-endian, but not on big-endian.
+define <8 x i8> @xtn_shuffle_even_v16i8(<16 x i8> %a) {
+; CHECKLE-LABEL: xtn_shuffle_even_v16i8:
+; CHECKLE: // %bb.0:
+; CHECKLE-NEXT: xtn v0.8b, v0.8h
+; CHECKLE-NEXT: ret
+;
+; CHECKBE-LABEL: xtn_shuffle_even_v16i8:
+; CHECKBE: // %bb.0:
+; CHECKBE-NEXT: rev64 v0.16b, v0.16b
+; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
+; CHECKBE-NEXT: uzp1 v0.16b, v0.16b, v0.16b
+; CHECKBE-NEXT: rev64 v0.8b, v0.8b
+; CHECKBE-NEXT: ret
+ %r = shufflevector <16 x i8> %a, <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ ret <8 x i8> %r
+}
+
+; a shufflevector of even elements can become just xtn on little-endian, but not on big-endian.
+define <4 x i16> @xtn_shuffle_even_v8i16(<8 x i16> %a) {
+; CHECKLE-LABEL: xtn_shuffle_even_v8i16:
+; CHECKLE: // %bb.0:
+; CHECKLE-NEXT: xtn v0.4h, v0.4s
+; CHECKLE-NEXT: ret
+;
+; CHECKBE-LABEL: xtn_shuffle_even_v8i16:
+; CHECKBE: // %bb.0:
+; CHECKBE-NEXT: rev64 v0.8h, v0.8h
+; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
+; CHECKBE-NEXT: uzp1 v0.8h, v0.8h, v0.8h
+; CHECKBE-NEXT: rev64 v0.4h, v0.4h
+; CHECKBE-NEXT: ret
+ %r = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ ret <4 x i16> %r
+}
+
+; a shufflevector of even elements can become just xtn on little-endian, but not on big-endian.
+define <4 x i32> @xtn_shuffle_uzp1_poison_v4i32(<4 x i32> %a) {
+; CHECKLE-LABEL: xtn_shuffle_uzp1_poison_v4i32:
+; CHECKLE: // %bb.0:
+; CHECKLE-NEXT: xtn v0.2s, v0.2d
+; CHECKLE-NEXT: ret
+;
+; CHECKBE-LABEL: xtn_shuffle_uzp1_poison_v4i32:
+; CHECKBE: // %bb.0:
+; CHECKBE-NEXT: rev64 v0.4s, v0.4s
+; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
+; CHECKBE-NEXT: uzp1 v0.4s, v0.4s, v0.4s
+; CHECKBE-NEXT: rev64 v0.4s, v0.4s
+; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
+; CHECKBE-NEXT: ret
+ %r = shufflevector <4 x i32> %a, <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+ ret <4 x i32> %r
+
+}
diff --git a/llvm/test/CodeGen/AArch64/fix-shuffle-vector-be-rev.ll b/llvm/test/CodeGen/AArch64/fix-shuffle-vector-be-rev.ll
deleted file mode 100644
index 65da95e0163f4..0000000000000
--- a/llvm/test/CodeGen/AArch64/fix-shuffle-vector-be-rev.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefix=CHECKLE
-; RUN: llc < %s -mtriple=aarch64_be | FileCheck %s --check-prefix=CHECKBE
-
-define <4 x i16> @test_reconstructshuffle(<16 x i8> %a, <16 x i8> %b) nounwind {
-; CHECKLE-LABEL: test_reconstructshuffle:
-; CHECKLE: // %bb.0:
-; CHECKLE-NEXT: mov b2, v0.b[3]
-; CHECKLE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
-; CHECKLE-NEXT: mov v2.b[2], v0.b[2]
-; CHECKLE-NEXT: mov v2.b[4], v0.b[1]
-; CHECKLE-NEXT: mov v2.b[6], v0.b[0]
-; CHECKLE-NEXT: zip2 v0.8b, v1.8b, v0.8b
-; CHECKLE-NEXT: add v0.4h, v2.4h, v0.4h
-; CHECKLE-NEXT: bic v0.4h, #255, lsl #8
-; CHECKLE-NEXT: ret
-;
-; CHECKBE-LABEL: test_reconstructshuffle:
-; CHECKBE: // %bb.0:
-; CHECKBE-NEXT: rev64 v0.16b, v0.16b
-; CHECKBE-NEXT: rev64 v1.16b, v1.16b
-; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
-; CHECKBE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
-; CHECKBE-NEXT: mov b2, v0.b[3]
-; CHECKBE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
-; CHECKBE-NEXT: mov v2.b[2], v0.b[2]
-; CHECKBE-NEXT: mov v2.b[4], v0.b[1]
-; CHECKBE-NEXT: mov v2.b[6], v0.b[0]
-; CHECKBE-NEXT: zip2 v0.8b, v1.8b, v0.8b
-; CHECKBE-NEXT: add v0.4h, v2.4h, v0.4h
-; CHECKBE-NEXT: bic v0.4h, #255, lsl #8
-; CHECKBE-NEXT: rev64 v0.4h, v0.4h
-; CHECKBE-NEXT: ret
- %tmp1 = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
- %tmp2 = shufflevector <16 x i8> %b, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
- %tmp3 = add <4 x i8> %tmp1, %tmp2
- %tmp4 = zext <4 x i8> %tmp3 to <4 x i16>
- ret <4 x i16> %tmp4
-}
|
| SDValue BC = DAG.getBitcast(BCVT, Op0); | ||
| SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC); | ||
| return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc, | ||
| DAG.getPOISON(HalfVT)); |
There was a problem hiding this comment.
on this branch the original uses getUNDEF here, which caused a merge conflict. I suspect we want to actually use getPOISON if we can?
There was a problem hiding this comment.
It does sound better for trunk but it might be worth being conservative on the branch and just keeping the code as-is, just moving it later. I'm not sure what else has gone into trunk since the branch was made in regards to poison vs undef in SDAG.
There was a problem hiding this comment.
Ah, makes sense. Fixed.
|
Do we know when this miscompilation was introduced? |
|
LLVM 19 apparently https://godbolt.org/z/6jnq6Ef8e, so, a while ago |
|
Given that this miscompilation was introduced in LLVM 19 and was only reported/fixed now, it doesn't seem like a problem that is a widely hit issue, or a recent regression, so I don't think it meets the criteria for backporting. It should be in the upcoming LLVM 23 release which will be available in a few months. Let me know if you have any strong objections to this and we can reconsider. |
|
It's not a new bug but it is kind of a bad one with a very straightforward fix. Backporting it would unblock work on |
|
I think it would be fine to take this one. It fixes a miscompile and it's a conservative change (i.e. all it does is disable an optimization). |
backport 7425ab9
pr: #193076