release/22.x: [AArch64] Fix `shufflevector` miscompilation on `aarch64_be` (#193076) by folkertdev · Pull Request #193744 · llvm/llvm-project

folkertdev · 2026-04-23T13:40:50Z

backport 7425ab9
pr: #193076

…3076) A function like ```llvm define <4 x i16> @xtn_shuffle_even_v8i16(<8 x i16> %a) { entry: %r = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> ret <4 x i16> %r } ``` will use the `xtn` instruction, which for each 32-bit vector element keeps only the lower 16 bits, so effectively this is a truncation. However, if the vector actually has 16-bit elements, then the conversion from a shuffle to a truncation is only valid on LE, not on BE. On BE, `uzp1` should be used instead. So this PR moves some logic to right after a check for LE, so that BE does not miscompile. I filed this as an issue on `rust-lang-rust` as rust-lang/rust#155459, and we had some discussion at [#t-libs/stdarch > &llvm#96;simd_shuffle!&llvm#96; behaves strangely on &llvm#96;aarch64_be&llvm#96;](https://rust-lang.zulipchat.com/#narrow/channel/208962-t-libs.2Fstdarch/topic/.60simd_shuffle.21.60.20behaves.20strangely.20on.20.60aarch64_be.60/with/587766433).

llvmbot · 2026-04-23T13:41:31Z

@llvm/pr-subscribers-backend-aarch64

Author: Folkert de Vries (folkertdev)

Changes

backport 7425ab9
pr: #193076

Full diff: https://github.com/llvm/llvm-project/pull/193744.diff

3 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+27-27)
(added) llvm/test/CodeGen/AArch64/aarch64_be-shuffle-vector.ll (+95)
(removed) llvm/test/CodeGen/AArch64/fix-shuffle-vector-be-rev.ll (-39)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a55b1facb103f..aa22f480c1537 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -24419,33 +24419,6 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
   if (N->getOpcode() == AArch64ISD::UZP2)
     return SDValue();
 
-  // uzp1(x, undef) -> concat(truncate(x), undef)
-  if (Op1.isUndef()) {
-    EVT BCVT = MVT::Other, HalfVT = MVT::Other;
-    switch (ResVT.getSimpleVT().SimpleTy) {
-    default:
-      break;
-    case MVT::v16i8:
-      BCVT = MVT::v8i16;
-      HalfVT = MVT::v8i8;
-      break;
-    case MVT::v8i16:
-      BCVT = MVT::v4i32;
-      HalfVT = MVT::v4i16;
-      break;
-    case MVT::v4i32:
-      BCVT = MVT::v2i64;
-      HalfVT = MVT::v2i32;
-      break;
-    }
-    if (BCVT != MVT::Other) {
-      SDValue BC = DAG.getBitcast(BCVT, Op0);
-      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
-      return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
-                         DAG.getUNDEF(HalfVT));
-    }
-  }
-
   if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
     return Urshr;
 
@@ -24487,6 +24460,33 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
   if (!DAG.getDataLayout().isLittleEndian())
     return SDValue();
 
+  // uzp1(x, undef) -> concat(truncate(x), undef)
+  if (Op1.isUndef()) {
+    EVT BCVT = MVT::Other, HalfVT = MVT::Other;
+    switch (ResVT.getSimpleVT().SimpleTy) {
+    default:
+      break;
+    case MVT::v16i8:
+      BCVT = MVT::v8i16;
+      HalfVT = MVT::v8i8;
+      break;
+    case MVT::v8i16:
+      BCVT = MVT::v4i32;
+      HalfVT = MVT::v4i16;
+      break;
+    case MVT::v4i32:
+      BCVT = MVT::v2i64;
+      HalfVT = MVT::v2i32;
+      break;
+    }
+    if (BCVT != MVT::Other) {
+      SDValue BC = DAG.getBitcast(BCVT, Op0);
+      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
+      return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
+                         DAG.getPOISON(HalfVT));
+    }
+  }
+
   // uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
   // Example:
   // nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)
diff --git a/llvm/test/CodeGen/AArch64/aarch64_be-shuffle-vector.ll b/llvm/test/CodeGen/AArch64/aarch64_be-shuffle-vector.ll
new file mode 100644
index 0000000000000..4e60d99dbed36
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64_be-shuffle-vector.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefix=CHECKLE
+; RUN: llc < %s -mtriple=aarch64_be | FileCheck %s --check-prefix=CHECKBE
+
+define <4 x i16> @test_reconstructshuffle(<16 x i8> %a, <16 x i8> %b) nounwind {
+; CHECKLE-LABEL: test_reconstructshuffle:
+; CHECKLE:       // %bb.0:
+; CHECKLE-NEXT:    mov b2, v0.b[3]
+; CHECKLE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; CHECKLE-NEXT:    mov v2.b[2], v0.b[2]
+; CHECKLE-NEXT:    mov v2.b[4], v0.b[1]
+; CHECKLE-NEXT:    mov v2.b[6], v0.b[0]
+; CHECKLE-NEXT:    zip2 v0.8b, v1.8b, v0.8b
+; CHECKLE-NEXT:    add v0.4h, v2.4h, v0.4h
+; CHECKLE-NEXT:    bic v0.4h, #255, lsl #8
+; CHECKLE-NEXT:    ret
+;
+; CHECKBE-LABEL: test_reconstructshuffle:
+; CHECKBE:       // %bb.0:
+; CHECKBE-NEXT:    rev64 v0.16b, v0.16b
+; CHECKBE-NEXT:    rev64 v1.16b, v1.16b
+; CHECKBE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECKBE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; CHECKBE-NEXT:    mov b2, v0.b[3]
+; CHECKBE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; CHECKBE-NEXT:    mov v2.b[2], v0.b[2]
+; CHECKBE-NEXT:    mov v2.b[4], v0.b[1]
+; CHECKBE-NEXT:    mov v2.b[6], v0.b[0]
+; CHECKBE-NEXT:    zip2 v0.8b, v1.8b, v0.8b
+; CHECKBE-NEXT:    add v0.4h, v2.4h, v0.4h
+; CHECKBE-NEXT:    bic v0.4h, #255, lsl #8
+; CHECKBE-NEXT:    rev64 v0.4h, v0.4h
+; CHECKBE-NEXT:    ret
+  %tmp1 = shufflevector <16 x i8> %a, <16 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %tmp2 = shufflevector <16 x i8> %b, <16 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %tmp3 = add <4 x i8> %tmp1, %tmp2
+  %tmp4 = zext <4 x i8> %tmp3 to <4 x i16>
+  ret <4 x i16> %tmp4
+}
+
+; a shufflevector of even elements can become just xtn on little-endian, but not on big-endian.
+define <8 x i8> @xtn_shuffle_even_v16i8(<16 x i8> %a) {
+; CHECKLE-LABEL: xtn_shuffle_even_v16i8:
+; CHECKLE:       // %bb.0:
+; CHECKLE-NEXT:    xtn v0.8b, v0.8h
+; CHECKLE-NEXT:    ret
+;
+; CHECKBE-LABEL: xtn_shuffle_even_v16i8:
+; CHECKBE:       // %bb.0:
+; CHECKBE-NEXT:    rev64 v0.16b, v0.16b
+; CHECKBE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECKBE-NEXT:    uzp1 v0.16b, v0.16b, v0.16b
+; CHECKBE-NEXT:    rev64 v0.8b, v0.8b
+; CHECKBE-NEXT:    ret
+  %r = shufflevector <16 x i8> %a, <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  ret <8 x i8> %r
+}
+
+; a shufflevector of even elements can become just xtn on little-endian, but not on big-endian.
+define <4 x i16> @xtn_shuffle_even_v8i16(<8 x i16> %a) {
+; CHECKLE-LABEL: xtn_shuffle_even_v8i16:
+; CHECKLE:       // %bb.0:
+; CHECKLE-NEXT:    xtn v0.4h, v0.4s
+; CHECKLE-NEXT:    ret
+;
+; CHECKBE-LABEL: xtn_shuffle_even_v8i16:
+; CHECKBE:       // %bb.0:
+; CHECKBE-NEXT:    rev64 v0.8h, v0.8h
+; CHECKBE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECKBE-NEXT:    uzp1 v0.8h, v0.8h, v0.8h
+; CHECKBE-NEXT:    rev64 v0.4h, v0.4h
+; CHECKBE-NEXT:    ret
+  %r = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x i16> %r
+}
+
+; a shufflevector of even elements can become just xtn on little-endian, but not on big-endian.
+define <4 x i32> @xtn_shuffle_uzp1_poison_v4i32(<4 x i32> %a) {
+; CHECKLE-LABEL: xtn_shuffle_uzp1_poison_v4i32:
+; CHECKLE:       // %bb.0:
+; CHECKLE-NEXT:    xtn v0.2s, v0.2d
+; CHECKLE-NEXT:    ret
+;
+; CHECKBE-LABEL: xtn_shuffle_uzp1_poison_v4i32:
+; CHECKBE:       // %bb.0:
+; CHECKBE-NEXT:    rev64 v0.4s, v0.4s
+; CHECKBE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECKBE-NEXT:    uzp1 v0.4s, v0.4s, v0.4s
+; CHECKBE-NEXT:    rev64 v0.4s, v0.4s
+; CHECKBE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECKBE-NEXT:    ret
+  %r = shufflevector <4 x i32> %a, <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+  ret <4 x i32> %r
+
+}
diff --git a/llvm/test/CodeGen/AArch64/fix-shuffle-vector-be-rev.ll b/llvm/test/CodeGen/AArch64/fix-shuffle-vector-be-rev.ll
deleted file mode 100644
index 65da95e0163f4..0000000000000
--- a/llvm/test/CodeGen/AArch64/fix-shuffle-vector-be-rev.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefix=CHECKLE
-; RUN: llc < %s -mtriple=aarch64_be | FileCheck %s --check-prefix=CHECKBE
-
-define <4 x i16> @test_reconstructshuffle(<16 x i8> %a, <16 x i8> %b) nounwind {
-; CHECKLE-LABEL: test_reconstructshuffle:
-; CHECKLE:       // %bb.0:
-; CHECKLE-NEXT:    mov b2, v0.b[3]
-; CHECKLE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECKLE-NEXT:    mov v2.b[2], v0.b[2]
-; CHECKLE-NEXT:    mov v2.b[4], v0.b[1]
-; CHECKLE-NEXT:    mov v2.b[6], v0.b[0]
-; CHECKLE-NEXT:    zip2 v0.8b, v1.8b, v0.8b
-; CHECKLE-NEXT:    add v0.4h, v2.4h, v0.4h
-; CHECKLE-NEXT:    bic v0.4h, #255, lsl #8
-; CHECKLE-NEXT:    ret
-;
-; CHECKBE-LABEL: test_reconstructshuffle:
-; CHECKBE:       // %bb.0:
-; CHECKBE-NEXT:    rev64 v0.16b, v0.16b
-; CHECKBE-NEXT:    rev64 v1.16b, v1.16b
-; CHECKBE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECKBE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECKBE-NEXT:    mov b2, v0.b[3]
-; CHECKBE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECKBE-NEXT:    mov v2.b[2], v0.b[2]
-; CHECKBE-NEXT:    mov v2.b[4], v0.b[1]
-; CHECKBE-NEXT:    mov v2.b[6], v0.b[0]
-; CHECKBE-NEXT:    zip2 v0.8b, v1.8b, v0.8b
-; CHECKBE-NEXT:    add v0.4h, v2.4h, v0.4h
-; CHECKBE-NEXT:    bic v0.4h, #255, lsl #8
-; CHECKBE-NEXT:    rev64 v0.4h, v0.4h
-; CHECKBE-NEXT:    ret
-  %tmp1 = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-  %tmp2 = shufflevector <16 x i8> %b, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-  %tmp3 = add <4 x i8> %tmp1, %tmp2
-  %tmp4 = zext <4 x i8> %tmp3 to <4 x i16>
-  ret <4 x i16> %tmp4
-}

folkertdev · 2026-04-23T13:41:36Z

+      SDValue BC = DAG.getBitcast(BCVT, Op0);
+      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
+      return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
+                         DAG.getPOISON(HalfVT));


on this branch the original uses getUNDEF here, which caused a merge conflict. I suspect we want to actually use getPOISON if we can?

It does sound better for trunk but it might be worth being conservative on the branch and just keeping the code as-is, just moving it later. I'm not sure what else has gone into trunk since the branch was made in regards to poison vs undef in SDAG.

Ah, makes sense. Fixed.

dyung · 2026-04-23T22:10:52Z

Do we know when this miscompilation was introduced?

folkertdev · 2026-04-23T22:13:48Z

LLVM 19 apparently https://godbolt.org/z/6jnq6Ef8e, so, a while ago

dyung · 2026-04-24T00:03:48Z

Given that this miscompilation was introduced in LLVM 19 and was only reported/fixed now, it doesn't seem like a problem that is a widely hit issue, or a recent regression, so I don't think it meets the criteria for backporting. It should be in the upcoming LLVM 23 release which will be available in a few months.

Let me know if you have any strong objections to this and we can reconsider.

folkertdev · 2026-04-24T12:47:35Z

It's not a new bug but it is kind of a bad one with a very straightforward fix. Backporting it would unblock work on rust-lang/stdarch, specifically rust-lang/stdarch#2088.

nikic · 2026-04-30T07:43:47Z

I think it would be fine to take this one. It fixes a miscompile and it's a conservative change (i.e. all it does is disable an optimization).

folkertdev assigned davemgreen Apr 23, 2026

llvmbot added the backend:AArch64 label Apr 23, 2026

folkertdev commented Apr 23, 2026

View reviewed changes

folkertdev added this to the LLVM 22.x Release milestone Apr 23, 2026

github-project-automation Bot added this to LLVM Release Status Apr 23, 2026

github-project-automation Bot moved this to Needs Triage in LLVM Release Status Apr 23, 2026

folkertdev requested a review from davemgreen April 23, 2026 18:17

folkertdev unassigned davemgreen Apr 23, 2026

keep using undef

a430908

dyung moved this from Needs Triage to Needs Review in LLVM Release Status Apr 23, 2026

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

release/22.x: [AArch64] Fix `shufflevector` miscompilation on `aarch64_be` (#193076)#193744

release/22.x: [AArch64] Fix `shufflevector` miscompilation on `aarch64_be` (#193076)#193744
folkertdev wants to merge 2 commits intollvm:release/22.xfrom
folkertdev:backport-aarch64_be-shufflevector-fix

folkertdev commented Apr 23, 2026

Uh oh!

llvmbot commented Apr 23, 2026

Uh oh!

folkertdev Apr 23, 2026

Uh oh!

davemgreen Apr 23, 2026

Uh oh!

folkertdev Apr 23, 2026

Uh oh!

dyung commented Apr 23, 2026

Uh oh!

folkertdev commented Apr 23, 2026

Uh oh!

dyung commented Apr 24, 2026

Uh oh!

folkertdev commented Apr 24, 2026

Uh oh!

nikic commented Apr 30, 2026

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

5 participants

Conversation

folkertdev commented Apr 23, 2026

Uh oh!

llvmbot commented Apr 23, 2026

Uh oh!

folkertdev Apr 23, 2026

Choose a reason for hiding this comment

Uh oh!

davemgreen Apr 23, 2026

Choose a reason for hiding this comment

Uh oh!

folkertdev Apr 23, 2026

Choose a reason for hiding this comment

Uh oh!

dyung commented Apr 23, 2026

Uh oh!

folkertdev commented Apr 23, 2026

Uh oh!

dyung commented Apr 24, 2026

Uh oh!

folkertdev commented Apr 24, 2026

Uh oh!

nikic commented Apr 30, 2026

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

5 participants