Skip to content

release/22.x: [AArch64] Fix shufflevector miscompilation on aarch64_be (#193076)#193744

Open
folkertdev wants to merge 2 commits intollvm:release/22.xfrom
folkertdev:backport-aarch64_be-shufflevector-fix
Open

release/22.x: [AArch64] Fix shufflevector miscompilation on aarch64_be (#193076)#193744
folkertdev wants to merge 2 commits intollvm:release/22.xfrom
folkertdev:backport-aarch64_be-shufflevector-fix

Conversation

@folkertdev
Copy link
Copy Markdown
Contributor

backport 7425ab9
pr: #193076

…3076)

A function like

```llvm
define <4 x i16> @xtn_shuffle_even_v8i16(<8 x i16> %a) {
entry:
  %r = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
  ret <4 x i16> %r
}
```

will use the `xtn` instruction, which for each 32-bit vector element
keeps only the lower 16 bits, so effectively this is a truncation.
However, if the vector actually has 16-bit elements, then the conversion
from a shuffle to a truncation is only valid on LE, not on BE. On BE,
`uzp1` should be used instead. So this PR moves some logic to right
after a check for LE, so that BE does not miscompile.

I filed this as an issue on `rust-lang-rust` as
rust-lang/rust#155459, and we had some
discussion at [#t-libs/stdarch > &llvm#96;simd_shuffle!&llvm#96; behaves
strangely on
&llvm#96;aarch64_be&llvm#96;](https://rust-lang.zulipchat.com/#narrow/channel/208962-t-libs.2Fstdarch/topic/.60simd_shuffle.21.60.20behaves.20strangely.20on.20.60aarch64_be.60/with/587766433).
@llvmbot
Copy link
Copy Markdown
Member

llvmbot commented Apr 23, 2026

@llvm/pr-subscribers-backend-aarch64

Author: Folkert de Vries (folkertdev)

Changes

backport 7425ab9
pr: #193076


Full diff: https://github.com/llvm/llvm-project/pull/193744.diff

3 Files Affected:

  • (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+27-27)
  • (added) llvm/test/CodeGen/AArch64/aarch64_be-shuffle-vector.ll (+95)
  • (removed) llvm/test/CodeGen/AArch64/fix-shuffle-vector-be-rev.ll (-39)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a55b1facb103f..aa22f480c1537 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -24419,33 +24419,6 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
   if (N->getOpcode() == AArch64ISD::UZP2)
     return SDValue();
 
-  // uzp1(x, undef) -> concat(truncate(x), undef)
-  if (Op1.isUndef()) {
-    EVT BCVT = MVT::Other, HalfVT = MVT::Other;
-    switch (ResVT.getSimpleVT().SimpleTy) {
-    default:
-      break;
-    case MVT::v16i8:
-      BCVT = MVT::v8i16;
-      HalfVT = MVT::v8i8;
-      break;
-    case MVT::v8i16:
-      BCVT = MVT::v4i32;
-      HalfVT = MVT::v4i16;
-      break;
-    case MVT::v4i32:
-      BCVT = MVT::v2i64;
-      HalfVT = MVT::v2i32;
-      break;
-    }
-    if (BCVT != MVT::Other) {
-      SDValue BC = DAG.getBitcast(BCVT, Op0);
-      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
-      return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
-                         DAG.getUNDEF(HalfVT));
-    }
-  }
-
   if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
     return Urshr;
 
@@ -24487,6 +24460,33 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
   if (!DAG.getDataLayout().isLittleEndian())
     return SDValue();
 
+  // uzp1(x, undef) -> concat(truncate(x), undef)
+  if (Op1.isUndef()) {
+    EVT BCVT = MVT::Other, HalfVT = MVT::Other;
+    switch (ResVT.getSimpleVT().SimpleTy) {
+    default:
+      break;
+    case MVT::v16i8:
+      BCVT = MVT::v8i16;
+      HalfVT = MVT::v8i8;
+      break;
+    case MVT::v8i16:
+      BCVT = MVT::v4i32;
+      HalfVT = MVT::v4i16;
+      break;
+    case MVT::v4i32:
+      BCVT = MVT::v2i64;
+      HalfVT = MVT::v2i32;
+      break;
+    }
+    if (BCVT != MVT::Other) {
+      SDValue BC = DAG.getBitcast(BCVT, Op0);
+      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
+      return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
+                         DAG.getPOISON(HalfVT));
+    }
+  }
+
   // uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
   // Example:
   // nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)
diff --git a/llvm/test/CodeGen/AArch64/aarch64_be-shuffle-vector.ll b/llvm/test/CodeGen/AArch64/aarch64_be-shuffle-vector.ll
new file mode 100644
index 0000000000000..4e60d99dbed36
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64_be-shuffle-vector.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefix=CHECKLE
+; RUN: llc < %s -mtriple=aarch64_be | FileCheck %s --check-prefix=CHECKBE
+
+define <4 x i16> @test_reconstructshuffle(<16 x i8> %a, <16 x i8> %b) nounwind {
+; CHECKLE-LABEL: test_reconstructshuffle:
+; CHECKLE:       // %bb.0:
+; CHECKLE-NEXT:    mov b2, v0.b[3]
+; CHECKLE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; CHECKLE-NEXT:    mov v2.b[2], v0.b[2]
+; CHECKLE-NEXT:    mov v2.b[4], v0.b[1]
+; CHECKLE-NEXT:    mov v2.b[6], v0.b[0]
+; CHECKLE-NEXT:    zip2 v0.8b, v1.8b, v0.8b
+; CHECKLE-NEXT:    add v0.4h, v2.4h, v0.4h
+; CHECKLE-NEXT:    bic v0.4h, #255, lsl #8
+; CHECKLE-NEXT:    ret
+;
+; CHECKBE-LABEL: test_reconstructshuffle:
+; CHECKBE:       // %bb.0:
+; CHECKBE-NEXT:    rev64 v0.16b, v0.16b
+; CHECKBE-NEXT:    rev64 v1.16b, v1.16b
+; CHECKBE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECKBE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; CHECKBE-NEXT:    mov b2, v0.b[3]
+; CHECKBE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; CHECKBE-NEXT:    mov v2.b[2], v0.b[2]
+; CHECKBE-NEXT:    mov v2.b[4], v0.b[1]
+; CHECKBE-NEXT:    mov v2.b[6], v0.b[0]
+; CHECKBE-NEXT:    zip2 v0.8b, v1.8b, v0.8b
+; CHECKBE-NEXT:    add v0.4h, v2.4h, v0.4h
+; CHECKBE-NEXT:    bic v0.4h, #255, lsl #8
+; CHECKBE-NEXT:    rev64 v0.4h, v0.4h
+; CHECKBE-NEXT:    ret
+  %tmp1 = shufflevector <16 x i8> %a, <16 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %tmp2 = shufflevector <16 x i8> %b, <16 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %tmp3 = add <4 x i8> %tmp1, %tmp2
+  %tmp4 = zext <4 x i8> %tmp3 to <4 x i16>
+  ret <4 x i16> %tmp4
+}
+
+; a shufflevector of even elements can become just xtn on little-endian, but not on big-endian.
+define <8 x i8> @xtn_shuffle_even_v16i8(<16 x i8> %a) {
+; CHECKLE-LABEL: xtn_shuffle_even_v16i8:
+; CHECKLE:       // %bb.0:
+; CHECKLE-NEXT:    xtn v0.8b, v0.8h
+; CHECKLE-NEXT:    ret
+;
+; CHECKBE-LABEL: xtn_shuffle_even_v16i8:
+; CHECKBE:       // %bb.0:
+; CHECKBE-NEXT:    rev64 v0.16b, v0.16b
+; CHECKBE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECKBE-NEXT:    uzp1 v0.16b, v0.16b, v0.16b
+; CHECKBE-NEXT:    rev64 v0.8b, v0.8b
+; CHECKBE-NEXT:    ret
+  %r = shufflevector <16 x i8> %a, <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  ret <8 x i8> %r
+}
+
+; a shufflevector of even elements can become just xtn on little-endian, but not on big-endian.
+define <4 x i16> @xtn_shuffle_even_v8i16(<8 x i16> %a) {
+; CHECKLE-LABEL: xtn_shuffle_even_v8i16:
+; CHECKLE:       // %bb.0:
+; CHECKLE-NEXT:    xtn v0.4h, v0.4s
+; CHECKLE-NEXT:    ret
+;
+; CHECKBE-LABEL: xtn_shuffle_even_v8i16:
+; CHECKBE:       // %bb.0:
+; CHECKBE-NEXT:    rev64 v0.8h, v0.8h
+; CHECKBE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECKBE-NEXT:    uzp1 v0.8h, v0.8h, v0.8h
+; CHECKBE-NEXT:    rev64 v0.4h, v0.4h
+; CHECKBE-NEXT:    ret
+  %r = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x i16> %r
+}
+
+; a shufflevector of even elements can become just xtn on little-endian, but not on big-endian.
+define <4 x i32> @xtn_shuffle_uzp1_poison_v4i32(<4 x i32> %a) {
+; CHECKLE-LABEL: xtn_shuffle_uzp1_poison_v4i32:
+; CHECKLE:       // %bb.0:
+; CHECKLE-NEXT:    xtn v0.2s, v0.2d
+; CHECKLE-NEXT:    ret
+;
+; CHECKBE-LABEL: xtn_shuffle_uzp1_poison_v4i32:
+; CHECKBE:       // %bb.0:
+; CHECKBE-NEXT:    rev64 v0.4s, v0.4s
+; CHECKBE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECKBE-NEXT:    uzp1 v0.4s, v0.4s, v0.4s
+; CHECKBE-NEXT:    rev64 v0.4s, v0.4s
+; CHECKBE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECKBE-NEXT:    ret
+  %r = shufflevector <4 x i32> %a, <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+  ret <4 x i32> %r
+
+}
diff --git a/llvm/test/CodeGen/AArch64/fix-shuffle-vector-be-rev.ll b/llvm/test/CodeGen/AArch64/fix-shuffle-vector-be-rev.ll
deleted file mode 100644
index 65da95e0163f4..0000000000000
--- a/llvm/test/CodeGen/AArch64/fix-shuffle-vector-be-rev.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefix=CHECKLE
-; RUN: llc < %s -mtriple=aarch64_be | FileCheck %s --check-prefix=CHECKBE
-
-define <4 x i16> @test_reconstructshuffle(<16 x i8> %a, <16 x i8> %b) nounwind {
-; CHECKLE-LABEL: test_reconstructshuffle:
-; CHECKLE:       // %bb.0:
-; CHECKLE-NEXT:    mov b2, v0.b[3]
-; CHECKLE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECKLE-NEXT:    mov v2.b[2], v0.b[2]
-; CHECKLE-NEXT:    mov v2.b[4], v0.b[1]
-; CHECKLE-NEXT:    mov v2.b[6], v0.b[0]
-; CHECKLE-NEXT:    zip2 v0.8b, v1.8b, v0.8b
-; CHECKLE-NEXT:    add v0.4h, v2.4h, v0.4h
-; CHECKLE-NEXT:    bic v0.4h, #255, lsl #8
-; CHECKLE-NEXT:    ret
-;
-; CHECKBE-LABEL: test_reconstructshuffle:
-; CHECKBE:       // %bb.0:
-; CHECKBE-NEXT:    rev64 v0.16b, v0.16b
-; CHECKBE-NEXT:    rev64 v1.16b, v1.16b
-; CHECKBE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECKBE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECKBE-NEXT:    mov b2, v0.b[3]
-; CHECKBE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECKBE-NEXT:    mov v2.b[2], v0.b[2]
-; CHECKBE-NEXT:    mov v2.b[4], v0.b[1]
-; CHECKBE-NEXT:    mov v2.b[6], v0.b[0]
-; CHECKBE-NEXT:    zip2 v0.8b, v1.8b, v0.8b
-; CHECKBE-NEXT:    add v0.4h, v2.4h, v0.4h
-; CHECKBE-NEXT:    bic v0.4h, #255, lsl #8
-; CHECKBE-NEXT:    rev64 v0.4h, v0.4h
-; CHECKBE-NEXT:    ret
-  %tmp1 = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-  %tmp2 = shufflevector <16 x i8> %b, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-  %tmp3 = add <4 x i8> %tmp1, %tmp2
-  %tmp4 = zext <4 x i8> %tmp3 to <4 x i16>
-  ret <4 x i16> %tmp4
-}

SDValue BC = DAG.getBitcast(BCVT, Op0);
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
DAG.getPOISON(HalfVT));
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

on this branch the original uses getUNDEF here, which caused a merge conflict. I suspect we want to actually use getPOISON if we can?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does sound better for trunk but it might be worth being conservative on the branch and just keeping the code as-is, just moving it later. I'm not sure what else has gone into trunk since the branch was made in regards to poison vs undef in SDAG.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, makes sense. Fixed.

@dyung
Copy link
Copy Markdown
Collaborator

dyung commented Apr 23, 2026

Do we know when this miscompilation was introduced?

@dyung dyung moved this from Needs Triage to Needs Review in LLVM Release Status Apr 23, 2026
@folkertdev
Copy link
Copy Markdown
Contributor Author

LLVM 19 apparently https://godbolt.org/z/6jnq6Ef8e, so, a while ago

@dyung
Copy link
Copy Markdown
Collaborator

dyung commented Apr 24, 2026

Given that this miscompilation was introduced in LLVM 19 and was only reported/fixed now, it doesn't seem like a problem that is a widely hit issue, or a recent regression, so I don't think it meets the criteria for backporting. It should be in the upcoming LLVM 23 release which will be available in a few months.

Let me know if you have any strong objections to this and we can reconsider.

@folkertdev
Copy link
Copy Markdown
Contributor Author

It's not a new bug but it is kind of a bad one with a very straightforward fix. Backporting it would unblock work on rust-lang/stdarch, specifically rust-lang/stdarch#2088.

@nikic
Copy link
Copy Markdown
Contributor

nikic commented Apr 30, 2026

I think it would be fine to take this one. It fixes a miscompile and it's a conservative change (i.e. all it does is disable an optimization).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

Status: Needs Review

Development

Successfully merging this pull request may close these issues.

5 participants