[AMDGPU] Add a sched group mask for LDSDMA instructions#190872
[AMDGPU] Add a sched group mask for LDSDMA instructions#190872
Conversation
|
This stack of pull requests is managed by sgh. |
|
@llvm/pr-subscribers-backend-amdgpu Author: Shilei Tian (shiltian) ChangesThe existing VMEM masks are not fine-grained enough for some use cases. For This PR adds a new sched group mask for LDSDMA instructions. It is a subclass of Patch is 25.33 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/190872.diff 4 Files Affected:
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 8d2bcfeca2a41..f0e0eca2336cd 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1605,6 +1605,7 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
- 0x0100: All DS read instructions may be scheduled across sched_barrier.
- 0x0200: All DS write instructions may be scheduled across sched_barrier.
- 0x0400: All Transcendental (e.g. V_EXP) instructions may be scheduled across sched_barrier.
+ - 0x0800: All LDSDMA instructions may be scheduled across sched_barrier.
llvm.amdgcn.sched.group.barrier Creates schedule groups with specific properties to create custom scheduling
pipelines. The ordering between groups is enforced by the instruction scheduler.
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index f576972183eca..de6cea665bfc5 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -360,6 +360,10 @@ def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">,
// MASK = 0x0000 0080: ALL DS instructions may be scheduled across SCHED_BARRIER.
// MASK = 0x0000 0100: ALL DS read instructions may be scheduled accoss SCHED_BARRIER.
// MASK = 0x0000 0200: ALL DS write instructions may be scheduled across SCHED_BARRIER.
+// MASK = 0x0000 0400: All Transcendental (e.g. V_EXP) instructions may be
+// scheduled across SCHED_BARRIER.
+// MASK = 0x0000 0800: All LDSDMA instructions may be scheduled across
+// SCHED_BARRIER.
def int_amdgcn_sched_barrier : ClangBuiltin<"__builtin_amdgcn_sched_barrier">,
Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
IntrWillReturn, IntrNoCallback, IntrNoFree]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index d49ec90e4c212..2aa6dbed75b17 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -77,8 +77,9 @@ enum class SchedGroupMask {
DS_READ = 1u << 8,
DS_WRITE = 1u << 9,
TRANS = 1u << 10,
+ LDSDMA = 1u << 11,
ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
- DS_READ | DS_WRITE | TRANS,
+ DS_READ | DS_WRITE | TRANS | LDSDMA,
LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
};
@@ -2471,7 +2472,7 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
Result = true;
else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
- TII->isVMEM(MI))
+ (TII->isVMEM(MI) || TII->isLDSDMA(MI)))
Result = true;
else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
@@ -2498,6 +2499,10 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
TII->isTRANS(MI))
Result = true;
+ else if (((SGMask & SchedGroupMask::LDSDMA) != SchedGroupMask::NONE) &&
+ TII->isLDSDMA(MI))
+ Result = true;
+
LLVM_DEBUG(
dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)
<< (Result ? " could classify " : " unable to classify ") << MI);
@@ -2663,12 +2668,15 @@ IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
(InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE)
InvertedMask &= ~SchedGroupMask::ALU;
- // VMEM implies VMEM_READ, VMEM_WRITE.
+ // VMEM implies VMEM_READ, VMEM_WRITE, LDSDMA.
if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)
- InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE;
- // VMEM_READ, VMEM_WRITE implies VMEM.
+ InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE &
+ ~SchedGroupMask::LDSDMA;
+ // VMEM_READ, VMEM_WRITE, LDSDMA implies VMEM.
else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE ||
- (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE)
+ (InvertedMask & SchedGroupMask::VMEM_WRITE) ==
+ SchedGroupMask::NONE ||
+ (InvertedMask & SchedGroupMask::LDSDMA) == SchedGroupMask::NONE)
InvertedMask &= ~SchedGroupMask::VMEM;
// DS implies DS_READ, DS_WRITE.
diff --git a/llvm/test/CodeGen/AMDGPU/sched-ldsdma-mask.mir b/llvm/test/CodeGen/AMDGPU/sched-ldsdma-mask.mir
new file mode 100644
index 0000000000000..82358f80329cb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sched-ldsdma-mask.mir
@@ -0,0 +1,342 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -misched-cluster=false -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s
+#
+# This test validates SCHED_GROUP_BARRIER pipelines that enforce a repeating
+# 2 LDSDMA loads + 1 compute scheduling pattern.
+
+--- |
+ define amdgpu_kernel void @sched_group_barrier_mask_2048_tensor() { ret void }
+ define amdgpu_kernel void @sched_group_barrier_mask_16_tensor() { ret void }
+ define amdgpu_kernel void @sched_group_barrier_mask_2048_async_loads() { ret void }
+ define amdgpu_kernel void @sched_group_barrier_mask_16_async_loads() { ret void }
+ define amdgpu_kernel void @sched_group_barrier_mask_2048_interleave_2load_1valu() { ret void }
+...
+
+---
+name: sched_group_barrier_mask_2048_tensor
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: sched_group_barrier_mask_2048_tensor
+ ; CHECK: $exec = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_256 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF3:%[0-9]+]]:sgpr_256 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF4:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF5:%[0-9]+]]:sgpr_256 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF]], [[DEF1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF2]], [[DEF3]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF6]], [[DEF7]], implicit $exec
+ ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF4]], [[DEF5]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF]], [[DEF3]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ ; CHECK-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF8]], [[DEF9]], implicit $exec
+ ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF2]], [[DEF5]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF4]], [[DEF1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ ; CHECK-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_]], [[V_ADD_U32_e32_1]], implicit $exec
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2048, 2, 0
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 0
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2048, 2, 0
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 0
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2048, 2, 0
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 0
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_U32_e32_]], implicit [[V_ADD_U32_e32_1]], implicit [[V_ADD_U32_e32_2]]
+ $exec = IMPLICIT_DEF
+ %0:sgpr_128 = IMPLICIT_DEF
+ %1:sgpr_256 = IMPLICIT_DEF
+ %2:sgpr_128 = IMPLICIT_DEF
+ %3:sgpr_256 = IMPLICIT_DEF
+ %4:sgpr_128 = IMPLICIT_DEF
+ %5:sgpr_256 = IMPLICIT_DEF
+ %20:vgpr_32 = IMPLICIT_DEF
+ %21:vgpr_32 = IMPLICIT_DEF
+ %22:vgpr_32 = IMPLICIT_DEF
+ %23:vgpr_32 = IMPLICIT_DEF
+ TENSOR_LOAD_TO_LDS_d2_gfx1250 %0, %1, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ TENSOR_LOAD_TO_LDS_d2_gfx1250 %2, %3, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ TENSOR_LOAD_TO_LDS_d2_gfx1250 %4, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ TENSOR_LOAD_TO_LDS_d2_gfx1250 %0, %3, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ TENSOR_LOAD_TO_LDS_d2_gfx1250 %2, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ TENSOR_LOAD_TO_LDS_d2_gfx1250 %4, %1, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ %30:vgpr_32 = V_ADD_U32_e32 %20, %21, implicit $exec
+ %31:vgpr_32 = V_ADD_U32_e32 %22, %23, implicit $exec
+ %32:vgpr_32 = V_ADD_U32_e32 %30, %31, implicit $exec
+ SCHED_GROUP_BARRIER 2048, 2, 0
+ SCHED_GROUP_BARRIER 2, 1, 0
+ SCHED_GROUP_BARRIER 2048, 2, 0
+ SCHED_GROUP_BARRIER 2, 1, 0
+ SCHED_GROUP_BARRIER 2048, 2, 0
+ SCHED_GROUP_BARRIER 2, 1, 0
+ S_ENDPGM 0, implicit %30, implicit %31, implicit %32
+
+...
+
+---
+name: sched_group_barrier_mask_16_tensor
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: sched_group_barrier_mask_16_tensor
+ ; CHECK: $exec = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_256 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF3:%[0-9]+]]:sgpr_256 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF4:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF5:%[0-9]+]]:sgpr_256 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF]], [[DEF1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF2]], [[DEF3]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF6]], [[DEF7]], implicit $exec
+ ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF4]], [[DEF5]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF]], [[DEF5]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ ; CHECK-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_]], [[DEF8]], implicit $exec
+ ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF2]], [[DEF1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF4]], [[DEF3]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ ; CHECK-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_1]], [[DEF9]], implicit $exec
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 16, 2, 2
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 2
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 16, 2, 2
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 2
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 16, 2, 2
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 2
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_U32_e32_]], implicit [[V_ADD_U32_e32_1]], implicit [[V_ADD_U32_e32_2]]
+ $exec = IMPLICIT_DEF
+ %0:sgpr_128 = IMPLICIT_DEF
+ %1:sgpr_256 = IMPLICIT_DEF
+ %2:sgpr_128 = IMPLICIT_DEF
+ %3:sgpr_256 = IMPLICIT_DEF
+ %4:sgpr_128 = IMPLICIT_DEF
+ %5:sgpr_256 = IMPLICIT_DEF
+ %20:vgpr_32 = IMPLICIT_DEF
+ %21:vgpr_32 = IMPLICIT_DEF
+ %22:vgpr_32 = IMPLICIT_DEF
+ %23:vgpr_32 = IMPLICIT_DEF
+ TENSOR_LOAD_TO_LDS_d2_gfx1250 %0, %1, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ TENSOR_LOAD_TO_LDS_d2_gfx1250 %2, %3, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ TENSOR_LOAD_TO_LDS_d2_gfx1250 %4, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ TENSOR_LOAD_TO_LDS_d2_gfx1250 %0, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ TENSOR_LOAD_TO_LDS_d2_gfx1250 %2, %1, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ TENSOR_LOAD_TO_LDS_d2_gfx1250 %4, %3, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ %30:vgpr_32 = V_ADD_U32_e32 %20, %21, implicit $exec
+ %31:vgpr_32 = V_ADD_U32_e32 %30, %22, implicit $exec
+ %32:vgpr_32 = V_ADD_U32_e32 %31, %23, implicit $exec
+ SCHED_GROUP_BARRIER 16, 2, 2
+ SCHED_GROUP_BARRIER 2, 1, 2
+ SCHED_GROUP_BARRIER 16, 2, 2
+ SCHED_GROUP_BARRIER 2, 1, 2
+ SCHED_GROUP_BARRIER 16, 2, 2
+ SCHED_GROUP_BARRIER 2, 1, 2
+ S_ENDPGM 0, implicit %30, implicit %31, implicit %32
+
+...
+
+---
+name: sched_group_barrier_mask_2048_async_loads
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: sched_group_barrier_mask_2048_async_loads
+ ; CHECK: $exec = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: GLOBAL_LOAD_ASYNC_TO_LDS_B32 [[DEF1]], [[DEF]], 0, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: GLOBAL_LOAD_ASYNC_TO_LDS_B32 [[DEF3]], [[DEF2]], 16, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt
+ ; CHECK-NEXT: [[DEF4:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF1]], [[DEF3]], implicit $exec
+ ; CHECK-NEXT: GLOBAL_LOAD_ASYNC_TO_LDS_B32 [[DEF5]], [[DEF4]], 32, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt
+ ; CHECK-NEXT: [[DEF6:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: GLOBAL_LOAD_ASYNC_TO_LDS_B32 [[DEF7]], [[DEF6]], 48, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt
+ ; CHECK-NEXT: [[DEF8:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF5]], [[DEF7]], implicit $exec
+ ; CHECK-NEXT: GLOBAL_LOAD_ASYNC_TO_LDS_B32 [[DEF9]], [[DEF8]], 64, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt
+ ; CHECK-NEXT: [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: GLOBAL_LOAD_ASYNC_TO_LDS_B32 [[DEF11]], [[DEF10]], 80, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt
+ ; CHECK-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF9]], [[DEF11]], implicit $exec
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2048, 2, 1
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 1
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2048, 2, 1
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 1
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2048, 2, 1
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 1
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_U32_e32_]], implicit [[V_ADD_U32_e32_1]], implicit [[V_ADD_U32_e32_2]]
+ $exec = IMPLICIT_DEF
+ %0:vreg_64_align2 = IMPLICIT_DEF
+ %1:vreg_64_align2 = IMPLICIT_DEF
+ %2:vreg_64_align2 = IMPLICIT_DEF
+ %3:vreg_64_align2 = IMPLICIT_DEF
+ %4:vreg_64_align2 = IMPLICIT_DEF
+ %5:vreg_64_align2 = IMPLICIT_DEF
+ %10:vgpr_32 = IMPLICIT_DEF
+ %11:vgpr_32 = IMPLICIT_DEF
+ %12:vgpr_32 = IMPLICIT_DEF
+ %13:vgpr_32 = IMPLICIT_DEF
+ %14:vgpr_32 = IMPLICIT_DEF
+ %15:vgpr_32 = IMPLICIT_DEF
+ GLOBAL_LOAD_ASYNC_TO_LDS_B32 %10, %0, 0, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt
+ GLOBAL_LOAD_ASYNC_TO_LDS_B32 %11, %1, 16, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt
+ GLOBAL_LOAD_ASYNC_TO_LDS_B32 %12, %2, 32, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt
+ GLOBAL_LOAD_ASYNC_TO_LDS_B32 %13, %3, 48, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt
+ GLOBAL_LOAD_ASYNC_TO_LDS_B32 %14, %4, 64, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt
+ GLOBAL_LOAD_ASYNC_TO_LDS_B32 %15, %5, 80, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt
+ %30:vgpr_32 = V_ADD_U32_e32 %10, %11, implicit $exec
+ %31:vgpr_32 = V_ADD_U32_e32 %12, %13, implicit $exec
+ %32:vgpr_32 = V_ADD_U32_e32 %14, %15, implicit $exec
+ SCHED_GROUP_BARRIER 2048, 2, 1
+ SCHED_GROUP_BARRIER 2, 1, 1
+ SCHED_GROUP_BARRIER 2048, 2, 1
+ SCHED_GROUP_BARRIER 2, 1, 1
+ SCHED_GROUP_BARRIER 2048, 2, 1
+ SCHED_GROUP_BARRIER 2, 1, 1
+ S_ENDPGM 0, implicit %30, implicit %31, implicit %32
+
+...
+
+---
+name: sched_group_barrier_mask_16_async_loads
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: sched_group_barrier_mask_16_async_loads
+ ; CHECK: $exec = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: $m0 = S_MOV_B32 1
+ ; CHECK-NEXT: CLUSTER_LOAD_ASYNC_TO_LDS_B32 [[DEF1]], [[DEF]], 0, 0, implicit-def dead $asynccnt, implicit $m0, implicit $exec, implicit $asynccnt
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: CLUSTER_LOAD_ASYNC_TO_LDS_B32 [[DEF3]], [[DEF2]], 16, 0, implicit-def dead $asynccnt, implicit $m0, implicit $exec, implicit $asynccnt
+ ; CHECK-NEXT: [[DEF4:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF1]], [[DEF5]], implicit $exec
+ ; CHECK-NEXT: CLUSTER_LOAD_ASYNC_TO_LDS_B32 [[DEF5]], [[DEF4]], 32, 0, implicit-def dead $asynccnt, implicit $m0, implicit $exec, implicit $asynccnt
+ ; CHECK-NEXT: [[DEF6:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: CLUSTER_LOAD_ASYNC_TO_LDS_B32 [[DEF7]], [[DEF6]], 48, 0, implicit-def dead $asynccnt, implicit $m0, implicit $exec, implicit $asynccnt
+ ; CHECK-NEXT: [[DEF8:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF3]], [[DEF9]], implicit $exec
+ ; CHECK-NEXT: CLUSTER_LOAD_ASYNC_TO_LDS_B32 [[DEF9]], [[DEF8]], 64, 0, implicit-def dead $asynccnt, implicit $m0, implicit $exec, implicit $asynccnt
+ ; CHECK-NEXT: [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: CLUSTER_LOAD_ASYNC_TO_LDS_B32 [[DEF11]], [[DEF10]], 80, 0, implicit-def dead $asynccnt, implicit $m0, implicit $exec, implicit $asynccnt
+ ; CHECK-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF7]], [[DEF11]], implicit $exec
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 16, 2, 3
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 3
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 16, 2, 3
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 3
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 16, 2, 3
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 3
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_U32_e32_]], implicit [[V_ADD_U32_e32_1]], implicit [[V_ADD_U32_e32_2]]
+ $exec = IMPLICIT_DEF
+ $m0 = S_MOV_B32 1
+ %0:vreg_64_align2 = IMPLICIT_DEF
+ %1:vreg_64_align2 = IMPLICIT_DEF
+ %2:vreg_64_align2 = IMPLICIT_DEF
+ %3:vreg_64_align2 = IMPLICIT_DEF
+ %4:vreg_64_align2 = IMPLICIT_DEF
+ %5:vreg_64_align2 = IMPLICIT_DEF
+ %10:vgpr_32 = IMPLICIT_DEF
+ %11:vgpr_32 = IMPLI...
[truncated]
|
|
@llvm/pr-subscribers-llvm-ir Author: Shilei Tian (shiltian) ChangesThe existing VMEM masks are not fine-grained enough for some use cases. For This PR adds a new sched group mask for LDSDMA instructions. It is a subclass of Patch is 25.33 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/190872.diff 4 Files Affected:
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 8d2bcfeca2a41..f0e0eca2336cd 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1605,6 +1605,7 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
- 0x0100: All DS read instructions may be scheduled across sched_barrier.
- 0x0200: All DS write instructions may be scheduled across sched_barrier.
- 0x0400: All Transcendental (e.g. V_EXP) instructions may be scheduled across sched_barrier.
+ - 0x0800: All LDSDMA instructions may be scheduled across sched_barrier.
llvm.amdgcn.sched.group.barrier Creates schedule groups with specific properties to create custom scheduling
pipelines. The ordering between groups is enforced by the instruction scheduler.
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index f576972183eca..de6cea665bfc5 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -360,6 +360,10 @@ def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">,
// MASK = 0x0000 0080: ALL DS instructions may be scheduled across SCHED_BARRIER.
// MASK = 0x0000 0100: ALL DS read instructions may be scheduled accoss SCHED_BARRIER.
// MASK = 0x0000 0200: ALL DS write instructions may be scheduled across SCHED_BARRIER.
+// MASK = 0x0000 0400: All Transcendental (e.g. V_EXP) instructions may be
+// scheduled across SCHED_BARRIER.
+// MASK = 0x0000 0800: All LDSDMA instructions may be scheduled across
+// SCHED_BARRIER.
def int_amdgcn_sched_barrier : ClangBuiltin<"__builtin_amdgcn_sched_barrier">,
Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
IntrWillReturn, IntrNoCallback, IntrNoFree]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index d49ec90e4c212..2aa6dbed75b17 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -77,8 +77,9 @@ enum class SchedGroupMask {
DS_READ = 1u << 8,
DS_WRITE = 1u << 9,
TRANS = 1u << 10,
+ LDSDMA = 1u << 11,
ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
- DS_READ | DS_WRITE | TRANS,
+ DS_READ | DS_WRITE | TRANS | LDSDMA,
LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
};
@@ -2471,7 +2472,7 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
Result = true;
else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
- TII->isVMEM(MI))
+ (TII->isVMEM(MI) || TII->isLDSDMA(MI)))
Result = true;
else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
@@ -2498,6 +2499,10 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
TII->isTRANS(MI))
Result = true;
+ else if (((SGMask & SchedGroupMask::LDSDMA) != SchedGroupMask::NONE) &&
+ TII->isLDSDMA(MI))
+ Result = true;
+
LLVM_DEBUG(
dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)
<< (Result ? " could classify " : " unable to classify ") << MI);
@@ -2663,12 +2668,15 @@ IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
(InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE)
InvertedMask &= ~SchedGroupMask::ALU;
- // VMEM implies VMEM_READ, VMEM_WRITE.
+ // VMEM implies VMEM_READ, VMEM_WRITE, LDSDMA.
if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)
- InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE;
- // VMEM_READ, VMEM_WRITE implies VMEM.
+ InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE &
+ ~SchedGroupMask::LDSDMA;
+ // VMEM_READ, VMEM_WRITE, LDSDMA implies VMEM.
else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE ||
- (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE)
+ (InvertedMask & SchedGroupMask::VMEM_WRITE) ==
+ SchedGroupMask::NONE ||
+ (InvertedMask & SchedGroupMask::LDSDMA) == SchedGroupMask::NONE)
InvertedMask &= ~SchedGroupMask::VMEM;
// DS implies DS_READ, DS_WRITE.
diff --git a/llvm/test/CodeGen/AMDGPU/sched-ldsdma-mask.mir b/llvm/test/CodeGen/AMDGPU/sched-ldsdma-mask.mir
new file mode 100644
index 0000000000000..82358f80329cb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sched-ldsdma-mask.mir
@@ -0,0 +1,342 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -misched-cluster=false -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s
+#
+# This test validates SCHED_GROUP_BARRIER pipelines that enforce a repeating
+# 2 LDSDMA loads + 1 compute scheduling pattern.
+
+--- |
+ define amdgpu_kernel void @sched_group_barrier_mask_2048_tensor() { ret void }
+ define amdgpu_kernel void @sched_group_barrier_mask_16_tensor() { ret void }
+ define amdgpu_kernel void @sched_group_barrier_mask_2048_async_loads() { ret void }
+ define amdgpu_kernel void @sched_group_barrier_mask_16_async_loads() { ret void }
+ define amdgpu_kernel void @sched_group_barrier_mask_2048_interleave_2load_1valu() { ret void }
+...
+
+---
+name: sched_group_barrier_mask_2048_tensor
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: sched_group_barrier_mask_2048_tensor
+ ; CHECK: $exec = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_256 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF3:%[0-9]+]]:sgpr_256 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF4:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF5:%[0-9]+]]:sgpr_256 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF]], [[DEF1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF2]], [[DEF3]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF6]], [[DEF7]], implicit $exec
+ ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF4]], [[DEF5]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF]], [[DEF3]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ ; CHECK-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF8]], [[DEF9]], implicit $exec
+ ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF2]], [[DEF5]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF4]], [[DEF1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ ; CHECK-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_]], [[V_ADD_U32_e32_1]], implicit $exec
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2048, 2, 0
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 0
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2048, 2, 0
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 0
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2048, 2, 0
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 0
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_U32_e32_]], implicit [[V_ADD_U32_e32_1]], implicit [[V_ADD_U32_e32_2]]
+ $exec = IMPLICIT_DEF
+ %0:sgpr_128 = IMPLICIT_DEF
+ %1:sgpr_256 = IMPLICIT_DEF
+ %2:sgpr_128 = IMPLICIT_DEF
+ %3:sgpr_256 = IMPLICIT_DEF
+ %4:sgpr_128 = IMPLICIT_DEF
+ %5:sgpr_256 = IMPLICIT_DEF
+ %20:vgpr_32 = IMPLICIT_DEF
+ %21:vgpr_32 = IMPLICIT_DEF
+ %22:vgpr_32 = IMPLICIT_DEF
+ %23:vgpr_32 = IMPLICIT_DEF
+ TENSOR_LOAD_TO_LDS_d2_gfx1250 %0, %1, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ TENSOR_LOAD_TO_LDS_d2_gfx1250 %2, %3, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ TENSOR_LOAD_TO_LDS_d2_gfx1250 %4, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ TENSOR_LOAD_TO_LDS_d2_gfx1250 %0, %3, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ TENSOR_LOAD_TO_LDS_d2_gfx1250 %2, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ TENSOR_LOAD_TO_LDS_d2_gfx1250 %4, %1, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ %30:vgpr_32 = V_ADD_U32_e32 %20, %21, implicit $exec
+ %31:vgpr_32 = V_ADD_U32_e32 %22, %23, implicit $exec
+ %32:vgpr_32 = V_ADD_U32_e32 %30, %31, implicit $exec
+ SCHED_GROUP_BARRIER 2048, 2, 0
+ SCHED_GROUP_BARRIER 2, 1, 0
+ SCHED_GROUP_BARRIER 2048, 2, 0
+ SCHED_GROUP_BARRIER 2, 1, 0
+ SCHED_GROUP_BARRIER 2048, 2, 0
+ SCHED_GROUP_BARRIER 2, 1, 0
+ S_ENDPGM 0, implicit %30, implicit %31, implicit %32
+
+...
+
+---
+name: sched_group_barrier_mask_16_tensor
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: sched_group_barrier_mask_16_tensor
+ ; CHECK: $exec = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_256 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF3:%[0-9]+]]:sgpr_256 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF4:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF5:%[0-9]+]]:sgpr_256 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF]], [[DEF1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF2]], [[DEF3]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF6]], [[DEF7]], implicit $exec
+ ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF4]], [[DEF5]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF]], [[DEF5]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ ; CHECK-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_]], [[DEF8]], implicit $exec
+ ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF2]], [[DEF1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF4]], [[DEF3]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ ; CHECK-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_1]], [[DEF9]], implicit $exec
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 16, 2, 2
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 2
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 16, 2, 2
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 2
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 16, 2, 2
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 2
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_U32_e32_]], implicit [[V_ADD_U32_e32_1]], implicit [[V_ADD_U32_e32_2]]
+ $exec = IMPLICIT_DEF
+ %0:sgpr_128 = IMPLICIT_DEF
+ %1:sgpr_256 = IMPLICIT_DEF
+ %2:sgpr_128 = IMPLICIT_DEF
+ %3:sgpr_256 = IMPLICIT_DEF
+ %4:sgpr_128 = IMPLICIT_DEF
+ %5:sgpr_256 = IMPLICIT_DEF
+ %20:vgpr_32 = IMPLICIT_DEF
+ %21:vgpr_32 = IMPLICIT_DEF
+ %22:vgpr_32 = IMPLICIT_DEF
+ %23:vgpr_32 = IMPLICIT_DEF
+ TENSOR_LOAD_TO_LDS_d2_gfx1250 %0, %1, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ TENSOR_LOAD_TO_LDS_d2_gfx1250 %2, %3, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ TENSOR_LOAD_TO_LDS_d2_gfx1250 %4, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ TENSOR_LOAD_TO_LDS_d2_gfx1250 %0, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ TENSOR_LOAD_TO_LDS_d2_gfx1250 %2, %1, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ TENSOR_LOAD_TO_LDS_d2_gfx1250 %4, %3, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ %30:vgpr_32 = V_ADD_U32_e32 %20, %21, implicit $exec
+ %31:vgpr_32 = V_ADD_U32_e32 %30, %22, implicit $exec
+ %32:vgpr_32 = V_ADD_U32_e32 %31, %23, implicit $exec
+ SCHED_GROUP_BARRIER 16, 2, 2
+ SCHED_GROUP_BARRIER 2, 1, 2
+ SCHED_GROUP_BARRIER 16, 2, 2
+ SCHED_GROUP_BARRIER 2, 1, 2
+ SCHED_GROUP_BARRIER 16, 2, 2
+ SCHED_GROUP_BARRIER 2, 1, 2
+ S_ENDPGM 0, implicit %30, implicit %31, implicit %32
+
+...
+
+---
+name: sched_group_barrier_mask_2048_async_loads
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: sched_group_barrier_mask_2048_async_loads
+ ; CHECK: $exec = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: GLOBAL_LOAD_ASYNC_TO_LDS_B32 [[DEF1]], [[DEF]], 0, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: GLOBAL_LOAD_ASYNC_TO_LDS_B32 [[DEF3]], [[DEF2]], 16, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt
+ ; CHECK-NEXT: [[DEF4:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF1]], [[DEF3]], implicit $exec
+ ; CHECK-NEXT: GLOBAL_LOAD_ASYNC_TO_LDS_B32 [[DEF5]], [[DEF4]], 32, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt
+ ; CHECK-NEXT: [[DEF6:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: GLOBAL_LOAD_ASYNC_TO_LDS_B32 [[DEF7]], [[DEF6]], 48, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt
+ ; CHECK-NEXT: [[DEF8:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF5]], [[DEF7]], implicit $exec
+ ; CHECK-NEXT: GLOBAL_LOAD_ASYNC_TO_LDS_B32 [[DEF9]], [[DEF8]], 64, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt
+ ; CHECK-NEXT: [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: GLOBAL_LOAD_ASYNC_TO_LDS_B32 [[DEF11]], [[DEF10]], 80, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt
+ ; CHECK-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF9]], [[DEF11]], implicit $exec
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2048, 2, 1
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 1
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2048, 2, 1
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 1
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2048, 2, 1
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 1
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_U32_e32_]], implicit [[V_ADD_U32_e32_1]], implicit [[V_ADD_U32_e32_2]]
+ $exec = IMPLICIT_DEF
+ %0:vreg_64_align2 = IMPLICIT_DEF
+ %1:vreg_64_align2 = IMPLICIT_DEF
+ %2:vreg_64_align2 = IMPLICIT_DEF
+ %3:vreg_64_align2 = IMPLICIT_DEF
+ %4:vreg_64_align2 = IMPLICIT_DEF
+ %5:vreg_64_align2 = IMPLICIT_DEF
+ %10:vgpr_32 = IMPLICIT_DEF
+ %11:vgpr_32 = IMPLICIT_DEF
+ %12:vgpr_32 = IMPLICIT_DEF
+ %13:vgpr_32 = IMPLICIT_DEF
+ %14:vgpr_32 = IMPLICIT_DEF
+ %15:vgpr_32 = IMPLICIT_DEF
+ GLOBAL_LOAD_ASYNC_TO_LDS_B32 %10, %0, 0, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt
+ GLOBAL_LOAD_ASYNC_TO_LDS_B32 %11, %1, 16, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt
+ GLOBAL_LOAD_ASYNC_TO_LDS_B32 %12, %2, 32, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt
+ GLOBAL_LOAD_ASYNC_TO_LDS_B32 %13, %3, 48, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt
+ GLOBAL_LOAD_ASYNC_TO_LDS_B32 %14, %4, 64, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt
+ GLOBAL_LOAD_ASYNC_TO_LDS_B32 %15, %5, 80, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt
+ %30:vgpr_32 = V_ADD_U32_e32 %10, %11, implicit $exec
+ %31:vgpr_32 = V_ADD_U32_e32 %12, %13, implicit $exec
+ %32:vgpr_32 = V_ADD_U32_e32 %14, %15, implicit $exec
+ SCHED_GROUP_BARRIER 2048, 2, 1
+ SCHED_GROUP_BARRIER 2, 1, 1
+ SCHED_GROUP_BARRIER 2048, 2, 1
+ SCHED_GROUP_BARRIER 2, 1, 1
+ SCHED_GROUP_BARRIER 2048, 2, 1
+ SCHED_GROUP_BARRIER 2, 1, 1
+ S_ENDPGM 0, implicit %30, implicit %31, implicit %32
+
+...
+
+---
+name: sched_group_barrier_mask_16_async_loads
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: sched_group_barrier_mask_16_async_loads
+ ; CHECK: $exec = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: $m0 = S_MOV_B32 1
+ ; CHECK-NEXT: CLUSTER_LOAD_ASYNC_TO_LDS_B32 [[DEF1]], [[DEF]], 0, 0, implicit-def dead $asynccnt, implicit $m0, implicit $exec, implicit $asynccnt
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: CLUSTER_LOAD_ASYNC_TO_LDS_B32 [[DEF3]], [[DEF2]], 16, 0, implicit-def dead $asynccnt, implicit $m0, implicit $exec, implicit $asynccnt
+ ; CHECK-NEXT: [[DEF4:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF1]], [[DEF5]], implicit $exec
+ ; CHECK-NEXT: CLUSTER_LOAD_ASYNC_TO_LDS_B32 [[DEF5]], [[DEF4]], 32, 0, implicit-def dead $asynccnt, implicit $m0, implicit $exec, implicit $asynccnt
+ ; CHECK-NEXT: [[DEF6:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: CLUSTER_LOAD_ASYNC_TO_LDS_B32 [[DEF7]], [[DEF6]], 48, 0, implicit-def dead $asynccnt, implicit $m0, implicit $exec, implicit $asynccnt
+ ; CHECK-NEXT: [[DEF8:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF3]], [[DEF9]], implicit $exec
+ ; CHECK-NEXT: CLUSTER_LOAD_ASYNC_TO_LDS_B32 [[DEF9]], [[DEF8]], 64, 0, implicit-def dead $asynccnt, implicit $m0, implicit $exec, implicit $asynccnt
+ ; CHECK-NEXT: [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: CLUSTER_LOAD_ASYNC_TO_LDS_B32 [[DEF11]], [[DEF10]], 80, 0, implicit-def dead $asynccnt, implicit $m0, implicit $exec, implicit $asynccnt
+ ; CHECK-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF7]], [[DEF11]], implicit $exec
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 16, 2, 3
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 3
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 16, 2, 3
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 3
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 16, 2, 3
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 3
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_U32_e32_]], implicit [[V_ADD_U32_e32_1]], implicit [[V_ADD_U32_e32_2]]
+ $exec = IMPLICIT_DEF
+ $m0 = S_MOV_B32 1
+ %0:vreg_64_align2 = IMPLICIT_DEF
+ %1:vreg_64_align2 = IMPLICIT_DEF
+ %2:vreg_64_align2 = IMPLICIT_DEF
+ %3:vreg_64_align2 = IMPLICIT_DEF
+ %4:vreg_64_align2 = IMPLICIT_DEF
+ %5:vreg_64_align2 = IMPLICIT_DEF
+ %10:vgpr_32 = IMPLICIT_DEF
+ %11:vgpr_32 = IMPLI...
[truncated]
|
05eb81a to
49b4343
Compare
🐧 Linux x64 Test Results
✅ The build succeeded and all tests passed. |
🪟 Windows x64 Test Results
✅ The build succeeded and all tests passed. |
The existing VMEM masks are not fine-grained enough for some use cases. For example, if users want to control async loads, using VMEM may cause the compiler to pick instructions it shouldn't. This PR adds a new sched group mask for LDSDMA instructions. It is a subclass of VMEM, but only targets isLDSDMA instructions.
49b4343 to
3e0e255
Compare
| ; CHECK-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_]], [[DEF8]], implicit $exec | ||
| ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF2]], [[DEF1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt | ||
| ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF4]], [[DEF3]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt | ||
| ; CHECK-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF |
There was a problem hiding this comment.
in contrast to sched_group_barrier_mask_2048_tensor, this implicit_def got interleaved here, but probably not an issue, ig
| - 0x0100: All DS read instructions may be scheduled across sched_barrier. | ||
| - 0x0200: All DS write instructions may be scheduled across sched_barrier. | ||
| - 0x0400: All Transcendental (e.g. V_EXP) instructions may be scheduled across sched_barrier. | ||
| - 0x0800: All LDSDMA instructions may be scheduled across sched_barrier. |
There was a problem hiding this comment.
This interface is ludicrous and will require making up new arbitrary masks forever
The existing VMEM masks are not fine-grained enough for some use cases. For example, if users want to control async loads, using VMEM may cause the compiler to pick instructions it shouldn't.
This PR adds a new sched group mask for LDSDMA instructions. It is a subclass of VMEM, but only targets isLDSDMA instructions.
Fixes ROCM-20517.