diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 8d2bcfeca2a41..f0e0eca2336cd 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -1605,6 +1605,7 @@ The AMDGPU backend implements the following LLVM IR intrinsics. - 0x0100: All DS read instructions may be scheduled across sched_barrier. - 0x0200: All DS write instructions may be scheduled across sched_barrier. - 0x0400: All Transcendental (e.g. V_EXP) instructions may be scheduled across sched_barrier. + - 0x0800: All LDSDMA instructions may be scheduled across sched_barrier. llvm.amdgcn.sched.group.barrier Creates schedule groups with specific properties to create custom scheduling pipelines. The ordering between groups is enforced by the instruction scheduler. diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index f576972183eca..8c67ec81ae90e 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -360,6 +360,8 @@ def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">, // MASK = 0x0000 0080: ALL DS instructions may be scheduled across SCHED_BARRIER. // MASK = 0x0000 0100: ALL DS read instructions may be scheduled accoss SCHED_BARRIER. // MASK = 0x0000 0200: ALL DS write instructions may be scheduled across SCHED_BARRIER. +// MASK = 0x0000 0400: All Transcendental (e.g. V_EXP) instructions may be scheduled across SCHED_BARRIER. +// MASK = 0x0000 0800: All LDSDMA instructions may be scheduled across SCHED_BARRIER. def int_amdgcn_sched_barrier : ClangBuiltin<"__builtin_amdgcn_sched_barrier">, Intrinsic<[], [llvm_i32_ty], [ImmArg>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index d49ec90e4c212..2aa6dbed75b17 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -77,8 +77,9 @@ enum class SchedGroupMask { DS_READ = 1u << 8, DS_WRITE = 1u << 9, TRANS = 1u << 10, + LDSDMA = 1u << 11, ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS | - DS_READ | DS_WRITE | TRANS, + DS_READ | DS_WRITE | TRANS | LDSDMA, LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) }; @@ -2471,7 +2472,7 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const { Result = true; else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) && - TII->isVMEM(MI)) + (TII->isVMEM(MI) || TII->isLDSDMA(MI))) Result = true; else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) && @@ -2498,6 +2499,10 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const { TII->isTRANS(MI)) Result = true; + else if (((SGMask & SchedGroupMask::LDSDMA) != SchedGroupMask::NONE) && + TII->isLDSDMA(MI)) + Result = true; + LLVM_DEBUG( dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true) << (Result ? " could classify " : " unable to classify ") << MI); @@ -2663,12 +2668,15 @@ IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const { (InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE) InvertedMask &= ~SchedGroupMask::ALU; - // VMEM implies VMEM_READ, VMEM_WRITE. + // VMEM implies VMEM_READ, VMEM_WRITE, LDSDMA. if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE) - InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE; - // VMEM_READ, VMEM_WRITE implies VMEM. + InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE & + ~SchedGroupMask::LDSDMA; + // VMEM_READ, VMEM_WRITE, LDSDMA implies VMEM. else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE || - (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE) + (InvertedMask & SchedGroupMask::VMEM_WRITE) == + SchedGroupMask::NONE || + (InvertedMask & SchedGroupMask::LDSDMA) == SchedGroupMask::NONE) InvertedMask &= ~SchedGroupMask::VMEM; // DS implies DS_READ, DS_WRITE. diff --git a/llvm/test/CodeGen/AMDGPU/sched-ldsdma-mask.mir b/llvm/test/CodeGen/AMDGPU/sched-ldsdma-mask.mir new file mode 100644 index 0000000000000..82358f80329cb --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sched-ldsdma-mask.mir @@ -0,0 +1,342 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -misched-cluster=false -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s +# +# This test validates SCHED_GROUP_BARRIER pipelines that enforce a repeating +# 2 LDSDMA loads + 1 compute scheduling pattern. + +--- | + define amdgpu_kernel void @sched_group_barrier_mask_2048_tensor() { ret void } + define amdgpu_kernel void @sched_group_barrier_mask_16_tensor() { ret void } + define amdgpu_kernel void @sched_group_barrier_mask_2048_async_loads() { ret void } + define amdgpu_kernel void @sched_group_barrier_mask_16_async_loads() { ret void } + define amdgpu_kernel void @sched_group_barrier_mask_2048_interleave_2load_1valu() { ret void } +... + +--- +name: sched_group_barrier_mask_2048_tensor +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: sched_group_barrier_mask_2048_tensor + ; CHECK: $exec = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_256 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:sgpr_256 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF4:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF5:%[0-9]+]]:sgpr_256 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF]], [[DEF1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF2]], [[DEF3]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF6]], [[DEF7]], implicit $exec + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF4]], [[DEF5]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF]], [[DEF3]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF8]], [[DEF9]], implicit $exec + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF2]], [[DEF5]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF4]], [[DEF1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_]], [[V_ADD_U32_e32_1]], implicit $exec + ; CHECK-NEXT: SCHED_GROUP_BARRIER 2048, 2, 0 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 0 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 2048, 2, 0 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 0 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 2048, 2, 0 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 0 + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_U32_e32_]], implicit [[V_ADD_U32_e32_1]], implicit [[V_ADD_U32_e32_2]] + $exec = IMPLICIT_DEF + %0:sgpr_128 = IMPLICIT_DEF + %1:sgpr_256 = IMPLICIT_DEF + %2:sgpr_128 = IMPLICIT_DEF + %3:sgpr_256 = IMPLICIT_DEF + %4:sgpr_128 = IMPLICIT_DEF + %5:sgpr_256 = IMPLICIT_DEF + %20:vgpr_32 = IMPLICIT_DEF + %21:vgpr_32 = IMPLICIT_DEF + %22:vgpr_32 = IMPLICIT_DEF + %23:vgpr_32 = IMPLICIT_DEF + TENSOR_LOAD_TO_LDS_d2_gfx1250 %0, %1, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + TENSOR_LOAD_TO_LDS_d2_gfx1250 %2, %3, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + TENSOR_LOAD_TO_LDS_d2_gfx1250 %4, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + TENSOR_LOAD_TO_LDS_d2_gfx1250 %0, %3, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + TENSOR_LOAD_TO_LDS_d2_gfx1250 %2, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + TENSOR_LOAD_TO_LDS_d2_gfx1250 %4, %1, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + %30:vgpr_32 = V_ADD_U32_e32 %20, %21, implicit $exec + %31:vgpr_32 = V_ADD_U32_e32 %22, %23, implicit $exec + %32:vgpr_32 = V_ADD_U32_e32 %30, %31, implicit $exec + SCHED_GROUP_BARRIER 2048, 2, 0 + SCHED_GROUP_BARRIER 2, 1, 0 + SCHED_GROUP_BARRIER 2048, 2, 0 + SCHED_GROUP_BARRIER 2, 1, 0 + SCHED_GROUP_BARRIER 2048, 2, 0 + SCHED_GROUP_BARRIER 2, 1, 0 + S_ENDPGM 0, implicit %30, implicit %31, implicit %32 + +... + +--- +name: sched_group_barrier_mask_16_tensor +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: sched_group_barrier_mask_16_tensor + ; CHECK: $exec = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_256 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:sgpr_256 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF4:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF5:%[0-9]+]]:sgpr_256 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF]], [[DEF1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF2]], [[DEF3]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF6]], [[DEF7]], implicit $exec + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF4]], [[DEF5]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF]], [[DEF5]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_]], [[DEF8]], implicit $exec + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF2]], [[DEF1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2_gfx1250 [[DEF4]], [[DEF3]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_1]], [[DEF9]], implicit $exec + ; CHECK-NEXT: SCHED_GROUP_BARRIER 16, 2, 2 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 2 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 16, 2, 2 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 2 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 16, 2, 2 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 2 + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_U32_e32_]], implicit [[V_ADD_U32_e32_1]], implicit [[V_ADD_U32_e32_2]] + $exec = IMPLICIT_DEF + %0:sgpr_128 = IMPLICIT_DEF + %1:sgpr_256 = IMPLICIT_DEF + %2:sgpr_128 = IMPLICIT_DEF + %3:sgpr_256 = IMPLICIT_DEF + %4:sgpr_128 = IMPLICIT_DEF + %5:sgpr_256 = IMPLICIT_DEF + %20:vgpr_32 = IMPLICIT_DEF + %21:vgpr_32 = IMPLICIT_DEF + %22:vgpr_32 = IMPLICIT_DEF + %23:vgpr_32 = IMPLICIT_DEF + TENSOR_LOAD_TO_LDS_d2_gfx1250 %0, %1, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + TENSOR_LOAD_TO_LDS_d2_gfx1250 %2, %3, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + TENSOR_LOAD_TO_LDS_d2_gfx1250 %4, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + TENSOR_LOAD_TO_LDS_d2_gfx1250 %0, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + TENSOR_LOAD_TO_LDS_d2_gfx1250 %2, %1, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + TENSOR_LOAD_TO_LDS_d2_gfx1250 %4, %3, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + %30:vgpr_32 = V_ADD_U32_e32 %20, %21, implicit $exec + %31:vgpr_32 = V_ADD_U32_e32 %30, %22, implicit $exec + %32:vgpr_32 = V_ADD_U32_e32 %31, %23, implicit $exec + SCHED_GROUP_BARRIER 16, 2, 2 + SCHED_GROUP_BARRIER 2, 1, 2 + SCHED_GROUP_BARRIER 16, 2, 2 + SCHED_GROUP_BARRIER 2, 1, 2 + SCHED_GROUP_BARRIER 16, 2, 2 + SCHED_GROUP_BARRIER 2, 1, 2 + S_ENDPGM 0, implicit %30, implicit %31, implicit %32 + +... + +--- +name: sched_group_barrier_mask_2048_async_loads +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: sched_group_barrier_mask_2048_async_loads + ; CHECK: $exec = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: GLOBAL_LOAD_ASYNC_TO_LDS_B32 [[DEF1]], [[DEF]], 0, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: GLOBAL_LOAD_ASYNC_TO_LDS_B32 [[DEF3]], [[DEF2]], 16, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt + ; CHECK-NEXT: [[DEF4:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF1]], [[DEF3]], implicit $exec + ; CHECK-NEXT: GLOBAL_LOAD_ASYNC_TO_LDS_B32 [[DEF5]], [[DEF4]], 32, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt + ; CHECK-NEXT: [[DEF6:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: GLOBAL_LOAD_ASYNC_TO_LDS_B32 [[DEF7]], [[DEF6]], 48, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt + ; CHECK-NEXT: [[DEF8:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF5]], [[DEF7]], implicit $exec + ; CHECK-NEXT: GLOBAL_LOAD_ASYNC_TO_LDS_B32 [[DEF9]], [[DEF8]], 64, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt + ; CHECK-NEXT: [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: GLOBAL_LOAD_ASYNC_TO_LDS_B32 [[DEF11]], [[DEF10]], 80, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt + ; CHECK-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF9]], [[DEF11]], implicit $exec + ; CHECK-NEXT: SCHED_GROUP_BARRIER 2048, 2, 1 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 1 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 2048, 2, 1 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 1 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 2048, 2, 1 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 1 + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_U32_e32_]], implicit [[V_ADD_U32_e32_1]], implicit [[V_ADD_U32_e32_2]] + $exec = IMPLICIT_DEF + %0:vreg_64_align2 = IMPLICIT_DEF + %1:vreg_64_align2 = IMPLICIT_DEF + %2:vreg_64_align2 = IMPLICIT_DEF + %3:vreg_64_align2 = IMPLICIT_DEF + %4:vreg_64_align2 = IMPLICIT_DEF + %5:vreg_64_align2 = IMPLICIT_DEF + %10:vgpr_32 = IMPLICIT_DEF + %11:vgpr_32 = IMPLICIT_DEF + %12:vgpr_32 = IMPLICIT_DEF + %13:vgpr_32 = IMPLICIT_DEF + %14:vgpr_32 = IMPLICIT_DEF + %15:vgpr_32 = IMPLICIT_DEF + GLOBAL_LOAD_ASYNC_TO_LDS_B32 %10, %0, 0, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt + GLOBAL_LOAD_ASYNC_TO_LDS_B32 %11, %1, 16, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt + GLOBAL_LOAD_ASYNC_TO_LDS_B32 %12, %2, 32, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt + GLOBAL_LOAD_ASYNC_TO_LDS_B32 %13, %3, 48, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt + GLOBAL_LOAD_ASYNC_TO_LDS_B32 %14, %4, 64, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt + GLOBAL_LOAD_ASYNC_TO_LDS_B32 %15, %5, 80, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt + %30:vgpr_32 = V_ADD_U32_e32 %10, %11, implicit $exec + %31:vgpr_32 = V_ADD_U32_e32 %12, %13, implicit $exec + %32:vgpr_32 = V_ADD_U32_e32 %14, %15, implicit $exec + SCHED_GROUP_BARRIER 2048, 2, 1 + SCHED_GROUP_BARRIER 2, 1, 1 + SCHED_GROUP_BARRIER 2048, 2, 1 + SCHED_GROUP_BARRIER 2, 1, 1 + SCHED_GROUP_BARRIER 2048, 2, 1 + SCHED_GROUP_BARRIER 2, 1, 1 + S_ENDPGM 0, implicit %30, implicit %31, implicit %32 + +... + +--- +name: sched_group_barrier_mask_16_async_loads +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: sched_group_barrier_mask_16_async_loads + ; CHECK: $exec = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: $m0 = S_MOV_B32 1 + ; CHECK-NEXT: CLUSTER_LOAD_ASYNC_TO_LDS_B32 [[DEF1]], [[DEF]], 0, 0, implicit-def dead $asynccnt, implicit $m0, implicit $exec, implicit $asynccnt + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: CLUSTER_LOAD_ASYNC_TO_LDS_B32 [[DEF3]], [[DEF2]], 16, 0, implicit-def dead $asynccnt, implicit $m0, implicit $exec, implicit $asynccnt + ; CHECK-NEXT: [[DEF4:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF1]], [[DEF5]], implicit $exec + ; CHECK-NEXT: CLUSTER_LOAD_ASYNC_TO_LDS_B32 [[DEF5]], [[DEF4]], 32, 0, implicit-def dead $asynccnt, implicit $m0, implicit $exec, implicit $asynccnt + ; CHECK-NEXT: [[DEF6:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: CLUSTER_LOAD_ASYNC_TO_LDS_B32 [[DEF7]], [[DEF6]], 48, 0, implicit-def dead $asynccnt, implicit $m0, implicit $exec, implicit $asynccnt + ; CHECK-NEXT: [[DEF8:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF3]], [[DEF9]], implicit $exec + ; CHECK-NEXT: CLUSTER_LOAD_ASYNC_TO_LDS_B32 [[DEF9]], [[DEF8]], 64, 0, implicit-def dead $asynccnt, implicit $m0, implicit $exec, implicit $asynccnt + ; CHECK-NEXT: [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: CLUSTER_LOAD_ASYNC_TO_LDS_B32 [[DEF11]], [[DEF10]], 80, 0, implicit-def dead $asynccnt, implicit $m0, implicit $exec, implicit $asynccnt + ; CHECK-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF7]], [[DEF11]], implicit $exec + ; CHECK-NEXT: SCHED_GROUP_BARRIER 16, 2, 3 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 3 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 16, 2, 3 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 3 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 16, 2, 3 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 3 + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_U32_e32_]], implicit [[V_ADD_U32_e32_1]], implicit [[V_ADD_U32_e32_2]] + $exec = IMPLICIT_DEF + $m0 = S_MOV_B32 1 + %0:vreg_64_align2 = IMPLICIT_DEF + %1:vreg_64_align2 = IMPLICIT_DEF + %2:vreg_64_align2 = IMPLICIT_DEF + %3:vreg_64_align2 = IMPLICIT_DEF + %4:vreg_64_align2 = IMPLICIT_DEF + %5:vreg_64_align2 = IMPLICIT_DEF + %10:vgpr_32 = IMPLICIT_DEF + %11:vgpr_32 = IMPLICIT_DEF + %12:vgpr_32 = IMPLICIT_DEF + %13:vgpr_32 = IMPLICIT_DEF + %14:vgpr_32 = IMPLICIT_DEF + %15:vgpr_32 = IMPLICIT_DEF + CLUSTER_LOAD_ASYNC_TO_LDS_B32 %10, %0, 0, 0, implicit-def dead $asynccnt, implicit $m0, implicit $exec, implicit $asynccnt + CLUSTER_LOAD_ASYNC_TO_LDS_B32 %11, %1, 16, 0, implicit-def dead $asynccnt, implicit $m0, implicit $exec, implicit $asynccnt + CLUSTER_LOAD_ASYNC_TO_LDS_B32 %12, %2, 32, 0, implicit-def dead $asynccnt, implicit $m0, implicit $exec, implicit $asynccnt + CLUSTER_LOAD_ASYNC_TO_LDS_B32 %13, %3, 48, 0, implicit-def dead $asynccnt, implicit $m0, implicit $exec, implicit $asynccnt + CLUSTER_LOAD_ASYNC_TO_LDS_B32 %14, %4, 64, 0, implicit-def dead $asynccnt, implicit $m0, implicit $exec, implicit $asynccnt + CLUSTER_LOAD_ASYNC_TO_LDS_B32 %15, %5, 80, 0, implicit-def dead $asynccnt, implicit $m0, implicit $exec, implicit $asynccnt + %30:vgpr_32 = V_ADD_U32_e32 %10, %12, implicit $exec + %31:vgpr_32 = V_ADD_U32_e32 %11, %14, implicit $exec + %32:vgpr_32 = V_ADD_U32_e32 %13, %15, implicit $exec + SCHED_GROUP_BARRIER 16, 2, 3 + SCHED_GROUP_BARRIER 2, 1, 3 + SCHED_GROUP_BARRIER 16, 2, 3 + SCHED_GROUP_BARRIER 2, 1, 3 + SCHED_GROUP_BARRIER 16, 2, 3 + SCHED_GROUP_BARRIER 2, 1, 3 + S_ENDPGM 0, implicit %30, implicit %31, implicit %32 + +... + +--- +name: sched_group_barrier_mask_2048_interleave_2load_1valu +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: sched_group_barrier_mask_2048_interleave_2load_1valu + ; CHECK: $exec = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: GLOBAL_LOAD_ASYNC_TO_LDS_B32 [[DEF1]], [[DEF]], 0, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: GLOBAL_LOAD_ASYNC_TO_LDS_B32 [[DEF3]], [[DEF2]], 16, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt + ; CHECK-NEXT: [[DEF4:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF1]], [[DEF3]], implicit $exec + ; CHECK-NEXT: GLOBAL_LOAD_ASYNC_TO_LDS_B32 [[DEF5]], [[DEF4]], 32, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt + ; CHECK-NEXT: [[DEF6:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: GLOBAL_LOAD_ASYNC_TO_LDS_B32 [[DEF7]], [[DEF6]], 48, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt + ; CHECK-NEXT: [[DEF8:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF5]], [[DEF7]], implicit $exec + ; CHECK-NEXT: GLOBAL_LOAD_ASYNC_TO_LDS_B32 [[DEF9]], [[DEF8]], 64, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt + ; CHECK-NEXT: [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: GLOBAL_LOAD_ASYNC_TO_LDS_B32 [[DEF11]], [[DEF10]], 80, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt + ; CHECK-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF9]], [[DEF11]], implicit $exec + ; CHECK-NEXT: SCHED_GROUP_BARRIER 2048, 2, 4 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 4 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 2048, 2, 4 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 4 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 2048, 2, 4 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 4 + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_U32_e32_]], implicit [[V_ADD_U32_e32_1]], implicit [[V_ADD_U32_e32_2]] + $exec = IMPLICIT_DEF + %0:vreg_64_align2 = IMPLICIT_DEF + %1:vreg_64_align2 = IMPLICIT_DEF + %2:vreg_64_align2 = IMPLICIT_DEF + %3:vreg_64_align2 = IMPLICIT_DEF + %4:vreg_64_align2 = IMPLICIT_DEF + %5:vreg_64_align2 = IMPLICIT_DEF + %10:vgpr_32 = IMPLICIT_DEF + %11:vgpr_32 = IMPLICIT_DEF + %12:vgpr_32 = IMPLICIT_DEF + %13:vgpr_32 = IMPLICIT_DEF + %14:vgpr_32 = IMPLICIT_DEF + %15:vgpr_32 = IMPLICIT_DEF + GLOBAL_LOAD_ASYNC_TO_LDS_B32 %10, %0, 0, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt + GLOBAL_LOAD_ASYNC_TO_LDS_B32 %11, %1, 16, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt + GLOBAL_LOAD_ASYNC_TO_LDS_B32 %12, %2, 32, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt + GLOBAL_LOAD_ASYNC_TO_LDS_B32 %13, %3, 48, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt + GLOBAL_LOAD_ASYNC_TO_LDS_B32 %14, %4, 64, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt + GLOBAL_LOAD_ASYNC_TO_LDS_B32 %15, %5, 80, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt + %30:vgpr_32 = V_ADD_U32_e32 %10, %11, implicit $exec + %31:vgpr_32 = V_ADD_U32_e32 %12, %13, implicit $exec + %32:vgpr_32 = V_ADD_U32_e32 %14, %15, implicit $exec + SCHED_GROUP_BARRIER 2048, 2, 4 + SCHED_GROUP_BARRIER 2, 1, 4 + SCHED_GROUP_BARRIER 2048, 2, 4 + SCHED_GROUP_BARRIER 2, 1, 4 + SCHED_GROUP_BARRIER 2048, 2, 4 + SCHED_GROUP_BARRIER 2, 1, 4 + S_ENDPGM 0, implicit %30, implicit %31, implicit %32 + +... diff --git a/llvm/test/CodeGen/AMDGPU/sched.barrier.inverted.mask.ll b/llvm/test/CodeGen/AMDGPU/sched.barrier.inverted.mask.ll index c20dbba42ccd4..6ce91deab8ce0 100644 --- a/llvm/test/CodeGen/AMDGPU/sched.barrier.inverted.mask.ll +++ b/llvm/test/CodeGen/AMDGPU/sched.barrier.inverted.mask.ll @@ -5,8 +5,8 @@ -; Inverted 1008: 01111110000 -; GCN: After Inverting, SchedGroup Mask: 1008 +; Inverted 3056: 101111110000 +; GCN: After Inverting, SchedGroup Mask: 3056 define amdgpu_kernel void @invert1() #0 { entry: call void @llvm.amdgcn.sched.barrier(i32 1) #1 @@ -14,8 +14,8 @@ entry: ret void } -; Inverted 2044: 11111111100 -; GCN: After Inverting, SchedGroup Mask: 2044 +; Inverted 4092: 111111111100 +; GCN: After Inverting, SchedGroup Mask: 4092 define amdgpu_kernel void @invert2() #0 { entry: call void @llvm.amdgcn.sched.barrier(i32 2) #1 @@ -23,8 +23,8 @@ entry: ret void } -; Inverted 2042: 11111111010 -; GCN: After Inverting, SchedGroup Mask: 2042 +; Inverted 4090: 111111111010 +; GCN: After Inverting, SchedGroup Mask: 4090 define amdgpu_kernel void @invert4() #0 { entry: call void @llvm.amdgcn.sched.barrier(i32 4) #1 @@ -32,8 +32,8 @@ entry: ret void } -; Inverted 2038: 11111110110 -; GCN: After Inverting, SchedGroup Mask: 2038 +; Inverted 4086: 111111110110 +; GCN: After Inverting, SchedGroup Mask: 4086 define amdgpu_kernel void @invert8() #0 { entry: call void @llvm.amdgcn.sched.barrier(i32 8) #1 @@ -41,7 +41,7 @@ entry: ret void } -; Inverted 1935: 11110001111 +; Inverted 1935: 011110001111 ; GCN: After Inverting, SchedGroup Mask: 1935 define amdgpu_kernel void @invert16() #0 { entry: @@ -50,8 +50,8 @@ entry: ret void } -; Inverted 1999: 11111001111 -; GCN: After Inverting, SchedGroup Mask: 1999 +; Inverted 4047: 111111001111 +; GCN: After Inverting, SchedGroup Mask: 4047 define amdgpu_kernel void @invert32() #0 { entry: call void @llvm.amdgcn.sched.barrier(i32 32) #1 @@ -59,8 +59,8 @@ entry: ret void } -; Inverted 1967: 11110101111 -; GCN: After Inverting, SchedGroup Mask: 1967 +; Inverted 4015: 111110101111 +; GCN: After Inverting, SchedGroup Mask: 4015 define amdgpu_kernel void @invert64() #0 { entry: call void @llvm.amdgcn.sched.barrier(i32 64) #1 @@ -68,8 +68,8 @@ entry: ret void } -; Inverted 1151: 10001111111 -; GCN: After Inverting, SchedGroup Mask: 1151 +; Inverted 3199: 110001111111 +; GCN: After Inverting, SchedGroup Mask: 3199 define amdgpu_kernel void @invert128() #0 { entry: call void @llvm.amdgcn.sched.barrier(i32 128) #1 @@ -77,8 +77,8 @@ entry: ret void } -; Inverted 1663: 11001111111 -; GCN: After Inverting, SchedGroup Mask: 1663 +; Inverted 3711: 111001111111 +; GCN: After Inverting, SchedGroup Mask: 3711 define amdgpu_kernel void @invert256() #0 { entry: call void @llvm.amdgcn.sched.barrier(i32 256) #1 @@ -86,8 +86,8 @@ entry: ret void } -; Inverted 1407: 10101111111 -; GCN: After Inverting, SchedGroup Mask: 1407 +; Inverted 3455: 110101111111 +; GCN: After Inverting, SchedGroup Mask: 3455 define amdgpu_kernel void @invert512() #0 { entry: call void @llvm.amdgcn.sched.barrier(i32 512) #1 @@ -95,8 +95,8 @@ entry: ret void } -; Inverted 1022: 01111111110 -; GCN: After Inverting, SchedGroup Mask: 1022 +; Inverted 3070: 101111111110 +; GCN: After Inverting, SchedGroup Mask: 3070 define amdgpu_kernel void @invert1024() #0 { entry: call void @llvm.amdgcn.sched.barrier(i32 1024) #1