Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7026,8 +7026,10 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
return Cost;

// Pre-compute costs for instructions that are forced-scalar or profitable to
// scalarize. Their costs will be computed separately in the legacy cost
// model.
// scalarize. For most such instructions, their scalarization costs are
// accounted for here using the legacy cost model. However, some opcodes
// are excluded from these precomputed scalarization costs and are instead
// modeled later by the VPlan cost model (see UseVPlanCostModel below).
for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
if (CostCtx.skipCostComputation(ForcedScalar, VF.isVector()))
continue;
Expand All @@ -7039,8 +7041,21 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
});
Cost += ForcedCost;
}

auto UseVPlanCostModel = [](Instruction *I) -> bool {
switch (I->getOpcode()) {
case Instruction::SDiv:
case Instruction::UDiv:
case Instruction::SRem:
case Instruction::URem:
return true;
default:
return false;
}
};
for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
if (CostCtx.skipCostComputation(Scalarized, VF.isVector()))
if (UseVPlanCostModel(Scalarized) ||
CostCtx.skipCostComputation(Scalarized, VF.isVector()))
continue;
CostCtx.SkipCostComputation.insert(Scalarized);
LLVM_DEBUG({
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,10 @@ target triple = "aarch64--linux-gnu"

; This test checks that we correctly compute the scalarized operands for a
; user-specified vectorization factor when interleaving is disabled. We use
; -force-vector-interleave=1 to disable all interleaving calculations. A cost of
; 4 for %var4 indicates that we would scalarize it's operand (%var3), giving
; %var4 a lower scalarization overhead.
; -force-vector-interleave=1 to disable all interleaving calculations.
;
; COST-LABEL: predicated_udiv_scalarized_operand
; COST: Cost of 5 for VF 2: profitable to scalarize %var4 = udiv i64 %var2, %var3
; COST: Cost of 5 for VF 2: REPLICATE ir<%var4> = udiv ir<%var2>, ir<%var3> (S->V)
;
;
define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) {
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ target triple = "aarch64--linux-gnu"
; (udiv(2) + extractelement(8) + insertelement(4)) / 2 = 7
;
; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
; CHECK: Cost of 7 for VF 2: profitable to scalarize %tmp4 = udiv i32 %tmp2, %tmp3
; CHECK: Cost of 7 for VF 2: REPLICATE ir<%tmp4> = udiv ir<%tmp2>, ir<%tmp3> (S->V)
;
define i32 @predicated_udiv(ptr %a, ptr %b, i1 %c, i64 %n) {
entry:
Expand Down Expand Up @@ -135,8 +135,8 @@ for.end:
;
; CHECK: Scalarizing: %tmp3 = add nsw i32 %tmp2, %x
; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
; CHECK: Cost of 5 for VF 2: profitable to scalarize %tmp4 = udiv i32 %tmp2, %tmp3
; CHECK: Cost of 3 for VF 2: profitable to scalarize %tmp3 = add nsw i32 %tmp2, %x
; CHECK: Cost of 5 for VF 2: REPLICATE ir<%tmp4> = udiv ir<%tmp2>, ir<%tmp3> (S->V)
;

define i32 @predicated_udiv_scalarized_operand(ptr %a, i1 %c, i32 %x, i64 %n) {
Expand Down Expand Up @@ -222,7 +222,7 @@ for.end:
; Cost of sdiv:
; (sdiv(2) + extractelement(8) + insertelement(4)) / 2 = 7
; Cost of udiv:
; (udiv(2) + extractelement(8) + insertelement(4)) / 2 = 7
; (udiv(2) + extractelement(4) + insertelement(4)) / 2 = 5
; Cost of sub:
; (sub(2) + extractelement(4)) / 2 = 3
; Cost of store:
Expand All @@ -233,11 +233,11 @@ for.end:
; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp3, %tmp2
; CHECK: Scalarizing: %tmp5 = sub i32 %tmp4, %x
; CHECK: Scalarizing and predicating: store i32 %tmp5, ptr %tmp0, align 4
; CHECK: Cost of 7 for VF 2: profitable to scalarize %tmp3 = sdiv i32 %tmp1, %tmp2
; CHECK: Cost of 7 for VF 2: profitable to scalarize %tmp4 = udiv i32 %tmp3, %tmp2
; CHECK: Cost of 2 for VF 2: profitable to scalarize store i32 %tmp5, ptr %tmp0, align 4
; CHECK: Cost of 3 for VF 2: profitable to scalarize %tmp5 = sub i32 %tmp4, %x
; CHECK: Cost of 1 for VF 2: WIDEN ir<%tmp2> = add ir<%tmp1>, ir<%x>
; CHECK: Cost of 7 for VF 2: REPLICATE ir<%tmp3> = sdiv ir<%tmp1>, ir<%tmp2>
; CHECK: Cost of 5 for VF 2: REPLICATE ir<%tmp4> = udiv ir<%tmp3>, ir<%tmp2>
;
define void @predication_multi_context(ptr %a, i1 %c, i32 %x, i64 %n) {
entry:
Expand Down
Loading