diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 9c34781bb7447..758341d27aa77 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7026,8 +7026,10 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, return Cost; // Pre-compute costs for instructions that are forced-scalar or profitable to - // scalarize. Their costs will be computed separately in the legacy cost - // model. + // scalarize. For most such instructions, their scalarization costs are + // accounted for here using the legacy cost model. However, some opcodes + // are excluded from these precomputed scalarization costs and are instead + // modeled later by the VPlan cost model (see UseVPlanCostModel below). for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) { if (CostCtx.skipCostComputation(ForcedScalar, VF.isVector())) continue; @@ -7039,8 +7041,21 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, }); Cost += ForcedCost; } + + auto UseVPlanCostModel = [](Instruction *I) -> bool { + switch (I->getOpcode()) { + case Instruction::SDiv: + case Instruction::UDiv: + case Instruction::SRem: + case Instruction::URem: + return true; + default: + return false; + } + }; for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) { - if (CostCtx.skipCostComputation(Scalarized, VF.isVector())) + if (UseVPlanCostModel(Scalarized) || + CostCtx.skipCostComputation(Scalarized, VF.isVector())) continue; CostCtx.SkipCostComputation.insert(Scalarized); LLVM_DEBUG({ diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll index 1f3949172b758..ba8a4c735b94d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll @@ -8,12 +8,10 @@ target triple = "aarch64--linux-gnu" ; This test checks that we correctly compute the scalarized operands for a ; user-specified vectorization factor when interleaving is disabled. We use -; -force-vector-interleave=1 to disable all interleaving calculations. A cost of -; 4 for %var4 indicates that we would scalarize it's operand (%var3), giving -; %var4 a lower scalarization overhead. +; -force-vector-interleave=1 to disable all interleaving calculations. ; ; COST-LABEL: predicated_udiv_scalarized_operand -; COST: Cost of 5 for VF 2: profitable to scalarize %var4 = udiv i64 %var2, %var3 +; COST: Cost of 5 for VF 2: REPLICATE ir<%var4> = udiv ir<%var2>, ir<%var3> (S->V) ; ; define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) { diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll index d84a6e27e5473..b9b91be9b7a65 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll @@ -19,7 +19,7 @@ target triple = "aarch64--linux-gnu" ; (udiv(2) + extractelement(8) + insertelement(4)) / 2 = 7 ; ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3 -; CHECK: Cost of 7 for VF 2: profitable to scalarize %tmp4 = udiv i32 %tmp2, %tmp3 +; CHECK: Cost of 7 for VF 2: REPLICATE ir<%tmp4> = udiv ir<%tmp2>, ir<%tmp3> (S->V) ; define i32 @predicated_udiv(ptr %a, ptr %b, i1 %c, i64 %n) { entry: @@ -135,8 +135,8 @@ for.end: ; ; CHECK: Scalarizing: %tmp3 = add nsw i32 %tmp2, %x ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3 -; CHECK: Cost of 5 for VF 2: profitable to scalarize %tmp4 = udiv i32 %tmp2, %tmp3 ; CHECK: Cost of 3 for VF 2: profitable to scalarize %tmp3 = add nsw i32 %tmp2, %x +; CHECK: Cost of 5 for VF 2: REPLICATE ir<%tmp4> = udiv ir<%tmp2>, ir<%tmp3> (S->V) ; define i32 @predicated_udiv_scalarized_operand(ptr %a, i1 %c, i32 %x, i64 %n) { @@ -222,7 +222,7 @@ for.end: ; Cost of sdiv: ; (sdiv(2) + extractelement(8) + insertelement(4)) / 2 = 7 ; Cost of udiv: -; (udiv(2) + extractelement(8) + insertelement(4)) / 2 = 7 +; (udiv(2) + extractelement(4) + insertelement(4)) / 2 = 5 ; Cost of sub: ; (sub(2) + extractelement(4)) / 2 = 3 ; Cost of store: @@ -233,11 +233,11 @@ for.end: ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp3, %tmp2 ; CHECK: Scalarizing: %tmp5 = sub i32 %tmp4, %x ; CHECK: Scalarizing and predicating: store i32 %tmp5, ptr %tmp0, align 4 -; CHECK: Cost of 7 for VF 2: profitable to scalarize %tmp3 = sdiv i32 %tmp1, %tmp2 -; CHECK: Cost of 7 for VF 2: profitable to scalarize %tmp4 = udiv i32 %tmp3, %tmp2 ; CHECK: Cost of 2 for VF 2: profitable to scalarize store i32 %tmp5, ptr %tmp0, align 4 ; CHECK: Cost of 3 for VF 2: profitable to scalarize %tmp5 = sub i32 %tmp4, %x ; CHECK: Cost of 1 for VF 2: WIDEN ir<%tmp2> = add ir<%tmp1>, ir<%x> +; CHECK: Cost of 7 for VF 2: REPLICATE ir<%tmp3> = sdiv ir<%tmp1>, ir<%tmp2> +; CHECK: Cost of 5 for VF 2: REPLICATE ir<%tmp4> = udiv ir<%tmp3>, ir<%tmp2> ; define void @predication_multi_context(ptr %a, i1 %c, i32 %x, i64 %n) { entry: