Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 0 additions & 26 deletions helion/_compiler/tile_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import dataclasses
import functools
import itertools
import math
import operator
from typing import TYPE_CHECKING
from typing import NamedTuple
Expand Down Expand Up @@ -297,31 +296,6 @@ def get_tl_range_kwargs(config: Config, block_idx: int) -> list[str]:
range_num_stages = env.config_spec.range_num_stages.config_get(
config.range_num_stages, block_idx, 0
)
num_stages = config.num_stages

if "tensor_descriptor" in config.indexing:
# Tensor descriptor + multi-stage pipelines in addition to unrolling tend to cause
# CUDA "misaligned address" or "unspecified launch failure" errors.
if range_num_stages > 0:
range_num_stages = 0
if range_unroll_factor > 0 and num_stages > 1:
range_unroll_factor = 0
elif (
range_num_stages > 1
and range_unroll_factor > 1
and env.block_sizes[block_idx].size
and env.block_sizes[block_idx].numel.is_number
):
# Unrolling can cause CUDA IMA with pipelining
# We want to ensure new step size + pipeline is within bounds
loop_numel = int(env.block_sizes[block_idx].numel)
block_size = int(env.block_sizes[block_idx].from_config_assert(config))
step = range_unroll_factor * block_size
last_offset = ((loop_numel - 1) // block_size) * block_size
remainder = loop_numel - last_offset
range_num_stages = min(
max(1, int(math.ceil(remainder / step))), range_num_stages
)

if range_unroll_factor > 0:
kwargs.append(f"loop_unroll_factor={range_unroll_factor}")
Expand Down
44 changes: 0 additions & 44 deletions test/test_loops.py
Original file line number Diff line number Diff line change
Expand Up @@ -1397,50 +1397,6 @@ def three_pass_kernel(x: torch.Tensor) -> torch.Tensor:

torch.testing.assert_close(result, expected, atol=1e-5, rtol=1e-5)

@patch.object(_compat, "_supports_tensor_descriptor", lambda: False)
@skipIfTileIR("tileir backend will ignore `range_unroll_factors` hint")
@skipIfNotTriton("range loop hints are Triton-specific")
@skipIfXPU("Accuracy issue on XPU backend")
def test_unroll_with_pipelining(self):
@helion.kernel(static_shapes=True)
def matmul(
x: torch.Tensor,
y: torch.Tensor,
) -> torch.Tensor:
m, k = x.size()
k2, n = y.size()
assert k == k2, f"size mismatch {k} != {k2}"
out = torch.empty(
[m, n], dtype=torch.promote_types(x.dtype, y.dtype), device=x.device
)
for tile_m, tile_n in hl.tile([m, n]):
acc = hl.zeros([tile_m, tile_n], dtype=torch.float32)
for tile_k in hl.tile(k):
acc = torch.addmm(acc, x[tile_m, tile_k], y[tile_k, tile_n])
out[tile_m, tile_n] = acc
return out

a = torch.randn(256, 256, device=DEVICE, dtype=torch.bfloat16)
b = torch.randn(256, 256, device=DEVICE, dtype=torch.bfloat16)

code, result = code_and_output(
matmul,
(a, b),
block_sizes=[64, 16, 16],
indexing="block_ptr",
loop_orders=[[1, 0]],
pid_type="persistent_blocked",
range_num_stages=[4, 2],
range_unroll_factors=[4, 4],
)

expected = torch.matmul(a, b)
torch.testing.assert_close(result, expected, atol=1e-2, rtol=1e-2)

# Logic for modifying num_stages and loop unrolling factors should
# change num_stages=1
self.assertIn("num_stages=1", code)

def test_loop_with_symbolic_bounds(self):
@helion.kernel(
config=helion.Config(
Expand Down
9 changes: 0 additions & 9 deletions test/test_tensor_descriptor.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import annotations

import re
import unittest

import torch
Expand Down Expand Up @@ -303,14 +302,6 @@ def jsd_forward_kernel(

torch.testing.assert_close(loss, baseline_loss, rtol=5e-2, atol=5e-3)
self.assertIn(get_tensor_descriptor_fn_name(), code)
range_stage_values = [
int(match)
for line in code.splitlines()
if "tl.range" in line
for match in re.findall(r"num_stages=(\d+)", line)
]
# range_num_stages=4 is clamped to 0, so doesn't show up as num_stages in the tl.range call
self.assertEqual(len(range_stage_values), 0)

@skipUnlessTensorDescriptor("Tensor descriptor support is required")
def test_tiny_matmul_tile_fallback(self) -> None:
Expand Down
Loading