diff --git a/helion/autotuner/config_fragment.py b/helion/autotuner/config_fragment.py index f9d75aea7..ad3d916bf 100644 --- a/helion/autotuner/config_fragment.py +++ b/helion/autotuner/config_fragment.py @@ -247,6 +247,26 @@ def differential_mutation(self, a: object, b: object, c: object) -> int: return a +class DefaultBiasedIntegerFragment(IntegerFragment): + """Integer fragment that samples its default value most of the time.""" + + def __init__( + self, + low: int, + high: int, + default_val: int | None = None, + *, + default_probability: float, + ) -> None: + super().__init__(low, high, default_val) + self.default_probability = default_probability + + def random(self) -> int: + if random.random() < self.default_probability: + return self.default() + return super().random() + + @dataclasses.dataclass class EnumFragment(ConfigSpecFragment): choices: tuple[object, ...] @@ -285,6 +305,17 @@ def encode(self, value: object) -> list[float]: return [1.0 if i == choice_idx else 0.0 for i in range(len(self.choices))] +@dataclasses.dataclass +class DefaultBiasedEnumFragment(EnumFragment): + default_probability: float + + def random(self) -> object: + if random.random() < self.default_probability: + return self.default() + choices = [choice for choice in self.choices if choice != self.default()] + return random.choice(choices) + + class BooleanFragment(ConfigSpecFragment): def default(self) -> bool: return False diff --git a/helion/autotuner/config_spec.py b/helion/autotuner/config_spec.py index f4fffa9a0..8b9979c98 100644 --- a/helion/autotuner/config_spec.py +++ b/helion/autotuner/config_spec.py @@ -119,6 +119,8 @@ from .config_fragment import BlockSizeFragment from .config_fragment import BooleanFragment from .config_fragment import ConfigSpecFragment +from .config_fragment import DefaultBiasedEnumFragment +from .config_fragment import DefaultBiasedIntegerFragment from .config_fragment import EnumFragment from .config_fragment import IntegerFragment from .config_fragment import ListOf @@ -347,6 +349,8 @@ def _get_backend_tunable_keys() -> frozenset[str]: EPILOGUE_SUBTILE_DEFAULT_CHOICES = (None, 2) EPILOGUE_SUBTILE_MIN_K_HINT = 1024 EPILOGUE_SUBTILE_MIN_K_HINT_EXTENDED = 16384 +RANGE_INT_RANDOM_DEFAULT_PROBABILITY = 0.95 +RANGE_WARP_SPECIALIZE_RANDOM_DEFAULT_PROBABILITY = 0.90 # maxnreg values: None means no limit, otherwise limit to this many registers per thread # Lower values allow higher occupancy but may hurt performance for register-heavy kernels VALID_MAXNREG = (None, 32, 64, 128, 256) @@ -3056,16 +3060,30 @@ def _fill_missing(self) -> None: class RangeUnrollFactorSpec(_OptionalIntSpec): def _fragment(self, base: ConfigSpec) -> IntegerFragment: - return IntegerFragment(0, 4, 0) + return DefaultBiasedIntegerFragment( + 0, + 4, + 0, + default_probability=RANGE_INT_RANDOM_DEFAULT_PROBABILITY, + ) class RangeWarpSpecializeSpec(_OptionalBoolSpec): - pass + def _fragment(self, base: ConfigSpec) -> EnumFragment: + return DefaultBiasedEnumFragment( + (None, False, True), + default_probability=RANGE_WARP_SPECIALIZE_RANDOM_DEFAULT_PROBABILITY, + ) class RangeNumStagesSpec(_OptionalIntSpec): def _fragment(self, base: ConfigSpec) -> IntegerFragment: - return IntegerFragment(0, 4, 0) + return DefaultBiasedIntegerFragment( + 0, + 4, + 0, + default_probability=RANGE_INT_RANDOM_DEFAULT_PROBABILITY, + ) class RangeMultiBufferSpec(_OptionalBoolSpec): diff --git a/test/test_autotuner.expected b/test/test_autotuner.expected index ffb5baf75..39094ff42 100644 --- a/test/test_autotuner.expected +++ b/test/test_autotuner.expected @@ -3,39 +3,39 @@ Update expected outputs by running tests with the EXPECTTEST_ACCEPT=1 environmen --- assertExpectedJournal(TestAutotuner.test_config_fragment0) helion.Config(atomic_indexing=[], block_sizes=[16, 16, 16], indexing=['pointer', 'pointer', 'pointer'], l2_groupings=[1], load_eviction_policies=['', ''], loop_orders=[[0, 1]], num_stages=1, num_warps=4, pid_type='flat', range_flattens=[None, None], range_multi_buffers=[None, None], range_num_stages=[0, 0], range_unroll_factors=[0, 0], range_warp_specializes=[None, None]) -helion.Config(atomic_indexing=[], block_sizes=[16, 32, 16], indexing=['tensor_descriptor', 'tensor_descriptor', 'pointer'], l2_groupings=[8], load_eviction_policies=['last', 'first'], loop_orders=[[1, 0]], num_stages=8, num_warps=8, pid_type='flat', range_flattens=[None, True], range_multi_buffers=[None, True], range_num_stages=[0, 0], range_unroll_factors=[0, 0], range_warp_specializes=[None, True]) -helion.Config(atomic_indexing=[], block_sizes=[16, 16, 16], indexing=['pointer', 'tensor_descriptor', 'tensor_descriptor'], l2_groupings=[1], load_eviction_policies=['first', 'last'], loop_orders=[[1, 0]], maxnreg=64, num_sm_multiplier=4, num_stages=8, num_warps=32, pid_type='persistent_interleaved', range_flattens=[None, False], range_multi_buffers=[None, None], range_num_stages=[0, 4], range_unroll_factors=[0, 1], range_warp_specializes=[False, False]) -helion.Config(atomic_indexing=[], block_sizes=[32, 64, 32], indexing=['pointer', 'tensor_descriptor', 'pointer'], l2_groupings=[16], load_eviction_policies=['last', 'first'], loop_orders=[[0, 1]], num_sm_multiplier=2, num_stages=3, num_warps=32, pid_type='persistent_interleaved', range_flattens=[None, True], range_multi_buffers=[False, False], range_num_stages=[4, 1], range_unroll_factors=[0, 1], range_warp_specializes=[True, None]) -helion.Config(atomic_indexing=[], block_sizes=[16, 16, 16], indexing=['tensor_descriptor', 'pointer', 'pointer'], l2_groupings=[1], load_eviction_policies=['last', 'first'], loop_orders=[[0, 1]], maxnreg=128, num_sm_multiplier=4, num_stages=4, num_warps=4, pid_type='persistent_interleaved', range_flattens=[False, False], range_multi_buffers=[False, False], range_num_stages=[0, 2], range_unroll_factors=[0, 4], range_warp_specializes=[True, None]) -helion.Config(atomic_indexing=[], block_sizes=[16, 16, 16], indexing=['pointer', 'tensor_descriptor', 'tensor_descriptor'], l2_groupings=[4], load_eviction_policies=['', 'last'], loop_orders=[[0, 1]], maxnreg=64, num_sm_multiplier=32, num_stages=5, num_warps=4, pid_type='persistent_blocked', range_flattens=[None, True], range_multi_buffers=[False, False], range_num_stages=[3, 4], range_unroll_factors=[3, 2], range_warp_specializes=[None, None]) -helion.Config(atomic_indexing=[], block_sizes=[16, 16, 16], indexing=['tensor_descriptor', 'pointer', 'pointer'], l2_groupings=[32], load_eviction_policies=['first', ''], loop_orders=[[1, 0]], maxnreg=64, num_sm_multiplier=4, num_stages=5, num_warps=4, pid_type='persistent_interleaved', range_flattens=[None, True], range_multi_buffers=[True, False], range_num_stages=[1, 4], range_unroll_factors=[3, 3], range_warp_specializes=[None, False]) -helion.Config(atomic_indexing=[], block_sizes=[16, 16, 16], indexing=['pointer', 'pointer', 'tensor_descriptor'], l2_groupings=[4], load_eviction_policies=['last', 'first'], loop_orders=[[0, 1]], maxnreg=128, num_sm_multiplier=64, num_stages=6, num_warps=2, pid_type='persistent_interleaved', range_flattens=[None, True], range_multi_buffers=[None, None], range_num_stages=[2, 0], range_unroll_factors=[0, 0], range_warp_specializes=[True, None]) -helion.Config(atomic_indexing=[], block_sizes=[16, 16, 32], indexing=['tensor_descriptor', 'pointer', 'tensor_descriptor'], l2_groupings=[16], load_eviction_policies=['', 'last'], loop_orders=[[1, 0]], num_stages=2, num_warps=2, pid_type='flat', range_flattens=[None, None], range_multi_buffers=[None, None], range_num_stages=[0, 3], range_unroll_factors=[0, 0], range_warp_specializes=[None, None]) -helion.Config(atomic_indexing=[], block_sizes=[32, 64, 16], indexing=['tensor_descriptor', 'pointer', 'pointer'], l2_groupings=[64], load_eviction_policies=['last', 'last'], loop_orders=[[0, 1]], num_stages=4, num_warps=32, pid_type='flat', range_flattens=[None, None], range_multi_buffers=[None, None], range_num_stages=[0, 1], range_unroll_factors=[0, 2], range_warp_specializes=[None, None]) +helion.Config(atomic_indexing=[], block_sizes=[16, 16, 16], indexing=['tensor_descriptor', 'tensor_descriptor', 'tensor_descriptor'], l2_groupings=[8], load_eviction_policies=['first', ''], loop_orders=[[1, 0]], num_sm_multiplier=64, num_stages=3, num_warps=2, pid_type='persistent_interleaved', range_flattens=[False, None], range_multi_buffers=[False, False], range_num_stages=[0, 3], range_unroll_factors=[0, 0], range_warp_specializes=[None, None]) +helion.Config(atomic_indexing=[], block_sizes=[16, 16, 16], indexing=['tensor_descriptor', 'pointer', 'pointer'], l2_groupings=[16], load_eviction_policies=['', 'last'], loop_orders=[[1, 0]], num_stages=5, num_warps=2, pid_type='flat', range_flattens=[None, True], range_multi_buffers=[None, True], range_num_stages=[0, 0], range_unroll_factors=[0, 0], range_warp_specializes=[None, None]) +helion.Config(atomic_indexing=[], block_sizes=[32, 32, 16], indexing=['pointer', 'pointer', 'tensor_descriptor'], l2_groupings=[4], load_eviction_policies=['first', 'first'], loop_orders=[[0, 1]], maxnreg=64, num_sm_multiplier=16, num_stages=2, num_warps=4, pid_type='persistent_blocked', range_flattens=[True, None], range_multi_buffers=[False, True], range_num_stages=[0, 0], range_unroll_factors=[0, 0], range_warp_specializes=[None, None]) +helion.Config(atomic_indexing=[], block_sizes=[16, 16, 16], indexing=['tensor_descriptor', 'pointer', 'pointer'], l2_groupings=[64], load_eviction_policies=['first', ''], loop_orders=[[0, 1]], maxnreg=256, num_sm_multiplier=32, num_stages=7, num_warps=4, pid_type='persistent_blocked', range_flattens=[None, False], range_multi_buffers=[True, True], range_num_stages=[0, 0], range_unroll_factors=[0, 0], range_warp_specializes=[None, None]) +helion.Config(atomic_indexing=[], block_sizes=[16, 16, 16], indexing=['tensor_descriptor', 'tensor_descriptor', 'pointer'], l2_groupings=[64], load_eviction_policies=['last', 'last'], loop_orders=[[1, 0]], maxnreg=128, num_sm_multiplier=8, num_stages=7, num_warps=8, pid_type='persistent_blocked', range_flattens=[False, False], range_multi_buffers=[None, True], range_num_stages=[0, 0], range_unroll_factors=[0, 0], range_warp_specializes=[None, None]) +helion.Config(atomic_indexing=[], block_sizes=[16, 16, 16], indexing=['pointer', 'pointer', 'pointer'], l2_groupings=[2], load_eviction_policies=['last', 'first'], loop_orders=[[1, 0]], maxnreg=256, num_sm_multiplier=8, num_stages=1, num_warps=4, pid_type='persistent_interleaved', range_flattens=[True, True], range_multi_buffers=[True, None], range_num_stages=[0, 0], range_unroll_factors=[0, 0], range_warp_specializes=[None, None]) +helion.Config(atomic_indexing=[], block_sizes=[128, 16, 32], indexing=['pointer', 'pointer', 'pointer'], l2_groupings=[32], load_eviction_policies=['', 'first'], loop_orders=[[1, 0]], num_stages=7, num_warps=2, pid_type='flat', range_flattens=[None, None], range_multi_buffers=[None, None], range_num_stages=[0, 0], range_unroll_factors=[0, 0], range_warp_specializes=[None, True]) +helion.Config(atomic_indexing=[], block_sizes=[32, 16, 16], indexing=['tensor_descriptor', 'pointer', 'pointer'], l2_groupings=[2], load_eviction_policies=['last', 'last'], loop_orders=[[1, 0]], num_stages=4, num_warps=2, pid_type='flat', range_flattens=[None, True], range_multi_buffers=[None, None], range_num_stages=[0, 3], range_unroll_factors=[0, 0], range_warp_specializes=[None, None]) +helion.Config(atomic_indexing=[], block_sizes=[16, 16, 64], indexing=['tensor_descriptor', 'pointer', 'pointer'], l2_groupings=[8], load_eviction_policies=['', ''], loop_orders=[[1, 0]], maxnreg=64, num_sm_multiplier=8, num_stages=2, num_warps=16, pid_type='persistent_interleaved', range_flattens=[False, None], range_multi_buffers=[True, False], range_num_stages=[0, 0], range_unroll_factors=[0, 0], range_warp_specializes=[None, None]) --- assertExpectedJournal(TestAutotuner.test_config_fragment1) helion.Config(atomic_indexing=[], block_sizes=[8, 16, 16], flatten_loops=[False], indexing=['pointer', 'pointer', 'pointer'], l2_groupings=[1], load_eviction_policies=['', ''], loop_orders=[[0, 1, 2]], num_stages=1, num_warps=4, pid_type='flat', range_flattens=[None], range_multi_buffers=[None], range_num_stages=[], range_unroll_factors=[0], range_warp_specializes=[None]) -helion.Config(atomic_indexing=[], block_sizes=[4, 256, 128], flatten_loops=[True], indexing=['tensor_descriptor', 'pointer', 'tensor_descriptor'], l2_groupings=[64], load_eviction_policies=['first', ''], loop_orders=[[2, 1, 0]], maxnreg=32, num_sm_multiplier=16, num_stages=1, num_warps=8, pid_type='persistent_blocked', range_flattens=[True], range_multi_buffers=[None], range_unroll_factors=[1], range_warp_specializes=[False]) -helion.Config(atomic_indexing=[], block_sizes=[1, 128, 32], flatten_loops=[True], indexing=['tensor_descriptor', 'pointer', 'pointer'], l2_groupings=[4], load_eviction_policies=['first', 'last'], loop_orders=[[2, 1, 0]], num_stages=5, num_warps=2, pid_type='flat', range_flattens=[None], range_multi_buffers=[None], range_num_stages=[], range_unroll_factors=[0], range_warp_specializes=[None]) -helion.Config(atomic_indexing=[], block_sizes=[1, 64, 512], flatten_loops=[True], indexing=['pointer', 'pointer', 'tensor_descriptor'], l2_groupings=[2], load_eviction_policies=['last', 'last'], loop_orders=[[1, 0, 2]], maxnreg=32, num_sm_multiplier=8, num_stages=5, num_warps=2, pid_type='persistent_blocked', range_flattens=[None], range_multi_buffers=[True], range_unroll_factors=[0], range_warp_specializes=[True]) -helion.Config(atomic_indexing=[], block_sizes=[1, 32, 256], flatten_loops=[True], indexing=['tensor_descriptor', 'pointer', 'pointer'], l2_groupings=[1], load_eviction_policies=['last', 'first'], loop_orders=[[2, 0, 1]], maxnreg=64, num_sm_multiplier=1, num_stages=3, num_warps=16, pid_type='persistent_interleaved', range_flattens=[True], range_multi_buffers=[False], range_unroll_factors=[3], range_warp_specializes=[None]) -helion.Config(atomic_indexing=[], block_sizes=[4, 64, 32], flatten_loops=[False], indexing=['pointer', 'tensor_descriptor', 'tensor_descriptor'], l2_groupings=[1], load_eviction_policies=['first', 'first'], loop_orders=[[2, 0, 1]], maxnreg=128, num_sm_multiplier=8, num_stages=2, num_warps=8, pid_type='persistent_interleaved', range_flattens=[False], range_multi_buffers=[True], range_unroll_factors=[4], range_warp_specializes=[None]) -helion.Config(atomic_indexing=[], block_sizes=[2, 16, 128], flatten_loops=[False], indexing=['tensor_descriptor', 'tensor_descriptor', 'pointer'], l2_groupings=[16], load_eviction_policies=['', 'last'], loop_orders=[[2, 1, 0]], maxnreg=64, num_sm_multiplier=16, num_stages=8, num_warps=8, pid_type='persistent_interleaved', range_flattens=[True], range_multi_buffers=[None], range_unroll_factors=[2], range_warp_specializes=[False]) -helion.Config(atomic_indexing=[], block_sizes=[1, 32, 16], flatten_loops=[False], indexing=['tensor_descriptor', 'tensor_descriptor', 'pointer'], l2_groupings=[8], load_eviction_policies=['first', ''], loop_orders=[[2, 1, 0]], num_stages=5, num_warps=2, pid_type='flat', range_flattens=[None], range_multi_buffers=[None], range_num_stages=[], range_unroll_factors=[0], range_warp_specializes=[None]) -helion.Config(atomic_indexing=[], block_sizes=[1, 16, 16], flatten_loops=[False], indexing=['tensor_descriptor', 'pointer', 'pointer'], l2_groupings=[32], load_eviction_policies=['last', 'first'], loop_orders=[[0, 1, 2]], maxnreg=128, num_sm_multiplier=64, num_stages=4, num_warps=1, pid_type='persistent_blocked', range_flattens=[None], range_multi_buffers=[None], range_unroll_factors=[2], range_warp_specializes=[None]) -helion.Config(atomic_indexing=[], block_sizes=[4, 16, 128], flatten_loops=[True], indexing=['pointer', 'pointer', 'tensor_descriptor'], l2_groupings=[1], load_eviction_policies=['', 'last'], loop_orders=[[1, 0, 2]], num_stages=2, num_warps=16, pid_type='flat', range_flattens=[None], range_multi_buffers=[None], range_num_stages=[], range_unroll_factors=[0], range_warp_specializes=[None]) +helion.Config(atomic_indexing=[], block_sizes=[2, 128, 64], flatten_loops=[True], indexing=['pointer', 'tensor_descriptor', 'tensor_descriptor'], l2_groupings=[64], load_eviction_policies=['', ''], loop_orders=[[2, 1, 0]], maxnreg=256, num_sm_multiplier=32, num_stages=7, num_warps=1, pid_type='persistent_blocked', range_flattens=[False], range_multi_buffers=[True], range_unroll_factors=[0], range_warp_specializes=[None]) +helion.Config(atomic_indexing=[], block_sizes=[1, 256, 64], flatten_loops=[True], indexing=['pointer', 'pointer', 'pointer'], l2_groupings=[4], load_eviction_policies=['', 'first'], loop_orders=[[2, 1, 0]], maxnreg=128, num_sm_multiplier=128, num_stages=2, num_warps=8, pid_type='persistent_blocked', range_flattens=[False], range_multi_buffers=[None], range_unroll_factors=[0], range_warp_specializes=[None]) +helion.Config(atomic_indexing=[], block_sizes=[1, 32, 128], flatten_loops=[False], indexing=['pointer', 'pointer', 'pointer'], l2_groupings=[16], load_eviction_policies=['last', ''], loop_orders=[[1, 0, 2]], maxnreg=256, num_sm_multiplier=2, num_stages=7, num_warps=8, pid_type='persistent_blocked', range_flattens=[None], range_multi_buffers=[True], range_unroll_factors=[0], range_warp_specializes=[None]) +helion.Config(atomic_indexing=[], block_sizes=[1, 16, 16], flatten_loops=[False], indexing=['tensor_descriptor', 'tensor_descriptor', 'tensor_descriptor'], l2_groupings=[1], load_eviction_policies=['last', 'first'], loop_orders=[[1, 0, 2]], num_sm_multiplier=8, num_stages=5, num_warps=4, pid_type='persistent_interleaved', range_flattens=[False], range_multi_buffers=[True], range_unroll_factors=[0], range_warp_specializes=[None]) +helion.Config(atomic_indexing=[], block_sizes=[1, 64, 16], flatten_loops=[True], indexing=['tensor_descriptor', 'pointer', 'pointer'], l2_groupings=[8], load_eviction_policies=['first', ''], loop_orders=[[1, 0, 2]], maxnreg=256, num_sm_multiplier=32, num_stages=7, num_warps=4, pid_type='persistent_blocked', range_flattens=[False], range_multi_buffers=[None], range_unroll_factors=[0], range_warp_specializes=[None]) +helion.Config(atomic_indexing=[], block_sizes=[4, 16, 256], flatten_loops=[True], indexing=['tensor_descriptor', 'tensor_descriptor', 'tensor_descriptor'], l2_groupings=[64], load_eviction_policies=['first', ''], loop_orders=[[1, 2, 0]], maxnreg=256, num_sm_multiplier=4, num_stages=8, num_warps=4, pid_type='persistent_blocked', range_flattens=[True], range_multi_buffers=[None], range_unroll_factors=[0], range_warp_specializes=[None]) +helion.Config(atomic_indexing=[], block_sizes=[1, 16, 16], flatten_loops=[False], indexing=['pointer', 'tensor_descriptor', 'tensor_descriptor'], l2_groupings=[2], load_eviction_policies=['last', 'last'], loop_orders=[[1, 2, 0]], maxnreg=64, num_sm_multiplier=1, num_stages=8, num_warps=1, pid_type='persistent_interleaved', range_flattens=[False], range_multi_buffers=[False], range_unroll_factors=[0], range_warp_specializes=[None]) +helion.Config(atomic_indexing=[], block_sizes=[1, 32, 16], flatten_loops=[False], indexing=['pointer', 'tensor_descriptor', 'pointer'], l2_groupings=[32], load_eviction_policies=['last', ''], loop_orders=[[2, 0, 1]], maxnreg=256, num_sm_multiplier=128, num_stages=7, num_warps=8, pid_type='persistent_interleaved', range_flattens=[True], range_multi_buffers=[False], range_unroll_factors=[0], range_warp_specializes=[None]) +helion.Config(atomic_indexing=[], block_sizes=[1, 64, 16], flatten_loops=[False], indexing=['pointer', 'tensor_descriptor', 'tensor_descriptor'], l2_groupings=[2], load_eviction_policies=['', ''], loop_orders=[[0, 1, 2]], num_stages=6, num_warps=32, pid_type='flat', range_flattens=[None], range_multi_buffers=[None], range_num_stages=[], range_unroll_factors=[0], range_warp_specializes=[None]) --- assertExpectedJournal(TestAutotuner.test_config_warp_specialize_unroll) helion.Config(atomic_indexing=[], block_sizes=[8, 16, 16], flatten_loops=[False], indexing=['pointer', 'pointer', 'pointer'], l2_groupings=[1], load_eviction_policies=['', ''], loop_orders=[[0, 1, 2]], num_stages=1, num_warps=4, pid_type='flat', range_flattens=[None], range_multi_buffers=[None], range_num_stages=[], range_unroll_factors=[0], range_warp_specializes=[None]) -helion.Config(atomic_indexing=[], block_sizes=[4, 256, 128], flatten_loops=[True], indexing=['tensor_descriptor', 'pointer', 'tensor_descriptor'], l2_groupings=[64], load_eviction_policies=['first', ''], loop_orders=[[2, 1, 0]], maxnreg=32, num_sm_multiplier=16, num_stages=1, num_warps=8, pid_type='persistent_blocked', range_flattens=[True], range_multi_buffers=[None], range_unroll_factors=[0], range_warp_specializes=[True]) -helion.Config(atomic_indexing=[], block_sizes=[1, 128, 32], flatten_loops=[True], indexing=['tensor_descriptor', 'pointer', 'pointer'], l2_groupings=[4], load_eviction_policies=['first', 'last'], loop_orders=[[2, 1, 0]], num_stages=5, num_warps=2, pid_type='flat', range_flattens=[None], range_multi_buffers=[None], range_num_stages=[], range_unroll_factors=[0], range_warp_specializes=[None]) -helion.Config(atomic_indexing=[], block_sizes=[1, 64, 512], flatten_loops=[True], indexing=['pointer', 'pointer', 'tensor_descriptor'], l2_groupings=[2], load_eviction_policies=['last', 'last'], loop_orders=[[1, 0, 2]], maxnreg=32, num_sm_multiplier=8, num_stages=5, num_warps=2, pid_type='persistent_blocked', range_flattens=[None], range_multi_buffers=[True], range_unroll_factors=[0], range_warp_specializes=[True]) -helion.Config(atomic_indexing=[], block_sizes=[1, 32, 256], flatten_loops=[True], indexing=['tensor_descriptor', 'pointer', 'pointer'], l2_groupings=[1], load_eviction_policies=['last', 'first'], loop_orders=[[2, 0, 1]], maxnreg=64, num_sm_multiplier=1, num_stages=3, num_warps=16, pid_type='persistent_interleaved', range_flattens=[True], range_multi_buffers=[False], range_unroll_factors=[0], range_warp_specializes=[True]) -helion.Config(atomic_indexing=[], block_sizes=[4, 64, 32], flatten_loops=[False], indexing=['pointer', 'tensor_descriptor', 'tensor_descriptor'], l2_groupings=[1], load_eviction_policies=['first', 'first'], loop_orders=[[2, 0, 1]], maxnreg=128, num_sm_multiplier=8, num_stages=2, num_warps=8, pid_type='persistent_interleaved', range_flattens=[False], range_multi_buffers=[True], range_unroll_factors=[0], range_warp_specializes=[True]) -helion.Config(atomic_indexing=[], block_sizes=[2, 16, 128], flatten_loops=[False], indexing=['tensor_descriptor', 'tensor_descriptor', 'pointer'], l2_groupings=[16], load_eviction_policies=['', 'last'], loop_orders=[[2, 1, 0]], maxnreg=64, num_sm_multiplier=16, num_stages=8, num_warps=8, pid_type='persistent_interleaved', range_flattens=[True], range_multi_buffers=[None], range_unroll_factors=[0], range_warp_specializes=[True]) -helion.Config(atomic_indexing=[], block_sizes=[1, 32, 16], flatten_loops=[False], indexing=['tensor_descriptor', 'tensor_descriptor', 'pointer'], l2_groupings=[8], load_eviction_policies=['first', ''], loop_orders=[[2, 1, 0]], num_stages=5, num_warps=2, pid_type='flat', range_flattens=[None], range_multi_buffers=[None], range_num_stages=[], range_unroll_factors=[0], range_warp_specializes=[None]) -helion.Config(atomic_indexing=[], block_sizes=[1, 16, 16], flatten_loops=[False], indexing=['tensor_descriptor', 'pointer', 'pointer'], l2_groupings=[32], load_eviction_policies=['last', 'first'], loop_orders=[[0, 1, 2]], maxnreg=128, num_sm_multiplier=64, num_stages=4, num_warps=1, pid_type='persistent_blocked', range_flattens=[None], range_multi_buffers=[None], range_unroll_factors=[0], range_warp_specializes=[True]) -helion.Config(atomic_indexing=[], block_sizes=[4, 16, 128], flatten_loops=[True], indexing=['pointer', 'pointer', 'tensor_descriptor'], l2_groupings=[1], load_eviction_policies=['', 'last'], loop_orders=[[1, 0, 2]], num_stages=2, num_warps=16, pid_type='flat', range_flattens=[None], range_multi_buffers=[None], range_num_stages=[], range_unroll_factors=[0], range_warp_specializes=[None]) +helion.Config(atomic_indexing=[], block_sizes=[2, 128, 64], flatten_loops=[True], indexing=['pointer', 'tensor_descriptor', 'tensor_descriptor'], l2_groupings=[64], load_eviction_policies=['', ''], loop_orders=[[2, 1, 0]], maxnreg=256, num_sm_multiplier=32, num_stages=7, num_warps=1, pid_type='persistent_blocked', range_flattens=[False], range_multi_buffers=[True], range_unroll_factors=[0], range_warp_specializes=[True]) +helion.Config(atomic_indexing=[], block_sizes=[1, 256, 64], flatten_loops=[True], indexing=['pointer', 'pointer', 'pointer'], l2_groupings=[4], load_eviction_policies=['', 'first'], loop_orders=[[2, 1, 0]], maxnreg=128, num_sm_multiplier=128, num_stages=2, num_warps=8, pid_type='persistent_blocked', range_flattens=[False], range_multi_buffers=[None], range_unroll_factors=[0], range_warp_specializes=[True]) +helion.Config(atomic_indexing=[], block_sizes=[1, 32, 128], flatten_loops=[False], indexing=['pointer', 'pointer', 'pointer'], l2_groupings=[16], load_eviction_policies=['last', ''], loop_orders=[[1, 0, 2]], maxnreg=256, num_sm_multiplier=2, num_stages=7, num_warps=8, pid_type='persistent_blocked', range_flattens=[None], range_multi_buffers=[True], range_unroll_factors=[0], range_warp_specializes=[True]) +helion.Config(atomic_indexing=[], block_sizes=[1, 16, 16], flatten_loops=[False], indexing=['tensor_descriptor', 'tensor_descriptor', 'tensor_descriptor'], l2_groupings=[1], load_eviction_policies=['last', 'first'], loop_orders=[[1, 0, 2]], maxnreg=None, num_sm_multiplier=8, num_stages=5, num_warps=4, pid_type='persistent_interleaved', range_flattens=[False], range_multi_buffers=[True], range_unroll_factors=[0], range_warp_specializes=[True]) +helion.Config(atomic_indexing=[], block_sizes=[1, 64, 16], flatten_loops=[True], indexing=['tensor_descriptor', 'pointer', 'pointer'], l2_groupings=[8], load_eviction_policies=['first', ''], loop_orders=[[1, 0, 2]], maxnreg=256, num_sm_multiplier=32, num_stages=7, num_warps=4, pid_type='persistent_blocked', range_flattens=[False], range_multi_buffers=[None], range_unroll_factors=[0], range_warp_specializes=[True]) +helion.Config(atomic_indexing=[], block_sizes=[4, 16, 256], flatten_loops=[True], indexing=['tensor_descriptor', 'tensor_descriptor', 'tensor_descriptor'], l2_groupings=[64], load_eviction_policies=['first', ''], loop_orders=[[1, 2, 0]], maxnreg=256, num_sm_multiplier=4, num_stages=8, num_warps=4, pid_type='persistent_blocked', range_flattens=[True], range_multi_buffers=[None], range_unroll_factors=[0], range_warp_specializes=[True]) +helion.Config(atomic_indexing=[], block_sizes=[1, 16, 16], flatten_loops=[False], indexing=['pointer', 'tensor_descriptor', 'tensor_descriptor'], l2_groupings=[2], load_eviction_policies=['last', 'last'], loop_orders=[[1, 2, 0]], maxnreg=64, num_sm_multiplier=1, num_stages=8, num_warps=1, pid_type='persistent_interleaved', range_flattens=[False], range_multi_buffers=[False], range_unroll_factors=[0], range_warp_specializes=[True]) +helion.Config(atomic_indexing=[], block_sizes=[1, 32, 16], flatten_loops=[False], indexing=['pointer', 'tensor_descriptor', 'pointer'], l2_groupings=[32], load_eviction_policies=['last', ''], loop_orders=[[2, 0, 1]], maxnreg=256, num_sm_multiplier=128, num_stages=7, num_warps=8, pid_type='persistent_interleaved', range_flattens=[True], range_multi_buffers=[False], range_unroll_factors=[0], range_warp_specializes=[True]) +helion.Config(atomic_indexing=[], block_sizes=[1, 64, 16], flatten_loops=[False], indexing=['pointer', 'tensor_descriptor', 'tensor_descriptor'], l2_groupings=[2], load_eviction_policies=['', ''], loop_orders=[[0, 1, 2]], num_stages=6, num_warps=32, pid_type='flat', range_flattens=[None], range_multi_buffers=[None], range_num_stages=[], range_unroll_factors=[0], range_warp_specializes=[None]) --- assertExpectedJournal(TestAutotuner.test_save_load_config) { diff --git a/test/test_autotuner.py b/test/test_autotuner.py index de7ba4cca..40e6fdc35 100644 --- a/test/test_autotuner.py +++ b/test/test_autotuner.py @@ -52,6 +52,8 @@ from helion.autotuner.base_search import PopulationMember from helion.autotuner.benchmark_provider import LocalBenchmarkProvider from helion.autotuner.config_fragment import BooleanFragment +from helion.autotuner.config_fragment import DefaultBiasedEnumFragment +from helion.autotuner.config_fragment import DefaultBiasedIntegerFragment from helion.autotuner.config_fragment import EnumFragment from helion.autotuner.config_fragment import IntegerFragment from helion.autotuner.config_fragment import ListOf @@ -1106,6 +1108,23 @@ def test_pattern_search_neighbor_values(self): ["a", "c"], ) + def test_default_biased_fragments_keep_non_default_neighbors(self): + int_fragment = DefaultBiasedIntegerFragment( + 0, + 4, + 0, + default_probability=1.0, + ) + enum_fragment = DefaultBiasedEnumFragment( + (None, False, True), + default_probability=1.0, + ) + + self.assertEqual(int_fragment.random(), 0) + self.assertEqual(int_fragment.pattern_neighbors(0), [1]) + self.assertIsNone(enum_fragment.random()) + self.assertEqual(enum_fragment.pattern_neighbors(None), [False, True]) + def test_pattern_search_neighbor_values_radius(self): # PowerOfTwoFragment: radius=2 should return 2 steps in exponent space self.assertEqual(