pytorch · choijon5 · May 14, 2026 · May 16, 2026
diff --git a/helion/_compiler/backend.py b/helion/_compiler/backend.py
@@ -663,7 +663,18 @@ def autotune(
         if bound_kernel.settings.autotune_effort == "none" and (
             force or not bound_kernel.kernel.configs
         ):
-            config = bound_kernel.config_spec.default_config()
+            from ..autotuner.matmul_heuristics import (
+                matmul_heuristic_default_config_for_kernel,
+            )
+
+            config = (
+                matmul_heuristic_default_config_for_kernel(
+                    bound_kernel,
+                    args,
+                    config_spec=bound_kernel.config_spec,
+                )
+                or bound_kernel.config_spec.default_config()
+            )
         elif not force and bound_kernel.kernel.configs:
             if len(bound_kernel.kernel.configs) == 1:
                 (config,) = bound_kernel.kernel.configs

diff --git a/helion/autotuner/base_search.py b/helion/autotuner/base_search.py
@@ -36,6 +36,8 @@
 from .benchmark_provider import _unset_fn
 from .benchmarking import interleaved_bench
 from .logger import AutotuningLogger
+from .matmul_heuristics import matmul_heuristic_seed_configs_for_kernel
+from .matmul_heuristics import matmul_heuristics_supported_on_args
 from .metrics import AutotuneMetrics
 from .metrics import _run_post_autotune_hooks
 from .precompile_future import PrecompileFuture as PrecompileFuture
@@ -687,6 +689,26 @@ def get_kwargs_from_profile(
             **super().get_kwargs_from_profile(profile, settings),
         }
 
+    def _heuristic_seed_configs(self, max_configs: int = 1) -> list[Config]:
+        if not matmul_heuristics_supported_on_args(self.args):
+            return []
+        return matmul_heuristic_seed_configs_for_kernel(
+            self.kernel,
+            self.args,
+            config_spec=self.config_gen.config_spec,
+            max_configs=max_configs,
+        )
+
+    def _autotune_seed_configs_with_heuristics(self) -> list[Config]:
+        return [*self._heuristic_seed_configs(), *self._autotune_seed_configs()]
+
+    def _random_population_flat_with_heuristics(self, n: int) -> list[FlatConfig]:
+        return self.config_gen.random_population_flat(
+            n,
+            user_seed_configs=self._autotune_seed_configs_with_heuristics(),
+            log_func=self.log,
+        )
+
     @property
     def best(self) -> PopulationMember:
         """
@@ -778,24 +800,43 @@ def _generate_best_available_population_flat(self) -> list[FlatConfig]:
         Generate initial population using default config, explicit seed configs,
         and cached configs.
 
-        Always starts with the default configuration, then adds up to
+        Starts with a matching heuristic config when available, otherwise starts
+        with the default configuration, then adds up to
         MAX_BEST_AVAILABLE_CONFIGS matching cached configs from previous runs.
         Explicit seed configs provided by the caller are added ahead of cached
         configs and are not suppressed by cache-skip settings. No random configs
         are added. Duplicate configs are discarded.
 
         Returns:
-            A list of unique FlatConfig values for the initial population.
-            Minimum size is 1 (just default), plus any valid unique explicit
-            seed configs and up to autotune_best_available_max_configs cached
-            configs.
+            A list of unique FlatConfig values for the initial population. Minimum
+            size is 1 (heuristic or default), plus any valid unique explicit seed
+            configs and up to autotune_best_available_max_configs cached configs.
         """
-        # Always start with the default config
-        default_flat = self.config_gen.default_flat()
-        default_config = self.config_gen.unflatten(default_flat)
-        seen: set[Config] = {default_config}
-        result: list[FlatConfig] = [default_flat]
-        self.log("Starting with default config")
+        max_configs = self.settings.autotune_best_available_max_configs
+        matmul_heuristic_configs = self._heuristic_seed_configs(max_configs=max_configs)
+
+        seen: set[Config] = set()
+        result: list[FlatConfig] = []
+        for i, config in enumerate(matmul_heuristic_configs):
+            try:
+                flat = self.config_gen.flatten(config)
+                transferred_config = self.config_gen.unflatten(flat)
+                seen.add(transferred_config)
+                result.append(flat)
+                self.log(
+                    f"Starting with matmul heuristic config {i + 1}: "
+                    f"{transferred_config}"
+                )
+                break
+            except (ValueError, TypeError, KeyError, AssertionError) as e:
+                self.log(f"Failed to transfer matmul initial config: {e}")
+
+        if not result:
+            default_flat = self.config_gen.default_flat()
+            default_config = self.config_gen.unflatten(default_flat)
+            seen.add(default_config)
+            result.append(default_flat)
+            self.log("Starting with default config")
 
         # User seed configs are explicit requests, so try them before compiler-owned
         # seeds and cached configs while still deduplicating normalized configs.
@@ -825,7 +866,6 @@ def _generate_best_available_population_flat(self) -> list[FlatConfig]:
             except (ValueError, TypeError, KeyError, AssertionError) as e:
                 self.log(f"Failed to transfer explicit seed config: {e}")
 
-        max_configs = self.settings.autotune_best_available_max_configs
         cached_entries = self._find_similar_cached_configs(max_configs)
 
         if cached_entries:

diff --git a/helion/autotuner/differential_evolution.py b/helion/autotuner/differential_evolution.py
@@ -144,11 +144,7 @@ def _generate_initial_population_flat(self) -> list[FlatConfig]:
                 return pop[:target]
             return pop
 
-        return self.config_gen.random_population_flat(
-            self.population_size * 2,
-            user_seed_configs=self._autotune_seed_configs(),
-            log_func=self.log,
-        )
+        return self._random_population_flat_with_heuristics(self.population_size * 2)
 
     def initial_two_generations(self) -> None:
         # The initial population is 2x larger so we can throw out the slowest half and give the tuning process a head start