diff --git a/src/a2a3/runtime/aicpu_build_graph/aicore/aicore_executor.cpp b/src/a2a3/runtime/aicpu_build_graph/aicore/aicore_executor.cpp index f3d3c096d..76d5046c6 100644 --- a/src/a2a3/runtime/aicpu_build_graph/aicore/aicore_executor.cpp +++ b/src/a2a3/runtime/aicpu_build_graph/aicore/aicore_executor.cpp @@ -140,4 +140,9 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in // Flush all dirty cache lines to HBM before kernel exit. dcci(my_hank, SINGLE_CACHE_LINE, CACHELINE_OUT); + + // Invalidate our Handshake L1 line on exit so the next case on this core + // sees a fresh aicpu_ready=0 on its first load instead of an L1-resident 1 + // left over from this case (no rtDeviceReset between cases). + dcci(my_hank, SINGLE_CACHE_LINE); } diff --git a/src/a2a3/runtime/host_build_graph/aicore/aicore_executor.cpp b/src/a2a3/runtime/host_build_graph/aicore/aicore_executor.cpp index aaf924361..c8c628664 100644 --- a/src/a2a3/runtime/host_build_graph/aicore/aicore_executor.cpp +++ b/src/a2a3/runtime/host_build_graph/aicore/aicore_executor.cpp @@ -98,4 +98,9 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in // Flush all dirty cache lines to HBM before kernel exit. dcci(my_hank, SINGLE_CACHE_LINE, CACHELINE_OUT); + + // Invalidate our Handshake L1 line on exit so the next case on this core + // sees a fresh aicpu_ready=0 on its first load instead of an L1-resident 1 + // left over from this case (no rtDeviceReset between cases). + dcci(my_hank, SINGLE_CACHE_LINE); } diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp index e7be65fd0..ead0781e4 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp @@ -145,4 +145,9 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in // Flush all dirty cache lines to HBM before kernel exit. dcci(my_hank, SINGLE_CACHE_LINE, CACHELINE_OUT); + + // Invalidate our Handshake L1 line on exit so the next case on this core + // sees a fresh aicpu_ready=0 on its first load instead of an L1-resident 1 + // left over from this case (no rtDeviceReset between cases). + dcci(my_hank, SINGLE_CACHE_LINE); } diff --git a/src/a5/runtime/host_build_graph/aicore/aicore_executor.cpp b/src/a5/runtime/host_build_graph/aicore/aicore_executor.cpp index 41fe1b48a..d2d4798af 100644 --- a/src/a5/runtime/host_build_graph/aicore/aicore_executor.cpp +++ b/src/a5/runtime/host_build_graph/aicore/aicore_executor.cpp @@ -98,4 +98,9 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in // Flush all dirty cache lines to HBM before kernel exit. dcci(my_hank, SINGLE_CACHE_LINE, CACHELINE_OUT); + + // Invalidate our Handshake L1 line on exit so the next case on this core + // sees a fresh aicpu_ready=0 on its first load instead of an L1-resident 1 + // left over from this case (no rtDeviceReset between cases). + dcci(my_hank, SINGLE_CACHE_LINE); } diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp index 019f9270e..4646f9107 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp @@ -145,4 +145,9 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in // Flush all dirty cache lines to HBM before kernel exit. dcci(my_hank, SINGLE_CACHE_LINE, CACHELINE_OUT); + + // Invalidate our Handshake L1 line on exit so the next case on this core + // sees a fresh aicpu_ready=0 on its first load instead of an L1-resident 1 + // left over from this case (no rtDeviceReset between cases). + dcci(my_hank, SINGLE_CACHE_LINE); }