diff --git a/examples/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp index b313dbe41..930c12a72 100644 --- a/examples/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp +++ b/examples/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp @@ -78,9 +78,8 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const Chip // A/B layout: [num_groups, grid_k, incore_loop, tile_size, tile_size] // C layout: [incore_loop * num_groups, tile_size, tile_size] - for (int group_idx = 0; group_idx < num_groups; group_idx++) { + PTO2_PARALLEL_FOR(group_idx, num_groups) { PTO2_SCOPE_GUARD(); - uint32_t c_elem_offset = static_cast(static_cast(group_idx) * group_tile_elems); uint32_t c_view_offsets[1] = {c_elem_offset}; Tensor C_view = ext_C.view(group_shapes, c_view_offsets); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h index e025c91ba..a126f33ea 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h @@ -136,6 +136,11 @@ typedef struct PTO2RuntimeOps { PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value ); TaskOutputTensors (*alloc_tensors)(PTO2Runtime *rt, const Arg &args); + + // Parallel for iteration isolation + void (*parallel_for_begin)(PTO2Runtime *rt); + void (*parallel_iter_begin)(PTO2Runtime *rt); + void (*parallel_for_end)(PTO2Runtime *rt); } PTO2RuntimeOps; /** @@ -255,6 +260,21 @@ static inline void pto2_rt_scope_end() { rt->ops->scope_end(rt); } +static inline void pto2_rt_parallel_for_begin() { + PTO2Runtime *rt = pto2_current_runtime(); + rt->ops->parallel_for_begin(rt); +} + +static inline void pto2_rt_parallel_iter_begin() { + PTO2Runtime *rt = pto2_current_runtime(); + rt->ops->parallel_iter_begin(rt); +} + +static inline void pto2_rt_parallel_for_end() { + PTO2Runtime *rt = pto2_current_runtime(); + rt->ops->parallel_for_end(rt); +} + static inline void pto2_rt_orchestration_done() { PTO2Runtime *rt = pto2_current_runtime(); rt->ops->orchestration_done(rt); @@ -381,6 +401,41 @@ class PTO2ScopeGuard { */ #define PTO2_SCOPE() if (PTO2_SCOPE_GUARD(); true) +/** + * Combined RAII guard + loop controller for PTO2_PARALLEL_FOR. + * Construction calls parallel_for_begin; destruction calls parallel_for_end. + * next() drives per-iteration parallel_iter_begin bookkeeping. + */ +class PTO2ParallelForLoop { +public: // NOLINT(whitespace/indent) + explicit PTO2ParallelForLoop(int count) : + rt_(pto2_current_runtime()), + count_(count) { + rt_->ops->parallel_for_begin(rt_); + } + ~PTO2ParallelForLoop() { rt_->ops->parallel_for_end(rt_); } + bool next(int var) { + if (var >= count_) return false; + rt_->ops->parallel_iter_begin(rt_); + return true; + } + +private: // NOLINT(whitespace/indent) + PTO2Runtime *rt_; + int count_; +}; + +/** + * Parallel for loop with automatic iteration isolation: + * PTO2_PARALLEL_FOR(i, N) { + * submit_iter_tasks(i); + * } + * Body is a genuine for-loop body; break/continue work naturally. + */ +#define PTO2_PARALLEL_FOR(var, count) \ + if (PTO2ParallelForLoop _pfl_##var(count); true) \ + for (int var = 0; _pfl_##var.next(var); ++var) + // ============================================================================= // Orchestration Config // ============================================================================= diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp index 3c9eee69b..94ae9ee77 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp @@ -486,6 +486,30 @@ void pto2_scope_end(PTO2OrchestratorState *orch) { #endif } +// ============================================================================= +// Parallel For Iteration Isolation +// ============================================================================= + +void pto2_parallel_for_begin(PTO2OrchestratorState *orch) { + if (orch->fatal) return; + orch->tensor_map.push_iter_frame(orch->current_ring_id()); +} + +void pto2_parallel_iter_begin(PTO2OrchestratorState *orch) { + if (orch->fatal) return; + auto &tm = orch->tensor_map; + // If stack overflowed, skip filtering — run as a plain for loop. + if (tm.iter_stack_top < 0 || tm.iter_stack_top >= PTO2_MAX_PARALLEL_DEPTH) return; + uint8_t ring_id = orch->current_ring_id(); + int32_t next_id = orch->rings[ring_id].task_allocator.next_local_id(); + tm.set_iter_start(next_id); +} + +void pto2_parallel_for_end(PTO2OrchestratorState *orch) { + if (orch->fatal) return; + orch->tensor_map.pop_iter_frame(); +} + // ============================================================================= // Task Submission // ============================================================================= diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h index e10ef2b08..b281a17e2 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h @@ -168,6 +168,29 @@ void pto2_scope_begin(PTO2OrchestratorState *orch); */ void pto2_scope_end(PTO2OrchestratorState *orch); +// ============================================================================= +// Parallel For Iteration Isolation +// ============================================================================= + +/** + * Begin a parallel for region. + * Pushes an iteration frame onto the iter_stack. + */ +void pto2_parallel_for_begin(PTO2OrchestratorState *orch); + +/** + * Begin a parallel for iteration. + * Records the current ring's next local_id as the iteration boundary. + * Does NOT create a scope — scope management is fully explicit. + */ +void pto2_parallel_iter_begin(PTO2OrchestratorState *orch); + +/** + * End a parallel for region. + * Pops the iteration frame from the iter_stack. + */ +void pto2_parallel_for_end(PTO2OrchestratorState *orch); + // ============================================================================= // Task Submission // ============================================================================= diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h index 445b6c73a..1101dc10b 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h @@ -187,6 +187,7 @@ class PTO2TaskAllocator { uint64_t heap_top() const { return heap_top_; } uint64_t heap_capacity() const { return heap_size_; } + int32_t next_local_id() const { return local_task_id_; } private: // --- Task Ring --- diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp index 08cd7fabc..538670a5f 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp @@ -50,6 +50,12 @@ void pto2_rt_scope_begin(PTO2Runtime *rt) { pto2_scope_begin(&rt->orchestrator); void pto2_rt_scope_end(PTO2Runtime *rt) { pto2_scope_end(&rt->orchestrator); } +static void pto2_rt_parallel_for_begin(PTO2Runtime *rt) { pto2_parallel_for_begin(&rt->orchestrator); } + +static void pto2_rt_parallel_iter_begin(PTO2Runtime *rt) { pto2_parallel_iter_begin(&rt->orchestrator); } + +static void pto2_rt_parallel_for_end(PTO2Runtime *rt) { pto2_parallel_for_end(&rt->orchestrator); } + void pto2_rt_orchestration_done(PTO2Runtime *rt) { pto2_orchestrator_done(&rt->orchestrator); } static bool is_fatal_impl(PTO2Runtime *rt) { return rt->orchestrator.fatal; } @@ -224,6 +230,9 @@ static const PTO2RuntimeOps s_runtime_ops = { .get_tensor_data = pto2_get_tensor_data, .set_tensor_data = pto2_set_tensor_data, .alloc_tensors = alloc_tensors_impl, + .parallel_for_begin = pto2_rt_parallel_for_begin, + .parallel_iter_begin = pto2_rt_parallel_iter_begin, + .parallel_for_end = pto2_rt_parallel_for_end, }; // ============================================================================= diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h index aaaad5344..7c9a77cee 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h @@ -87,6 +87,11 @@ struct PTO2RuntimeOps { PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value ); TaskOutputTensors (*alloc_tensors)(PTO2Runtime *rt, const Arg &args); + + // Parallel for iteration isolation + void (*parallel_for_begin)(PTO2Runtime *rt); + void (*parallel_iter_begin)(PTO2Runtime *rt); + void (*parallel_for_end)(PTO2Runtime *rt); }; /** diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h index 3251f49e7..4e3ca8b6b 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h @@ -116,6 +116,9 @@ #define PTO2_MAX_SCOPE_DEPTH 64 // Maximum nesting depth #define PTO2_SCOPE_TASKS_INIT_CAP 65536 // Initial capacity for scope task buffer +// Parallel for iteration isolation +#define PTO2_MAX_PARALLEL_DEPTH 8 // Max nesting depth for iteration filtering; deeper levels degrade gracefully + // Ready queue #define PTO2_READY_QUEUE_SIZE 65536 // Per-shape queue size diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp index 3c7447362..731db5dcd 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp @@ -129,7 +129,10 @@ bool PTO2TensorMap::init( for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { last_task_alives[r] = 0; last_cleanup[r] = 0; + active_iter_start[r] = -1; } + iter_stack_top = -1; + active_filter_mask = 0; return true; } diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h index 61524348a..1742904e7 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h @@ -216,6 +216,68 @@ struct PTO2TensorMap { // Per-ring validity threshold (for lazy invalidation) int32_t last_task_alives[PTO2_MAX_RING_DEPTH]; // Cached from shared memory per ring + // Per-ring active iteration threshold (lookup hot-path cache). + // active_iter_start[r] >= 0 : entries on ring r with local_id < this are filtered. + // active_iter_start[r] == -1 : no active filter on ring r. + // active_filter_mask bit r mirrors (active_iter_start[r] >= 0) for a single branch test. + int32_t active_iter_start[PTO2_MAX_RING_DEPTH]{}; + uint32_t active_filter_mask{0}; + + // Parallel for iteration isolation stack. + // Each PTO2_PARALLEL_FOR pushes a frame; each iteration updates the frame's + // iter_start. Lookup filters entries whose local_id < iter_start on the + // matching ring. Nesting beyond PTO2_MAX_PARALLEL_DEPTH degrades gracefully + // (no filtering for the overflow level, full dependency visibility). + // + // The stack itself is the source of truth for nesting/pop semantics; lookup, + // however, consumes a denormalized per-ring cache (active_iter_start + + // active_filter_mask) so the hot path is O(1) regardless of stack depth. + // For same-ring nesting, the inner frame's threshold dominates (it is always + // >= the outer's since next_local_id() is monotonic), so the cache simply + // tracks the innermost frame per ring; on pop we restore the saved outer. + struct PTO2IterFrame { + int32_t iter_start_local_id; // -1 = before first iter; >= 0 = boundary + int32_t saved_prev_iter_start; // value of active_iter_start[ring_id] before this frame + uint8_t ring_id; // ring this parallel for operates on + }; + PTO2IterFrame iter_stack[PTO2_MAX_PARALLEL_DEPTH]; + int32_t iter_stack_top{-1}; // -1 = no active parallel for + + // ============================================================================= + // Iter-stack helpers (maintain frames + per-ring cache atomically) + // ============================================================================= + + // Push a frame on parallel_for_begin. New frame has no active threshold yet + // (iter_start_local_id == -1); active_iter_start[ring] is unchanged. The + // previous value is saved in the frame so pop can restore it. + void push_iter_frame(uint8_t ring_id) { + int32_t top = ++iter_stack_top; + if (top >= PTO2_MAX_PARALLEL_DEPTH) return; // overflow: see class comment + iter_stack[top] = {-1, active_iter_start[ring_id], ring_id}; + } + + // Update the top frame's iter_start on parallel_iter_begin. + void set_iter_start(int32_t iter_start_local_id) { + int32_t top = iter_stack_top; + if (top < 0 || top >= PTO2_MAX_PARALLEL_DEPTH) return; + uint8_t ring_id = iter_stack[top].ring_id; + iter_stack[top].iter_start_local_id = iter_start_local_id; + active_iter_start[ring_id] = iter_start_local_id; + active_filter_mask |= (1u << ring_id); + } + + // Pop a frame on parallel_for_end, restoring the outer threshold. + void pop_iter_frame() { + int32_t top = iter_stack_top--; + if (top < 0 || top >= PTO2_MAX_PARALLEL_DEPTH) return; + const PTO2IterFrame &frame = iter_stack[top]; + uint8_t ring_id = frame.ring_id; + active_iter_start[ring_id] = frame.saved_prev_iter_start; + if (frame.saved_prev_iter_start < 0) { + active_filter_mask &= ~(1u << ring_id); + } + } + // Per-ring cleanup progress (for periodic cleanup_retired) int32_t last_cleanup[PTO2_MAX_RING_DEPTH]{}; @@ -320,9 +382,9 @@ struct PTO2TensorMap { #if PTO2_TENSORMAP_PROFILING chain_len++; #endif - // Skip stale entries (no chain truncation — entries from different - // rings can be interleaved, so a stale entry from one ring does NOT - // imply subsequent entries from other rings are also stale) + // Skip entries that are either stale (producer retired) or from prior + // iterations of the current parallel-for. Both checks are unified in + // entry_valid() to avoid extracting ring/local twice. if (!entry_valid(*cur_entry)) { cur_entry = next_entry; continue; @@ -450,10 +512,16 @@ struct PTO2TensorMap { } /** - * Check if entry is valid (producer has not retired) + * Check if entry is visible in the current execution context: + * 1. Producer has not retired (not stale). + * 2. Not from a prior iteration of the active parallel-for on the same ring. */ bool entry_valid(const PTO2TensorMapEntry &entry) const { - return static_cast(entry.producer_task_id.local()) >= last_task_alives[entry.producer_task_id.ring()]; + uint8_t ring = entry.producer_task_id.ring(); + int32_t local = static_cast(entry.producer_task_id.local()); + if (local < last_task_alives[ring]) return false; + if (active_filter_mask && ((active_filter_mask >> ring) & 1u) && local < active_iter_start[ring]) return false; + return true; } void remove_entry(PTO2TensorMapEntry &entry) { diff --git a/src/a5/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h b/src/a5/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h index e8b0a08b6..9136caadd 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h @@ -136,6 +136,11 @@ typedef struct PTO2RuntimeOps { PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value ); TaskOutputTensors (*alloc_tensors)(PTO2Runtime *rt, const Arg &args); + + // Parallel for iteration isolation + void (*parallel_for_begin)(PTO2Runtime *rt); + void (*parallel_iter_begin)(PTO2Runtime *rt); + void (*parallel_for_end)(PTO2Runtime *rt); } PTO2RuntimeOps; /** @@ -255,6 +260,21 @@ static inline void pto2_rt_scope_end() { rt->ops->scope_end(rt); } +static inline void pto2_rt_parallel_for_begin() { + PTO2Runtime *rt = pto2_current_runtime(); + rt->ops->parallel_for_begin(rt); +} + +static inline void pto2_rt_parallel_iter_begin() { + PTO2Runtime *rt = pto2_current_runtime(); + rt->ops->parallel_iter_begin(rt); +} + +static inline void pto2_rt_parallel_for_end() { + PTO2Runtime *rt = pto2_current_runtime(); + rt->ops->parallel_for_end(rt); +} + static inline void pto2_rt_orchestration_done() { PTO2Runtime *rt = pto2_current_runtime(); rt->ops->orchestration_done(rt); @@ -381,6 +401,41 @@ class PTO2ScopeGuard { */ #define PTO2_SCOPE() if (PTO2_SCOPE_GUARD(); true) +/** + * Combined RAII guard + loop controller for PTO2_PARALLEL_FOR. + * Construction calls parallel_for_begin; destruction calls parallel_for_end. + * next() drives per-iteration parallel_iter_begin bookkeeping. + */ +class PTO2ParallelForLoop { +public: // NOLINT(whitespace/indent) + explicit PTO2ParallelForLoop(int count) : + rt_(pto2_current_runtime()), + count_(count) { + rt_->ops->parallel_for_begin(rt_); + } + ~PTO2ParallelForLoop() { rt_->ops->parallel_for_end(rt_); } + bool next(int var) { + if (var >= count_) return false; + rt_->ops->parallel_iter_begin(rt_); + return true; + } + +private: // NOLINT(whitespace/indent) + PTO2Runtime *rt_; + int count_; +}; + +/** + * Parallel for loop with automatic iteration isolation: + * PTO2_PARALLEL_FOR(i, N) { + * submit_iter_tasks(i); + * } + * Body is a genuine for-loop body; break/continue work naturally. + */ +#define PTO2_PARALLEL_FOR(var, count) \ + if (PTO2ParallelForLoop _pfl_##var(count); true) \ + for (int var = 0; _pfl_##var.next(var); ++var) + // ============================================================================= // Orchestration Config // ============================================================================= diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp index 79f7bd345..1f4ffe385 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp @@ -486,6 +486,30 @@ void pto2_scope_end(PTO2OrchestratorState *orch) { #endif } +// ============================================================================= +// Parallel For Iteration Isolation +// ============================================================================= + +void pto2_parallel_for_begin(PTO2OrchestratorState *orch) { + if (orch->fatal) return; + orch->tensor_map.push_iter_frame(orch->current_ring_id()); +} + +void pto2_parallel_iter_begin(PTO2OrchestratorState *orch) { + if (orch->fatal) return; + auto &tm = orch->tensor_map; + // If stack overflowed, skip filtering — run as a plain for loop. + if (tm.iter_stack_top < 0 || tm.iter_stack_top >= PTO2_MAX_PARALLEL_DEPTH) return; + uint8_t ring_id = orch->current_ring_id(); + int32_t next_id = orch->rings[ring_id].task_allocator.next_local_id(); + tm.set_iter_start(next_id); +} + +void pto2_parallel_for_end(PTO2OrchestratorState *orch) { + if (orch->fatal) return; + orch->tensor_map.pop_iter_frame(); +} + // ============================================================================= // Task Submission // ============================================================================= diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h index 9abcee2fa..2ba6d96fb 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h @@ -168,6 +168,29 @@ void pto2_scope_begin(PTO2OrchestratorState *orch); */ void pto2_scope_end(PTO2OrchestratorState *orch); +// ============================================================================= +// Parallel For Iteration Isolation +// ============================================================================= + +/** + * Begin a parallel for region. + * Pushes an iteration frame onto the iter_stack. + */ +void pto2_parallel_for_begin(PTO2OrchestratorState *orch); + +/** + * Begin a parallel for iteration. + * Records the current ring's next local_id as the iteration boundary. + * Does NOT create a scope — scope management is fully explicit. + */ +void pto2_parallel_iter_begin(PTO2OrchestratorState *orch); + +/** + * End a parallel for region. + * Pops the iteration frame from the iter_stack. + */ +void pto2_parallel_for_end(PTO2OrchestratorState *orch); + // ============================================================================= // Task Submission // ============================================================================= diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h index 445b6c73a..1101dc10b 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h @@ -187,6 +187,7 @@ class PTO2TaskAllocator { uint64_t heap_top() const { return heap_top_; } uint64_t heap_capacity() const { return heap_size_; } + int32_t next_local_id() const { return local_task_id_; } private: // --- Task Ring --- diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp index 08cd7fabc..538670a5f 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp @@ -50,6 +50,12 @@ void pto2_rt_scope_begin(PTO2Runtime *rt) { pto2_scope_begin(&rt->orchestrator); void pto2_rt_scope_end(PTO2Runtime *rt) { pto2_scope_end(&rt->orchestrator); } +static void pto2_rt_parallel_for_begin(PTO2Runtime *rt) { pto2_parallel_for_begin(&rt->orchestrator); } + +static void pto2_rt_parallel_iter_begin(PTO2Runtime *rt) { pto2_parallel_iter_begin(&rt->orchestrator); } + +static void pto2_rt_parallel_for_end(PTO2Runtime *rt) { pto2_parallel_for_end(&rt->orchestrator); } + void pto2_rt_orchestration_done(PTO2Runtime *rt) { pto2_orchestrator_done(&rt->orchestrator); } static bool is_fatal_impl(PTO2Runtime *rt) { return rt->orchestrator.fatal; } @@ -224,6 +230,9 @@ static const PTO2RuntimeOps s_runtime_ops = { .get_tensor_data = pto2_get_tensor_data, .set_tensor_data = pto2_set_tensor_data, .alloc_tensors = alloc_tensors_impl, + .parallel_for_begin = pto2_rt_parallel_for_begin, + .parallel_iter_begin = pto2_rt_parallel_iter_begin, + .parallel_for_end = pto2_rt_parallel_for_end, }; // ============================================================================= diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h index aaaad5344..7c9a77cee 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h @@ -87,6 +87,11 @@ struct PTO2RuntimeOps { PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value ); TaskOutputTensors (*alloc_tensors)(PTO2Runtime *rt, const Arg &args); + + // Parallel for iteration isolation + void (*parallel_for_begin)(PTO2Runtime *rt); + void (*parallel_iter_begin)(PTO2Runtime *rt); + void (*parallel_for_end)(PTO2Runtime *rt); }; /** diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h index e696a5784..c2375b0e4 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h @@ -107,6 +107,9 @@ #define PTO2_MAX_SCOPE_DEPTH 64 // Maximum nesting depth #define PTO2_SCOPE_TASKS_INIT_CAP 65536 // Initial capacity for scope task buffer +// Parallel for iteration isolation +#define PTO2_MAX_PARALLEL_DEPTH 8 // Max nesting depth for iteration filtering; deeper levels degrade gracefully + // Ready queue #define PTO2_READY_QUEUE_SIZE 65536 // Per-shape queue size diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp index 3c7447362..731db5dcd 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp @@ -129,7 +129,10 @@ bool PTO2TensorMap::init( for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { last_task_alives[r] = 0; last_cleanup[r] = 0; + active_iter_start[r] = -1; } + iter_stack_top = -1; + active_filter_mask = 0; return true; } diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h index 61524348a..91c3914f8 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h @@ -216,6 +216,68 @@ struct PTO2TensorMap { // Per-ring validity threshold (for lazy invalidation) int32_t last_task_alives[PTO2_MAX_RING_DEPTH]; // Cached from shared memory per ring + // Parallel for iteration isolation stack. + // Each PTO2_PARALLEL_FOR pushes a frame; each iteration updates the frame's + // iter_start. Lookup filters entries whose local_id < iter_start on the + // matching ring. Nesting beyond PTO2_MAX_PARALLEL_DEPTH degrades gracefully + // (no filtering for the overflow level, full dependency visibility). + // + // The stack itself is the source of truth for nesting/pop semantics; lookup, + // however, consumes a denormalized per-ring cache (active_iter_start + + // active_filter_mask) so the hot path is O(1) regardless of stack depth. + // For same-ring nesting, the inner frame's threshold dominates (it is always + // >= the outer's since next_local_id() is monotonic), so the cache simply + // tracks the innermost frame per ring; on pop we restore the saved outer. + struct PTO2IterFrame { + int32_t iter_start_local_id; // -1 = before first iter; >= 0 = boundary + int32_t saved_prev_iter_start; // value of active_iter_start[ring_id] before this frame + uint8_t ring_id; // ring this parallel for operates on + }; + PTO2IterFrame iter_stack[PTO2_MAX_PARALLEL_DEPTH]; + int32_t iter_stack_top{-1}; // -1 = no active parallel for + + // Per-ring active iteration threshold (lookup hot-path cache). + // active_iter_start[r] >= 0 : entries on ring r with local_id < this are filtered. + // active_iter_start[r] == -1 : no active filter on ring r. + // active_filter_mask bit r mirrors (active_iter_start[r] >= 0) for a single branch test. + int32_t active_iter_start[PTO2_MAX_RING_DEPTH]{}; + uint32_t active_filter_mask{0}; + + // ============================================================================= + // Iter-stack helpers (maintain frames + per-ring cache atomically) + // ============================================================================= + + // Push a frame on parallel_for_begin. New frame has no active threshold yet + // (iter_start_local_id == -1); active_iter_start[ring] is unchanged. The + // previous value is saved in the frame so pop can restore it. + void push_iter_frame(uint8_t ring_id) { + int32_t top = ++iter_stack_top; + if (top >= PTO2_MAX_PARALLEL_DEPTH) return; // overflow: see class comment + iter_stack[top] = {-1, active_iter_start[ring_id], ring_id}; + } + + // Update the top frame's iter_start on parallel_iter_begin. + void set_iter_start(int32_t iter_start_local_id) { + int32_t top = iter_stack_top; + if (top < 0 || top >= PTO2_MAX_PARALLEL_DEPTH) return; + uint8_t ring_id = iter_stack[top].ring_id; + iter_stack[top].iter_start_local_id = iter_start_local_id; + active_iter_start[ring_id] = iter_start_local_id; + active_filter_mask |= (1u << ring_id); + } + + // Pop a frame on parallel_for_end, restoring the outer threshold. + void pop_iter_frame() { + int32_t top = iter_stack_top--; + if (top < 0 || top >= PTO2_MAX_PARALLEL_DEPTH) return; + const PTO2IterFrame &frame = iter_stack[top]; + uint8_t ring_id = frame.ring_id; + active_iter_start[ring_id] = frame.saved_prev_iter_start; + if (frame.saved_prev_iter_start < 0) { + active_filter_mask &= ~(1u << ring_id); + } + } + // Per-ring cleanup progress (for periodic cleanup_retired) int32_t last_cleanup[PTO2_MAX_RING_DEPTH]{}; @@ -328,6 +390,23 @@ struct PTO2TensorMap { continue; } + // Parallel for iteration isolation: skip entries from prior iterations. + // Fast path: active_filter_mask == 0 (no parallel_for is currently + // inside an iteration) collapses to a single branch. Otherwise a + // single per-ring compare replaces the full iter_stack scan; the + // stack's "innermost frame wins per ring" semantics are denormalized + // into active_iter_start[] on push/set/pop. + if (active_filter_mask) { + uint8_t entry_ring = cur_entry->producer_task_id.ring(); + if ((active_filter_mask >> entry_ring) & 1u) { + int32_t entry_local = static_cast(cur_entry->producer_task_id.local()); + if (entry_local < active_iter_start[entry_ring]) { + cur_entry = next_entry; + continue; + } + } + } + // Entry is valid - check if regions OVERLAP (not just exact match) // Since we hash only by base_ptr, all entries in this bucket have // potential to overlap. We must check actual byte-range overlap. diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp b/tests/st/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp index 308a1d66e..e7ff178c8 100644 --- a/tests/st/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp +++ b/tests/st/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp @@ -79,7 +79,7 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const Chip int max_groups = num_matmul_groups > num_add_groups ? num_matmul_groups : num_add_groups; // Interleaved submit: matmul and add groups alternate - for (int group_idx = 0; group_idx < max_groups; group_idx++) { + PTO2_PARALLEL_FOR(group_idx, max_groups) { if (group_idx < num_matmul_groups) { int start_task_idx = group_idx * matmul_batch; uint64_t offset = static_cast(start_task_idx) * MATMUL_ELEMS; diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/st/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp index 68b794f6b..2e2fd7fb8 100644 --- a/tests/st/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp +++ b/tests/st/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp @@ -113,10 +113,10 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const Chip constexpr uint64_t IN_CORE_BATCH = 16; uint64_t num_chunks = (batch + IN_CORE_BATCH - 1) / IN_CORE_BATCH; - for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) { + PTO2_PARALLEL_FOR(q_idx, (int)q_loop) { uint64_t q_offset = q_idx * q_tile; - for (uint64_t chunk_idx = 0; chunk_idx < num_chunks; chunk_idx++) { + PTO2_PARALLEL_FOR(chunk_idx, (int)num_chunks) { uint64_t chunk_bc = batch - chunk_idx * IN_CORE_BATCH; if (chunk_bc > IN_CORE_BATCH) chunk_bc = IN_CORE_BATCH; uint64_t batch_start = chunk_idx * IN_CORE_BATCH; diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp b/tests/st/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp index 5c9b8c3ad..68d6091ff 100644 --- a/tests/st/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp +++ b/tests/st/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp @@ -74,7 +74,7 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const Chip LOG_INFO("[mixed_orch] num_iters=%d", num_iters); - for (int i = 0; i < num_iters; i++) { + PTO2_PARALLEL_FOR(i, num_iters) { PTO2_SCOPE() { uint32_t view_shapes[1] = {TILE_ELEMS}; uint32_t view_offsets[1] = {static_cast(i) * TILE_ELEMS}; diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp b/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp index f4eba8a5f..f34508894 100644 --- a/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp +++ b/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp @@ -151,12 +151,12 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const Chip CYCLE_COUNT_LAP(prof_make_tensor); #endif - for (uint64_t b_idx = 0; b_idx < batch; b_idx++) { + PTO2_PARALLEL_FOR(b_idx, (int)batch) { uint32_t cl_idx[1] = {static_cast(b_idx)}; uint64_t cur_seq = static_cast(get_tensor_data(context_lens, 1, cl_idx)); uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size; - for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) { + PTO2_PARALLEL_FOR(q_idx, (int)q_loop) { CYCLE_COUNT_LAP(prof_scope_and_loop); PTO2_SCOPE() { uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile; diff --git a/tests/st/a5/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp b/tests/st/a5/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp index 5c9b8c3ad..68d6091ff 100644 --- a/tests/st/a5/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp +++ b/tests/st/a5/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp @@ -74,7 +74,7 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const Chip LOG_INFO("[mixed_orch] num_iters=%d", num_iters); - for (int i = 0; i < num_iters; i++) { + PTO2_PARALLEL_FOR(i, num_iters) { PTO2_SCOPE() { uint32_t view_shapes[1] = {TILE_ELEMS}; uint32_t view_offsets[1] = {static_cast(i) * TILE_ELEMS}; diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp b/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp index fba81681a..2fe072039 100644 --- a/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp +++ b/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp @@ -160,11 +160,11 @@ __attribute__((visibility("default"))) void build_paged_attention_graph(const Ch CYCLE_COUNT_LAP(prof_make_tensor); #endif - for (uint64_t b_idx = 0; b_idx < batch; b_idx++) { + PTO2_PARALLEL_FOR(b_idx, (int)batch) { uint32_t cl_idx[1] = {static_cast(b_idx)}; uint64_t cur_seq = static_cast(get_tensor_data(context_lens, 1, cl_idx)); uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size; - for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) { + PTO2_PARALLEL_FOR(q_idx, (int)q_loop) { CYCLE_COUNT_LAP(prof_scope_and_loop); PTO2_SCOPE() { uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile;