diff --git a/examples/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp
index b313dbe41..930c12a72 100644
--- a/examples/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp
+++ b/examples/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp
@@ -78,9 +78,8 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const Chip
 
     // A/B layout: [num_groups, grid_k, incore_loop, tile_size, tile_size]
     // C layout:   [incore_loop * num_groups, tile_size, tile_size]
-    for (int group_idx = 0; group_idx < num_groups; group_idx++) {
+    PTO2_PARALLEL_FOR(group_idx, num_groups) {
         PTO2_SCOPE_GUARD();
-
         uint32_t c_elem_offset = static_cast<uint32_t>(static_cast<uint64_t>(group_idx) * group_tile_elems);
         uint32_t c_view_offsets[1] = {c_elem_offset};
         Tensor C_view = ext_C.view(group_shapes, c_view_offsets);
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
index e025c91ba..a126f33ea 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
@@ -136,6 +136,11 @@ typedef struct PTO2RuntimeOps {
         PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value
     );
     TaskOutputTensors (*alloc_tensors)(PTO2Runtime *rt, const Arg &args);
+
+    // Parallel for iteration isolation
+    void (*parallel_for_begin)(PTO2Runtime *rt);
+    void (*parallel_iter_begin)(PTO2Runtime *rt);
+    void (*parallel_for_end)(PTO2Runtime *rt);
 } PTO2RuntimeOps;
 
 /**
@@ -255,6 +260,21 @@ static inline void pto2_rt_scope_end() {
     rt->ops->scope_end(rt);
 }
 
+static inline void pto2_rt_parallel_for_begin() {
+    PTO2Runtime *rt = pto2_current_runtime();
+    rt->ops->parallel_for_begin(rt);
+}
+
+static inline void pto2_rt_parallel_iter_begin() {
+    PTO2Runtime *rt = pto2_current_runtime();
+    rt->ops->parallel_iter_begin(rt);
+}
+
+static inline void pto2_rt_parallel_for_end() {
+    PTO2Runtime *rt = pto2_current_runtime();
+    rt->ops->parallel_for_end(rt);
+}
+
 static inline void pto2_rt_orchestration_done() {
     PTO2Runtime *rt = pto2_current_runtime();
     rt->ops->orchestration_done(rt);
@@ -381,6 +401,41 @@ class PTO2ScopeGuard {
  */
 #define PTO2_SCOPE() if (PTO2_SCOPE_GUARD(); true)
 
+/**
+ * Combined RAII guard + loop controller for PTO2_PARALLEL_FOR.
+ * Construction calls parallel_for_begin; destruction calls parallel_for_end.
+ * next() drives per-iteration parallel_iter_begin bookkeeping.
+ */
+class PTO2ParallelForLoop {
+public:  // NOLINT(whitespace/indent)
+    explicit PTO2ParallelForLoop(int count) :
+        rt_(pto2_current_runtime()),
+        count_(count) {
+        rt_->ops->parallel_for_begin(rt_);
+    }
+    ~PTO2ParallelForLoop() { rt_->ops->parallel_for_end(rt_); }
+    bool next(int var) {
+        if (var >= count_) return false;
+        rt_->ops->parallel_iter_begin(rt_);
+        return true;
+    }
+
+private:  // NOLINT(whitespace/indent)
+    PTO2Runtime *rt_;
+    int count_;
+};
+
+/**
+ * Parallel for loop with automatic iteration isolation:
+ *   PTO2_PARALLEL_FOR(i, N) {
+ *       submit_iter_tasks(i);
+ *   }
+ * Body is a genuine for-loop body; break/continue work naturally.
+ */
+#define PTO2_PARALLEL_FOR(var, count)                \
+    if (PTO2ParallelForLoop _pfl_##var(count); true) \
+        for (int var = 0; _pfl_##var.next(var); ++var)
+
 // =============================================================================
 // Orchestration Config
 // =============================================================================
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
index 3c9eee69b..94ae9ee77 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
@@ -486,6 +486,30 @@ void pto2_scope_end(PTO2OrchestratorState *orch) {
 #endif
 }
 
+// =============================================================================
+// Parallel For Iteration Isolation
+// =============================================================================
+
+void pto2_parallel_for_begin(PTO2OrchestratorState *orch) {
+    if (orch->fatal) return;
+    orch->tensor_map.push_iter_frame(orch->current_ring_id());
+}
+
+void pto2_parallel_iter_begin(PTO2OrchestratorState *orch) {
+    if (orch->fatal) return;
+    auto &tm = orch->tensor_map;
+    // If stack overflowed, skip filtering — run as a plain for loop.
+    if (tm.iter_stack_top < 0 || tm.iter_stack_top >= PTO2_MAX_PARALLEL_DEPTH) return;
+    uint8_t ring_id = orch->current_ring_id();
+    int32_t next_id = orch->rings[ring_id].task_allocator.next_local_id();
+    tm.set_iter_start(next_id);
+}
+
+void pto2_parallel_for_end(PTO2OrchestratorState *orch) {
+    if (orch->fatal) return;
+    orch->tensor_map.pop_iter_frame();
+}
+
 // =============================================================================
 // Task Submission
 // =============================================================================
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
index e10ef2b08..b281a17e2 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
@@ -168,6 +168,29 @@ void pto2_scope_begin(PTO2OrchestratorState *orch);
  */
 void pto2_scope_end(PTO2OrchestratorState *orch);
 
+// =============================================================================
+// Parallel For Iteration Isolation
+// =============================================================================
+
+/**
+ * Begin a parallel for region.
+ * Pushes an iteration frame onto the iter_stack.
+ */
+void pto2_parallel_for_begin(PTO2OrchestratorState *orch);
+
+/**
+ * Begin a parallel for iteration.
+ * Records the current ring's next local_id as the iteration boundary.
+ * Does NOT create a scope — scope management is fully explicit.
+ */
+void pto2_parallel_iter_begin(PTO2OrchestratorState *orch);
+
+/**
+ * End a parallel for region.
+ * Pops the iteration frame from the iter_stack.
+ */
+void pto2_parallel_for_end(PTO2OrchestratorState *orch);
+
 // =============================================================================
 // Task Submission
 // =============================================================================
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
index 445b6c73a..1101dc10b 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
@@ -187,6 +187,7 @@ class PTO2TaskAllocator {
 
     uint64_t heap_top() const { return heap_top_; }
     uint64_t heap_capacity() const { return heap_size_; }
+    int32_t next_local_id() const { return local_task_id_; }
 
 private:
     // --- Task Ring ---
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
index 08cd7fabc..538670a5f 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
@@ -50,6 +50,12 @@ void pto2_rt_scope_begin(PTO2Runtime *rt) { pto2_scope_begin(&rt->orchestrator);
 
 void pto2_rt_scope_end(PTO2Runtime *rt) { pto2_scope_end(&rt->orchestrator); }
 
+static void pto2_rt_parallel_for_begin(PTO2Runtime *rt) { pto2_parallel_for_begin(&rt->orchestrator); }
+
+static void pto2_rt_parallel_iter_begin(PTO2Runtime *rt) { pto2_parallel_iter_begin(&rt->orchestrator); }
+
+static void pto2_rt_parallel_for_end(PTO2Runtime *rt) { pto2_parallel_for_end(&rt->orchestrator); }
+
 void pto2_rt_orchestration_done(PTO2Runtime *rt) { pto2_orchestrator_done(&rt->orchestrator); }
 
 static bool is_fatal_impl(PTO2Runtime *rt) { return rt->orchestrator.fatal; }
@@ -224,6 +230,9 @@ static const PTO2RuntimeOps s_runtime_ops = {
     .get_tensor_data = pto2_get_tensor_data,
     .set_tensor_data = pto2_set_tensor_data,
     .alloc_tensors = alloc_tensors_impl,
+    .parallel_for_begin = pto2_rt_parallel_for_begin,
+    .parallel_iter_begin = pto2_rt_parallel_iter_begin,
+    .parallel_for_end = pto2_rt_parallel_for_end,
 };
 
 // =============================================================================
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
index aaaad5344..7c9a77cee 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
@@ -87,6 +87,11 @@ struct PTO2RuntimeOps {
         PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value
     );
     TaskOutputTensors (*alloc_tensors)(PTO2Runtime *rt, const Arg &args);
+
+    // Parallel for iteration isolation
+    void (*parallel_for_begin)(PTO2Runtime *rt);
+    void (*parallel_iter_begin)(PTO2Runtime *rt);
+    void (*parallel_for_end)(PTO2Runtime *rt);
 };
 
 /**
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
index 3251f49e7..4e3ca8b6b 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
@@ -116,6 +116,9 @@
 #define PTO2_MAX_SCOPE_DEPTH 64          // Maximum nesting depth
 #define PTO2_SCOPE_TASKS_INIT_CAP 65536  // Initial capacity for scope task buffer
 
+// Parallel for iteration isolation
+#define PTO2_MAX_PARALLEL_DEPTH 8  // Max nesting depth for iteration filtering; deeper levels degrade gracefully
+
 // Ready queue
 #define PTO2_READY_QUEUE_SIZE 65536  // Per-shape queue size
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp
index 3c7447362..731db5dcd 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp
@@ -129,7 +129,10 @@ bool PTO2TensorMap::init(
     for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
         last_task_alives[r] = 0;
         last_cleanup[r] = 0;
+        active_iter_start[r] = -1;
     }
+    iter_stack_top = -1;
+    active_filter_mask = 0;
 
     return true;
 }
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
index 61524348a..1742904e7 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
@@ -216,6 +216,68 @@ struct PTO2TensorMap {
     // Per-ring validity threshold (for lazy invalidation)
     int32_t last_task_alives[PTO2_MAX_RING_DEPTH];  // Cached from shared memory per ring
 
+    // Per-ring active iteration threshold (lookup hot-path cache).
+    //   active_iter_start[r] >= 0  : entries on ring r with local_id < this are filtered.
+    //   active_iter_start[r] == -1 : no active filter on ring r.
+    // active_filter_mask bit r mirrors (active_iter_start[r] >= 0) for a single branch test.
+    int32_t active_iter_start[PTO2_MAX_RING_DEPTH]{};
+    uint32_t active_filter_mask{0};
+
+    // Parallel for iteration isolation stack.
+    // Each PTO2_PARALLEL_FOR pushes a frame; each iteration updates the frame's
+    // iter_start. Lookup filters entries whose local_id < iter_start on the
+    // matching ring. Nesting beyond PTO2_MAX_PARALLEL_DEPTH degrades gracefully
+    // (no filtering for the overflow level, full dependency visibility).
+    //
+    // The stack itself is the source of truth for nesting/pop semantics; lookup,
+    // however, consumes a denormalized per-ring cache (active_iter_start +
+    // active_filter_mask) so the hot path is O(1) regardless of stack depth.
+    // For same-ring nesting, the inner frame's threshold dominates (it is always
+    // >= the outer's since next_local_id() is monotonic), so the cache simply
+    // tracks the innermost frame per ring; on pop we restore the saved outer.
+    struct PTO2IterFrame {
+        int32_t iter_start_local_id;    // -1 = before first iter; >= 0 = boundary
+        int32_t saved_prev_iter_start;  // value of active_iter_start[ring_id] before this frame
+        uint8_t ring_id;                // ring this parallel for operates on
+    };
+    PTO2IterFrame iter_stack[PTO2_MAX_PARALLEL_DEPTH];
+    int32_t iter_stack_top{-1};  // -1 = no active parallel for
+
+    // =============================================================================
+    // Iter-stack helpers (maintain frames + per-ring cache atomically)
+    // =============================================================================
+
+    // Push a frame on parallel_for_begin. New frame has no active threshold yet
+    // (iter_start_local_id == -1); active_iter_start[ring] is unchanged. The
+    // previous value is saved in the frame so pop can restore it.
+    void push_iter_frame(uint8_t ring_id) {
+        int32_t top = ++iter_stack_top;
+        if (top >= PTO2_MAX_PARALLEL_DEPTH) return;  // overflow: see class comment
+        iter_stack[top] = {-1, active_iter_start[ring_id], ring_id};
+    }
+
+    // Update the top frame's iter_start on parallel_iter_begin.
+    void set_iter_start(int32_t iter_start_local_id) {
+        int32_t top = iter_stack_top;
+        if (top < 0 || top >= PTO2_MAX_PARALLEL_DEPTH) return;
+        uint8_t ring_id = iter_stack[top].ring_id;
+        iter_stack[top].iter_start_local_id = iter_start_local_id;
+        active_iter_start[ring_id] = iter_start_local_id;
+        active_filter_mask |= (1u << ring_id);
+    }
+
+    // Pop a frame on parallel_for_end, restoring the outer threshold.
+    void pop_iter_frame() {
+        int32_t top = iter_stack_top--;
+        if (top < 0 || top >= PTO2_MAX_PARALLEL_DEPTH) return;
+        const PTO2IterFrame &frame = iter_stack[top];
+        uint8_t ring_id = frame.ring_id;
+        active_iter_start[ring_id] = frame.saved_prev_iter_start;
+        if (frame.saved_prev_iter_start < 0) {
+            active_filter_mask &= ~(1u << ring_id);
+        }
+    }
+
     // Per-ring cleanup progress (for periodic cleanup_retired)
     int32_t last_cleanup[PTO2_MAX_RING_DEPTH]{};
 
@@ -320,9 +382,9 @@ struct PTO2TensorMap {
 #if PTO2_TENSORMAP_PROFILING
             chain_len++;
 #endif
-            // Skip stale entries (no chain truncation — entries from different
-            // rings can be interleaved, so a stale entry from one ring does NOT
-            // imply subsequent entries from other rings are also stale)
+            // Skip entries that are either stale (producer retired) or from prior
+            // iterations of the current parallel-for. Both checks are unified in
+            // entry_valid() to avoid extracting ring/local twice.
             if (!entry_valid(*cur_entry)) {
                 cur_entry = next_entry;
                 continue;
@@ -450,10 +512,16 @@ struct PTO2TensorMap {
     }
 
     /**
-     * Check if entry is valid (producer has not retired)
+     * Check if entry is visible in the current execution context:
+     * 1. Producer has not retired (not stale).
+     * 2. Not from a prior iteration of the active parallel-for on the same ring.
      */
     bool entry_valid(const PTO2TensorMapEntry &entry) const {
-        return static_cast<int32_t>(entry.producer_task_id.local()) >= last_task_alives[entry.producer_task_id.ring()];
+        uint8_t ring = entry.producer_task_id.ring();
+        int32_t local = static_cast<int32_t>(entry.producer_task_id.local());
+        if (local < last_task_alives[ring]) return false;
+        if (active_filter_mask && ((active_filter_mask >> ring) & 1u) && local < active_iter_start[ring]) return false;
+        return true;
     }
 
     void remove_entry(PTO2TensorMapEntry &entry) {
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h b/src/a5/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
index e8b0a08b6..9136caadd 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
@@ -136,6 +136,11 @@ typedef struct PTO2RuntimeOps {
         PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value
     );
     TaskOutputTensors (*alloc_tensors)(PTO2Runtime *rt, const Arg &args);
+
+    // Parallel for iteration isolation
+    void (*parallel_for_begin)(PTO2Runtime *rt);
+    void (*parallel_iter_begin)(PTO2Runtime *rt);
+    void (*parallel_for_end)(PTO2Runtime *rt);
 } PTO2RuntimeOps;
 
 /**
@@ -255,6 +260,21 @@ static inline void pto2_rt_scope_end() {
     rt->ops->scope_end(rt);
 }
 
+static inline void pto2_rt_parallel_for_begin() {
+    PTO2Runtime *rt = pto2_current_runtime();
+    rt->ops->parallel_for_begin(rt);
+}
+
+static inline void pto2_rt_parallel_iter_begin() {
+    PTO2Runtime *rt = pto2_current_runtime();
+    rt->ops->parallel_iter_begin(rt);
+}
+
+static inline void pto2_rt_parallel_for_end() {
+    PTO2Runtime *rt = pto2_current_runtime();
+    rt->ops->parallel_for_end(rt);
+}
+
 static inline void pto2_rt_orchestration_done() {
     PTO2Runtime *rt = pto2_current_runtime();
     rt->ops->orchestration_done(rt);
@@ -381,6 +401,41 @@ class PTO2ScopeGuard {
  */
 #define PTO2_SCOPE() if (PTO2_SCOPE_GUARD(); true)
 
+/**
+ * Combined RAII guard + loop controller for PTO2_PARALLEL_FOR.
+ * Construction calls parallel_for_begin; destruction calls parallel_for_end.
+ * next() drives per-iteration parallel_iter_begin bookkeeping.
+ */
+class PTO2ParallelForLoop {
+public:  // NOLINT(whitespace/indent)
+    explicit PTO2ParallelForLoop(int count) :
+        rt_(pto2_current_runtime()),
+        count_(count) {
+        rt_->ops->parallel_for_begin(rt_);
+    }
+    ~PTO2ParallelForLoop() { rt_->ops->parallel_for_end(rt_); }
+    bool next(int var) {
+        if (var >= count_) return false;
+        rt_->ops->parallel_iter_begin(rt_);
+        return true;
+    }
+
+private:  // NOLINT(whitespace/indent)
+    PTO2Runtime *rt_;
+    int count_;
+};
+
+/**
+ * Parallel for loop with automatic iteration isolation:
+ *   PTO2_PARALLEL_FOR(i, N) {
+ *       submit_iter_tasks(i);
+ *   }
+ * Body is a genuine for-loop body; break/continue work naturally.
+ */
+#define PTO2_PARALLEL_FOR(var, count)                \
+    if (PTO2ParallelForLoop _pfl_##var(count); true) \
+        for (int var = 0; _pfl_##var.next(var); ++var)
+
 // =============================================================================
 // Orchestration Config
 // =============================================================================
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
index 79f7bd345..1f4ffe385 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
@@ -486,6 +486,30 @@ void pto2_scope_end(PTO2OrchestratorState *orch) {
 #endif
 }
 
+// =============================================================================
+// Parallel For Iteration Isolation
+// =============================================================================
+
+void pto2_parallel_for_begin(PTO2OrchestratorState *orch) {
+    if (orch->fatal) return;
+    orch->tensor_map.push_iter_frame(orch->current_ring_id());
+}
+
+void pto2_parallel_iter_begin(PTO2OrchestratorState *orch) {
+    if (orch->fatal) return;
+    auto &tm = orch->tensor_map;
+    // If stack overflowed, skip filtering — run as a plain for loop.
+    if (tm.iter_stack_top < 0 || tm.iter_stack_top >= PTO2_MAX_PARALLEL_DEPTH) return;
+    uint8_t ring_id = orch->current_ring_id();
+    int32_t next_id = orch->rings[ring_id].task_allocator.next_local_id();
+    tm.set_iter_start(next_id);
+}
+
+void pto2_parallel_for_end(PTO2OrchestratorState *orch) {
+    if (orch->fatal) return;
+    orch->tensor_map.pop_iter_frame();
+}
+
 // =============================================================================
 // Task Submission
 // =============================================================================
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
index 9abcee2fa..2ba6d96fb 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
@@ -168,6 +168,29 @@ void pto2_scope_begin(PTO2OrchestratorState *orch);
  */
 void pto2_scope_end(PTO2OrchestratorState *orch);
 
+// =============================================================================
+// Parallel For Iteration Isolation
+// =============================================================================
+
+/**
+ * Begin a parallel for region.
+ * Pushes an iteration frame onto the iter_stack.
+ */
+void pto2_parallel_for_begin(PTO2OrchestratorState *orch);
+
+/**
+ * Begin a parallel for iteration.
+ * Records the current ring's next local_id as the iteration boundary.
+ * Does NOT create a scope — scope management is fully explicit.
+ */
+void pto2_parallel_iter_begin(PTO2OrchestratorState *orch);
+
+/**
+ * End a parallel for region.
+ * Pops the iteration frame from the iter_stack.
+ */
+void pto2_parallel_for_end(PTO2OrchestratorState *orch);
+
 // =============================================================================
 // Task Submission
 // =============================================================================
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
index 445b6c73a..1101dc10b 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
@@ -187,6 +187,7 @@ class PTO2TaskAllocator {
 
     uint64_t heap_top() const { return heap_top_; }
     uint64_t heap_capacity() const { return heap_size_; }
+    int32_t next_local_id() const { return local_task_id_; }
 
 private:
     // --- Task Ring ---
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
index 08cd7fabc..538670a5f 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
@@ -50,6 +50,12 @@ void pto2_rt_scope_begin(PTO2Runtime *rt) { pto2_scope_begin(&rt->orchestrator);
 
 void pto2_rt_scope_end(PTO2Runtime *rt) { pto2_scope_end(&rt->orchestrator); }
 
+static void pto2_rt_parallel_for_begin(PTO2Runtime *rt) { pto2_parallel_for_begin(&rt->orchestrator); }
+
+static void pto2_rt_parallel_iter_begin(PTO2Runtime *rt) { pto2_parallel_iter_begin(&rt->orchestrator); }
+
+static void pto2_rt_parallel_for_end(PTO2Runtime *rt) { pto2_parallel_for_end(&rt->orchestrator); }
+
 void pto2_rt_orchestration_done(PTO2Runtime *rt) { pto2_orchestrator_done(&rt->orchestrator); }
 
 static bool is_fatal_impl(PTO2Runtime *rt) { return rt->orchestrator.fatal; }
@@ -224,6 +230,9 @@ static const PTO2RuntimeOps s_runtime_ops = {
     .get_tensor_data = pto2_get_tensor_data,
     .set_tensor_data = pto2_set_tensor_data,
     .alloc_tensors = alloc_tensors_impl,
+    .parallel_for_begin = pto2_rt_parallel_for_begin,
+    .parallel_iter_begin = pto2_rt_parallel_iter_begin,
+    .parallel_for_end = pto2_rt_parallel_for_end,
 };
 
 // =============================================================================
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
index aaaad5344..7c9a77cee 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
@@ -87,6 +87,11 @@ struct PTO2RuntimeOps {
         PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value
     );
     TaskOutputTensors (*alloc_tensors)(PTO2Runtime *rt, const Arg &args);
+
+    // Parallel for iteration isolation
+    void (*parallel_for_begin)(PTO2Runtime *rt);
+    void (*parallel_iter_begin)(PTO2Runtime *rt);
+    void (*parallel_for_end)(PTO2Runtime *rt);
 };
 
 /**
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
index e696a5784..c2375b0e4 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
@@ -107,6 +107,9 @@
 #define PTO2_MAX_SCOPE_DEPTH 64          // Maximum nesting depth
 #define PTO2_SCOPE_TASKS_INIT_CAP 65536  // Initial capacity for scope task buffer
 
+// Parallel for iteration isolation
+#define PTO2_MAX_PARALLEL_DEPTH 8  // Max nesting depth for iteration filtering; deeper levels degrade gracefully
+
 // Ready queue
 #define PTO2_READY_QUEUE_SIZE 65536  // Per-shape queue size
 
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp
index 3c7447362..731db5dcd 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp
@@ -129,7 +129,10 @@ bool PTO2TensorMap::init(
     for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
         last_task_alives[r] = 0;
         last_cleanup[r] = 0;
+        active_iter_start[r] = -1;
     }
+    iter_stack_top = -1;
+    active_filter_mask = 0;
 
     return true;
 }
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
index 61524348a..91c3914f8 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
@@ -216,6 +216,68 @@ struct PTO2TensorMap {
     // Per-ring validity threshold (for lazy invalidation)
     int32_t last_task_alives[PTO2_MAX_RING_DEPTH];  // Cached from shared memory per ring
 
+    // Parallel for iteration isolation stack.
+    // Each PTO2_PARALLEL_FOR pushes a frame; each iteration updates the frame's
+    // iter_start. Lookup filters entries whose local_id < iter_start on the
+    // matching ring. Nesting beyond PTO2_MAX_PARALLEL_DEPTH degrades gracefully
+    // (no filtering for the overflow level, full dependency visibility).
+    //
+    // The stack itself is the source of truth for nesting/pop semantics; lookup,
+    // however, consumes a denormalized per-ring cache (active_iter_start +
+    // active_filter_mask) so the hot path is O(1) regardless of stack depth.
+    // For same-ring nesting, the inner frame's threshold dominates (it is always
+    // >= the outer's since next_local_id() is monotonic), so the cache simply
+    // tracks the innermost frame per ring; on pop we restore the saved outer.
+    struct PTO2IterFrame {
+        int32_t iter_start_local_id;    // -1 = before first iter; >= 0 = boundary
+        int32_t saved_prev_iter_start;  // value of active_iter_start[ring_id] before this frame
+        uint8_t ring_id;                // ring this parallel for operates on
+    };
+    PTO2IterFrame iter_stack[PTO2_MAX_PARALLEL_DEPTH];
+    int32_t iter_stack_top{-1};  // -1 = no active parallel for
+
+    // Per-ring active iteration threshold (lookup hot-path cache).
+    //   active_iter_start[r] >= 0  : entries on ring r with local_id < this are filtered.
+    //   active_iter_start[r] == -1 : no active filter on ring r.
+    // active_filter_mask bit r mirrors (active_iter_start[r] >= 0) for a single branch test.
+    int32_t active_iter_start[PTO2_MAX_RING_DEPTH]{};
+    uint32_t active_filter_mask{0};
+
+    // =============================================================================
+    // Iter-stack helpers (maintain frames + per-ring cache atomically)
+    // =============================================================================
+
+    // Push a frame on parallel_for_begin. New frame has no active threshold yet
+    // (iter_start_local_id == -1); active_iter_start[ring] is unchanged. The
+    // previous value is saved in the frame so pop can restore it.
+    void push_iter_frame(uint8_t ring_id) {
+        int32_t top = ++iter_stack_top;
+        if (top >= PTO2_MAX_PARALLEL_DEPTH) return;  // overflow: see class comment
+        iter_stack[top] = {-1, active_iter_start[ring_id], ring_id};
+    }
+
+    // Update the top frame's iter_start on parallel_iter_begin.
+    void set_iter_start(int32_t iter_start_local_id) {
+        int32_t top = iter_stack_top;
+        if (top < 0 || top >= PTO2_MAX_PARALLEL_DEPTH) return;
+        uint8_t ring_id = iter_stack[top].ring_id;
+        iter_stack[top].iter_start_local_id = iter_start_local_id;
+        active_iter_start[ring_id] = iter_start_local_id;
+        active_filter_mask |= (1u << ring_id);
+    }
+
+    // Pop a frame on parallel_for_end, restoring the outer threshold.
+    void pop_iter_frame() {
+        int32_t top = iter_stack_top--;
+        if (top < 0 || top >= PTO2_MAX_PARALLEL_DEPTH) return;
+        const PTO2IterFrame &frame = iter_stack[top];
+        uint8_t ring_id = frame.ring_id;
+        active_iter_start[ring_id] = frame.saved_prev_iter_start;
+        if (frame.saved_prev_iter_start < 0) {
+            active_filter_mask &= ~(1u << ring_id);
+        }
+    }
+
     // Per-ring cleanup progress (for periodic cleanup_retired)
     int32_t last_cleanup[PTO2_MAX_RING_DEPTH]{};
 
@@ -328,6 +390,23 @@ struct PTO2TensorMap {
                 continue;
             }
 
+            // Parallel for iteration isolation: skip entries from prior iterations.
+            // Fast path: active_filter_mask == 0 (no parallel_for is currently
+            // inside an iteration) collapses to a single branch. Otherwise a
+            // single per-ring compare replaces the full iter_stack scan; the
+            // stack's "innermost frame wins per ring" semantics are denormalized
+            // into active_iter_start[] on push/set/pop.
+            if (active_filter_mask) {
+                uint8_t entry_ring = cur_entry->producer_task_id.ring();
+                if ((active_filter_mask >> entry_ring) & 1u) {
+                    int32_t entry_local = static_cast<int32_t>(cur_entry->producer_task_id.local());
+                    if (entry_local < active_iter_start[entry_ring]) {
+                        cur_entry = next_entry;
+                        continue;
+                    }
+                }
+            }
+
             // Entry is valid - check if regions OVERLAP (not just exact match)
             // Since we hash only by base_ptr, all entries in this bucket have
             // potential to overlap. We must check actual byte-range overlap.
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp b/tests/st/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp
index 308a1d66e..e7ff178c8 100644
--- a/tests/st/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp
@@ -79,7 +79,7 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const Chip
     int max_groups = num_matmul_groups > num_add_groups ? num_matmul_groups : num_add_groups;
 
     // Interleaved submit: matmul and add groups alternate
-    for (int group_idx = 0; group_idx < max_groups; group_idx++) {
+    PTO2_PARALLEL_FOR(group_idx, max_groups) {
         if (group_idx < num_matmul_groups) {
             int start_task_idx = group_idx * matmul_batch;
             uint64_t offset = static_cast<uint64_t>(start_task_idx) * MATMUL_ELEMS;
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/st/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp
index 68b794f6b..2e2fd7fb8 100644
--- a/tests/st/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp
@@ -113,10 +113,10 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const Chip
     constexpr uint64_t IN_CORE_BATCH = 16;
     uint64_t num_chunks = (batch + IN_CORE_BATCH - 1) / IN_CORE_BATCH;
 
-    for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) {
+    PTO2_PARALLEL_FOR(q_idx, (int)q_loop) {
         uint64_t q_offset = q_idx * q_tile;
 
-        for (uint64_t chunk_idx = 0; chunk_idx < num_chunks; chunk_idx++) {
+        PTO2_PARALLEL_FOR(chunk_idx, (int)num_chunks) {
             uint64_t chunk_bc = batch - chunk_idx * IN_CORE_BATCH;
             if (chunk_bc > IN_CORE_BATCH) chunk_bc = IN_CORE_BATCH;
             uint64_t batch_start = chunk_idx * IN_CORE_BATCH;
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp b/tests/st/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp
index 5c9b8c3ad..68d6091ff 100644
--- a/tests/st/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp
@@ -74,7 +74,7 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const Chip
 
     LOG_INFO("[mixed_orch] num_iters=%d", num_iters);
 
-    for (int i = 0; i < num_iters; i++) {
+    PTO2_PARALLEL_FOR(i, num_iters) {
         PTO2_SCOPE() {
             uint32_t view_shapes[1] = {TILE_ELEMS};
             uint32_t view_offsets[1] = {static_cast<uint32_t>(i) * TILE_ELEMS};
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp b/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp
index f4eba8a5f..f34508894 100644
--- a/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp
@@ -151,12 +151,12 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const Chip
     CYCLE_COUNT_LAP(prof_make_tensor);
 #endif
 
-    for (uint64_t b_idx = 0; b_idx < batch; b_idx++) {
+    PTO2_PARALLEL_FOR(b_idx, (int)batch) {
         uint32_t cl_idx[1] = {static_cast<uint32_t>(b_idx)};
         uint64_t cur_seq = static_cast<uint64_t>(get_tensor_data<int32_t>(context_lens, 1, cl_idx));
         uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size;
 
-        for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) {
+        PTO2_PARALLEL_FOR(q_idx, (int)q_loop) {
             CYCLE_COUNT_LAP(prof_scope_and_loop);
             PTO2_SCOPE() {
                 uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile;
diff --git a/tests/st/a5/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp b/tests/st/a5/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp
index 5c9b8c3ad..68d6091ff 100644
--- a/tests/st/a5/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp
+++ b/tests/st/a5/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp
@@ -74,7 +74,7 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const Chip
 
     LOG_INFO("[mixed_orch] num_iters=%d", num_iters);
 
-    for (int i = 0; i < num_iters; i++) {
+    PTO2_PARALLEL_FOR(i, num_iters) {
         PTO2_SCOPE() {
             uint32_t view_shapes[1] = {TILE_ELEMS};
             uint32_t view_offsets[1] = {static_cast<uint32_t>(i) * TILE_ELEMS};
diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp b/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp
index fba81681a..2fe072039 100644
--- a/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp
+++ b/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp
@@ -160,11 +160,11 @@ __attribute__((visibility("default"))) void build_paged_attention_graph(const Ch
     CYCLE_COUNT_LAP(prof_make_tensor);
 #endif
 
-    for (uint64_t b_idx = 0; b_idx < batch; b_idx++) {
+    PTO2_PARALLEL_FOR(b_idx, (int)batch) {
         uint32_t cl_idx[1] = {static_cast<uint32_t>(b_idx)};
         uint64_t cur_seq = static_cast<uint64_t>(get_tensor_data<int32_t>(context_lens, 1, cl_idx));
         uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size;
-        for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) {
+        PTO2_PARALLEL_FOR(q_idx, (int)q_loop) {
             CYCLE_COUNT_LAP(prof_scope_and_loop);
             PTO2_SCOPE() {
                 uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile;