diff --git a/ceno_recursion/src/transcript/mod.rs b/ceno_recursion/src/transcript/mod.rs
index 5792dc3b2..c23f395c3 100644
--- a/ceno_recursion/src/transcript/mod.rs
+++ b/ceno_recursion/src/transcript/mod.rs
@@ -53,32 +53,3 @@ pub fn transcript_check_pow_witness<C: Config>(
         builder.assert_eq::<Var<C::N>>(bit, Usize::from(0));
     });
 }
-
-pub fn clone_challenger_state<C: Config>(
-    builder: &mut Builder<C>,
-    src: &DuplexChallengerVariable<C>,
-) -> DuplexChallengerVariable<C> {
-    let dst = DuplexChallengerVariable::new(builder);
-    builder
-        .range(0, dst.sponge_state.len())
-        .for_each(|idx_vec, builder| {
-            let value = builder.get(&src.sponge_state, idx_vec[0]);
-            builder.set(&dst.sponge_state, idx_vec[0], value);
-        });
-
-    let input_offset = src.input_ptr - src.io_empty_ptr;
-    builder.assign(&dst.input_ptr, input_offset + dst.io_empty_ptr);
-
-    let output_offset = src.output_ptr - src.io_empty_ptr;
-    builder.assign(&dst.output_ptr, output_offset + dst.io_empty_ptr);
-    dst
-}
-
-pub fn challenger_add_forked_index<C: Config>(
-    builder: &mut Builder<C>,
-    challenger: &mut DuplexChallengerVariable<C>,
-    index: &Usize<C::N>,
-) {
-    let felt = builder.unsafe_cast_var_to_felt(index.get_var());
-    challenger.observe(builder, felt);
-}
diff --git a/ceno_recursion/src/zkvm_verifier/verifier.rs b/ceno_recursion/src/zkvm_verifier/verifier.rs
index 676d2bde2..8e6ffdca9 100644
--- a/ceno_recursion/src/zkvm_verifier/verifier.rs
+++ b/ceno_recursion/src/zkvm_verifier/verifier.rs
@@ -29,7 +29,6 @@ use crate::{
 use ceno_zkvm::structs::{ComposedConstrainSystem, VerifyingKey, ZKVMVerifyingKey};
 use ff_ext::BabyBearExt4;
 
-use crate::transcript::{challenger_add_forked_index, clone_challenger_state};
 use gkr_iop::{
     evaluation::EvalExpression,
     gkr::{
@@ -167,22 +166,6 @@ pub fn verify_zkvm_proof<C: Config<F = F>>(
     );
     challenger.observe(builder, log2_max_codeword_size_felt);
 
-    iter_zip!(builder, zkvm_proof_input.chip_proofs).for_each(|ptr_vec, builder| {
-        let chip_proofs = builder.iter_ptr_get(&zkvm_proof_input.chip_proofs, ptr_vec[0]);
-        let chip_idx = builder.get(&chip_proofs, 0).idx_felt;
-        challenger.observe(builder, chip_idx);
-
-        iter_zip!(builder, chip_proofs).for_each(|ptr_vec, builder| {
-            let chip_proof = builder.iter_ptr_get(&chip_proofs, ptr_vec[0]);
-
-            iter_zip!(builder, chip_proof.num_instances).for_each(|ptr_vec, builder| {
-                let num_instance = builder.iter_ptr_get(&chip_proof.num_instances, ptr_vec[0]);
-                let num_instance = builder.unsafe_cast_var_to_felt(num_instance);
-                challenger.observe(builder, num_instance);
-            });
-        });
-    });
-
     let alpha = challenger.sample_ext(builder);
     let beta = challenger.sample_ext(builder);
 
@@ -253,9 +236,15 @@ pub fn verify_zkvm_proof<C: Config<F = F>>(
 
             iter_zip!(builder, chip_proofs).for_each(|ptr_vec, builder| {
                 let chip_proof = builder.iter_ptr_get(&chip_proofs, ptr_vec[0]);
-                // fork transcript to support chip concurrently proved
-                let mut chip_challenger = clone_challenger_state(builder, &challenger);
-                challenger_add_forked_index(builder, &mut chip_challenger, &forked_sample_index);
+                // Fork chip transcript independently and bind challenges/metadata in verifier order.
+                let mut chip_challenger = DuplexChallengerVariable::new(builder);
+                transcript_observe_label(builder, &mut chip_challenger, b"fork");
+                let alpha_felts = builder.ext2felt(alpha);
+                chip_challenger.observe_slice(builder, alpha_felts);
+                let beta_felts = builder.ext2felt(beta);
+                chip_challenger.observe_slice(builder, beta_felts);
+                let fork_id_felt = builder.unsafe_cast_var_to_felt(forked_sample_index.get_var());
+                chip_challenger.observe(builder, fork_id_felt);
                 builder.assert_usize_eq(
                     chip_proof.rw_out_evals.length.clone(),
                     Usize::from(
@@ -267,6 +256,11 @@ pub fn verify_zkvm_proof<C: Config<F = F>>(
                     Usize::from(circuit_vk.get_cs().num_lks() * 4),
                 );
                 chip_challenger.observe(builder, chip_proof.idx_felt);
+                iter_zip!(builder, chip_proof.num_instances).for_each(|ptr_vec, builder| {
+                    let num_instance = builder.iter_ptr_get(&chip_proof.num_instances, ptr_vec[0]);
+                    let num_instance = builder.unsafe_cast_var_to_felt(num_instance);
+                    chip_challenger.observe(builder, num_instance);
+                });
 
                 // getting the number of dummy padding item that we used in this opcode circuit
                 let num_lks: Var<C::N> =
diff --git a/ceno_recursion_v2/skills/ceno-recursion-principles/SKILL.md b/ceno_recursion_v2/skills/ceno-recursion-principles/SKILL.md
index e27f7e4b6..790c50fb4 100644
--- a/ceno_recursion_v2/skills/ceno-recursion-principles/SKILL.md
+++ b/ceno_recursion_v2/skills/ceno-recursion-principles/SKILL.md
@@ -72,50 +72,6 @@ When forking, keep:
 - Keep target regression test command handy:
   - `RUST_MIN_STACK=33554432 RUST_BACKTRACE=1 cargo test -p ceno_recursion_v2 leaf_app_proof_round_trip_placeholder -- --nocapture`
 
-## AIR Refactor Execution Protocol (Reusable)
-
-Use this protocol for any AIR/module refactor (not tied to a specific module such as vm_pvs).
-
-### A. Diff-First Task Intake
-- Start from user edits, not assumptions: inspect `git diff` (staged + unstaged).
-- Extract added comments/TODOs and convert them into a condensed checklist with:
-  - behavior to implement,
-  - exact file/symbol target,
-  - acceptance condition.
-- Keep comments short and actionable; remove or resolve implementation TODOs during the patch.
-
-### B. Source-of-Truth and Pattern References
-- For transcript/order semantics and real data filling, use:
-  - `../ceno_zkvm/src/scheme/verifier.rs` (`verify_proof_validity`).
-- For constraint/bus message skeleton patterns, refer to upstream OpenVM continuation/recursion crates and mirror local style.
-- Prefer adapting existing local AIR/bus patterns over inventing new message shapes.
-
-### C. Transcript and Bus Invariants
-- Preserve transcript-visible operation order before challenge sampling (`alpha`, `beta`) exactly.
-- Keep producer/consumer bus keys aligned (`proof_idx`, `air_idx`, `pv_idx`, `tidx`, gating flags).
-- If preflight ownership moves across modules, ensure order is preserved via explicit orchestrator steps.
-- Do not silently change AIR count/order: `airs()` ordering must match proving-context ordering.
-
-### D. Refactor Steps (Generic)
-1. Read `git diff` and publish condensed implementation plan.
-2. Implement real data population from source-of-truth fields (avoid placeholder drift).
-3. Wire constraints/bus message skeletons using upstream-compatible patterns.
-4. Reconcile preflight/transcript ordering with verifier semantics.
-5. Remove stale comments and dead wiring introduced by prior placeholders.
-
-### E. Required Validation Gate
-Run both commands before declaring completion:
-
-```bash
-cargo fmt --all
-RUST_LOG=debug RUST_MIN_STACK=33554432 RUST_BACKTRACE=1 cargo test --release -p ceno_recursion_v2 leaf_app_proof_round_trip_placeholder -- --nocapture
-```
-
-Acceptance criteria:
-- Both commands pass.
-- No order mismatch against source-of-truth transcript sequence.
-- No unresolved implementation TODOs from the task diff.
-
 ## Acceptance Checklist for Migration PRs
 
 Before continuing to next module, verify:
diff --git a/ceno_recursion_v2/skills/recursion-v2-git-diff-driven-task/SKILL.md b/ceno_recursion_v2/skills/recursion-v2-git-diff-driven-task/SKILL.md
index afc2b5fb8..d6e4a010b 100644
--- a/ceno_recursion_v2/skills/recursion-v2-git-diff-driven-task/SKILL.md
+++ b/ceno_recursion_v2/skills/recursion-v2-git-diff-driven-task/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: recursion-v2-git-diff-driven-task
-description: Diff-driven execution protocol for ceno_recursion_v2 tasks. Harvest newly added code comments from git diff, condense into an implementation checklist, implement against upstream OpenVM skeleton patterns, and fill real data from ceno_zkvm verifier source-of-truth.
+description: Diff-driven execution protocol for ceno_recursion_v2 tasks. Harvest TODO/task comments from current git diff, align strictly with the user's change request, implement against upstream OpenVM skeleton patterns, and fill real data from ceno_zkvm verifier source-of-truth.
 ---
 
 # Recursion V2 Git Diff Driven Task
@@ -10,7 +10,8 @@ description: Diff-driven execution protocol for ceno_recursion_v2 tasks. Harvest
 Use this skill when task intent is written into newly added source comments and implementation should be driven directly by `git diff`.
 
 This protocol enforces:
-- comment-first task intake from staged and unstaged diffs,
+- TODO/comment-first task intake from staged and unstaged diffs,
+- explicit alignment to the user's current change request,
 - condensed, reviewable implementation plans,
 - upstream-compatible constraint/bus skeleton wiring,
 - source-of-truth data fill from the verifier path,
@@ -27,21 +28,22 @@ Use this skill when any of the following applies:
 ## Diff Intake Protocol (Required)
 
 1. Read `git diff` from both unstaged and staged changes.
-2. Extract only newly added task comments that describe behavior changes.
-3. Convert comments into a condensed checklist item format:
+2. Extract only newly added TODO/task comments that describe behavior changes.
+3. Restate the explicit user change request and filter out out-of-scope TODOs/comments.
+4. Convert in-scope items into a condensed checklist format:
    - `target`: file and symbol,
    - `behavior`: what must be implemented,
    - `acceptance`: concrete condition proving completion.
-4. Remove duplicate or ambiguous items and keep only executable work items.
-5. Present the condensed plan for review before editing logic.
+5. Remove duplicate or ambiguous items and keep only executable work items.
+6. Present the condensed plan for review before editing logic.
 
 ## Implementation Protocol (Required)
 
 1. Implement checklist items in priority order.
-2. Keep implementation scoped to the comment-defined behavior.
-3. Resolve temporary task comments by either:
-   - replacing them with finished code, or
-   - retaining a minimal explicit TODO with ownership if blocked.
+2. Keep implementation scoped to the user-approved request and in-scope diff TODOs.
+3. Resolve every in-scope diff TODO by either:
+   - replacing it with finished code, or
+   - retaining a minimal explicit TODO with ownership and blocker rationale.
 4. Preserve existing AIR ordering and bus key invariants unless checklist items explicitly request changes.
 
 ## Reference Hierarchy
@@ -59,7 +61,7 @@ Use references in this order:
 ## Plan Quality Bar
 
 A valid plan must:
-- map every actionable diff comment to exactly one checklist item,
+- map every actionable in-scope diff TODO/comment to exactly one checklist item,
 - state file/symbol targets explicitly,
 - include acceptance conditions that can be verified by code reading or test execution,
 - and call out any blocker assumptions before implementation starts.
@@ -77,6 +79,7 @@ cargo fmt --all
 
 Task is done only when all are true:
 - All condensed checklist items are implemented or explicitly marked blocked with rationale.
+- All in-scope TODOs introduced by the current `git diff` are resolved or explicitly owned/blocked.
 - No unresolved implementation comments remain from the task diff without ownership.
 - Constraint/bus skeleton usage follows upstream-compatible patterns.
 - Real data fill aligns with `ceno_zkvm/src/scheme/verifier.rs` semantics.
@@ -85,12 +88,12 @@ Task is done only when all are true:
 ## Quick Reusable Checklist Template
 
 ```markdown
-- [ ] Collect unstaged + staged git diff comments
+- [ ] Collect unstaged + staged git diff TODO/task comments
+- [ ] Restate user request and mark in-scope items only
 - [ ] Condense into target/behavior/acceptance checklist
 - [ ] Share plan and get implementation go-ahead
 - [ ] Implement using upstream continuation/recursion skeleton patterns
 - [ ] Fill real data per ceno_zkvm verifier source-of-truth
-- [ ] Remove or resolve task comments/TODOs introduced in this diff
+- [ ] Resolve in-scope TODOs introduced in this diff
 - [ ] Run required completion gate commands
 ```
-
diff --git a/ceno_recursion_v2/skills/recursion-v2-git-diff-driven-task/agents/openai.yaml b/ceno_recursion_v2/skills/recursion-v2-git-diff-driven-task/agents/openai.yaml
index f589037ea..b0e6609dd 100644
--- a/ceno_recursion_v2/skills/recursion-v2-git-diff-driven-task/agents/openai.yaml
+++ b/ceno_recursion_v2/skills/recursion-v2-git-diff-driven-task/agents/openai.yaml
@@ -1,5 +1,4 @@
 interface:
   display_name: "Recursion V2 Diff-Driven Task"
-  short_description: "Implement recursion-v2 work from newly added git-diff comments with upstream skeleton and verifier-truth alignment"
-  default_prompt: "Follow recursion-v2-git-diff-driven-task: harvest staged+unstaged diff comments, condense into target/behavior/acceptance checklist, use upstream openvm continuation/recursion constraint-bus skeletons, fill real data per ceno_zkvm/src/scheme/verifier.rs, then run required completion gate commands in order."
-
+  short_description: "Implement recursion-v2 work from current git-diff TODO/task comments while strictly honoring explicit user scope"
+  default_prompt: "Follow recursion-v2-git-diff-driven-task: read staged+unstaged git diff, extract newly added TODO/task comments, restate and enforce explicit user request scope, condense in-scope work into target/behavior/acceptance checklist, implement against upstream openvm continuation/recursion constraint-bus skeletons, fill real data per ceno_zkvm/src/scheme/verifier.rs, resolve all in-scope diff TODOs (or mark owner+blocker), then run required completion gate commands in order."
diff --git a/ceno_recursion_v2/src/bus.rs b/ceno_recursion_v2/src/bus.rs
index ad45b7c4f..21562d770 100644
--- a/ceno_recursion_v2/src/bus.rs
+++ b/ceno_recursion_v2/src/bus.rs
@@ -1,6 +1,8 @@
 use openvm_poseidon2_air::POSEIDON2_WIDTH;
 use openvm_stark_sdk::config::baby_bear_poseidon2::D_EF;
-use recursion_circuit::{bus as upstream, define_typed_per_proof_permutation_bus};
+use recursion_circuit::{
+    bus as upstream, define_typed_per_proof_lookup_bus, define_typed_per_proof_permutation_bus,
+};
 pub use upstream::{
     AirPresenceBus, AirPresenceBusMessage, AirShapeBus, AirShapeBusMessage, CachedCommitBus,
     CachedCommitBusMessage, ColumnClaimsBus, ColumnClaimsMessage, ExpressionClaimNMaxBus,
@@ -11,7 +13,7 @@ pub use upstream::{
 };
 
 // ── Forked transcript bus ─────────────────────────────────────────────────────
-// Carries per-chip fork transcript state, addressed by (fork_id, fork_tidx)
+// Carries per-chip fork transcript state, addressed by (fork_id, tidx)
 // instead of the absolute tidx used by the trunk TranscriptBus.
 
 #[repr(C)]
@@ -19,8 +21,8 @@ pub use upstream::{
 pub struct ForkedTranscriptBusMessage<T> {
     /// Fork identifier (1-based). Matches TranscriptAir's fork_id column.
     pub fork_id: T,
-    /// Position within the fork state (0-based).
-    pub fork_tidx: T,
+    /// Position within the fork transcript namespace.
+    pub tidx: T,
     /// Sponge state element being communicated.
     pub value: T,
     /// 1 if this is a sample operation, 0 if observe.
@@ -29,6 +31,29 @@ pub struct ForkedTranscriptBusMessage<T> {
 
 define_typed_per_proof_permutation_bus!(ForkedTranscriptBus, ForkedTranscriptBusMessage);
 
+#[repr(u8)]
+#[derive(Debug, Clone, Copy)]
+pub enum LookupChallengeKind {
+    Alpha = 0,
+    Beta = 1,
+}
+
+impl LookupChallengeKind {
+    pub const fn as_usize(self) -> usize {
+        self as usize
+    }
+}
+
+#[repr(C)]
+#[derive(stark_recursion_circuit_derive::AlignedBorrow, Debug, Clone, Copy)]
+pub struct LookupChallengeMessage<T> {
+    pub kind: T,
+    pub word_idx: T,
+    pub value: T,
+}
+
+define_typed_per_proof_lookup_bus!(LookupChallengeBus, LookupChallengeMessage);
+
 #[repr(C)]
 #[derive(stark_recursion_circuit_derive::AlignedBorrow, Debug, Clone, Copy)]
 pub struct TowerModuleMessage<T> {
diff --git a/ceno_recursion_v2/src/circuit/inner/mod.rs b/ceno_recursion_v2/src/circuit/inner/mod.rs
index b59d5a324..2d66e353d 100644
--- a/ceno_recursion_v2/src/circuit/inner/mod.rs
+++ b/ceno_recursion_v2/src/circuit/inner/mod.rs
@@ -38,6 +38,7 @@ impl<SC: StarkProtocolConfig<F = F>, S: AggregationSubCircuit> Circuit<SC> for I
         let transcript_bus = bus_inventory.transcript_bus;
         let public_values_bus = bus_inventory.public_values_bus;
         let cached_commit_bus = bus_inventory.cached_commit_bus;
+        let lookup_challenge_bus = bus_inventory.lookup_challenge_bus;
         let poseidon2_compress_bus = bus_inventory.poseidon2_compress_bus;
         let pvs_air_consistency_bus =
             PvsAirConsistencyBus::new(self.verifier_circuit.next_bus_idx());
@@ -63,6 +64,7 @@ impl<SC: StarkProtocolConfig<F = F>, S: AggregationSubCircuit> Circuit<SC> for I
             transcript_bus,
             public_values_bus,
             cached_commit_bus,
+            lookup_challenge_bus,
             pvs_air_consistency_bus,
             deferral_enabled,
             instance_public_value_indices: self.instance_public_value_indices.clone(),
diff --git a/ceno_recursion_v2/src/circuit/inner/trace.rs b/ceno_recursion_v2/src/circuit/inner/trace.rs
index 135285bfa..79d0c579e 100644
--- a/ceno_recursion_v2/src/circuit/inner/trace.rs
+++ b/ceno_recursion_v2/src/circuit/inner/trace.rs
@@ -83,9 +83,20 @@ impl InnerTraceGen<CpuBackend<BabyBearPoseidon2Config>> for InnerTraceGenImpl {
             child_dag_commit,
             self.deferral_enabled,
         );
+        let (preflights, per_proof_initial_transcripts): (Vec<Preflight>, Vec<TS>) = proofs
+            .iter()
+            .map(|proof| {
+                let mut sponge = initial_transcript.clone();
+                let mut preflight = Preflight::default();
+                super::verifier::run_preflight(child_vk, proof, &mut preflight, &mut sponge);
+                super::vm_pvs::run_preflight(child_vk, proof, &mut preflight, &mut sponge);
+                (preflight, sponge)
+            })
+            .unzip();
         let vm_ctx = super::vm_pvs::generate_proving_ctx(
             child_vk,
             proofs,
+            &preflights,
             proofs_type,
             child_is_app,
             self.deferral_enabled,
@@ -105,17 +116,6 @@ impl InnerTraceGen<CpuBackend<BabyBearPoseidon2Config>> for InnerTraceGenImpl {
             super::unset::generate_proving_ctx(&[], child_is_app)
         };
 
-        let per_proof_initial_transcripts = proofs
-            .iter()
-            .map(|proof| {
-                let mut sponge = initial_transcript.clone();
-                let mut preflight = Preflight::default();
-                super::verifier::run_preflight(child_vk, proof, &mut preflight, &mut sponge);
-                super::vm_pvs::run_preflight(child_vk, proof, &mut preflight, &mut sponge);
-                sponge
-            })
-            .collect();
-
         (
             vec![verifier_ctx, vm_ctx, idx2_ctx],
             poseidon2_inputs,
diff --git a/ceno_recursion_v2/src/circuit/inner/vm_pvs/air.rs b/ceno_recursion_v2/src/circuit/inner/vm_pvs/air.rs
index 5079be775..63ae7727d 100644
--- a/ceno_recursion_v2/src/circuit/inner/vm_pvs/air.rs
+++ b/ceno_recursion_v2/src/circuit/inner/vm_pvs/air.rs
@@ -10,7 +10,7 @@ use openvm_circuit_primitives::utils::{and, assert_array_eq, not};
 use openvm_stark_backend::{
     BaseAirWithPublicValues, PartitionedBaseAir, interaction::InteractionBuilder,
 };
-use openvm_stark_sdk::config::baby_bear_poseidon2::DIGEST_SIZE;
+use openvm_stark_sdk::config::baby_bear_poseidon2::{D_EF, DIGEST_SIZE};
 use p3_air::{Air, AirBuilder, AirBuilderWithPublicValues, BaseAir};
 use p3_field::PrimeCharacteristicRing;
 use p3_matrix::Matrix;
@@ -19,7 +19,10 @@ use recursion_circuit::bus::{
 };
 use stark_recursion_circuit_derive::AlignedBorrow;
 
-use crate::circuit::inner::{bus::PvsAirConsistencyBus, vm_pvs::VmPvs};
+use crate::{
+    bus::{LookupChallengeBus, LookupChallengeKind, LookupChallengeMessage},
+    circuit::inner::{bus::PvsAirConsistencyBus, vm_pvs::VmPvs},
+};
 
 #[repr(C)]
 #[derive(AlignedBorrow)]
@@ -28,6 +31,10 @@ pub struct VmPvsCols<F> {
     pub is_valid: F,
     pub is_last: F,
     pub has_verifier_pvs: F,
+    pub lookup_challenge_alpha: [F; D_EF],
+    pub lookup_challenge_beta: [F; D_EF],
+    pub lookup_challenge_alpha_lookup_count: F,
+    pub lookup_challenge_beta_lookup_count: F,
     pub child_pvs: VmPvs<F>,
 }
 
@@ -35,6 +42,7 @@ pub struct VmPvsAir {
     pub transcript_bus: TranscriptBus,
     pub public_values_bus: PublicValuesBus,
     pub cached_commit_bus: CachedCommitBus,
+    pub lookup_challenge_bus: LookupChallengeBus,
     pub pvs_air_consistency_bus: PvsAirConsistencyBus,
     pub deferral_enabled: bool,
     pub instance_public_value_indices: Arc<Vec<Vec<usize>>>,
@@ -181,13 +189,13 @@ impl<AB: AirBuilder + InteractionBuilder + AirBuilderWithPublicValues> Air<AB> f
         // );
 
         // Commitments are observed after transcript-visible public values in preflight.
-        let start_tidx_after_commitment = VmPvs::<u8>::width() - 3 * DIGEST_SIZE;
+        let start_tidx_after_public_value = VmPvs::<u8>::width() - 3 * DIGEST_SIZE;
         for (didx, value) in local.child_pvs.fixed_commit.iter().enumerate() {
             self.transcript_bus.receive(
                 builder,
                 local.proof_idx,
                 TranscriptBusMessage {
-                    tidx: AB::Expr::from_usize(start_tidx_after_commitment + didx),
+                    tidx: AB::Expr::from_usize(start_tidx_after_public_value + didx),
                     value: (*value).into(),
                     is_sample: AB::Expr::ZERO,
                 },
@@ -199,7 +207,7 @@ impl<AB: AirBuilder + InteractionBuilder + AirBuilderWithPublicValues> Air<AB> f
                 builder,
                 local.proof_idx,
                 TranscriptBusMessage {
-                    tidx: AB::Expr::from_usize(start_tidx_after_commitment + DIGEST_SIZE + didx),
+                    tidx: AB::Expr::from_usize(start_tidx_after_public_value + DIGEST_SIZE + didx),
                     value: (*value).into(),
                     is_sample: AB::Expr::ZERO,
                 },
@@ -212,7 +220,7 @@ impl<AB: AirBuilder + InteractionBuilder + AirBuilderWithPublicValues> Air<AB> f
                 local.proof_idx,
                 TranscriptBusMessage {
                     tidx: AB::Expr::from_usize(
-                        start_tidx_after_commitment + 2 * DIGEST_SIZE + didx,
+                        start_tidx_after_public_value + 2 * DIGEST_SIZE + didx,
                     ),
                     value: (*value).into(),
                     is_sample: AB::Expr::ZERO,
@@ -221,6 +229,29 @@ impl<AB: AirBuilder + InteractionBuilder + AirBuilderWithPublicValues> Air<AB> f
             );
         }
 
+        for i in 0..D_EF {
+            self.lookup_challenge_bus.add_key_with_lookups(
+                builder,
+                local.proof_idx,
+                LookupChallengeMessage {
+                    kind: AB::Expr::from_usize(LookupChallengeKind::Alpha.as_usize()),
+                    word_idx: AB::Expr::from_usize(i),
+                    value: local.lookup_challenge_alpha[i].into(),
+                },
+                local.is_valid * local.lookup_challenge_alpha_lookup_count,
+            );
+            self.lookup_challenge_bus.add_key_with_lookups(
+                builder,
+                local.proof_idx,
+                LookupChallengeMessage {
+                    kind: AB::Expr::from_usize(LookupChallengeKind::Beta.as_usize()),
+                    word_idx: AB::Expr::from_usize(i),
+                    value: local.lookup_challenge_beta[i].into(),
+                },
+                local.is_valid * local.lookup_challenge_beta_lookup_count,
+            );
+        }
+
         // We look up proof metadata from VerifierPvsAir here to ensure consistency on each row.
         // self.pvs_air_consistency_bus.lookup_key(
         //     builder,
diff --git a/ceno_recursion_v2/src/circuit/inner/vm_pvs/mod.rs b/ceno_recursion_v2/src/circuit/inner/vm_pvs/mod.rs
index 31e5236d1..102876658 100644
--- a/ceno_recursion_v2/src/circuit/inner/vm_pvs/mod.rs
+++ b/ceno_recursion_v2/src/circuit/inner/vm_pvs/mod.rs
@@ -39,7 +39,7 @@ pub use trace::*;
 pub fn run_preflight<TS>(
     child_vk: &RecursionVk,
     proof: &RecursionProof,
-    _preflight: &mut Preflight,
+    preflight: &mut Preflight,
     ts: &mut TS,
 ) where
     TS: FiatShamirTranscript<BabyBearPoseidon2Config> + TranscriptHistory,
@@ -74,4 +74,12 @@ pub fn run_preflight<TS>(
         ts.observe(elem);
     }
     ts.observe(F::from_u64(witin.log2_max_codeword_size as u64));
+
+    let alpha_ext = ts.sample_ext();
+    let beta_ext = ts.sample_ext();
+    eprintln!("vm_pvs alpha {} beta {}", alpha_ext, beta_ext);
+    preflight.vm_pvs.lookup_challenge_alpha = alpha_ext;
+    preflight.vm_pvs.lookup_challenge_beta = beta_ext;
+    preflight.vm_pvs.lookup_challenge_alpha_lookup_count = 0;
+    preflight.vm_pvs.lookup_challenge_beta_lookup_count = 0;
 }
diff --git a/ceno_recursion_v2/src/circuit/inner/vm_pvs/trace.rs b/ceno_recursion_v2/src/circuit/inner/vm_pvs/trace.rs
index 7cee554c3..a1262f04b 100644
--- a/ceno_recursion_v2/src/circuit/inner/vm_pvs/trace.rs
+++ b/ceno_recursion_v2/src/circuit/inner/vm_pvs/trace.rs
@@ -1,7 +1,9 @@
 use openvm_cpu_backend::CpuBackend;
 use openvm_stark_backend::prover::AirProvingContext;
-use openvm_stark_sdk::config::baby_bear_poseidon2::{BabyBearPoseidon2Config, DIGEST_SIZE, F};
-use p3_field::PrimeCharacteristicRing;
+use openvm_stark_sdk::config::baby_bear_poseidon2::{
+    BabyBearPoseidon2Config, D_EF, DIGEST_SIZE, EF, F,
+};
+use p3_field::{BasedVectorSpace, PrimeCharacteristicRing};
 use p3_matrix::dense::RowMajorMatrix;
 use std::borrow::BorrowMut;
 
@@ -10,16 +12,18 @@ use crate::{
         ProofsType,
         vm_pvs::{VmPvs, air::VmPvsCols},
     },
-    system::{RecursionProof, RecursionVk},
+    system::{Preflight, RecursionProof, RecursionVk},
 };
 
 pub fn generate_proving_ctx(
     child_vk: &RecursionVk,
     proofs: &[RecursionProof],
+    preflights: &[Preflight],
     proofs_type: ProofsType,
     child_is_app: bool,
     deferral_enabled: bool,
 ) -> AirProvingContext<CpuBackend<BabyBearPoseidon2Config>> {
+    debug_assert_eq!(proofs.len(), preflights.len());
     let _ = (proofs_type, child_is_app);
     assert!(
         !deferral_enabled,
@@ -50,9 +54,16 @@ pub fn generate_proving_ctx(
 
         if row_idx < proofs.len() {
             let proof = &proofs[row_idx];
+            let preflight = &preflights[row_idx];
             cols.is_valid = F::ONE;
             cols.is_last = F::from_bool(row_idx + 1 == proofs.len());
             cols.has_verifier_pvs = F::ZERO;
+            cols.lookup_challenge_alpha = ef_to_limbs(preflight.vm_pvs.lookup_challenge_alpha);
+            cols.lookup_challenge_beta = ef_to_limbs(preflight.vm_pvs.lookup_challenge_beta);
+            cols.lookup_challenge_alpha_lookup_count =
+                F::from_usize(preflight.vm_pvs.lookup_challenge_alpha_lookup_count);
+            cols.lookup_challenge_beta_lookup_count =
+                F::from_usize(preflight.vm_pvs.lookup_challenge_beta_lookup_count);
             cols.child_pvs = build_vm_pvs(fixed_commit, fixed_no_omc_init_commit, proof);
         }
 
@@ -119,3 +130,9 @@ fn split_u32_lo_hi(value: u32) -> [F; 2] {
         F::from_u32((value >> 16) & 0xffff),
     ]
 }
+
+fn ef_to_limbs(value: EF) -> [F; D_EF] {
+    let mut out = [F::ZERO; D_EF];
+    out.copy_from_slice(value.as_basis_coefficients_slice());
+    out
+}
diff --git a/ceno_recursion_v2/src/cuda/types.rs b/ceno_recursion_v2/src/cuda/types.rs
index d5c7502e6..d6c3d1436 100644
--- a/ceno_recursion_v2/src/cuda/types.rs
+++ b/ceno_recursion_v2/src/cuda/types.rs
@@ -11,7 +11,6 @@ pub struct TraceHeight {
 #[derive(Debug, Default)]
 pub struct TraceMetadata {
     pub cached_idx: usize,
-    pub starting_cidx: usize,
     pub total_interactions: usize,
     pub num_air_id_lookups: usize,
 }
diff --git a/ceno_recursion_v2/src/proof_shape/mod.rs b/ceno_recursion_v2/src/proof_shape/mod.rs
index e9c797f5d..cf931b2b4 100644
--- a/ceno_recursion_v2/src/proof_shape/mod.rs
+++ b/ceno_recursion_v2/src/proof_shape/mod.rs
@@ -5,12 +5,9 @@ use openvm_circuit_primitives::encoder::Encoder;
 use openvm_cpu_backend::CpuBackend;
 use openvm_stark_backend::{
     AirRef, FiatShamirTranscript, StarkProtocolConfig, TranscriptHistory,
-    keygen::types::VerifierSinglePreprocessedData, p3_maybe_rayon::prelude::*,
-    prover::AirProvingContext,
-};
-use openvm_stark_sdk::config::baby_bear_poseidon2::{
-    BabyBearPoseidon2Config, DIGEST_SIZE, Digest, F,
+    p3_maybe_rayon::prelude::*, prover::AirProvingContext,
 };
+use openvm_stark_sdk::config::baby_bear_poseidon2::{BabyBearPoseidon2Config, F};
 use p3_field::PrimeCharacteristicRing;
 use p3_matrix::dense::RowMajorMatrix;
 
@@ -44,13 +41,12 @@ mod cuda_abi;
 pub struct AirMetadata {
     is_required: bool,
     num_public_values: usize,
-    main_width: usize,
-    cached_widths: Vec<usize>,
+    num_witin: usize,
+    num_structural_witin: usize,
+    num_fixed: usize,
     num_read_count: usize,
     num_write_count: usize,
     num_logup_count: usize,
-    preprocessed_width: Option<usize>,
-    preprocessed_data: Option<VerifierSinglePreprocessedData<Digest>>,
 }
 
 pub struct ProofShapeModule {
@@ -66,9 +62,6 @@ pub struct ProofShapeModule {
 
     // Required for ProofShapeAir tracegen + constraints
     idx_encoder: Arc<Encoder>,
-    min_cached_idx: usize,
-    max_cached: usize,
-    commit_mult: usize,
 
     // Module sends extra public values message for use outside of verifier
     // sub-circuit if true
@@ -84,12 +77,7 @@ impl ProofShapeModule {
     ) -> Self {
         let num_airs = child_vk.circuit_vks.len();
         let idx_encoder = Arc::new(Encoder::new(num_airs, 2, true));
-
-        let min_cached_idx = 0;
-        let _min_cached = 1;
-        let max_cached = 2;
-
-        let per_air = extract_air_metadata_from_vk(child_vk, max_cached);
+        let per_air = extract_air_metadata_from_vk(child_vk);
 
         let range_bus = bus_inventory.range_checker_bus;
         Self {
@@ -100,9 +88,6 @@ impl ProofShapeModule {
             starting_tidx_bus: StartingTidxBus::new(b.new_bus_idx()),
             num_pvs_bus: NumPublicValuesBus::new(b.new_bus_idx()),
             idx_encoder,
-            min_cached_idx,
-            max_cached,
-            commit_mult: 100,
             continuations_enabled,
         }
     }
@@ -119,6 +104,9 @@ impl ProofShapeModule {
     {
         let _ = self;
 
+        let transcript_start_tidx = ts.len();
+        preflight.proof_shape.fork_start_tidx = ts.len();
+
         // Build per-air shape metadata from present chip proofs.
         let mut sorted_trace_vdata = proof
             .chip_proofs
@@ -136,13 +124,11 @@ impl ProofShapeModule {
             .collect_vec();
         sorted_trace_vdata.sort_by_key(|(air_idx, v)| (usize::MAX - v.log_height, *air_idx));
         preflight.proof_shape.sorted_trace_vdata = sorted_trace_vdata;
+        // TODO remove l_skip
         preflight.proof_shape.l_skip = 0;
 
-        let mut current_tidx = crate::utils::TranscriptLabel::Riscv.field_len();
-        let mut current_cidx = 1usize;
+        let mut current_tidx = transcript_start_tidx;
         let mut starting_tidx = vec![0usize; child_vk.circuit_vks.len()];
-        let mut starting_cidx = vec![0usize; child_vk.circuit_vks.len()];
-        let mut pvs_tidx = Vec::new();
         let n_max = preflight
             .proof_shape
             .sorted_trace_vdata
@@ -155,76 +141,40 @@ impl ProofShapeModule {
             let metadata = &self.per_air[air_idx];
             let is_present = proof.chip_proofs.contains_key(&air_idx);
             starting_tidx[air_idx] = current_tidx;
-            starting_cidx[air_idx] = current_cidx;
-
-            current_cidx += usize::from(metadata.preprocessed_data.is_some());
-            current_cidx += metadata.cached_widths.len();
 
             if !metadata.is_required {
                 current_tidx += 1;
             }
 
             if is_present {
-                if metadata.preprocessed_data.is_some() {
-                    current_tidx += DIGEST_SIZE;
-                } else {
-                    current_tidx += 1;
-                }
-
-                for cached_width in &metadata.cached_widths {
-                    if *cached_width != 0 {
-                        current_tidx += DIGEST_SIZE;
-                    }
-                }
+                current_tidx += 1;
 
                 if metadata.num_public_values != 0 {
-                    pvs_tidx.push(current_tidx);
                     current_tidx += metadata.num_public_values;
                 }
             }
         }
 
         preflight.proof_shape.starting_tidx = starting_tidx;
-        preflight.proof_shape.starting_cidx = starting_cidx;
-        preflight.proof_shape.pvs_tidx = pvs_tidx;
         preflight.proof_shape.post_tidx = current_tidx;
         preflight.proof_shape.n_max = n_max;
-        preflight.proof_shape.n_logup = preflight
-            .gkr
-            .chips
-            .iter()
-            .filter(|entry| proof.chip_proofs.contains_key(&entry.chip_idx))
-            .map(|entry| entry.tower_replay.layers.len())
+        // n_logup is the per-proof max number of tower layers (num_layers).
+        // Keep this independent from tower replay side effects.
+        preflight.proof_shape.n_logup = proof
+            .chip_proofs
+            .values()
+            .flat_map(|instances| instances.iter())
+            .map(|chip_proof| chip_proof.tower_proof.proofs.len())
             .max()
             .unwrap_or(0);
 
-        // Verifier preprocess: absorb (circuit_idx, num_instance...) for all chip proofs.
-        for (&chip_idx, chip_instances) in &proof.chip_proofs {
-            ts.observe(F::from_usize(chip_idx));
-            for num_instance in chip_instances
-                .iter()
-                .flat_map(|instance| &instance.num_instances)
-            {
-                ts.observe(F::from_usize(*num_instance));
-            }
-        }
-
-        preflight.proof_shape.alpha_tidx = ts.len();
-        let _alpha = FiatShamirTranscript::<BabyBearPoseidon2Config>::sample_ext(ts);
-        preflight.proof_shape.beta_tidx = ts.len();
-        let _beta = FiatShamirTranscript::<BabyBearPoseidon2Config>::sample_ext(ts);
-        preflight.proof_shape.fork_start_tidx = ts.len();
-
-        eprintln!("_alpha {} _beta {}", _alpha, _beta);
-
         let _ = child_vk;
     }
 
     #[allow(dead_code)]
     fn placeholder_air_widths(&self) -> Vec<usize> {
-        let proof_shape_width = proof_shape::ProofShapeCols::<u8, 4>::width()
-            + self.idx_encoder.width()
-            + self.max_cached * DIGEST_SIZE;
+        let proof_shape_width =
+            proof_shape::ProofShapeCols::<u8, 4>::width() + self.idx_encoder.width();
         let pvs_width = pvs::PublicValuesCols::<u8>::width();
         let range_width = RangeCheckerCols::<u8>::width();
         // TODO(recursion-proof-bridge): replace proof-shape module placeholder contexts with
@@ -252,30 +202,37 @@ fn extract_rwlk_counts(child_vk: &RecursionVk, expected_len: usize) -> Vec<(usiz
         .collect()
 }
 
-fn extract_air_metadata_from_vk(child_vk: &RecursionVk, max_cached: usize) -> Vec<AirMetadata> {
+fn extract_air_metadata_from_vk(child_vk: &RecursionVk) -> Vec<AirMetadata> {
     let rwlk_counts = extract_rwlk_counts(child_vk, child_vk.circuit_vks.len());
     (0..child_vk.circuit_vks.len())
         .map(|idx| {
             let (num_read_count, num_write_count, num_logup_count) =
                 rwlk_counts.get(idx).copied().unwrap_or((0, 0, 0));
 
-            let num_public_values = child_vk
+            let (num_public_values, num_witin, num_structural_witin, num_fixed) = child_vk
                 .circuit_index_to_name
                 .get(&idx)
                 .and_then(|name| child_vk.circuit_vks.get(name))
-                .map(|circuit_vk| circuit_vk.get_cs().zkvm_v1_css.instance.len())
-                .unwrap_or(0);
+                .map(|circuit_vk| {
+                    let css = &circuit_vk.get_cs().zkvm_v1_css;
+                    (
+                        css.instance.len(),
+                        css.num_witin as usize,
+                        css.num_structural_witin as usize,
+                        css.num_fixed,
+                    )
+                })
+                .unwrap_or((0, 0, 0, 0));
 
             AirMetadata {
                 is_required: false,
                 num_public_values,
-                main_width: 0,
-                cached_widths: vec![0; max_cached],
+                num_witin,
+                num_structural_witin,
+                num_fixed,
                 num_read_count,
                 num_write_count,
                 num_logup_count,
-                preprocessed_width: None,
-                preprocessed_data: None,
             }
         })
         .collect_vec()
@@ -289,14 +246,11 @@ impl AirModule for ProofShapeModule {
     fn airs<SC: StarkProtocolConfig<F = F>>(&self) -> Vec<AirRef<SC>> {
         let proof_shape_air = ProofShapeAir::<4, 8> {
             per_air: self.per_air.clone(),
-            min_cached_idx: self.min_cached_idx,
-            max_cached: self.max_cached,
-            commit_mult: self.commit_mult,
             idx_encoder: self.idx_encoder.clone(),
             range_bus: self.range_bus,
             permutation_bus: self.permutation_bus,
             starting_tidx_bus: self.starting_tidx_bus,
-            num_pvs_bus: self.num_pvs_bus,
+            lookup_challenge_bus: self.bus_inventory.lookup_challenge_bus,
             fraction_folder_input_bus: self.bus_inventory.fraction_folder_input_bus,
             expression_claim_n_max_bus: self.bus_inventory.expression_claim_n_max_bus,
             tower_module_bus: self.bus_inventory.tower_module_bus,
@@ -306,8 +260,6 @@ impl AirModule for ProofShapeModule {
             transcript_bus: self.bus_inventory.transcript_bus,
             forked_transcript_bus: self.bus_inventory.forked_transcript_bus,
             n_lift_bus: self.bus_inventory.n_lift_bus,
-            cached_commit_bus: self.bus_inventory.cached_commit_bus,
-            continuations_enabled: self.continuations_enabled,
         };
         let pvs_air = PublicValuesAir {
             public_values_bus: self.bus_inventory.public_values_bus,
@@ -346,20 +298,10 @@ impl<SC: StarkProtocolConfig<F = F>> TraceGenModule<GlobalCtxCpu, CpuBackend<SC>
     ) -> Option<Vec<AirProvingContext<CpuBackend<SC>>>> {
         let pow_checker = &ctx.0;
         let external_range_checks = ctx.1;
-        let cidx_deltas = self
-            .per_air
-            .iter()
-            .map(|metadata| {
-                usize::from(metadata.preprocessed_data.is_some()) + metadata.cached_widths.len()
-            })
-            .collect();
 
         let range_checker = Arc::new(RangeCheckerCpuTraceGenerator::<8>::default());
         let proof_shape = proof_shape::ProofShapeChip::<4, 8>::new(
             self.idx_encoder.clone(),
-            self.min_cached_idx,
-            self.max_cached,
-            cidx_deltas,
             range_checker.clone(),
             pow_checker.clone(),
         );
diff --git a/ceno_recursion_v2/src/proof_shape/proof_shape/air.rs b/ceno_recursion_v2/src/proof_shape/proof_shape/air.rs
index c9aa5ca43..853848d02 100644
--- a/ceno_recursion_v2/src/proof_shape/proof_shape/air.rs
+++ b/ceno_recursion_v2/src/proof_shape/proof_shape/air.rs
@@ -6,11 +6,10 @@ use openvm_circuit_primitives::{
     encoder::Encoder,
     utils::{and, not, or},
 };
-use openvm_poseidon2_air::POSEIDON2_WIDTH;
 use openvm_stark_backend::{
     BaseAirWithPublicValues, PartitionedBaseAir, interaction::InteractionBuilder,
 };
-use openvm_stark_sdk::config::baby_bear_poseidon2::DIGEST_SIZE;
+use openvm_stark_sdk::config::baby_bear_poseidon2::D_EF;
 use p3_air::{Air, AirBuilder, BaseAir};
 use p3_field::{PrimeCharacteristicRing, PrimeField32};
 use p3_matrix::Matrix;
@@ -18,21 +17,24 @@ use stark_recursion_circuit_derive::AlignedBorrow;
 
 use crate::{
     bus::{
-        AirShapeBus, AirShapeBusMessage, CachedCommitBus, ExpressionClaimNMaxBus,
-        ExpressionClaimNMaxMessage, ForkedTranscriptBus, ForkedTranscriptBusMessage,
-        FractionFolderInputBus, FractionFolderInputMessage, HyperdimBus, HyperdimBusMessage,
-        LiftedHeightsBus, LiftedHeightsBusMessage, NLiftBus, NLiftMessage, TowerModuleBus,
-        TowerModuleMessage, TranscriptBus, TranscriptBusMessage,
+        AirShapeBus, AirShapeBusMessage, ExpressionClaimNMaxBus, ExpressionClaimNMaxMessage,
+        ForkedTranscriptBus, ForkedTranscriptBusMessage, FractionFolderInputBus,
+        FractionFolderInputMessage, HyperdimBus, HyperdimBusMessage, LiftedHeightsBus,
+        LiftedHeightsBusMessage, LookupChallengeBus, LookupChallengeKind, LookupChallengeMessage,
+        NLiftBus, NLiftMessage, TowerModuleBus, TowerModuleMessage, TranscriptBus,
+        TranscriptBusMessage,
     },
+    circuit::inner::vm_pvs::VmPvs,
     primitives::bus::{RangeCheckerBus, RangeCheckerBusMessage},
     proof_shape::{
         AirMetadata,
         bus::{
-            AirShapeProperty, NumPublicValuesBus, NumPublicValuesMessage, ProofShapePermutationBus,
-            ProofShapePermutationMessage, StartingTidxBus, StartingTidxMessage,
+            AirShapeProperty, ProofShapePermutationBus, ProofShapePermutationMessage,
+            StartingTidxBus, StartingTidxMessage,
         },
     },
     subairs::nested_for_loop::{NestedForLoopIoCols, NestedForLoopSubAir},
+    tower::tower_transcript_len,
     utils::TranscriptLabel,
 };
 
@@ -44,8 +46,6 @@ pub struct ProofShapeCols<F, const NUM_LIMBS: usize> {
     pub is_first: F,
     pub is_last: F,
 
-    // loop: proof_idx -> idx (air idx)
-    pub idx: F,
     pub sorted_idx: F,
     /// Represents `log2(next_pow_2(height))` when `is_present`.
     ///
@@ -54,24 +54,24 @@ pub struct ProofShapeCols<F, const NUM_LIMBS: usize> {
     /// Whether this AIR needs rotation openings.
     pub need_rot: F,
 
-    // First possible tidx and non-main cidx of the current AIR
+    // First possible transcript index of the current AIR.
     pub starting_tidx: F,
-    // pub starting_cidx: F,
 
-    // Columns that may be read from the transcript. Note that cached_commits is also read
-    // from the transcript.
+    // Columns that may be read from the transcript.
     pub is_present: F,
 
     /// Lifted trace height (`2^log_height`) used in downstream lookups when `is_present`.
     ///
     /// Has a special use on summary row (when `is_last`).
-    pub height: F,
+    pub height_1: F,
+    pub height_2: F,
 
     // Number of present AIRs so far
     pub num_present: F,
 
-    /// Limb decomposition of `height` used for range/decomposition checks.
-    pub height_limbs: [F; NUM_LIMBS],
+    /// Limb decomposition of per-instance heights used for range/decomposition checks.
+    pub height_1_limbs: [F; NUM_LIMBS],
+    pub height_2_limbs: [F; NUM_LIMBS],
 
     /// The maximum hypercube dimension across all present AIR traces, or zero.
     /// Computed as max(0, n0, n1, ...) where ni = log_height_i for each present trace.
@@ -83,13 +83,15 @@ pub struct ProofShapeCols<F, const NUM_LIMBS: usize> {
 
     /// The Poseidon2 sponge state at the fork point (trunk state just before
     /// forking). Constrained to be identical across all rows within a proof.
-    pub current_snapshot_state: [F; POSEIDON2_WIDTH],
+    pub lookup_challenge_alpha: [F; D_EF],
+    pub lookup_challenge_beta: [F; D_EF],
+    pub after_forked_challenge_1: [F; D_EF],
+    pub after_forked_challenge_2: [F; D_EF],
 }
 
 // Variable-length columns are stored at the end
 pub struct ProofShapeVarCols<'a, F> {
-    pub idx_flags: &'a [F],                     // [F; IDX_FLAGS]
-    pub cached_commits: &'a [[F; DIGEST_SIZE]], // [[F; DIGEST_SIZE]; MAX_CACHED]
+    pub idx_flags: &'a [F], // [F; IDX_FLAGS]
 }
 
 /// AIR for verifying the proof shape (trace heights, widths, commitments) of a child proof
@@ -99,9 +101,6 @@ pub struct ProofShapeVarCols<'a, F> {
 pub struct ProofShapeAir<const NUM_LIMBS: usize, const LIMB_BITS: usize> {
     // Parameters derived from vk
     pub per_air: Vec<AirMetadata>,
-    pub min_cached_idx: usize,
-    pub max_cached: usize,
-    pub commit_mult: usize,
 
     // Primitives
     pub idx_encoder: Arc<Encoder>,
@@ -110,7 +109,7 @@ pub struct ProofShapeAir<const NUM_LIMBS: usize, const LIMB_BITS: usize> {
     // Internal buses
     pub permutation_bus: ProofShapePermutationBus,
     pub starting_tidx_bus: StartingTidxBus,
-    pub num_pvs_bus: NumPublicValuesBus,
+    pub lookup_challenge_bus: LookupChallengeBus,
 
     // Inter-module buses
     pub tower_module_bus: TowerModuleBus,
@@ -122,19 +121,13 @@ pub struct ProofShapeAir<const NUM_LIMBS: usize, const LIMB_BITS: usize> {
     pub transcript_bus: TranscriptBus,
     pub forked_transcript_bus: ForkedTranscriptBus,
     pub n_lift_bus: NLiftBus,
-
-    // For continuations
-    pub cached_commit_bus: CachedCommitBus,
-    pub continuations_enabled: bool,
 }
 
 impl<F, const NUM_LIMBS: usize, const LIMB_BITS: usize> BaseAir<F>
     for ProofShapeAir<NUM_LIMBS, LIMB_BITS>
 {
     fn width(&self) -> usize {
-        ProofShapeCols::<F, NUM_LIMBS>::width()
-            + self.idx_encoder.width()
-            + self.max_cached * DIGEST_SIZE
+        ProofShapeCols::<F, NUM_LIMBS>::width() + self.idx_encoder.width()
     }
 }
 impl<F, const NUM_LIMBS: usize, const LIMB_BITS: usize> BaseAirWithPublicValues<F>
@@ -160,11 +153,7 @@ where
         );
         let const_width = ProofShapeCols::<AB::Var, NUM_LIMBS>::width();
 
-        let localv = borrow_var_cols::<AB::Var>(
-            &local[const_width..],
-            self.idx_encoder.width(),
-            self.max_cached,
-        );
+        let localv = borrow_var_cols::<AB::Var>(&local[const_width..], self.idx_encoder.width());
         let local: &ProofShapeCols<AB::Var, NUM_LIMBS> = (*local)[..const_width].borrow();
         let next: &ProofShapeCols<AB::Var, NUM_LIMBS> = (*next)[..const_width].borrow();
         let n = local.log_height.into();
@@ -201,6 +190,63 @@ where
             next.num_present,
         );
 
+        ///////////////////////////////////////////////////////////////////////////////////////////
+        // VK FIELD SELECTION
+        ///////////////////////////////////////////////////////////////////////////////////////////
+        // Select values for TranscriptBus
+        let mut is_required = AB::Expr::ZERO;
+        let mut air_idx = AB::Expr::ZERO;
+
+        // Select values for LiftedHeightsBus
+        let mut num_witin = AB::Expr::ZERO;
+        let mut num_structural_witin = AB::Expr::ZERO;
+        let mut num_fixed = AB::Expr::ZERO;
+
+        // Select values for NumPublicValuesBus
+        let mut num_pvs = AB::Expr::ZERO;
+        let mut num_read_count = AB::Expr::ZERO;
+        let mut num_write_count = AB::Expr::ZERO;
+        let mut num_logup_count = AB::Expr::ZERO;
+        // Per-selected-air tower transcript span (used for fork challenge tidx bump).
+        let mut tower_tidx_bump = AB::Expr::ZERO;
+
+        for (i, air_data) in self.per_air.iter().enumerate() {
+            // We keep a running tally of how many transcript reads there should be up to any
+            // given point, and use that to constrain initial_tidx
+            let is_current_air = self.idx_encoder.get_flag_expr::<AB>(i, localv.idx_flags);
+            let mut when_current = builder.when(is_current_air.clone());
+            air_idx += is_current_air.clone() * AB::F::from_usize(i);
+            num_witin += is_current_air.clone() * AB::F::from_usize(air_data.num_witin);
+            num_structural_witin +=
+                is_current_air.clone() * AB::F::from_usize(air_data.num_structural_witin);
+            num_fixed += is_current_air.clone() * AB::F::from_usize(air_data.num_fixed);
+
+            num_pvs += is_current_air.clone() * AB::F::from_usize(air_data.num_public_values);
+
+            if air_data.is_required {
+                is_required += is_current_air.clone();
+                when_current.assert_one(local.is_present);
+            }
+
+            num_read_count +=
+                is_current_air.clone() * AB::Expr::from_usize(air_data.num_read_count);
+            num_write_count +=
+                is_current_air.clone() * AB::Expr::from_usize(air_data.num_write_count);
+            num_logup_count +=
+                is_current_air.clone() * AB::Expr::from_usize(air_data.num_logup_count);
+
+            // Keep this aligned with TowerInputAir's `tidx_after_gkr_layers`
+            // arithmetic so fork challenge placement and tower buses share one
+            // transcript span model.
+            tower_tidx_bump += is_current_air
+                * per_air_tower_span::<AB>(
+                    n.clone(),
+                    air_data.num_read_count,
+                    air_data.num_write_count,
+                    air_data.num_logup_count,
+                );
+        }
+
         ///////////////////////////////////////////////////////////////////////////////////////////
         // PERMUTATION AND SORTING
         ///////////////////////////////////////////////////////////////////////////////////////////
@@ -221,13 +267,18 @@ where
         self.permutation_bus.receive(
             builder,
             local.proof_idx,
-            ProofShapePermutationMessage { idx: local.idx },
+            ProofShapePermutationMessage {
+                idx: air_idx.clone(),
+            },
             local.is_valid,
         );
 
         builder
             .when(and(not(local.is_present), local.is_valid))
-            .assert_zero(local.height);
+            .assert_zero(local.height_1);
+        builder
+            .when(and(not(local.is_present), local.is_valid))
+            .assert_zero(local.height_2);
         builder
             .when(and(not(local.is_present), local.is_valid))
             .assert_zero(local.log_height);
@@ -243,96 +294,51 @@ where
         );
 
         ///////////////////////////////////////////////////////////////////////////////////////////
-        // VK FIELD SELECTION
+        // LOOKUP CHALLENGE
         ///////////////////////////////////////////////////////////////////////////////////////////
-        // Select values for TranscriptBus
-        let mut is_required = AB::Expr::ZERO;
-        let mut is_min_cached = AB::Expr::ZERO;
-        let mut has_preprocessed = AB::Expr::ZERO;
-        let mut cached_present = vec![AB::Expr::ZERO; self.max_cached];
-
-        // Select values for LiftedHeightsBus
-        let mut main_common_width = AB::Expr::ZERO;
-        let mut preprocessed_stacked_width = AB::Expr::ZERO;
-        let mut cached_widths = vec![AB::Expr::ZERO; self.max_cached];
-
-        // Select values for transcript commitment material.
-        let mut preprocessed_commit = [AB::Expr::ZERO; DIGEST_SIZE];
-
-        // Select values for NumPublicValuesBus
-        let mut num_pvs = AB::Expr::ZERO;
-        let mut has_pvs = AB::Expr::ZERO;
-        let mut num_read_count = AB::Expr::ZERO;
-        let mut num_write_count = AB::Expr::ZERO;
-        let mut num_logup_count = AB::Expr::ZERO;
-
-        for (i, air_data) in self.per_air.iter().enumerate() {
-            // We keep a running tally of how many transcript reads there should be up to any
-            // given point, and use that to constrain initial_tidx
-            let is_current_air = self.idx_encoder.get_flag_expr::<AB>(i, localv.idx_flags);
-            let mut when_current = builder.when(is_current_air.clone());
-
-            when_current.assert_eq(local.idx, AB::F::from_usize(i));
-
-            main_common_width += is_current_air.clone() * AB::F::from_usize(air_data.main_width);
-
-            if air_data.num_public_values != 0 {
-                has_pvs += is_current_air.clone();
-            }
-            num_pvs += is_current_air.clone() * AB::F::from_usize(air_data.num_public_values);
-
-            if air_data.is_required {
-                is_required += is_current_air.clone();
-                when_current.assert_one(local.is_present);
-            }
-
-            if i == self.min_cached_idx {
-                is_min_cached += is_current_air.clone();
-            }
-
-            assert!(air_data.preprocessed_data.is_none());
-            if let Some(preprocessed) = &air_data.preprocessed_data {
-                when_current.assert_eq(
-                    local.log_height,
-                    AB::Expr::from_usize(0usize.wrapping_add_signed(preprocessed.hypercube_dim)),
-                );
-                has_preprocessed += is_current_air.clone();
-
-                preprocessed_stacked_width += is_current_air.clone()
-                    * AB::F::from_usize(air_data.preprocessed_width.unwrap());
-                (0..DIGEST_SIZE).for_each(|didx| {
-                    preprocessed_commit[didx] += is_current_air.clone()
-                        * AB::F::from_u32(preprocessed.commit[didx].as_canonical_u32());
-                });
-            }
-
-            for (cached_idx, width) in air_data.cached_widths.iter().enumerate() {
-                cached_present[cached_idx] += is_current_air.clone();
-                cached_widths[cached_idx] += is_current_air.clone() * AB::Expr::from_usize(*width);
-            }
-
-            num_read_count +=
-                is_current_air.clone() * AB::Expr::from_usize(air_data.num_read_count);
-            num_write_count +=
-                is_current_air.clone() * AB::Expr::from_usize(air_data.num_write_count);
-            num_logup_count +=
-                is_current_air.clone() * AB::Expr::from_usize(air_data.num_logup_count);
+        for i in 0..D_EF {
+            self.lookup_challenge_bus.lookup_key(
+                builder,
+                local.proof_idx,
+                LookupChallengeMessage {
+                    kind: AB::Expr::from_usize(LookupChallengeKind::Alpha.as_usize()),
+                    word_idx: AB::Expr::from_usize(i),
+                    value: local.lookup_challenge_alpha[i].into(),
+                },
+                local.is_present * local.is_valid,
+            );
+        }
+        for i in 0..D_EF {
+            self.lookup_challenge_bus.lookup_key(
+                builder,
+                local.proof_idx,
+                LookupChallengeMessage {
+                    kind: AB::Expr::from_usize(LookupChallengeKind::Beta.as_usize()),
+                    word_idx: AB::Expr::from_usize(i),
+                    value: local.lookup_challenge_beta[i].into(),
+                },
+                local.is_present * local.is_valid,
+            );
         }
 
         ///////////////////////////////////////////////////////////////////////////////////////////
         // TRANSCRIPT OBSERVATIONS
         ///////////////////////////////////////////////////////////////////////////////////////////
+
         let is_first_idx = self.idx_encoder.get_flag_expr::<AB>(0, localv.idx_flags);
+
+        // The first AIR starts immediately after the fixed trunk transcript prefix.
         builder.when(is_first_idx.clone()).assert_eq(
             local.starting_tidx,
-            AB::Expr::from_usize(TranscriptLabel::Riscv.field_len()),
+            AB::Expr::from_usize(TranscriptLabel::Riscv.field_len() + VmPvs::<u8>::width())
+                + AB::Expr::from_usize(2 * D_EF),
         );
 
         self.starting_tidx_bus.receive(
             builder,
             local.proof_idx,
             StartingTidxMessage {
-                air_idx: local.idx * local.is_valid
+                air_idx: air_idx.clone() * local.is_valid
                     + AB::Expr::from_usize(self.per_air.len()) * local.is_last,
                 tidx: local.starting_tidx.into(),
             },
@@ -342,131 +348,150 @@ where
             ),
         );
 
-        let mut tidx = local.starting_tidx.into();
-        self.transcript_bus.receive(
-            builder,
-            local.proof_idx,
-            TranscriptBusMessage {
-                tidx: tidx.clone(),
-                value: local.is_present.into(),
-                is_sample: AB::Expr::ZERO,
-            },
-            not::<AB::Expr>(is_required.clone()) * local.is_valid,
-        );
-        tidx += not::<AB::Expr>(is_required) * local.is_valid;
+        // Challenges are laid out in trunk transcript as contiguous EF limbs per present AIR.
+        // We jump directly to this AIR's segment using num_present (1-based among present AIRs).
+        let mut tidx =
+            local.starting_tidx.into() + local.num_present * AB::Expr::from_usize(2 * D_EF);
 
-        for (didx, commit_val) in preprocessed_commit.iter().enumerate() {
+        for i in 0..D_EF {
             self.transcript_bus.receive(
                 builder,
                 local.proof_idx,
                 TranscriptBusMessage {
-                    tidx: tidx.clone() + AB::Expr::from_usize(didx),
-                    value: commit_val.clone(),
+                    tidx: tidx.clone() + AB::Expr::from_usize(i),
+                    value: local.after_forked_challenge_1[i].into(),
                     is_sample: AB::Expr::ZERO,
                 },
-                has_preprocessed.clone() * local.is_present,
+                local.is_present,
             );
         }
-        tidx += has_preprocessed.clone() * AB::Expr::from_usize(DIGEST_SIZE) * local.is_present;
-
-        self.transcript_bus.receive(
-            builder,
-            local.proof_idx,
-            TranscriptBusMessage {
-                tidx: tidx.clone(),
-                value: local.log_height.into(),
-                is_sample: AB::Expr::ZERO,
-            },
-            not::<AB::Expr>(has_preprocessed.clone()) * local.is_present,
-        );
-        tidx += not::<AB::Expr>(has_preprocessed.clone()) * local.is_present;
-
-        (0..self.max_cached).for_each(|i| {
-            for didx in 0..DIGEST_SIZE {
-                self.transcript_bus.receive(
-                    builder,
-                    local.proof_idx,
-                    TranscriptBusMessage {
-                        tidx: tidx.clone(),
-                        value: localv.cached_commits[i][didx].into(),
-                        is_sample: AB::Expr::ZERO,
-                    },
-                    cached_present[i].clone() * local.is_present,
-                );
-                tidx += cached_present[i].clone() * local.is_present;
-            }
-        });
+        tidx += AB::Expr::from_usize(D_EF) * local.is_present;
 
-        let num_pvs_tidx = tidx.clone();
-        tidx += num_pvs.clone() * local.is_present;
+        for i in 0..D_EF {
+            self.transcript_bus.receive(
+                builder,
+                local.proof_idx,
+                TranscriptBusMessage {
+                    tidx: tidx.clone() + AB::Expr::from_usize(i),
+                    value: local.after_forked_challenge_2[i].into(),
+                    is_sample: AB::Expr::ZERO,
+                },
+                local.is_present,
+            );
+        }
+        tidx += AB::Expr::from_usize(D_EF) * local.is_present;
 
         // constrain next air tid
         self.starting_tidx_bus.send(
             builder,
             local.proof_idx,
             StartingTidxMessage {
-                air_idx: local.idx + AB::F::ONE,
+                air_idx: air_idx.clone() + AB::F::ONE,
                 tidx,
             },
             local.is_valid,
         );
 
-        for didx in 0..DIGEST_SIZE {
-            self.transcript_bus.receive(
+        ///////////////////////////////////////////////////////////////////////////////////////////
+        // SNAPSHOT STATE CONTINUITY (forked transcript)
+        ///////////////////////////////////////////////////////////////////////////////////////////
+        // Each present AIR corresponds to one fork whose fork_id equals
+        // num_present - 1 (0-based position among present AIRs in sorted order).
+        // This assumes a 1:1 mapping between present AIRs and forks, which
+        // holds when each chip has exactly one proof instance. Multi-instance
+        // chips would require a separate fork_id column.
+        // Receive fork transcript words after the fork label prefix.
+        let fork_tidx_base = TranscriptLabel::Fork.field_len();
+        let fork_id = local.num_present - AB::F::ONE;
+        // observe lookup alpha/beta
+        for i in 0..D_EF {
+            self.forked_transcript_bus.receive(
                 builder,
                 local.proof_idx,
-                TranscriptBusMessage {
-                    tidx: AB::Expr::from_usize(didx),
-                    value: localv.cached_commits[self.max_cached - 1][didx].into(),
+                ForkedTranscriptBusMessage {
+                    fork_id: fork_id.clone().into(),
+                    tidx: AB::Expr::from_usize(fork_tidx_base + i),
+                    value: local.lookup_challenge_alpha[i].into(),
                     is_sample: AB::Expr::ZERO,
                 },
-                local.is_last,
+                local.is_present * local.is_valid,
             );
-
-            self.transcript_bus.receive(
+            self.forked_transcript_bus.receive(
                 builder,
                 local.proof_idx,
-                TranscriptBusMessage {
-                    tidx: AB::Expr::from_usize(didx + DIGEST_SIZE),
-                    value: localv.cached_commits[self.max_cached - 1][didx].into(),
+                ForkedTranscriptBusMessage {
+                    fork_id: fork_id.clone().into(),
+                    tidx: AB::Expr::from_usize(fork_tidx_base + D_EF + i),
+                    value: local.lookup_challenge_beta[i].into(),
                     is_sample: AB::Expr::ZERO,
                 },
-                is_min_cached.clone() * local.is_valid,
+                local.is_present * local.is_valid,
             );
         }
+        self.forked_transcript_bus.receive(
+            builder,
+            local.proof_idx,
+            ForkedTranscriptBusMessage {
+                fork_id: fork_id.clone().into(),
+                tidx: AB::Expr::from_usize(fork_tidx_base + 2 * D_EF),
+                value: fork_id.clone(),
+                is_sample: AB::Expr::ZERO,
+            },
+            local.is_present * local.is_valid,
+        );
+        // Fork transcript metadata order is fixed: num_present, air_idx, then log_height.
+        self.forked_transcript_bus.receive(
+            builder,
+            local.proof_idx,
+            ForkedTranscriptBusMessage {
+                fork_id: fork_id.clone().into(),
+                tidx: AB::Expr::from_usize(fork_tidx_base + 2 * D_EF + 1),
+                value: air_idx.clone(),
+                is_sample: AB::Expr::ZERO,
+            },
+            local.is_present * local.is_valid,
+        );
+        self.forked_transcript_bus.receive(
+            builder,
+            local.proof_idx,
+            ForkedTranscriptBusMessage {
+                fork_id: fork_id.clone().into(),
+                tidx: AB::Expr::from_usize(fork_tidx_base + 2 * D_EF + 2),
+                value: local.log_height.into(),
+                is_sample: AB::Expr::ZERO,
+            },
+            local.is_present * local.is_valid,
+        );
 
-        ///////////////////////////////////////////////////////////////////////////////////////////
-        // SNAPSHOT STATE CONTINUITY (forked transcript)
-        ///////////////////////////////////////////////////////////////////////////////////////////
-        // The sponge state at the fork point is identical for all forks.
-        // Constrain: local.current_snapshot_state == next.current_snapshot_state
-        // for all valid rows within the same proof.
-        for i in 0..POSEIDON2_WIDTH {
-            builder
-                .when(and(local.is_valid, not(next.is_last)))
-                .assert_eq(
-                    local.current_snapshot_state[i],
-                    next.current_snapshot_state[i],
-                );
-        }
+        // Skip the full per-air tower transcript span (out-evals, alpha/beta,
+        // and all GKR/sumcheck layer transcript activity) before binding the
+        // post-fork sampled challenges.
+        let forked_challenge_1_tidx =
+            AB::Expr::from_usize(fork_tidx_base + 2 * D_EF + 3) + tower_tidx_bump;
+        // Challenge 2 starts after challenge 1 plus the product_sum label span.
+        let forked_challenge_2_tidx =
+            forked_challenge_1_tidx.clone() + AB::Expr::from_usize(tower_transcript_len::BETA_LEN);
 
-        // Each present AIR corresponds to one fork whose fork_id equals
-        // num_present (1-based position among present AIRs in sorted order).
-        // This assumes a 1:1 mapping between present AIRs and forks, which
-        // holds when each chip has exactly one proof instance. Multi-instance
-        // chips would require a separate fork_id column.
-        // Receive the trunk's sponge state at the fork point from the
-        // ForkedTranscriptBus, cross-checking current_snapshot_state against
-        // TranscriptAir's trunk_fork_state.
-        for i in 0..POSEIDON2_WIDTH {
+        for i in 0..D_EF {
             self.forked_transcript_bus.receive(
                 builder,
                 local.proof_idx,
                 ForkedTranscriptBusMessage {
-                    fork_id: local.num_present.into(),
-                    fork_tidx: AB::Expr::from_usize(i),
-                    value: local.current_snapshot_state[i].into(),
-                    is_sample: AB::Expr::ZERO,
+                    fork_id: fork_id.clone().into(),
+                    tidx: forked_challenge_1_tidx.clone() + AB::Expr::from_usize(i),
+                    value: local.after_forked_challenge_1[i].into(),
+                    is_sample: AB::Expr::ONE,
+                },
+                local.is_present * local.is_valid,
+            );
+            self.forked_transcript_bus.receive(
+                builder,
+                local.proof_idx,
+                ForkedTranscriptBusMessage {
+                    fork_id: fork_id.clone().into(),
+                    tidx: forked_challenge_2_tidx.clone() + AB::Expr::from_usize(i),
+                    value: local.after_forked_challenge_2[i].into(),
+                    is_sample: AB::Expr::ONE,
                 },
                 local.is_present * local.is_valid,
             );
@@ -481,7 +506,7 @@ where
             AirShapeBusMessage {
                 sort_idx: local.sorted_idx.into(),
                 property_idx: AirShapeProperty::AirId.to_field(),
-                value: local.idx.into(),
+                value: air_idx.clone(),
             },
             local.is_present * local.num_air_id_lookups,
         );
@@ -573,11 +598,18 @@ where
         ///////////////////////////////////////////////////////////////////////////////////////////
         // LIFTED HEIGHTS LOOKUP + STACKING COMMITMENTS
         ///////////////////////////////////////////////////////////////////////////////////////////
-        let _raw_height = fold(
-            local.height_limbs.iter().enumerate(),
+
+        let _raw_height_1 = fold(
+            local.height_1_limbs.iter().enumerate(),
+            AB::Expr::ZERO,
+            |acc, (i, limb)| acc + (AB::Expr::from_u32(1 << (i * LIMB_BITS)) * *limb),
+        );
+        let _raw_height_2 = fold(
+            local.height_2_limbs.iter().enumerate(),
             AB::Expr::ZERO,
             |acc, (i, limb)| acc + (AB::Expr::from_u32(1 << (i * LIMB_BITS)) * *limb),
         );
+        let combined_height = local.height_1 + local.height_2;
 
         self.lifted_heights_bus.add_key_with_lookups(
             builder,
@@ -587,122 +619,16 @@ where
                 part_idx: AB::Expr::ZERO,
                 commit_idx: AB::Expr::ZERO,
                 hypercube_dim: n.clone(),
-                lifted_height: local.height.into(),
+                lifted_height: combined_height.into(),
                 log_lifted_height: local.log_height.into(),
             },
-            local.is_present * main_common_width,
+            local.is_present * (num_witin + num_structural_witin + num_fixed),
         );
 
-        // cidx start from 1
-        // TODO(starting-cidx): starting_cidx flow is intentionally disabled in local fork.
-        // builder
-        //     .when(and(local.is_first, local.is_valid))
-        //     .assert_one(local.starting_cidx);
-        // let mut cidx_offset = AB::Expr::ZERO;
-
-        // NOTE: this is non used if preprocessed_stacked_width == 0
-        // self.lifted_heights_bus.add_key_with_lookups(
-        //     builder,
-        //     local.proof_idx,
-        //     LiftedHeightsBusMessage {
-        //         sort_idx: local.sorted_idx.into(),
-        //         part_idx: cidx_offset.clone() + AB::F::ONE,
-        //         commit_idx: cidx_offset.clone() + local.starting_cidx,
-        //         hypercube_dim: n.clone(),
-        //         lifted_height: local.height.into(),
-        //         log_lifted_height: local.log_height.into(),
-        //     },
-        //     local.is_present * preprocessed_stacked_width,
-        // );
-
-        // NOTE: historical commit-index bus path intentionally removed.
-        // legacy_lookup(
-        //     builder,
-        //     local.proof_idx,
-        //     commitment message
-        //         major_idx: AB::Expr::ZERO,
-        //         minor_idx: cidx_offset.clone() + local.starting_cidx,
-        //         commitment: preprocessed_commit,
-        //     },
-        //     has_preprocessed.clone() * local.is_present * AB::Expr::from_usize(self.commit_mult),
-        // );
-        // cidx_offset still be 0
-        // cidx_offset += has_preprocessed.clone();
-        // TODO(starting-cidx): re-enable if/when commit-index stacking flow is restored.
-        // let mut cidx_offset = AB::Expr::ZERO;
-
-        // (0..self.max_cached).for_each(|cached_idx| {
-        //     self.lifted_heights_bus.add_key_with_lookups(
-        //         builder,
-        //         local.proof_idx,
-        //         LiftedHeightsBusMessage {
-        //             sort_idx: local.sorted_idx.into(),
-        //             part_idx: cidx_offset.clone() + AB::F::ONE,
-        //             commit_idx: cidx_offset.clone() + local.starting_cidx,
-        //             hypercube_dim: n.clone(),
-        //             lifted_height: local.height.into(),
-        //             log_lifted_height: local.log_height.into(),
-        //         },
-        //         local.is_present * cached_widths[cached_idx].clone(),
-        //     );
-        //
-        //     // legacy_lookup(
-        //     //     builder,
-        //     //     local.proof_idx,
-        //     //     commitment message
-        //     //         major_idx: AB::Expr::ZERO,
-        //     //         minor_idx: cidx_offset.clone() + local.starting_cidx,
-        //     //         commitment: localv.cached_commits[cached_idx].map(Into::into),
-        //     //     },
-        //     //     cached_present[cached_idx].clone()
-        //     //         * local.is_present
-        //     //         * AB::Expr::from_usize(self.commit_mult),
-        //     // );
-        //     // cidx_offset += cached_present[cached_idx].clone();
-        //
-        //     // self.cached_commit_bus.send(
-        //     //     builder,
-        //     //     local.proof_idx,
-        //     //     CachedCommitBusMessage {
-        //     //         air_idx: local.idx.into(),
-        //     //         cached_idx: AB::Expr::from_usize(cached_idx),
-        //     //         cached_commit: localv.cached_commits[cached_idx].map(Into::into),
-        //     //     },
-        //     //     cached_present[cached_idx].clone()
-        //     //         * local.is_valid
-        //     //         * AB::Expr::from_bool(self.continuations_enabled),
-        //     // );
-        // });
-
-        // TODO(starting-cidx): disabled alongside local starting_cidx removal.
-        // builder
-        //     .when(and(local.is_valid, not(next.is_last)))
-        //     .assert_eq(local.starting_cidx + cidx_offset, next.starting_cidx);
-        //
-        // legacy_lookup(
-        //     builder,
-        //     local.proof_idx,
-        //     commitment message
-        //         major_idx: AB::Expr::ZERO,
-        //         minor_idx: AB::Expr::ZERO,
-        //         commitment: localv.cached_commits[self.max_cached - 1].map(Into::into),
-        //     },
-        //     is_min_cached.clone() * local.is_valid * AB::Expr::from_usize(self.commit_mult),
-        // );
-
         ///////////////////////////////////////////////////////////////////////////////////////////
         // NUM PUBLIC VALUES
         ///////////////////////////////////////////////////////////////////////////////////////////
-        // self.num_pvs_bus.send(
-        //     builder,
-        //     local.proof_idx,
-        //     NumPublicValuesMessage {
-        //         air_idx: local.idx.into(),
-        //         tidx: num_pvs_tidx,
-        //         num_pvs,
-        //     },
-        //     local.is_present * has_pvs,
-        // );
+        let _ = num_pvs;
 
         ///////////////////////////////////////////////////////////////////////////////////////////
         // HEIGHT + GKR MESSAGE
@@ -711,7 +637,15 @@ where
             self.range_bus.lookup_key(
                 builder,
                 RangeCheckerBusMessage {
-                    value: local.height_limbs[i].into(),
+                    value: local.height_1_limbs[i].into(),
+                    max_bits: AB::Expr::from_usize(LIMB_BITS),
+                },
+                local.is_valid,
+            );
+            self.range_bus.lookup_key(
+                builder,
+                RangeCheckerBusMessage {
+                    value: local.height_2_limbs[i].into(),
                     max_bits: AB::Expr::from_usize(LIMB_BITS),
                 },
                 local.is_valid,
@@ -759,7 +693,7 @@ where
             builder,
             local.proof_idx,
             TowerModuleMessage {
-                idx: local.idx.into(),
+                idx: air_idx.clone(),
                 tidx: local.starting_tidx.into(),
                 n_logup: n,
             },
@@ -781,7 +715,7 @@ where
             builder,
             local.proof_idx,
             NLiftMessage {
-                air_idx: local.idx.into(),
+                air_idx: air_idx,
                 n_lift: local.log_height.into(),
             },
             local.is_present,
@@ -799,24 +733,43 @@ where
     }
 }
 
-pub(super) fn borrow_var_cols<F>(
-    slice: &[F],
-    idx_flags: usize,
-    max_cached: usize,
-) -> ProofShapeVarCols<'_, F> {
-    let flags_idx = 0;
-    let cached_commits_idx = flags_idx + idx_flags;
-
-    let cached_commits = &slice[cached_commits_idx..cached_commits_idx + max_cached * DIGEST_SIZE];
-    let cached_commits: &[[F; DIGEST_SIZE]] = unsafe {
-        std::slice::from_raw_parts(
-            cached_commits.as_ptr() as *const [F; DIGEST_SIZE],
-            max_cached,
-        )
+fn per_air_tower_span<AB: AirBuilder>(
+    n_logup: AB::Expr,
+    num_read_count: usize,
+    num_write_count: usize,
+    num_logup_count: usize,
+) -> AB::Expr {
+    use tower_transcript_len::{
+        ALPHA_BETA_LEN, ALPHA_LEN, POST_SUMCHECK_LEN, ROUND_LEN, SUMCHECK_INIT_LEN,
+    };
+
+    // Derivation notes (matches tower transcript replay order used by verifier):
+    // 1) Out-evals before alpha/beta:
+    //    - read spec contributes 2 EF evals, write spec contributes 2 EF evals,
+    //      logup spec contributes 4 EF evals.
+    //    - each EF observe_ext contributes D_EF base-field transcript words.
+    // 2) Always sample alpha/beta next (ALPHA_BETA_LEN words).
+    // 3) If this air has interactions, add full GKR layer transcript span:
+    //    this is identical to TowerInputAir's closed-form tidx advancement
+    //    from `tidx_after_alpha_beta` to `tidx_after_gkr_layers`.
+    let out_eval_words = 2 * num_read_count + 2 * num_write_count + 4 * num_logup_count;
+    let out_eval_span = AB::Expr::from_usize(out_eval_words * D_EF);
+
+    let gkr_span = if out_eval_words == 0 {
+        AB::Expr::ZERO
+    } else {
+        let gkr_inner = n_logup.clone() * AB::Expr::from_usize(ROUND_LEN / 2)
+            + AB::Expr::from_usize(
+                ALPHA_LEN + SUMCHECK_INIT_LEN + POST_SUMCHECK_LEN - ROUND_LEN / 2,
+            );
+        n_logup * gkr_inner - AB::Expr::from_usize(ALPHA_LEN + SUMCHECK_INIT_LEN)
     };
 
+    out_eval_span + AB::Expr::from_usize(ALPHA_BETA_LEN) + gkr_span
+}
+
+pub(super) fn borrow_var_cols<F>(slice: &[F], idx_flags: usize) -> ProofShapeVarCols<'_, F> {
     ProofShapeVarCols {
-        idx_flags: &slice[flags_idx..cached_commits_idx],
-        cached_commits,
+        idx_flags: &slice[..idx_flags],
     }
 }
diff --git a/ceno_recursion_v2/src/proof_shape/proof_shape/trace.rs b/ceno_recursion_v2/src/proof_shape/proof_shape/trace.rs
index 7b09f9344..9ab35f8af 100644
--- a/ceno_recursion_v2/src/proof_shape/proof_shape/trace.rs
+++ b/ceno_recursion_v2/src/proof_shape/proof_shape/trace.rs
@@ -1,8 +1,8 @@
 use std::{borrow::BorrowMut, sync::Arc};
 
 use openvm_circuit_primitives::encoder::Encoder;
-use openvm_stark_sdk::config::baby_bear_poseidon2::{DIGEST_SIZE, F};
-use p3_field::PrimeCharacteristicRing;
+use openvm_stark_sdk::config::baby_bear_poseidon2::{D_EF, EF, F};
+use p3_field::{BasedVectorSpace, PrimeCharacteristicRing};
 use p3_matrix::dense::RowMajorMatrix;
 
 use super::air::ProofShapeCols;
@@ -14,24 +14,11 @@ use crate::{
 
 pub(in crate::proof_shape) struct ProofShapeVarColsMut<'a, F> {
     pub idx_flags: &'a mut [F],
-    pub cached_commits: &'a mut [[F; DIGEST_SIZE]],
 }
 
-fn borrow_var_cols_mut<F>(
-    slice: &mut [F],
-    idx_flags: usize,
-    max_cached: usize,
-) -> ProofShapeVarColsMut<'_, F> {
-    let (idx_flags_slice, cached_flat) = slice.split_at_mut(idx_flags);
-    let cached_commits: &mut [[F; DIGEST_SIZE]] = unsafe {
-        std::slice::from_raw_parts_mut(
-            cached_flat.as_mut_ptr() as *mut [F; DIGEST_SIZE],
-            max_cached,
-        )
-    };
+fn borrow_var_cols_mut<F>(slice: &mut [F], idx_flags: usize) -> ProofShapeVarColsMut<'_, F> {
     ProofShapeVarColsMut {
-        idx_flags: idx_flags_slice,
-        cached_commits,
+        idx_flags: &mut slice[..idx_flags],
     }
 }
 
@@ -46,22 +33,37 @@ fn decompose_usize<const NUM_LIMBS: usize, const LIMB_BITS: usize>(
     })
 }
 
+fn two_instance_heights_from_chip_instances(chip_instances: &[impl BorrowNumInstances]) -> (usize, usize) {
+    chip_instances.iter().fold((0usize, 0usize), |(h1, h2), instance| {
+        let num_instances = instance.borrow_num_instances();
+        (
+            h1 + num_instances.first().copied().unwrap_or(0),
+            h2 + num_instances.get(1).copied().unwrap_or(0),
+        )
+    })
+}
+
+trait BorrowNumInstances {
+    fn borrow_num_instances(&self) -> &[usize];
+}
+
+impl BorrowNumInstances for ceno_zkvm::scheme::ZKVMChipProof<crate::system::RecursionField> {
+    fn borrow_num_instances(&self) -> &[usize] {
+        &self.num_instances
+    }
+}
+
 #[derive(derive_new::new)]
 #[allow(dead_code)]
 pub(in crate::proof_shape) struct ProofShapeChip<const NUM_LIMBS: usize, const LIMB_BITS: usize> {
     idx_encoder: Arc<Encoder>,
-    min_cached_idx: usize,
-    max_cached: usize,
-    cidx_deltas: Vec<usize>,
     range_checker: Arc<RangeCheckerCpuTraceGenerator<LIMB_BITS>>,
     pow_checker: Arc<PowerCheckerCpuTraceGenerator<2, POW_CHECKER_HEIGHT>>,
 }
 
 impl<const NUM_LIMBS: usize, const LIMB_BITS: usize> ProofShapeChip<NUM_LIMBS, LIMB_BITS> {
     pub(in crate::proof_shape) fn placeholder_width(&self) -> usize {
-        ProofShapeCols::<u8, NUM_LIMBS>::width()
-            + self.idx_encoder.width()
-            + self.max_cached * DIGEST_SIZE
+        ProofShapeCols::<u8, NUM_LIMBS>::width() + self.idx_encoder.width()
     }
 }
 
@@ -96,41 +98,47 @@ impl<const NUM_LIMBS: usize, const LIMB_BITS: usize> RowMajorChip<F>
         for (proof_idx, (proof, preflight)) in proofs.iter().zip(preflights.iter()).enumerate() {
             let mut sorted_idx = 0usize;
             let mut num_present = 0usize;
-            let mut _current_cidx = 1usize;
 
             for (air_idx, vdata) in &preflight.proof_shape.sorted_trace_vdata {
                 let chunk = chunks.next().unwrap();
                 let (fixed_cols, variable_cols) = chunk.split_at_mut(cols_width);
                 let cols: &mut ProofShapeCols<F, NUM_LIMBS> = fixed_cols.borrow_mut();
-                let var_cols = &mut borrow_var_cols_mut(
-                    variable_cols,
-                    self.idx_encoder.width(),
-                    self.max_cached,
-                );
+                let var_cols = &mut borrow_var_cols_mut(variable_cols, self.idx_encoder.width());
 
                 let log_height = vdata.log_height;
-                let trace_height = 1usize << log_height;
-                num_present += 1;
+                let (height_1, height_2) = proof
+                    .chip_proofs
+                    .get(air_idx)
+                    .map(|instances| two_instance_heights_from_chip_instances(instances))
+                    .unwrap_or((0, 0));
+                 num_present += 1;
 
                 cols.proof_idx = F::from_usize(proof_idx);
                 cols.is_valid = F::ONE;
                 cols.is_first = F::from_bool(sorted_idx == 0);
                 cols.is_last = F::ZERO;
-                cols.idx = F::from_usize(*air_idx);
                 cols.sorted_idx = F::from_usize(sorted_idx);
                 cols.log_height = F::from_usize(log_height);
                 cols.need_rot = F::ZERO;
                 cols.starting_tidx = F::from_usize(preflight.proof_shape.starting_tidx[*air_idx]);
                 cols.is_present = F::ONE;
-                cols.height = F::from_usize(trace_height);
+                cols.height_1 = F::from_usize(height_1);
+                cols.height_2 = F::from_usize(height_2);
                 cols.num_present = F::from_usize(num_present);
-                cols.height_limbs =
-                    decompose_usize::<NUM_LIMBS, LIMB_BITS>(trace_height).map(F::from_usize);
+                cols.height_1_limbs =
+                    decompose_usize::<NUM_LIMBS, LIMB_BITS>(height_1).map(F::from_usize);
+                cols.height_2_limbs =
+                    decompose_usize::<NUM_LIMBS, LIMB_BITS>(height_2).map(F::from_usize);
                 cols.n_max = F::from_usize(preflight.proof_shape.n_max);
                 cols.is_n_max_greater = F::ZERO;
                 cols.num_air_id_lookups = F::ZERO;
                 cols.num_columns = F::ZERO;
-                cols.current_snapshot_state = preflight.proof_shape.fork_start_state;
+                cols.lookup_challenge_alpha = preflight.proof_shape.lookup_challenge_alpha;
+                cols.lookup_challenge_beta = preflight.proof_shape.lookup_challenge_beta;
+                cols.after_forked_challenge_1 =
+                    ef_to_limbs(preflight.proof_shape.after_forked_challenge_1);
+                cols.after_forked_challenge_2 =
+                    ef_to_limbs(preflight.proof_shape.after_forked_challenge_2);
 
                 for (dst, src) in var_cols
                     .idx_flags
@@ -140,11 +148,6 @@ impl<const NUM_LIMBS: usize, const LIMB_BITS: usize> RowMajorChip<F>
                     *dst = F::from_u32(*src);
                 }
 
-                if *air_idx == self.min_cached_idx {
-                    var_cols.cached_commits[self.max_cached - 1] = [F::ZERO; DIGEST_SIZE];
-                }
-
-                _current_cidx += self.cidx_deltas.get(*air_idx).copied().unwrap_or(0);
                 self.pow_checker.add_pow(log_height);
                 sorted_idx += 1;
             }
@@ -156,30 +159,32 @@ impl<const NUM_LIMBS: usize, const LIMB_BITS: usize> RowMajorChip<F>
                 let chunk = chunks.next().unwrap();
                 let (fixed_cols, variable_cols) = chunk.split_at_mut(cols_width);
                 let cols: &mut ProofShapeCols<F, NUM_LIMBS> = fixed_cols.borrow_mut();
-                let var_cols = &mut borrow_var_cols_mut(
-                    variable_cols,
-                    self.idx_encoder.width(),
-                    self.max_cached,
-                );
+                let var_cols = &mut borrow_var_cols_mut(variable_cols, self.idx_encoder.width());
 
                 cols.proof_idx = F::from_usize(proof_idx);
                 cols.is_valid = F::ONE;
                 cols.is_first = F::from_bool(sorted_idx == 0);
                 cols.is_last = F::ZERO;
-                cols.idx = F::from_usize(air_idx);
                 cols.sorted_idx = F::from_usize(sorted_idx);
                 cols.log_height = F::ZERO;
                 cols.need_rot = F::ZERO;
                 cols.starting_tidx = F::from_usize(preflight.proof_shape.starting_tidx[air_idx]);
                 cols.is_present = F::ZERO;
-                cols.height = F::ZERO;
+                cols.height_1 = F::ZERO;
+                cols.height_2 = F::ZERO;
                 cols.num_present = F::from_usize(num_present);
-                cols.height_limbs = [F::ZERO; NUM_LIMBS];
+                cols.height_1_limbs = [F::ZERO; NUM_LIMBS];
+                cols.height_2_limbs = [F::ZERO; NUM_LIMBS];
                 cols.n_max = F::from_usize(preflight.proof_shape.n_max);
                 cols.is_n_max_greater = F::ZERO;
                 cols.num_air_id_lookups = F::ZERO;
                 cols.num_columns = F::ZERO;
-                cols.current_snapshot_state = preflight.proof_shape.fork_start_state;
+                cols.lookup_challenge_alpha = preflight.proof_shape.lookup_challenge_alpha;
+                cols.lookup_challenge_beta = preflight.proof_shape.lookup_challenge_beta;
+                cols.after_forked_challenge_1 =
+                    ef_to_limbs(preflight.proof_shape.after_forked_challenge_1);
+                cols.after_forked_challenge_2 =
+                    ef_to_limbs(preflight.proof_shape.after_forked_challenge_2);
 
                 for (dst, src) in var_cols
                     .idx_flags
@@ -189,41 +194,38 @@ impl<const NUM_LIMBS: usize, const LIMB_BITS: usize> RowMajorChip<F>
                     *dst = F::from_u32(*src);
                 }
 
-                if air_idx == self.min_cached_idx {
-                    var_cols.cached_commits[self.max_cached - 1] = [F::ZERO; DIGEST_SIZE];
-                }
-
-                _current_cidx += self.cidx_deltas.get(air_idx).copied().unwrap_or(0);
                 sorted_idx += 1;
             }
 
             let chunk = chunks.next().unwrap();
             let (fixed_cols, variable_cols) = chunk.split_at_mut(cols_width);
             let cols: &mut ProofShapeCols<F, NUM_LIMBS> = fixed_cols.borrow_mut();
-            let var_cols =
-                &mut borrow_var_cols_mut(variable_cols, self.idx_encoder.width(), self.max_cached);
+            let _var_cols = &mut borrow_var_cols_mut(variable_cols, self.idx_encoder.width());
             cols.proof_idx = F::from_usize(proof_idx);
             cols.is_valid = F::ZERO;
             cols.is_first = F::ZERO;
             cols.is_last = F::ONE;
-            cols.idx = F::ZERO;
             cols.sorted_idx = F::ZERO;
             cols.log_height = F::from_usize(preflight.proof_shape.n_logup);
             cols.need_rot = F::ZERO;
             cols.starting_tidx = F::from_usize(preflight.proof_shape.post_tidx);
             cols.is_present = F::ZERO;
-            cols.height = F::ZERO;
+            cols.height_1 = F::ZERO;
+            cols.height_2 = F::ZERO;
             cols.num_present = F::from_usize(num_present);
-            cols.height_limbs = [F::ZERO; NUM_LIMBS];
+            cols.height_1_limbs = [F::ZERO; NUM_LIMBS];
+            cols.height_2_limbs = [F::ZERO; NUM_LIMBS];
             cols.n_max = F::from_usize(preflight.proof_shape.n_max);
             cols.is_n_max_greater =
                 F::from_bool(preflight.proof_shape.n_max > preflight.proof_shape.n_logup);
             cols.num_air_id_lookups = F::ZERO;
             cols.num_columns = F::ZERO;
-            cols.current_snapshot_state = preflight.proof_shape.fork_start_state;
-            if self.max_cached != 0 {
-                var_cols.cached_commits[self.max_cached - 1] = [F::ZERO; DIGEST_SIZE];
-            }
+            cols.lookup_challenge_alpha = preflight.proof_shape.lookup_challenge_alpha;
+            cols.lookup_challenge_beta = preflight.proof_shape.lookup_challenge_beta;
+            cols.after_forked_challenge_1 =
+                ef_to_limbs(preflight.proof_shape.after_forked_challenge_1);
+            cols.after_forked_challenge_2 =
+                ef_to_limbs(preflight.proof_shape.after_forked_challenge_2);
         }
 
         for chunk in chunks {
@@ -234,3 +236,9 @@ impl<const NUM_LIMBS: usize, const LIMB_BITS: usize> RowMajorChip<F>
         Some(RowMajorMatrix::new(trace, width))
     }
 }
+
+fn ef_to_limbs(value: EF) -> [F; D_EF] {
+    let mut out = [F::ZERO; D_EF];
+    out.copy_from_slice(value.as_basis_coefficients_slice());
+    out
+}
diff --git a/ceno_recursion_v2/src/proof_shape/pvs/air.rs b/ceno_recursion_v2/src/proof_shape/pvs/air.rs
index cbc4897ab..7626001b4 100644
--- a/ceno_recursion_v2/src/proof_shape/pvs/air.rs
+++ b/ceno_recursion_v2/src/proof_shape/pvs/air.rs
@@ -12,6 +12,7 @@ use crate::{
     bus::{PublicValuesBus, PublicValuesBusMessage, TranscriptBus, TranscriptBusMessage},
     proof_shape::bus::{NumPublicValuesBus, NumPublicValuesMessage},
     subairs::nested_for_loop::{NestedForLoopIoCols, NestedForLoopSubAir},
+    utils::TranscriptLabel,
 };
 
 #[repr(C)]
@@ -89,7 +90,8 @@ where
             .assert_one(next.is_first_in_air);
 
         let is_same_air = local.is_valid * next.is_valid * not(next.is_first_in_air);
-
+        // TODO fix first tidx to be TranscriptLabel::Riscv.field_len()
+        // TODO fix comment as well
         // first tidx happened here
         builder
             .when(local.is_valid * local.is_first_in_proof * local.is_first_in_air)
diff --git a/ceno_recursion_v2/src/proof_shape/pvs/trace.rs b/ceno_recursion_v2/src/proof_shape/pvs/trace.rs
index 3cf720b3c..13e6b645b 100644
--- a/ceno_recursion_v2/src/proof_shape/pvs/trace.rs
+++ b/ceno_recursion_v2/src/proof_shape/pvs/trace.rs
@@ -43,6 +43,7 @@ impl RowMajorChip<F> for PublicValuesTraceGenerator {
 
         for (proof_idx, proof) in proofs.iter().enumerate() {
             let mut is_first_in_proof = true;
+            // TODO first tidx start from TranscriptLabel::Riscv.field_len()
             let mut tidx = 0usize;
 
             for (air_idx, (_, circuit_vk)) in child_vk.circuit_vks.iter().enumerate() {
diff --git a/ceno_recursion_v2/src/system/bus_inventory.rs b/ceno_recursion_v2/src/system/bus_inventory.rs
index 35cffa58e..4c304d542 100644
--- a/ceno_recursion_v2/src/system/bus_inventory.rs
+++ b/ceno_recursion_v2/src/system/bus_inventory.rs
@@ -11,7 +11,7 @@ use crate::bus::{
     CachedCommitBus as LocalCachedCommitBus, ExpressionClaimNMaxBus as LocalExpressionClaimNMaxBus,
     ForkedTranscriptBus as LocalForkedTranscriptBus,
     FractionFolderInputBus as LocalFractionFolderInputBus, HyperdimBus as LocalHyperdimBus,
-    LiftedHeightsBus as LocalLiftedHeightsBus, MainBus, MainExpressionClaimBus,
+    LiftedHeightsBus as LocalLiftedHeightsBus, LookupChallengeBus, MainBus, MainExpressionClaimBus,
     MainSumcheckInputBus, MainSumcheckOutputBus, NLiftBus as LocalNLiftBus,
     PublicValuesBus as LocalPublicValuesBus, TowerModuleBus, TranscriptBus as LocalTranscriptBus,
 };
@@ -42,6 +42,7 @@ pub struct BusInventory {
     pub xi_randomness_bus: XiRandomnessBus,
     pub final_state_bus: FinalTranscriptStateBus,
     pub forked_transcript_bus: LocalForkedTranscriptBus,
+    pub lookup_challenge_bus: LookupChallengeBus,
 }
 
 impl BusInventory {
@@ -76,6 +77,7 @@ impl BusInventory {
         let cached_commit_bus = LocalCachedCommitBus::new(b.new_bus_idx());
         let final_state_bus = FinalTranscriptStateBus::new(b.new_bus_idx());
         let forked_transcript_bus = LocalForkedTranscriptBus::new(b.new_bus_idx());
+        let lookup_challenge_bus = LookupChallengeBus::new(b.new_bus_idx());
 
         Self {
             transcript_bus,
@@ -102,6 +104,7 @@ impl BusInventory {
             xi_randomness_bus,
             final_state_bus,
             forked_transcript_bus,
+            lookup_challenge_bus,
         }
     }
 }
diff --git a/ceno_recursion_v2/src/system/mod.rs b/ceno_recursion_v2/src/system/mod.rs
index e8120a604..2c3e2382c 100644
--- a/ceno_recursion_v2/src/system/mod.rs
+++ b/ceno_recursion_v2/src/system/mod.rs
@@ -27,6 +27,7 @@ use crate::{
     main::MainModule,
     tower::TowerModule,
     transcript::TranscriptModule,
+    utils::{TranscriptLabel, transcript_observe_label},
 };
 use openvm_cpu_backend::CpuBackend;
 use openvm_poseidon2_air::POSEIDON2_WIDTH;
@@ -36,8 +37,11 @@ use openvm_stark_backend::{
     p3_maybe_rayon::prelude::*,
     prover::{AirProvingContext, CommittedTraceData, ProverBackend},
 };
-use openvm_stark_sdk::config::baby_bear_poseidon2::{BabyBearPoseidon2Config, F};
-use p3_field::PrimeCharacteristicRing;
+use openvm_stark_sdk::{
+    config::baby_bear_poseidon2::{BabyBearPoseidon2Config, D_EF, EF, F, poseidon2_perm},
+    p3_baby_bear::Poseidon2BabyBear,
+};
+use p3_field::{BasedVectorSpace, PrimeCharacteristicRing};
 use p3_matrix::Matrix;
 use recursion_circuit::primitives::{
     exp_bits_len::{ExpBitsLenAir, ExpBitsLenTraceGenerator},
@@ -79,7 +83,8 @@ pub trait VerifierTraceGen<PB: ProverBackend, SC: StarkProtocolConfig<F = F>> {
     #[allow(clippy::ptr_arg)]
     fn generate_proving_ctxs<
         TS: FiatShamirTranscript<BabyBearPoseidon2Config>
-            + TranscriptHistory<F = F, State = [F; POSEIDON2_WIDTH]>,
+            + TranscriptHistory<F = F, State = [F; POSEIDON2_WIDTH]>
+            + From<Poseidon2BabyBear<POSEIDON2_WIDTH>>,
     >(
         &self,
         child_vk: &RecursionVk,
@@ -92,6 +97,7 @@ pub trait VerifierTraceGen<PB: ProverBackend, SC: StarkProtocolConfig<F = F>> {
     fn generate_proving_ctxs_base<
         TS: FiatShamirTranscript<BabyBearPoseidon2Config>
             + TranscriptHistory<F = F, State = [F; POSEIDON2_WIDTH]>
+            + From<Poseidon2BabyBear<POSEIDON2_WIDTH>>
             + Clone,
     >(
         &self,
@@ -295,11 +301,38 @@ impl<const MAX_NUM_PROOFS: usize> VerifierSubCircuit<MAX_NUM_PROOFS> {
         }
     }
 
+    #[allow(clippy::type_complexity)]
+    fn build_chip_proof_list<'a>(
+        proof: &'a RecursionProof,
+    ) -> Vec<(
+        usize,
+        usize,
+        &'a ceno_zkvm::scheme::ZKVMChipProof<RecursionField>,
+    )> {
+        // Keep current deterministic ordering: iterate chip map order, then
+        // instance order within each chip. Fork IDs are assigned in this order.
+        let chip_proof_list: Vec<(
+            usize,
+            usize,
+            &ceno_zkvm::scheme::ZKVMChipProof<RecursionField>,
+        )> = proof
+            .chip_proofs
+            .iter()
+            .flat_map(|(&chip_idx, instances)| {
+                instances
+                    .iter()
+                    .enumerate()
+                    .map(move |(instance_idx, chip_proof)| (chip_idx, instance_idx, chip_proof))
+            })
+            .collect();
+        chip_proof_list
+    }
+
     /// Runs preflight for a single proof, with proper transcript forking.
     ///
     /// This mirrors the native verifier's `verify_proof_validity` fork protocol:
-    /// 1. Trunk: proof-shape (per-AIR shape + α/β)
-    /// 2. Fork: clone sponge per chip, observe fork index, run Tower + Main
+    /// 1. Trunk: proof-shape metadata only (VmPvs preflight already observed/sampled α/β)
+    /// 2. Fork: fresh transcript per chip, observe fork prelude, run Tower + Main
     /// 3. Merge: each fork samples 1 ext element → observe into trunk
     #[tracing::instrument(name = "execute_preflight", skip_all)]
     fn run_preflight<TS>(
@@ -310,7 +343,9 @@ impl<const MAX_NUM_PROOFS: usize> VerifierSubCircuit<MAX_NUM_PROOFS> {
     ) -> Preflight
     where
         TS: FiatShamirTranscript<BabyBearPoseidon2Config>
-            + TranscriptHistory<F = F, State = [F; POSEIDON2_WIDTH]>,
+            + TranscriptHistory<F = F, State = [F; POSEIDON2_WIDTH]>
+            + From<Poseidon2BabyBear<POSEIDON2_WIDTH>>
+            + Clone,
     {
         let mut preflight = Preflight::default();
 
@@ -319,85 +354,76 @@ impl<const MAX_NUM_PROOFS: usize> VerifierSubCircuit<MAX_NUM_PROOFS> {
         self.proof_shape
             .run_preflight(child_vk, proof, &mut preflight, &mut sponge);
 
-        // Phase 2: Fork — clone sponge for each chip proof instance.
-        let chip_proof_list: Vec<(
-            usize,
-            usize,
-            &ceno_zkvm::scheme::ZKVMChipProof<RecursionField>,
-        )> = proof
-            .chip_proofs
-            .iter()
-            .flat_map(|(&chip_idx, instances)| {
-                instances
-                    .iter()
-                    .enumerate()
-                    .map(move |(instance_idx, chip_proof)| (chip_idx, instance_idx, chip_proof))
-            })
-            .collect();
-
-        let num_forks = chip_proof_list.len();
-        let fork_offset = sponge.len(); // tidx of the fork point in the trunk
-
-        let mut fork_sponges: Vec<TS> = (0..num_forks)
-            .map(|i| {
-                let mut forked = sponge.clone();
-                forked.observe(F::from_u64(i as u64));
-                forked
-            })
+        // VmPvs is owned by pre-system preflight. Consume vm_pvs challenge
+        // fields directly here.
+        let alpha_ext = preflight.vm_pvs.lookup_challenge_alpha;
+        let beta_ext = preflight.vm_pvs.lookup_challenge_beta;
+        preflight.proof_shape.lookup_challenge_alpha = ef_to_limbs(alpha_ext);
+        preflight.proof_shape.lookup_challenge_beta = ef_to_limbs(beta_ext);
+
+        // Mark where merge observations begin in the trunk transcript.
+        let fork_offset = sponge.len();
+
+        // Phase 2: Fork — fresh transcript per chip proof instance.
+        let chip_proof_list = Self::build_chip_proof_list(proof);
+        // `TS::from(poseidon2_perm())` is the generic equivalent of
+        // `default_duplex_sponge_recorder()` used by the inner prover.
+        let mut fork_sponges: Vec<TS> = (0..chip_proof_list.len())
+            .map(|_| TS::from(poseidon2_perm().clone()))
             .collect();
 
         // Phase 3: Run Tower + Main on each fork.
-        // Each fork sponge processes independently. We record fork-local tidx
-        // directly — global offsets are computed on the fly at trace gen time.
-
-        for (fork_idx, &(chip_idx, instance_idx, chip_proof)) in chip_proof_list.iter().enumerate()
-        {
-            let fs = &mut fork_sponges[fork_idx];
-            let fork_start_len = fs.len();
-            // Observe circuit_idx into the fork transcript.
-            // Mirrors v1 verifier: transcript.append_field_element(circuit_idx)
+        // Fork-local tidx values are recorded directly; global offsets are
+        // computed on demand during trace generation.
+
+        for (fork_id, &(chip_idx, instance_idx, chip_proof)) in chip_proof_list.iter().enumerate() {
+            let fs = &mut fork_sponges[fork_id];
+
+            // Strict upstream semantics: each fork transcript starts from a
+            // fresh domain-separated transcript (label "fork").
+            transcript_observe_label(fs, TranscriptLabel::Fork.as_bytes());
+
+            // Proof-shape-owned fork prelude:
+            // alpha, beta, fork_index, circuit_index, num_instances...
+            fs.observe_ext(alpha_ext);
+            fs.observe_ext(beta_ext);
+            // Fork IDs intentionally follow current chip/instance iteration
+            // order from `build_chip_proof_list` (not proof-shape sorted order).
+            // Fork IDs are normalized to 0-based indexing for forked
+            // transcripts (fork_id in 0..num_forks).
+            fs.observe(F::from_usize(fork_id));
             fs.observe(F::from_u64(chip_idx as u64));
+            for num_instance in chip_proof.num_instances {
+                fs.observe(F::from_usize(num_instance));
+            }
+
+            let tower_tidx = fs.len();
+            let tower_replay =
+                crate::tower::record_and_replay_tower_preflight(fs, child_vk, chip_idx, chip_proof);
 
-            // Fork-local tidx: position within the fork's extracted log.
-            let tower_local = fs.len() - fork_start_len;
-            let _ = crate::tower::record_gkr_transcript(fs, chip_idx, chip_proof);
-
-            let tower_replay = match crate::tower::circuit_vk_for_idx(child_vk, chip_idx) {
-                Some(circuit_vk) => {
-                    match crate::tower::replay_tower_proof(chip_proof, circuit_vk) {
-                        Ok(replay) => replay,
-                        Err(_err) => crate::tower::TowerReplayResult::default(),
-                    }
-                }
-                None => crate::tower::TowerReplayResult::default(),
-            };
-
-            // Record tower with fork-local tidx.
-            // Fork log layout: [0]=fork_id observe, [1..]=circuit_idx + tower + main.
-            // tower_local is relative to fork_start_len (after fork_id), so
-            // fork-log-local position = 1 (fork_id) + tower_local.
+            // Record tower entry with fork-local tidx at tower stage start.
             preflight.gkr.chips.push(TowerChipTranscriptRange {
                 chip_idx,
                 instance_idx,
-                tidx: 1 + tower_local,
-                fork_idx,
+                tidx: tower_tidx,
+                fork_idx: fork_id,
                 tower_replay,
             });
 
             // Main preflight for this chip.
-            let main_local = fs.len() - fork_start_len;
+            let main_tidx = fs.len();
             crate::main::record_main_transcript(fs, chip_idx, chip_proof);
 
             preflight.main.chips.push(ChipTranscriptRange {
                 chip_idx,
                 instance_idx,
-                tidx: 1 + main_local,
-                fork_idx,
+                tidx: main_tidx,
+                fork_idx: fork_id,
             });
         }
 
         // Phase 4: Merge — sample 1 ext element from each fork, observe into trunk.
-        for (i, fork_sponge) in fork_sponges.iter_mut().enumerate() {
+        for fork_sponge in &mut fork_sponges {
             let sample: <BabyBearPoseidon2Config as StarkProtocolConfig>::EF =
                 FiatShamirTranscript::<BabyBearPoseidon2Config>::sample_ext(fork_sponge);
             sponge.observe_ext(sample);
@@ -411,33 +437,38 @@ impl<const MAX_NUM_PROOFS: usize> VerifierSubCircuit<MAX_NUM_PROOFS> {
 
         preflight.proof_shape.fork_start_tidx = fork_offset;
 
-        // Compute the sponge state at the fork point by replaying the trunk log.
-        preflight.proof_shape.fork_start_state =
-            crate::utils::replay_sponge_state_from_log(&trunk_log, fork_offset);
-
-        // Store fork transcript logs. Each fork sponge log contains the
-        // trunk's history (0..fork_offset) followed by fork-only operations
-        // (fork_id observe + phase 3 ops + phase 4 sample_ext). Extract the
-        // fork portion. Global offsets are NOT stored — they are computed on
-        // the fly via Preflight::fork_global_offset().
-        for (_fork_idx, fork_sponge) in fork_sponges.into_iter().enumerate() {
-            let full_fork_log = fork_sponge.into_log();
-            let fork_values = full_fork_log.values()[fork_offset..].to_vec();
-            let fork_samples = full_fork_log.samples()[fork_offset..].to_vec();
-            let fork_log = openvm_stark_backend::TranscriptLog::new(fork_values, fork_samples);
-
-            // Compute the fork's initial sponge state by replaying the
-            // full log up to fork_offset + 1 (trunk ops + fork_id observe).
-            let initial_state =
-                crate::utils::replay_sponge_state_from_log(&full_fork_log, fork_offset + 1);
+        // Store fork transcript logs directly. Fork transcripts are fresh
+        // domain-separated sponges, so no trunk slicing or inherited state.
+        for (fork_id, fork_sponge) in fork_sponges.into_iter().enumerate() {
+            let fork_log = fork_sponge.into_log();
 
             preflight.fork_transcripts.push(ForkTranscriptLog {
                 log: fork_log,
-                initial_state,
-                fork_id: _fork_idx + 1, // 1-based fork IDs (0 = trunk)
+                fork_id,
             });
         }
 
+        preflight.proof_shape.after_forked_challenge_1 = preflight
+            .fork_transcripts
+            .first()
+            .and_then(|f| {
+                f.log
+                    .values()
+                    .get(f.log.len().saturating_sub(D_EF)..)
+                    .and_then(EF::from_basis_coefficients_slice)
+            })
+            .unwrap_or(EF::ZERO);
+        preflight.proof_shape.after_forked_challenge_2 = preflight
+            .fork_transcripts
+            .get(1)
+            .and_then(|f| {
+                f.log
+                    .values()
+                    .get(f.log.len().saturating_sub(D_EF)..)
+                    .and_then(EF::from_basis_coefficients_slice)
+            })
+            .unwrap_or(preflight.proof_shape.after_forked_challenge_1);
+
         preflight.transcript = trunk_log;
         preflight
     }
@@ -473,6 +504,12 @@ impl<const MAX_NUM_PROOFS: usize> VerifierSubCircuit<MAX_NUM_PROOFS> {
     }
 }
 
+fn ef_to_limbs(value: EF) -> [F; D_EF] {
+    let mut out = [F::ZERO; D_EF];
+    out.copy_from_slice(value.as_basis_coefficients_slice());
+    out
+}
+
 impl<SC: StarkProtocolConfig<F = F>, const MAX_NUM_PROOFS: usize>
     VerifierTraceGen<CpuBackend<SC>, SC> for VerifierSubCircuit<MAX_NUM_PROOFS>
 {
@@ -491,7 +528,8 @@ impl<SC: StarkProtocolConfig<F = F>, const MAX_NUM_PROOFS: usize>
     #[tracing::instrument(name = "subcircuit_generate_proving_ctxs", skip_all)]
     fn generate_proving_ctxs<
         TS: FiatShamirTranscript<BabyBearPoseidon2Config>
-            + TranscriptHistory<F = F, State = [F; POSEIDON2_WIDTH]>,
+            + TranscriptHistory<F = F, State = [F; POSEIDON2_WIDTH]>
+            + From<Poseidon2BabyBear<POSEIDON2_WIDTH>>,
     >(
         &self,
         child_vk: &RecursionVk,
diff --git a/ceno_recursion_v2/src/system/preflight/mod.rs b/ceno_recursion_v2/src/system/preflight/mod.rs
index f539cb0f6..ae05071b0 100644
--- a/ceno_recursion_v2/src/system/preflight/mod.rs
+++ b/ceno_recursion_v2/src/system/preflight/mod.rs
@@ -1,6 +1,6 @@
 use openvm_poseidon2_air::POSEIDON2_WIDTH;
 use openvm_stark_backend::TranscriptLog;
-use openvm_stark_sdk::config::baby_bear_poseidon2::{EF, F};
+use openvm_stark_sdk::config::baby_bear_poseidon2::{D_EF, EF, F};
 
 use crate::tower::TowerReplayResult;
 
@@ -18,6 +18,7 @@ pub struct Preflight {
     pub main: MainPreflight,
     pub gkr: TowerPreflight,
     pub batch_constraint: BatchConstraintPreflight,
+    pub vm_pvs: VmPvsPreflight,
 }
 
 impl Preflight {
@@ -40,15 +41,12 @@ impl Preflight {
     }
 }
 
-/// A single forked transcript chain with its initial sponge state.
+/// A single forked transcript chain.
 #[derive(Clone, Debug)]
 pub struct ForkTranscriptLog {
     /// The log of observe/sample operations in this fork.
     pub log: TranscriptLog<F, PoseidonWord>,
-    /// The sponge state this fork inherits from the trunk (after observing
-    /// the fork index into it).
-    pub initial_state: PoseidonWord,
-    /// The fork identifier (1-based: fork 0 = trunk, 1..N = chip forks).
+    /// The fork identifier (0-based across forked chip transcripts).
     pub fork_id: usize,
 }
 
@@ -56,18 +54,24 @@ pub struct ForkTranscriptLog {
 pub struct ProofShapePreflight {
     pub sorted_trace_vdata: Vec<(usize, TraceVData)>,
     pub starting_tidx: Vec<usize>,
-    pub starting_cidx: Vec<usize>,
-    pub pvs_tidx: Vec<usize>,
     pub post_tidx: usize,
     pub n_max: usize,
     pub n_logup: usize,
+    // TODO remove l_skip
     pub l_skip: usize,
     pub fork_start_tidx: usize,
-    pub alpha_tidx: usize,
-    pub beta_tidx: usize,
-    /// The Poseidon2 sponge state at the fork point (trunk state just before
-    /// forking). All forks clone this state as their starting point.
-    pub fork_start_state: PoseidonWord,
+    pub lookup_challenge_alpha: [F; D_EF],
+    pub lookup_challenge_beta: [F; D_EF],
+    pub after_forked_challenge_1: EF,
+    pub after_forked_challenge_2: EF,
+}
+
+#[derive(Clone, Debug, Default)]
+pub struct VmPvsPreflight {
+    pub lookup_challenge_alpha: EF,
+    pub lookup_challenge_beta: EF,
+    pub lookup_challenge_alpha_lookup_count: usize,
+    pub lookup_challenge_beta_lookup_count: usize,
 }
 
 #[derive(Clone, Debug, Default)]
diff --git a/ceno_recursion_v2/src/tower/mod.rs b/ceno_recursion_v2/src/tower/mod.rs
index 0c13755d7..15f48dbb8 100644
--- a/ceno_recursion_v2/src/tower/mod.rs
+++ b/ceno_recursion_v2/src/tower/mod.rs
@@ -254,25 +254,12 @@ impl TowerModule {
         TS: FiatShamirTranscript<BabyBearPoseidon2Config>
             + TranscriptHistory<F = F, State = [F; POSEIDON2_WIDTH]>,
     {
-        let _ = (self, child_vk);
+        let _ = self;
         for (&chip_idx, chip_instances) in &proof.chip_proofs {
             for (instance_idx, chip_proof) in chip_instances.iter().enumerate() {
                 let tidx = ts.len();
-                let _ = record_gkr_transcript(ts, chip_idx, chip_proof);
-
-                let tower_replay = match circuit_vk_for_idx(child_vk, chip_idx) {
-                    Some(circuit_vk) => match replay_tower_proof(chip_proof, circuit_vk) {
-                        Ok(replay) => replay,
-                        Err(err) => {
-                            error!(
-                                ?err,
-                                chip_idx, "failed to replay tower proof during preflight"
-                            );
-                            TowerReplayResult::default()
-                        }
-                    },
-                    None => TowerReplayResult::default(),
-                };
+                let tower_replay =
+                    record_and_replay_tower_preflight(ts, child_vk, chip_idx, chip_proof);
 
                 preflight.gkr.chips.push(TowerChipTranscriptRange {
                     chip_idx,
@@ -366,6 +353,34 @@ pub(crate) fn circuit_vk_for_idx(
         .and_then(|name| vk.circuit_vks.get(name))
 }
 
+/// Record all tower transcript events for one chip proof, then replay tower proof.
+/// Keeping this in the tower module avoids preflight callsites duplicating
+/// transcript/replay wiring logic.
+pub(crate) fn record_and_replay_tower_preflight<TS>(
+    ts: &mut TS,
+    child_vk: &RecursionVk,
+    chip_idx: usize,
+    chip_proof: &ZKVMChipProof<RecursionField>,
+) -> TowerReplayResult
+where
+    TS: FiatShamirTranscript<BabyBearPoseidon2Config>,
+{
+    let _ = record_gkr_transcript(ts, chip_idx, chip_proof);
+    match circuit_vk_for_idx(child_vk, chip_idx) {
+        Some(circuit_vk) => match replay_tower_proof(chip_proof, circuit_vk) {
+            Ok(replay) => replay,
+            Err(err) => {
+                error!(
+                    ?err,
+                    chip_idx, "failed to replay tower proof during preflight"
+                );
+                TowerReplayResult::default()
+            }
+        },
+        None => TowerReplayResult::default(),
+    }
+}
+
 #[allow(clippy::too_many_arguments)]
 fn build_chip_records(
     proof_idx: usize,
diff --git a/ceno_recursion_v2/src/tower/tower.rs b/ceno_recursion_v2/src/tower/tower.rs
index 49fb85293..0d288adf9 100644
--- a/ceno_recursion_v2/src/tower/tower.rs
+++ b/ceno_recursion_v2/src/tower/tower.rs
@@ -45,7 +45,7 @@ pub fn replay_tower_proof(
     vk: &VerifyingKey<RecursionField>,
 ) -> Result<TowerReplayResult> {
     let mut transcript = BasicTranscript::<RecursionField>::new(b"ceno-recursion-tower-tower");
-    replay_tower_proof_inner(chip_proof, vk, &mut transcript)
+    build_tower_replay_result(chip_proof, vk, &mut transcript)
 }
 
 // ---------------------------------------------------------------------------
@@ -150,11 +150,11 @@ pub fn replay_tower_proof_poseidon(
     schedule: &TowerTranscriptSchedule,
 ) -> Result<TowerReplayResult> {
     let mut transcript = PrecomputedTranscript::from_schedule(schedule);
-    replay_tower_proof_inner(chip_proof, vk, &mut transcript)
+    build_tower_replay_result(chip_proof, vk, &mut transcript)
 }
 
-/// Core replay logic, generic over transcript type.
-fn replay_tower_proof_inner(
+/// Core tower replay builder, generic over transcript type.
+fn build_tower_replay_result(
     chip_proof: &ZKVMChipProof<RecursionField>,
     vk: &VerifyingKey<RecursionField>,
     transcript: &mut impl Transcript<RecursionField>,
diff --git a/ceno_recursion_v2/src/transcript/mod.rs b/ceno_recursion_v2/src/transcript/mod.rs
index 6a41ca0da..1588acd78 100644
--- a/ceno_recursion_v2/src/transcript/mod.rs
+++ b/ceno_recursion_v2/src/transcript/mod.rs
@@ -81,48 +81,9 @@ impl TranscriptModule {
         num_valid_rows
     }
 
-    /// Replay the sponge for the first `up_to` entries of a transcript log,
-    /// returning the sponge state at that point.
-    ///
-    /// This matches the DuplexSponge's overwrite-mode behavior.
-    fn replay_sponge_state(
-        &self,
-        log: &openvm_stark_backend::TranscriptLog<F, [F; POSEIDON2_WIDTH]>,
-        up_to: usize,
-    ) -> [F; POSEIDON2_WIDTH] {
-        let mut state = [F::ZERO; POSEIDON2_WIDTH];
-        let mut absorb_idx = 0usize;
-        let mut sample_idx = 0usize;
-
-        for i in 0..up_to {
-            if log.samples()[i] {
-                // Matches DuplexSponge::sample()
-                let needs_perm = absorb_idx != 0 || sample_idx == 0;
-                if needs_perm {
-                    self.perm.permute_mut(&mut state);
-                    absorb_idx = 0;
-                    sample_idx = CHUNK; // RATE = CHUNK
-                }
-                sample_idx -= 1;
-                // sampled value = state[sample_idx]; we don't need it
-            } else {
-                // Matches DuplexSponge::observe()
-                state[absorb_idx] = log.values()[i];
-                absorb_idx += 1;
-                if absorb_idx == CHUNK {
-                    self.perm.permute_mut(&mut state);
-                    absorb_idx = 0;
-                    sample_idx = CHUNK;
-                }
-            }
-        }
-
-        state
-    }
-
     /// Fill transcript trace rows from a single transcript log.
     ///
-    /// `tidx_offset` is added to the local tidx so fork logs can use global tidx.
+    /// `tidx_offset` is added to the local tidx.
     ///
     /// Returns the final sponge state after processing the log.
     #[allow(clippy::too_many_arguments)]
@@ -273,50 +234,28 @@ impl TranscriptModule {
             );
             offset = trunk_end;
 
-            // Compute the sponge state at the fork point by replaying
-            // the trunk's pre-fork operations.
-            let fork_point_state = self
-                .replay_sponge_state(&preflight.transcript, preflight.proof_shape.fork_start_tidx);
-
-            // Fill fork rows. Global tidx offsets for each fork are computed
-            // on the fly (trunk_len + sum of preceding fork lengths).
-            let trunk_len = preflight.transcript.len();
-            let mut fork_tidx_cursor = trunk_len;
+            // Fill fork rows with fork-local tidx offsets.
             for (fi, fork_log) in preflight.fork_transcripts.iter().enumerate() {
                 let fork_rows = info.fork_rows[fi];
                 let fork_end = offset + fork_rows;
                 let fork_slice =
                     &mut transcript_trace[offset * transcript_width..fork_end * transcript_width];
-                // Each fork starts from the trunk's sponge state at the fork
-                // point (NOT the trunk's final state, which includes merge ops).
+                // Fresh fork semantics: every fork transcript starts from a
+                // clean sponge state and is domain-separated by "fork".
                 let _ = self.fill_log_rows(
                     fork_slice,
                     transcript_width,
                     &fork_log.log,
                     pidx,
                     fork_log.fork_id,
-                    false,            // is_proof_start
-                    true,             // is_fork_start
-                    fork_point_state, // trunk state at fork point
-                    fork_tidx_cursor, // computed global tidx offset
+                    false, // is_proof_start
+                    true,  // is_fork_start
+                    [F::ZERO; POSEIDON2_WIDTH],
+                    0,
                     &mut poseidon2_perm_inputs,
                 );
-                fork_tidx_cursor += fork_log.log.len();
                 offset = fork_end;
             }
-
-            // Fill trunk_fork_state on all rows for this proof.
-            // This is the trunk's sponge state at the fork point (before any
-            // fork-specific operations, but after all pre-fork trunk ops).
-            let proof_start = trunk_end - info.trunk_rows;
-            let proof_end = offset;
-            for row in transcript_trace
-                [proof_start * transcript_width..proof_end * transcript_width]
-                .chunks_exact_mut(transcript_width)
-            {
-                let cols: &mut ForkedTranscriptCols<F> = row.borrow_mut();
-                cols.trunk_fork_state = fork_point_state;
-            }
         }
 
         Some((
diff --git a/ceno_recursion_v2/src/transcript/transcript_air.rs b/ceno_recursion_v2/src/transcript/transcript_air.rs
index af4d080de..ca12a2c28 100644
--- a/ceno_recursion_v2/src/transcript/transcript_air.rs
+++ b/ceno_recursion_v2/src/transcript/transcript_air.rs
@@ -4,16 +4,12 @@
 //! fork support.  The key additions:
 //!
 //! * **`is_fork_start`** column — marks the first row of a forked transcript chain.
-//!   When set, the sponge state is initialised from the trunk's fork-point state
-//!   (propagated through `trunk_fork_state` auxiliary columns) instead of
-//!   being constrained to all-zeros.
+//!   When set, the sponge state is initialised from the provided initial fork state.
 //!
 //! * **`fork_id`** column — identifies which fork this row belongs to.
-//!   `fork_id = 0` is the trunk (pre-fork), `fork_id ≥ 1` are the per-chip forks.
+//!   Fork rows use 0-based identifiers (0..N-1) across per-chip forks.
 //!
-//! * **`trunk_fork_state`** auxiliary columns — carry the trunk's sponge state at
-//!   the fork point. These are propagated unchanged across all rows within a proof
-//!   and used to constrain each fork's initial sponge state.
+//! Fork rows otherwise follow upstream transcript constraints.
 
 use core::borrow::Borrow;
 
@@ -24,6 +20,7 @@ use openvm_circuit_primitives::{
 use openvm_stark_backend::{
     BaseAirWithPublicValues, PartitionedBaseAir, interaction::InteractionBuilder,
 };
+use openvm_stark_sdk::config::baby_bear_poseidon2::D_EF;
 use p3_air::{Air, AirBuilder, BaseAir};
 use p3_field::{Field, PrimeCharacteristicRing};
 use p3_matrix::Matrix;
@@ -62,11 +59,8 @@ pub struct ForkedTranscriptCols<T> {
     // --- fork extensions ---
     /// 1 on the first row of a forked transcript chain.
     pub is_fork_start: T,
-    /// Fork identifier. 0 = trunk, 1..N = per-chip forks.
+    /// Fork identifier (0-based across forked chip transcripts).
     pub fork_id: T,
-    /// The trunk's sponge state at the fork point, propagated to all rows within
-    /// a proof so fork starts can constrain their initial sponge state.
-    pub trunk_fork_state: [T; POSEIDON2_WIDTH],
 }
 
 impl<T: Copy> ForkedTranscriptCols<T> {
@@ -76,8 +70,7 @@ impl<T: Copy> ForkedTranscriptCols<T> {
         // prev_state = POSEIDON2_WIDTH
         // post_state = POSEIDON2_WIDTH
         // is_fork_start, fork_id = 2
-        // trunk_fork_state = POSEIDON2_WIDTH
-        4 + CHUNK + 3 * POSEIDON2_WIDTH + 2
+        4 + CHUNK + 2 * POSEIDON2_WIDTH + 2
     }
 }
 
@@ -184,44 +177,6 @@ impl<AB: AirBuilder + InteractionBuilder> Air<AB> for ForkedTranscriptAir {
                 .assert_eq(local.prev_state[i], AB::Expr::ZERO);
         }
 
-        ///////////////////////////////////////////////////////////////////////
-        // Fork state: trunk_fork_state propagation and anchoring
-        ///////////////////////////////////////////////////////////////////////
-
-        // Propagation: trunk_fork_state is constant within a proof.
-        // When next is valid and not a new proof, propagate.
-        for i in 0..POSEIDON2_WIDTH {
-            builder
-                .when(next_valid)
-                .when(not::<AB::Expr>(next.is_proof_start))
-                .assert_eq(local.trunk_fork_state[i], next.trunk_fork_state[i]);
-        }
-
-        // Anchor: the trace generator fills trunk_fork_state with the trunk's
-        // sponge state at the fork point. Because the trunk includes merge
-        // operations after the fork point, the trunk→fork row transition does
-        // NOT have the fork-point state in post_state. Instead, we rely on:
-        //   (a) trunk_fork_state is propagated/constant within a proof, and
-        //   (b) the TranscriptBus checks that fork operations produce values
-        //       consistent with the consumer AIRs.
-        // A direct AIR anchoring constraint would require a marker column at
-        // the exact fork-point row within the trunk, which we omit for now.
-
-        // Fork start: the fork's initial sponge state must match
-        // trunk_fork_state for capacity positions and unmasked rate positions.
-        for i in 0..CHUNK {
-            // Capacity: always constrained
-            builder.when(local.is_fork_start).assert_eq(
-                local.prev_state[i + CHUNK],
-                local.trunk_fork_state[i + CHUNK],
-            );
-            // Rate: constrained where mask[i] == 0 (position not overwritten by absorption)
-            builder
-                .when(local.is_fork_start)
-                .when(not::<AB::Expr>(local.mask[i]))
-                .assert_eq(local.prev_state[i], local.trunk_fork_state[i]);
-        }
-
         ///////////////////////////////////////////////////////////////////////
         // Intra-chain continuity (sponge state propagation)
         ///////////////////////////////////////////////////////////////////////
@@ -301,21 +256,52 @@ impl<AB: AirBuilder + InteractionBuilder> Air<AB> for ForkedTranscriptAir {
         ///////////////////////////////////////////////////////////////////////
         // Forked transcript bus interactions (send fork state)
         ///////////////////////////////////////////////////////////////////////
-        // On is_fork_start rows, send the trunk_fork_state (the sponge state
-        // at the fork point) on the ForkedTranscriptBus. ProofShapeAir
-        // receives these to cross-check its current_snapshot_state.
-        for i in 0..POSEIDON2_WIDTH {
+        // On is_fork_start rows, send fork-local transcript words with fork_id.
+        for i in 0..D_EF {
+            self.forked_transcript_bus.send(
+                builder,
+                local.proof_idx,
+                ForkedTranscriptBusMessage {
+                    fork_id: local.fork_id.into(),
+                    tidx: local.tidx + AB::Expr::from_usize(i),
+                    value: local.prev_state[i].into(),
+                    is_sample: AB::Expr::ZERO,
+                },
+                local.is_fork_start,
+            );
             self.forked_transcript_bus.send(
                 builder,
                 local.proof_idx,
                 ForkedTranscriptBusMessage {
                     fork_id: local.fork_id.into(),
-                    fork_tidx: AB::Expr::from_usize(i),
-                    value: local.trunk_fork_state[i].into(),
+                    tidx: local.tidx + AB::Expr::from_usize(D_EF + i),
+                    value: local.prev_state[i].into(),
                     is_sample: AB::Expr::ZERO,
                 },
                 local.is_fork_start,
             );
+            self.forked_transcript_bus.send(
+                builder,
+                local.proof_idx,
+                ForkedTranscriptBusMessage {
+                    fork_id: local.fork_id.into(),
+                    tidx: local.tidx + AB::Expr::from_usize(2 * D_EF + i),
+                    value: local.prev_state[i].into(),
+                    is_sample: AB::Expr::ONE,
+                },
+                local.is_fork_start,
+            );
+            self.forked_transcript_bus.send(
+                builder,
+                local.proof_idx,
+                ForkedTranscriptBusMessage {
+                    fork_id: local.fork_id.into(),
+                    tidx: local.tidx + AB::Expr::from_usize(3 * D_EF + i),
+                    value: local.prev_state[i].into(),
+                    is_sample: AB::Expr::ONE,
+                },
+                local.is_fork_start,
+            );
         }
 
         ///////////////////////////////////////////////////////////////////////
diff --git a/ceno_recursion_v2/src/utils.rs b/ceno_recursion_v2/src/utils.rs
index ff64e8d4d..e486ab544 100644
--- a/ceno_recursion_v2/src/utils.rs
+++ b/ceno_recursion_v2/src/utils.rs
@@ -13,12 +13,14 @@ use p3_symmetric::Permutation;
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum TranscriptLabel {
     Riscv,
+    Fork,
 }
 
 impl TranscriptLabel {
     pub fn as_bytes(self) -> &'static [u8] {
         match self {
             Self::Riscv => b"riscv",
+            Self::Fork => b"fork",
         }
     }
 
diff --git a/ceno_zkvm/src/scheme/prover.rs b/ceno_zkvm/src/scheme/prover.rs
index 87aa989d2..2c6283765 100644
--- a/ceno_zkvm/src/scheme/prover.rs
+++ b/ceno_zkvm/src/scheme/prover.rs
@@ -23,7 +23,7 @@ use sumcheck::{
     structs::IOPProverMessage,
 };
 use tracing::info_span;
-use transcript::{ForkableTranscript, Transcript};
+use transcript::{BasicTranscript, ForkableTranscript, Transcript};
 
 use super::{PublicValues, ZKVMChipProof, ZKVMProof, hal::ProverDevice};
 #[cfg(feature = "gpu")]
@@ -177,33 +177,6 @@ impl<
             // Extract chip metadata before consuming witnesses.
             // We reuse this for both transcript appends and task construction.
             let name_and_instances = witnesses.get_witnesses_name_instance();
-            let transcript_instances = name_and_instances.iter().try_fold(
-                Vec::new(),
-                |mut acc, (name, num_instances)| -> Result<_, ZKVMError> {
-                    let pk = self.pk.circuit_pks.get(name).ok_or(ZKVMError::VKNotFound(
-                        format!("proving key for circuit {} not found", name).into(),
-                    ))?;
-
-                    // Skip OMC-init-only circuits in non-first shards.
-                    if !shard_ctx.is_first_shard() && pk.get_cs().with_omc_init_only() {
-                        return Ok(acc);
-                    }
-
-                    if num_instances.iter().sum::<usize>() == 0 {
-                        return Ok(acc);
-                    }
-
-                    let circuit_idx = self.pk.circuit_name_to_index.get(name).copied().ok_or(
-                        ZKVMError::VKNotFound(
-                            format!("circuit index for {} not found", name).into(),
-                        ),
-                    )?;
-
-                    acc.push((circuit_idx, *num_instances));
-                    Ok(acc)
-                },
-            )?;
-
             let mut structural_rmms = Vec::with_capacity(name_and_instances.len());
             // commit to opcode circuits first and then commit to table circuits, sorted by name
             for (i, chip_input) in witnesses.into_iter_sorted().enumerate() {
@@ -248,14 +221,6 @@ impl<
             PCS::write_commitment(&witin_commit, &mut transcript).map_err(ZKVMError::PCSError)?;
             exit_span!(commit_to_traces_span);
 
-            // Write (circuit_idx, num_instances) after all commitments are absorbed.
-            for (circuit_idx, num_instances) in &transcript_instances {
-                transcript.append_field_element(&E::BaseField::from_usize(*circuit_idx));
-                for num_instance in num_instances {
-                    transcript.append_field_element(&E::BaseField::from_usize(*num_instance));
-                }
-            }
-
             // Use pre-loaded fixed_mles (extracted before in_scope to avoid lifetime issues)
             let fixed_mles = fixed_mles_preload;
 
@@ -286,8 +251,9 @@ impl<
             // GPU concurrent: memory-aware backfilling with standalone impl.
             // Sequential (GPU + CPU): unified path via self.create_chip_proof.
             let execute_tasks_span = entered_span!("execute_chip_tasks", profiling_1 = true);
+            let fork_transcript = BasicTranscript::<E>::new(b"fork");
             let (results, forked_samples) =
-                self.run_chip_proofs(tasks, &transcript, &witness_data)?;
+                self.run_chip_proofs(tasks, &fork_transcript, &witness_data)?;
             exit_span!(execute_tasks_span);
 
             // Phase 3: Collect results
@@ -355,9 +321,16 @@ impl<
                 let gpu_wd = SyncRef(gpu_witness_data);
 
                 return scheduler.execute(tasks, transcript, |task, transcript| {
+                    // Bind global challenges and metadata in the same order as verifier.
+                    transcript.append_field_element_ext(&task.challenges[0]);
+                    transcript.append_field_element_ext(&task.challenges[1]);
+                    transcript.append_field_element(&E::BaseField::from_usize(task.task_id));
                     // Append circuit_idx to per-task forked transcript (matching verifier)
                     transcript
                         .append_field_element(&E::BaseField::from_u64(task.circuit_idx as u64));
+                    for num_instance in task.input.num_instances {
+                        transcript.append_field_element(&E::BaseField::from_usize(num_instance));
+                    }
 
                     // SAFETY: TypeId check above (before closure) guarantees PB = GpuBackend<E, PCS>.
                     let gpu_input: ProofInput<'static, gkr_iop::gpu::GpuBackend<E, PCS>> =
@@ -391,8 +364,15 @@ impl<
         // Sequential path (GPU + CPU unified):
         // Uses execute_sequentially directly to avoid Send+Sync requirement on the closure.
         scheduler.execute_sequentially(tasks, transcript, |mut task, transcript| {
+            // Bind global challenges and metadata in the same order as verifier.
+            transcript.append_field_element_ext(&task.challenges[0]);
+            transcript.append_field_element_ext(&task.challenges[1]);
+            transcript.append_field_element(&E::BaseField::from_usize(task.task_id));
             // Append circuit_idx to per-task forked transcript (matching verifier)
             transcript.append_field_element(&E::BaseField::from_u64(task.circuit_idx as u64));
+            for num_instance in task.input.num_instances {
+                transcript.append_field_element(&E::BaseField::from_usize(num_instance));
+            }
 
             // Prepare: deferred extraction for GPU, no-op for CPU
             self.device.prepare_chip_input(&mut task, witness_data);
diff --git a/ceno_zkvm/src/scheme/scheduler.rs b/ceno_zkvm/src/scheme/scheduler.rs
index 4c6a7620e..f39152ea3 100644
--- a/ceno_zkvm/src/scheme/scheduler.rs
+++ b/ceno_zkvm/src/scheme/scheduler.rs
@@ -21,7 +21,6 @@ use crate::{
 use ff_ext::ExtensionField;
 use gkr_iop::hal::ProverBackend;
 use mpcs::Point;
-use p3::field::PrimeCharacteristicRing;
 use std::sync::OnceLock;
 use transcript::Transcript;
 static CHIP_PROVING_MODE: OnceLock<ChipProvingMode> = OnceLock::new();
@@ -152,8 +151,8 @@ impl ChipScheduler {
 
     /// Execute tasks sequentially with automatic transcript forking and sampling.
     ///
-    /// Each task gets a transcript cloned from `parent_transcript` with `task_id`
-    /// appended (identical to `ForkableTranscript::fork` default impl).
+    /// Each task gets a transcript cloned from `parent_transcript`.
+    /// Task-specific transcript appends are performed by the task closure.
     /// Returns `(results, forked_samples)` both sorted by task_id.
     #[allow(clippy::type_complexity)]
     pub(crate) fn execute_sequentially<'a, PB, T, F>(
@@ -186,12 +185,8 @@ impl ChipScheduler {
 
         for task in tasks {
             let task_id = task.task_id;
-            // Fork: clone parent + append task_id
-            // (identical to ForkableTranscript::fork default impl)
+            // Fork: clone parent transcript template.
             let mut forked = parent_transcript.clone();
-            forked.append_field_element(&<PB::E as ExtensionField>::BaseField::from_u64(
-                task_id as u64,
-            ));
 
             let result = execute_task(task, &mut forked)?;
             results.push(result);
@@ -213,8 +208,7 @@ impl ChipScheduler {
     /// Tasks are sorted by memory requirement (descending) and scheduled to
     /// maximize GPU utilization while respecting memory constraints.
     ///
-    /// Each worker thread clones the parent `transcript` and appends its task_id
-    /// (reproducing `ForkableTranscript::fork` locally). After proving, the worker
+    /// Each worker thread clones the parent `transcript`. After proving, the worker
     /// samples one extension-field element from its local transcript and returns it.
     /// This avoids sending non-`Send` transcript objects across threads.
     ///
@@ -250,9 +244,6 @@ impl ChipScheduler {
         if tasks.len() == 1 {
             let task = tasks.remove(0);
             let mut fork = transcript.clone();
-            fork.append_field_element(&<PB::E as ExtensionField>::BaseField::from_u64(
-                task.task_id as u64,
-            ));
             let result = execute_task(task, &mut fork)?;
             let sample = fork.sample_vec(1)[0];
             return Ok((vec![result], vec![sample]));
@@ -347,13 +338,8 @@ impl ChipScheduler {
                         // waiting for a CompletionMessage that never arrives).
                         let outcome =
                             std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
-                                // Fork locally: clone parent transcript + append task_id
-                                // (identical to ForkableTranscript::fork default impl)
+                                // Fork locally: clone parent transcript template.
                                 let mut local_transcript = tr.0.clone();
-                                local_transcript.append_field_element(
-                                    &<PB::E as ExtensionField>::BaseField::from_u64(task_id as u64),
-                                );
-
                                 let result = execute_fn(task, &mut local_transcript);
 
                                 // Sample from the forked transcript for gather phase
diff --git a/ceno_zkvm/src/scheme/verifier.rs b/ceno_zkvm/src/scheme/verifier.rs
index 73c16a331..7a47c44fe 100644
--- a/ceno_zkvm/src/scheme/verifier.rs
+++ b/ceno_zkvm/src/scheme/verifier.rs
@@ -43,7 +43,7 @@ use sumcheck::{
     structs::{IOPProof, IOPVerifierState},
     util::get_challenge_pows,
 };
-use transcript::{ForkableTranscript, Transcript};
+use transcript::{BasicTranscript, ForkableTranscript, Transcript};
 use witness::next_pow2_instance_padding;
 
 pub struct ZKVMVerifier<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> {
@@ -253,15 +253,6 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> ZKVMVerifier<E, PCS>
         PCS::write_commitment(&vm_proof.witin_commit, &mut transcript)
             .map_err(ZKVMError::PCSError)?;
 
-        // write (circuit_idx, num_instance) to transcript
-        for (circuit_idx, proofs) in vm_proof.chip_proofs.iter() {
-            transcript.append_field_element(&E::BaseField::from_u32(*circuit_idx as u32));
-            // length of proof.num_instances will be constrained in verify_chip_proof
-            for num_instance in proofs.iter().flat_map(|proof| &proof.num_instances) {
-                transcript.append_field_element(&E::BaseField::from_usize(*num_instance));
-            }
-        }
-
         #[cfg(debug_assertions)]
         {
             Instrumented::<<<E as ExtensionField>::BaseField as PoseidonField>::P>::log_label(
@@ -307,16 +298,17 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> ZKVMVerifier<E, PCS>
         }
 
         // fork transcript to support chip concurrently proved
-        let mut forked_transcripts = transcript.fork(num_proofs);
-        for ((index, proof), transcript) in vm_proof
+        let mut forked_transcripts = vec![BasicTranscript::new(b"fork"); num_proofs];
+        for (index, ((circuit_index, proof), transcript)) in vm_proof
             .chip_proofs
             .iter()
             .flat_map(|(index, proofs)| iter::repeat_n(index, proofs.len()).zip(proofs))
             .zip_eq(forked_transcripts.iter_mut())
+            .enumerate()
         {
             let num_instance: usize = proof.num_instances.iter().sum();
             assert!(num_instance > 0);
-            let circuit_name = &self.vk.circuit_index_to_name[index];
+            let circuit_name = &self.vk.circuit_index_to_name[circuit_index];
             let circuit_vk = &self.vk.circuit_vks[circuit_name];
 
             if proof.r_out_evals.len() != circuit_vk.get_cs().num_reads()
@@ -353,7 +345,13 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> ZKVMVerifier<E, PCS>
                 })
                 .sum::<E>();
 
-            transcript.append_field_element(&E::BaseField::from_u64(*index as u64));
+            transcript.append_field_element_ext(&challenges[0]);
+            transcript.append_field_element_ext(&challenges[1]);
+            transcript.append_field_element(&E::BaseField::from_usize(index));
+            transcript.append_field_element(&E::BaseField::from_u64(*circuit_index as u64));
+            for num_instance in &proof.num_instances {
+                transcript.append_field_element(&E::BaseField::from_usize(*num_instance));
+            }
 
             // compute logup_sum padding
             // getting the number of dummy padding item that we used in this opcode circuit