THUDM · nanjiangwill · May 11, 2026 · May 11, 2026 · May 11, 2026 · May 11, 2026
diff --git a/slime/ray/rollout.py b/slime/ray/rollout.py
@@ -691,6 +691,32 @@ def _convert_samples_to_train_data(self, samples: list[Sample] | list[list[Sampl
         assert len(raw_rewards) == len(samples)
         assert len(rewards) == len(samples)
 
+        # Neutralize zero-advantage samples: their forward pass is wasted compute
+        # since loss_mask=0 gives zero gradient. 0 matches the pad token used in
+        # slime/backends/megatron_utils/data.py. Limited to advantage estimators
+        # whose per-token advantage is a scalar broadcast of `rewards` (r==0 ⇒
+        # advantage==0); ppo/reinforce_plus_plus mix in values/kl/GAE so r==0 does
+        # not imply zero gradient.
+        if self.args.advantage_estimator in ["grpo", "gspo"]:
+            pad_token_id = 0
+            num_neutralized = 0
+            for sample, r in zip(samples, rewards, strict=True):
+                if r == 0.0:
+                    sample.tokens = [pad_token_id, pad_token_id]
+                    sample.response_length = 1
+                    sample.loss_mask = [0]
+                    sample.remove_sample = True
+                    num_neutralized += 1
+            logger.info(f"neutralized {num_neutralized}/{len(samples)} zero-advantage samples")
+            logging_utils.log(
+                self.args,
+                {
+                    "rollout/neutralized_ratio": num_neutralized / len(samples),
+                    "rollout/step": compute_rollout_step(self.args, self.rollout_id),
+                },
+                step_key="rollout/step",
+            )
+
         train_data = {
             "tokens": [sample.tokens for sample in samples],
             "response_lengths": [sample.response_length for sample in samples],