From bab83c0a40822dc57b2d44eec9e3c090c11e581b Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 17 Apr 2026 07:16:13 +0000
Subject: [PATCH 1/6] Initial plan


From 86966c5083506c012790658c49ccfd1b0cef0ba5 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 17 Apr 2026 07:46:52 +0000
Subject: [PATCH 2/6] Support output_logits='generation' and
 output_last_hidden_state in PyTorch engine

Agent-Logs-Url: https://github.com/InternLM/lmdeploy/sessions/84914fd8-47c1-4c72-80f4-88255925953e

Co-authored-by: CUHKSZzxy <46674730+CUHKSZzxy@users.noreply.github.com>
---
 lmdeploy/pytorch/engine/engine.py            |  1 +
 lmdeploy/pytorch/engine/engine_instance.py   |  3 +
 lmdeploy/pytorch/engine/engine_loop.py       | 49 +++++++++-
 lmdeploy/pytorch/engine/inputs_maker.py      | 12 +++
 lmdeploy/pytorch/engine/model_agent/agent.py | 40 ++++++++
 lmdeploy/pytorch/messages.py                 | 96 ++++++++++++++++++--
 6 files changed, 190 insertions(+), 11 deletions(-)

diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
index e2a7495624..87a801bb2c 100644
--- a/lmdeploy/pytorch/engine/engine.py
+++ b/lmdeploy/pytorch/engine/engine.py
@@ -44,6 +44,7 @@ class InferOutput:
     meta: Any = None
     finish: bool = False
     logits: torch.Tensor = None
+    last_hidden_state: torch.Tensor = None
     logprobs: torch.Tensor = None
 
     # send cache blocks back for migration in Disaggregated LLM Serving
diff --git a/lmdeploy/pytorch/engine/engine_instance.py b/lmdeploy/pytorch/engine/engine_instance.py
index 217e1d4609..0062bd711a 100644
--- a/lmdeploy/pytorch/engine/engine_instance.py
+++ b/lmdeploy/pytorch/engine/engine_instance.py
@@ -230,14 +230,17 @@ async def async_stream_infer(self,
                     # request might be cancelled before any output
                     token_ids = []
                     logits = None
+                    last_hidden_state = None
                 else:
                     token_ids = resp_data['token_ids'][output_offset:].tolist()
                     logits = resp_data.get('logits', None)
+                    last_hidden_state = resp_data.get('last_hidden_state', None)
                 num_ids = len(token_ids) - output_offset
                 logger.debug(f'session[{session_id}] finish: num_out_ids={num_ids}.')
                 yield EngineOutput(resp.type,
                                    token_ids,
                                    logits=logits,
+                                   last_hidden_state=last_hidden_state,
                                    cache_block_ids=cache_block_ids,
                                    req_metrics=req_metrics,
                                    routed_experts=routed_experts,
diff --git a/lmdeploy/pytorch/engine/engine_loop.py b/lmdeploy/pytorch/engine/engine_loop.py
index 2584b18e00..722a1ce66d 100644
--- a/lmdeploy/pytorch/engine/engine_loop.py
+++ b/lmdeploy/pytorch/engine/engine_loop.py
@@ -160,6 +160,7 @@ def _send_resp(self, out: InferOutput):
                       resp_type,
                       data=dict(token_ids=out.token_ids,
                                 logits=out.logits,
+                                last_hidden_state=out.last_hidden_state,
                                 cache_block_ids=out.cache_block_ids,
                                 req_metrics=out.req_metrics,
                                 routed_experts=out.routed_experts,
@@ -225,6 +226,16 @@ def __get_logit(msg, logits: torch.Tensor, seq_length: list[int], idx: int):
 
             return logit
 
+        def __get_hidden_state(msg, hidden_states: torch.Tensor, seq_length: list[int], idx: int):
+            hs = hidden_states.split(seq_length)[idx]
+            if len(msg.all_hidden_states) > 0:
+                # for chunked long context
+                msg.append_hidden_states(hs)
+                hs = msg.hidden_states
+                msg.all_hidden_states.resize(0)
+
+            return hs
+
         def __get_logprobs(batched_outputs: 'BatchedOutputs'):
             """Get valid logprobs."""
             batch_size = batched_outputs.stop_pos.size(0)
@@ -249,13 +260,18 @@ def __get_logprobs(batched_outputs: 'BatchedOutputs'):
             return results
 
         logits = batched_outputs.logits
+        hidden_states = batched_outputs.hidden_states
         all_routed_experts = batched_outputs.all_routed_experts
 
         if model_inputs is not None and (model_inputs.is_chunk and not model_inputs.is_last_chunk):
             # chunk long context does not need to update seqs and outputs
             seq = running[0]
             seq.append_routed_experts(all_routed_experts)
-            seq.append_logits(logits)
+            # For 'all' mode, accumulate chunk logits/hidden_states; for 'generation' mode skip
+            if seq.return_logits and not seq.logits_generation_mode:
+                seq.append_logits(logits)
+            if seq.return_hidden_states and not seq.hidden_states_generation_mode:
+                seq.append_hidden_states(hidden_states)
             return dict()
 
         new_token_timestamp = batched_outputs.new_token_timestamp
@@ -314,10 +330,37 @@ def __get_logprobs(batched_outputs: 'BatchedOutputs'):
             outputs[session_id] = out
 
             if msg.return_logits:
-                logit = __get_logit(msg, logits, seq_length, idx)
-                outputs[session_id].logits = logit
+                if msg.logits_generation_mode:
+                    # Accumulate last-position logit for each generation step
+                    if logits is not None:
+                        last_logit = logits.split(seq_length)[idx][-1:].detach().cpu()
+                        msg.append_logits(last_logit)
+                    if finish:
+                        outputs[session_id].logits = msg.logits
+                        msg.all_logits.resize(0)
+                else:
+                    # 'all' mode: return full sequence logits (existing behavior)
+                    logit = __get_logit(msg, logits, seq_length, idx)
+                    outputs[session_id].logits = logit
+
+            if msg.return_hidden_states:
+                if msg.hidden_states_generation_mode:
+                    # 'generation' mode: accumulate last-position hidden state at each step
+                    if hidden_states is not None:
+                        last_hs = hidden_states[idx:idx + 1].detach().cpu()
+                        msg.append_hidden_states(last_hs)
+                    if finish:
+                        outputs[session_id].last_hidden_state = msg.hidden_states
+                        msg.all_hidden_states.resize(0)
+                else:
+                    # 'all' mode: return full sequence hidden states
+                    if hidden_states is not None:
+                        hs = __get_hidden_state(msg, hidden_states, seq_length, idx)
+                        outputs[session_id].last_hidden_state = hs
+
         return outputs
 
+
     async def _main_loop_try_send_next_inputs(self):
         """Try send next inputs."""
         scheduler = self.scheduler
diff --git a/lmdeploy/pytorch/engine/inputs_maker.py b/lmdeploy/pytorch/engine/inputs_maker.py
index 72759d3cd6..0815846a27 100644
--- a/lmdeploy/pytorch/engine/inputs_maker.py
+++ b/lmdeploy/pytorch/engine/inputs_maker.py
@@ -613,6 +613,14 @@ def __need_logits(seqs: 'SeqList'):
                 return True
             return any(seq.return_logits for seq in seqs)
 
+        def __need_hidden_states(seqs: 'SeqList'):
+            """Need hidden states."""
+            return any(seq.return_hidden_states for seq in seqs)
+
+        def __hidden_states_all_mode(seqs: 'SeqList'):
+            """Check if any sequence uses hidden states 'all' mode."""
+            return any(seq.return_hidden_states and not seq.hidden_states_generation_mode for seq in seqs)
+
         def __need_routed_experts(seqs: 'SeqList'):
             """Need routed experts."""
             return any(seq.return_routed_experts for seq in seqs)
@@ -711,6 +719,8 @@ def __create_inputs_prefill():
             stopping_criteria = None
 
         return_logits = __need_logits(running)
+        return_hidden_states = __need_hidden_states(running)
+        hidden_states_all_mode = __hidden_states_all_mode(running)
         return_routed_experts = __need_routed_experts(running)
 
         return dict(
@@ -722,6 +732,8 @@ def __create_inputs_prefill():
             sampling_inputs=sampling_inputs,
             stopping_criteria=stopping_criteria,
             return_logits=return_logits,
+            return_hidden_states=return_hidden_states,
+            hidden_states_all_mode=hidden_states_all_mode,
             extra_inputs=extra_inputs,
             return_routed_experts=return_routed_experts,
         )
diff --git a/lmdeploy/pytorch/engine/model_agent/agent.py b/lmdeploy/pytorch/engine/model_agent/agent.py
index 55fb6e807b..0f8617c2f8 100644
--- a/lmdeploy/pytorch/engine/model_agent/agent.py
+++ b/lmdeploy/pytorch/engine/model_agent/agent.py
@@ -77,6 +77,7 @@ class BatchedOutputs:
     stopped: torch.Tensor
     stop_pos: torch.Tensor | None = None
     logits: torch.Tensor | None = None
+    hidden_states: torch.Tensor | None = None
     model_metas: list[dict[str, Any]] = None
     logprobs: BatchedLogProbs | None = None
     new_token_timestamp: int = 0
@@ -439,16 +440,45 @@ async def _async_model_forward(
         self,
         inputs: ModelInputs,
         return_logits: bool,
+        return_hidden_states: bool = False,
+        hidden_states_all_mode: bool = False,
     ):
         """Model forward."""
         origin_inputs = inputs
         ret = await self.async_forward(inputs)
 
+        # For 'all' mode hidden states, save the full hidden states before postprocessing
+        if return_hidden_states and hidden_states_all_mode and not return_logits:
+            full_hs = ret['hidden_states'][0]  # [total_tokens, hidden_dim]
+            seq_length = ret.get('seq_length', inputs.seq_length)
+
         if not return_logits:
             ret = self._postprocess_forward_output(ret, origin_inputs)
 
         hidden_states, ret = self.spec_agent.update_main_model_outputs(ret, origin_inputs)
 
+        if return_hidden_states:
+            # Extract hidden states to return to the user
+            hs = ret['hidden_states']
+            if hidden_states_all_mode:
+                if return_logits:
+                    # Full hidden states available; split by sequence
+                    seq_length = ret.get('seq_length', inputs.seq_length)
+                    full_hs = hs[0]  # [total_tokens, hidden_dim]
+                # else: full_hs was saved before _postprocess_forward_output above
+                ret['last_hidden_states'] = full_hs
+                ret['hidden_states_seq_length'] = seq_length
+            else:
+                # 'generation' mode: last-position hidden state per sequence
+                if return_logits:
+                    # hidden_states is full sequence, need to slice to last position
+                    seq_length = ret.get('seq_length', inputs.seq_length)
+                    last_hs = self._slice_outs(hs[0], seq_length)
+                else:
+                    # _postprocess_forward_output already sliced to last position
+                    last_hs = hs[0]
+                ret['last_hidden_states'] = last_hs
+
         logits = self.get_logits(hidden_states)
         ret['logits'] = logits
         return ret
@@ -601,6 +631,8 @@ async def _step_postprocess_with_output(self,
                                             model_metas: Any,
                                             need_broadcast_next: bool,
                                             return_logits: bool = False,
+                                            return_hidden_states: bool = False,
+                                            last_hidden_states: torch.Tensor = None,
                                             all_routed_experts: Any = None,
                                             extra_inputs: ExtraInputs = None):
         """Step postprocess with output."""
@@ -639,6 +671,7 @@ async def _step_postprocess_with_output(self,
         self._push_output(
             BatchedOutputs(next_token_ids=output_token_ids,
                            logits=logits if return_logits else None,
+                           hidden_states=last_hidden_states if return_hidden_states else None,
                            stopped=stopped,
                            stop_pos=stop_pos,
                            model_metas=model_metas,
@@ -677,6 +710,8 @@ async def _async_step(
         sampling_inputs: SamplingInputs = None,
         stopping_criteria: StoppingCriteria = None,
         return_logits: bool = False,
+        return_hidden_states: bool = False,
+        hidden_states_all_mode: bool = False,
         return_routed_experts: bool = False,
         extra_inputs: ExtraInputs = None,
     ):
@@ -738,6 +773,8 @@ async def _async_step(
         output = await self._async_model_forward(
             inputs,
             return_logits=return_logits,
+            return_hidden_states=return_hidden_states,
+            hidden_states_all_mode=hidden_states_all_mode,
         )
         # recovery is_decoding
         inputs.is_decoding = is_decoding
@@ -751,6 +788,7 @@ async def _async_step(
         last_logits = self._slice_outs(logits, seq_length)  # [bs, 1, prob] -> [bs, prob]
         extra_inputs = self.agent_strategy.slice_extra_inputs(extra_inputs, inputs, output)
         model_metas = output.get('model_metas')
+        last_hidden_states = output.get('last_hidden_states', None)
 
         if self.need_output:
             logger.debug(f'<ForwardTask> rank[{rank}]: Sampling.')
@@ -776,6 +814,8 @@ async def _async_step(
                     model_metas,
                     need_broadcast_next,
                     return_logits=return_logits,
+                    return_hidden_states=return_hidden_states,
+                    last_hidden_states=last_hidden_states,
                     all_routed_experts=all_routed_experts,
                     extra_inputs=extra_inputs,
                 ))
diff --git a/lmdeploy/pytorch/messages.py b/lmdeploy/pytorch/messages.py
index 1ef5caba83..aa830f5c71 100644
--- a/lmdeploy/pytorch/messages.py
+++ b/lmdeploy/pytorch/messages.py
@@ -59,7 +59,9 @@ class SamplingParam:
     response_format: None | str = None
     logits_processors: None | list[LogitsProcessor] = None
     out_logits: bool = False
+    out_logits_mode: str = None
     out_last_hidden_states: bool = False
+    out_last_hidden_states_mode: str = None
     num_logprobs: int = -1
     return_routed_experts: bool = False
 
@@ -87,13 +89,15 @@ def from_gen_config(cls, gen_config: GenerationConfig):
         response_format = gen_config.response_format
 
         output_logits = gen_config.output_logits
-        if output_logits:
-            if (output_logits != 'all' or gen_config.max_new_tokens > 0):
-                output_logits = None
-                logger.warning('Pytorch Engine only support output_logits="all"'
-                               ' with max_new_tokens=0')
-        if gen_config.output_last_hidden_state is not None:
-            logger.warning('Pytorch Engine does not support output last hidden states.')
+        if output_logits == 'all' and gen_config.max_new_tokens > 0:
+            output_logits = None
+            logger.warning('Pytorch Engine only support output_logits="all"'
+                           ' with max_new_tokens=0')
+        output_last_hidden_state = gen_config.output_last_hidden_state
+        if output_last_hidden_state == 'all' and gen_config.max_new_tokens > 0:
+            output_last_hidden_state = None
+            logger.warning('Pytorch Engine only support output_last_hidden_state="all"'
+                           ' with max_new_tokens=0')
         if top_p < 0 or top_p > 1.0:
             logger.warning('`top_p` has to be a float > 0 and < 1'
                            f' but is {top_p}')
@@ -156,6 +160,9 @@ def from_gen_config(cls, gen_config: GenerationConfig):
             min_new_tokens=min_new_tokens,
             logits_processors=gen_config.logits_processors,
             out_logits=(output_logits is not None),
+            out_logits_mode=output_logits,
+            out_last_hidden_states=(output_last_hidden_state is not None),
+            out_last_hidden_states_mode=output_last_hidden_state,
             num_logprobs=logprobs,
             return_routed_experts=gen_config.return_routed_experts,
             repetition_ngram_size=repetition_ngram_size,
@@ -549,7 +556,47 @@ def clone(self):
         return ret
 
 
-class HistoryMropePosIds(_HistoryDataBase):
+class HistoryHiddenStates(_HistoryDataBase):
+    """History hidden states."""
+    ALLOC_SIZE = 64
+    COPY_ON_RESIZE = True
+
+    def __init__(self, hidden_states: np.ndarray = None, dtype: np.dtype = np.int16):
+        super().__init__(hidden_states, dtype)
+        self._torch_dtype = None
+
+    def _create_empty_array(self, dtype):
+        """Create empty array.
+
+        Override in subclass for different shapes.
+        """
+        return None
+
+    def _get_pad_width(self, reserve_size: int):
+        """Get pad width for multi-dimensional array."""
+        return ((0, reserve_size), (0, 0))
+
+    def set_torch_dtype(self, torch_dtype):
+        """Set torch dtype."""
+        self._torch_dtype = torch_dtype
+
+    def get_hidden_states(self):
+        """Get hidden states as torch tensor."""
+        if self._data is None:
+            return None
+        if self._torch_dtype is None:
+            return None
+
+        hs_np = self.get_real()
+        return torch.frombuffer(hs_np, dtype=self._torch_dtype).view(hs_np.shape)
+
+    def clone(self):
+        """clone."""
+        ret = super().clone()
+        ret.set_torch_dtype(self._torch_dtype)
+        return ret
+
+
     """History mrope position ids."""
     ALLOC_SIZE = 64
 
@@ -653,6 +700,9 @@ class SchedulerSequence:
     # logits
     all_logits: HistoryLogits = field(default_factory=HistoryLogits)
 
+    # hidden states
+    all_hidden_states: HistoryHiddenStates = field(default_factory=HistoryHiddenStates)
+
     # mrope
     history_mrope_pos_ids: HistoryMropePosIds = field(default_factory=HistoryMropePosIds)
 
@@ -790,11 +840,30 @@ def status(self):
     def return_logits(self):
         return self.sampling_param.out_logits
 
+    @property
+    def logits_generation_mode(self):
+        """Check if logits are in generation mode."""
+        return self.sampling_param.out_logits_mode == 'generation'
+
     @property
     def logits(self):
         """Get logits."""
         return self.all_logits.get_logits()
 
+    @property
+    def return_hidden_states(self):
+        return self.sampling_param.out_last_hidden_states
+
+    @property
+    def hidden_states_generation_mode(self):
+        """Check if hidden states are in generation mode."""
+        return self.sampling_param.out_last_hidden_states_mode == 'generation'
+
+    @property
+    def hidden_states(self):
+        """Get hidden states."""
+        return self.all_hidden_states.get_hidden_states()
+
     @property
     def mrope_pos_ids(self):
         """Get mrope pos ids."""
@@ -813,6 +882,17 @@ def append_logits(self, logits: Tensor | np.ndarray):
             logits = logits.view(torch.int16).numpy()
         self.all_logits.append(logits)
 
+    def append_hidden_states(self, hidden_states: Tensor | np.ndarray):
+        """Append hidden states."""
+        if not self.return_hidden_states:
+            return
+        if hidden_states is None:
+            return
+        if isinstance(hidden_states, Tensor):
+            self.all_hidden_states.set_torch_dtype(hidden_states.dtype)
+            hidden_states = hidden_states.view(torch.int16).numpy()
+        self.all_hidden_states.append(hidden_states)
+
     def get_input_multimodals(self):
         """Get input multimodals."""
         start = self.num_history_ids

From 1db4500bda320e5fa1282e56a9f9e5ba82f3e05e Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 17 Apr 2026 07:49:58 +0000
Subject: [PATCH 3/6] Fix missing HistoryMropePosIds class declaration and
 improve code clarity

Agent-Logs-Url: https://github.com/InternLM/lmdeploy/sessions/84914fd8-47c1-4c72-80f4-88255925953e

Co-authored-by: CUHKSZzxy <46674730+CUHKSZzxy@users.noreply.github.com>
---
 lmdeploy/pytorch/engine/model_agent/agent.py | 16 +++++++++-------
 lmdeploy/pytorch/messages.py                 |  1 +
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/lmdeploy/pytorch/engine/model_agent/agent.py b/lmdeploy/pytorch/engine/model_agent/agent.py
index 0f8617c2f8..ab503a5853 100644
--- a/lmdeploy/pytorch/engine/model_agent/agent.py
+++ b/lmdeploy/pytorch/engine/model_agent/agent.py
@@ -447,10 +447,11 @@ async def _async_model_forward(
         origin_inputs = inputs
         ret = await self.async_forward(inputs)
 
-        # For 'all' mode hidden states, save the full hidden states before postprocessing
+        # For 'all' mode hidden states without return_logits, save the full hidden
+        # states before _postprocess_forward_output slices them to last position.
+        pre_postprocess_full_hs = None
         if return_hidden_states and hidden_states_all_mode and not return_logits:
-            full_hs = ret['hidden_states'][0]  # [total_tokens, hidden_dim]
-            seq_length = ret.get('seq_length', inputs.seq_length)
+            pre_postprocess_full_hs = ret['hidden_states'][0]  # [total_tokens, hidden_dim]
 
         if not return_logits:
             ret = self._postprocess_forward_output(ret, origin_inputs)
@@ -460,19 +461,20 @@ async def _async_model_forward(
         if return_hidden_states:
             # Extract hidden states to return to the user
             hs = ret['hidden_states']
+            seq_length = ret.get('seq_length', inputs.seq_length)
             if hidden_states_all_mode:
                 if return_logits:
-                    # Full hidden states available; split by sequence
-                    seq_length = ret.get('seq_length', inputs.seq_length)
+                    # Full hidden states still available (postprocessing was skipped)
                     full_hs = hs[0]  # [total_tokens, hidden_dim]
-                # else: full_hs was saved before _postprocess_forward_output above
+                else:
+                    # Use the saved full hidden states from before postprocessing
+                    full_hs = pre_postprocess_full_hs
                 ret['last_hidden_states'] = full_hs
                 ret['hidden_states_seq_length'] = seq_length
             else:
                 # 'generation' mode: last-position hidden state per sequence
                 if return_logits:
                     # hidden_states is full sequence, need to slice to last position
-                    seq_length = ret.get('seq_length', inputs.seq_length)
                     last_hs = self._slice_outs(hs[0], seq_length)
                 else:
                     # _postprocess_forward_output already sliced to last position
diff --git a/lmdeploy/pytorch/messages.py b/lmdeploy/pytorch/messages.py
index aa830f5c71..f3b3488b81 100644
--- a/lmdeploy/pytorch/messages.py
+++ b/lmdeploy/pytorch/messages.py
@@ -597,6 +597,7 @@ def clone(self):
         return ret
 
 
+class HistoryMropePosIds(_HistoryDataBase):
     """History mrope position ids."""
     ALLOC_SIZE = 64
 

From fb7c77731a11510a9a9e9ed508865275a300b576 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 17 Apr 2026 07:52:46 +0000
Subject: [PATCH 4/6] Add clarifying comment to HistoryHiddenStates class

Agent-Logs-Url: https://github.com/InternLM/lmdeploy/sessions/84914fd8-47c1-4c72-80f4-88255925953e

Co-authored-by: CUHKSZzxy <46674730+CUHKSZzxy@users.noreply.github.com>
---
 lmdeploy/pytorch/messages.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/lmdeploy/pytorch/messages.py b/lmdeploy/pytorch/messages.py
index f3b3488b81..619e2ed263 100644
--- a/lmdeploy/pytorch/messages.py
+++ b/lmdeploy/pytorch/messages.py
@@ -557,7 +557,13 @@ def clone(self):
 
 
 class HistoryHiddenStates(_HistoryDataBase):
-    """History hidden states."""
+    """History hidden states.
+
+    Hidden states are stored as int16 numpy arrays (same bit-level storage
+    as HistoryLogits), reinterpreting float16/bfloat16 tensors byte-for-byte.
+    _create_empty_array returns None so that the shape (hidden_dim) is inferred
+    dynamically from the first append call, matching the HistoryLogits pattern.
+    """
     ALLOC_SIZE = 64
     COPY_ON_RESIZE = True
 
@@ -566,10 +572,7 @@ def __init__(self, hidden_states: np.ndarray = None, dtype: np.dtype = np.int16)
         self._torch_dtype = None
 
     def _create_empty_array(self, dtype):
-        """Create empty array.
-
-        Override in subclass for different shapes.
-        """
+        """Return None; shape is determined on first append (see HistoryLogits)."""
         return None
 
     def _get_pad_width(self, reserve_size: int):

From 05b4c2bda08d50fa9b502bf294593a04cd1115db Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Mon, 20 Apr 2026 12:37:17 +0800
Subject: [PATCH 5/6] Fix bugs in output_logits/output_last_hidden_state
 generation mode

Three bugs fixed:

1. agent.py: Fix KeyError when extracting hidden_states after
   update_main_model_outputs() pops 'hidden_states' from ret dict.
   Use the local 'hidden_states' variable directly instead of ret['hidden_states'].

2. engine_loop.py: Fix split_with_sizes mismatch in generation mode.
   seq.num_token_ids returns ALL accumulated tokens (wrong for decode steps).
   Use model_inputs.seq_length (prefill) or delta.seq_length (decode, always=1)
   which matches the actual logits tensor shape.

3. messages.py Response.extend(): Propagate logits and last_hidden_state
   from the FINISH response. Without this the pipeline aggregation loop
   (res = res.extend(out)) discarded the accumulated tensors, leaving None.
---
 lmdeploy/messages.py                         | 4 ++++
 lmdeploy/pytorch/engine/engine_loop.py       | 7 ++++++-
 lmdeploy/pytorch/engine/model_agent/agent.py | 2 +-
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
index eeb2f7a056..04b393517b 100644
--- a/lmdeploy/messages.py
+++ b/lmdeploy/messages.py
@@ -577,6 +577,10 @@ def extend(self, other: 'Response') -> 'Response':
             self.logprobs = self.logprobs or []
             self.logprobs += other.logprobs
         self.routed_experts = other.routed_experts
+        if other.logits is not None:
+            self.logits = other.logits
+        if other.last_hidden_state is not None:
+            self.last_hidden_state = other.last_hidden_state
         return self
 
 
diff --git a/lmdeploy/pytorch/engine/engine_loop.py b/lmdeploy/pytorch/engine/engine_loop.py
index 722a1ce66d..107d138f4f 100644
--- a/lmdeploy/pytorch/engine/engine_loop.py
+++ b/lmdeploy/pytorch/engine/engine_loop.py
@@ -279,7 +279,12 @@ def __get_logprobs(batched_outputs: 'BatchedOutputs'):
 
         all_logprobs = __get_logprobs(batched_outputs)
 
-        seq_length = [seq.num_token_ids for seq in running]
+        if model_inputs is not None:
+            seq_length = model_inputs.seq_length.tolist()
+        elif delta is not None:
+            seq_length = delta.seq_length.tolist()
+        else:
+            seq_length = [seq.num_token_ids for seq in running]
         is_run = [seq.status == MessageStatus.RUNNING for seq in running]
         self.seq_strategy.update_running(running=running,
                                          batched_outputs=batched_outputs,
diff --git a/lmdeploy/pytorch/engine/model_agent/agent.py b/lmdeploy/pytorch/engine/model_agent/agent.py
index ab503a5853..28a51ce699 100644
--- a/lmdeploy/pytorch/engine/model_agent/agent.py
+++ b/lmdeploy/pytorch/engine/model_agent/agent.py
@@ -460,7 +460,7 @@ async def _async_model_forward(
 
         if return_hidden_states:
             # Extract hidden states to return to the user
-            hs = ret['hidden_states']
+            hs = hidden_states
             seq_length = ret.get('seq_length', inputs.seq_length)
             if hidden_states_all_mode:
                 if return_logits:

From afb030e45533594a2fddb6fd93ab9c4bec9c2031 Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Wed, 22 Apr 2026 21:03:46 +0800
Subject: [PATCH 6/6] Fix docformatter lint in pytorch messages

---
 lmdeploy/pytorch/messages.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lmdeploy/pytorch/messages.py b/lmdeploy/pytorch/messages.py
index 619e2ed263..dd7f3757cf 100644
--- a/lmdeploy/pytorch/messages.py
+++ b/lmdeploy/pytorch/messages.py
@@ -559,9 +559,8 @@ def clone(self):
 class HistoryHiddenStates(_HistoryDataBase):
     """History hidden states.
 
-    Hidden states are stored as int16 numpy arrays (same bit-level storage
-    as HistoryLogits), reinterpreting float16/bfloat16 tensors byte-for-byte.
-    _create_empty_array returns None so that the shape (hidden_dim) is inferred
+    Hidden states are stored as int16 numpy arrays (same bit-level storage as HistoryLogits), reinterpreting
+    float16/bfloat16 tensors byte-for-byte. _create_empty_array returns None so that the shape (hidden_dim) is inferred
     dynamically from the first append call, matching the HistoryLogits pattern.
     """
     ALLOC_SIZE = 64
@@ -572,7 +571,8 @@ def __init__(self, hidden_states: np.ndarray = None, dtype: np.dtype = np.int16)
         self._torch_dtype = None
 
     def _create_empty_array(self, dtype):
-        """Return None; shape is determined on first append (see HistoryLogits)."""
+        """Return None; shape is determined on first append (see
+        HistoryLogits)."""
         return None
 
     def _get_pad_width(self, reserve_size: int):