diff --git a/autotest/interface/pipeline/test_pipeline_sleep_wakeup.py b/autotest/interface/pipeline/test_pipeline_sleep_wakeup.py
new file mode 100644
index 0000000000..9cea30bdf4
--- /dev/null
+++ b/autotest/interface/pipeline/test_pipeline_sleep_wakeup.py
@@ -0,0 +1,441 @@
+from __future__ import annotations
+
+import inspect
+import os
+import time
+from pathlib import Path
+
+import pytest
+import torch
+from utils.config_utils import get_parallel_config
+from utils.constant import SLEEP_WAKEUP_BACKENDS, SLEEP_WAKEUP_MODEL_LIST
+from utils.sleep_utils import (
+    LEVEL2_BASELINE_RUNS,
+    LEVEL2_GREEDY_MESSAGES,
+    LEVEL2_MAX_TOKENS,
+    apply_serialized_hf_segments_for_level2_weights,
+    apply_serialized_hf_segments_for_turbomind_level2_weights,
+    assert_assistant_not_degenerate,
+    assert_chat_decode_unchanged,
+    level2_update_weights_request_dict,
+    resolve_hf_checkpoint_dir,
+)
+
+from lmdeploy import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline
+from lmdeploy.messages import Response
+from lmdeploy.serve.openai.protocol import UpdateParamsRequest
+from lmdeploy.utils import is_bf16_supported
+
+_SLEEP_PIPELINE_BACKEND_CLASS = {
+    'pytorch': PytorchEngineConfig,
+    'turbomind': TurbomindEngineConfig,
+}
+
+
+def _pipeline_sleep_backend_classes():
+    out: list[type[PytorchEngineConfig] | type[TurbomindEngineConfig]] = []
+    for name in SLEEP_WAKEUP_BACKENDS:
+        cls = _SLEEP_PIPELINE_BACKEND_CLASS.get(name)
+        if cls is None:
+            allowed = set(_SLEEP_PIPELINE_BACKEND_CLASS)
+            raise ValueError(
+                f'unknown SLEEP_WAKEUP_BACKENDS entry {name!r}; expected one of {allowed}',
+            )
+        out.append(cls)
+    return out
+
+
+def _force_pipeline_sleep_under_llm_dist() -> bool:
+    v = os.environ.get('LMDEPLOY_FORCE_PIPELINE_SLEEP', '').strip().lower()
+    return v in ('1', 'true', 'yes', 'on')
+
+
+@pytest.fixture(scope='module', autouse=True)
+def _skip_module_if_rest_runner_gpu_conflict():
+    if os.environ.get('LLM_DIST_PORT') and not _force_pipeline_sleep_under_llm_dist():
+        pytest.skip(
+            'pipeline sleep/wakeup: skipped when LLM_DIST_PORT is set (REST api_server already uses GPUs). '
+            'Run this file standalone from lmdeploy_sleep root, or set LMDEPLOY_FORCE_PIPELINE_SLEEP=1 '
+            'if you allocated extra GPUs for pytest.')
+
+
+def _pipeline_tp_for_model(config: dict, model: str) -> int:
+    tp = 1
+    for item in get_parallel_config(config, model):
+        if isinstance(item, dict) and 'tp' in item:
+            tp = max(tp, int(item['tp']))
+    return max(1, tp)
+
+
+def _make_backend_config(
+    backend: type[PytorchEngineConfig] | type[TurbomindEngineConfig],
+    config: dict,
+    model: str,
+):
+    tp = _pipeline_tp_for_model(config, model)
+    cfg = backend(tp=tp)
+    if backend is TurbomindEngineConfig:
+        cfg.empty_init = True
+    if backend is PytorchEngineConfig and not is_bf16_supported():
+        cfg.dtype = 'float16'
+    return cfg
+
+
+def _model_path(config: dict, model: str) -> str:
+    if os.environ.get('LMDEPLOY_USE_MODELSCOPE', 'False') == 'True':
+        return model
+    return str(Path(config['model_path']) / model)
+
+
+def _open_pipeline(config: dict, model: str, backend: type[PytorchEngineConfig] | type[TurbomindEngineConfig]):
+    return pipeline(
+        _model_path(config, model),
+        backend_config=_make_backend_config(backend, config, model),
+    )
+
+
+def _pipeline_resp_to_chat_dict(resp: Response) -> dict:
+    return {
+        'choices': [{
+            'message': {'content': (resp.text or '').strip()},
+            'finish_reason': getattr(resp, 'finish_reason', None),
+        }],
+        'usage': {'completion_tokens': resp.generate_token_len},
+    }
+
+
+def _infer_level2_greedy(pipe, gen_cfg: GenerationConfig) -> Response:
+    prompt = LEVEL2_GREEDY_MESSAGES[0]['content']
+    return pipe.infer(prompt, gen_config=gen_cfg)
+
+
+def _assert_level2_pipeline_baseline_stable(pipe, gen_cfg: GenerationConfig, *, label: str) -> Response:
+    contents: list[str] = []
+    refs: list[Response] = []
+    for i in range(LEVEL2_BASELINE_RUNS):
+        out = _infer_level2_greedy(pipe, gen_cfg)
+        assert_assistant_not_degenerate(
+            (out.text or '').strip(), label=f'{label} baseline run {i + 1}')
+        refs.append(out)
+        contents.append((out.text or '').strip())
+    assert len(set(contents)) == 1, (
+        f'{label}: greedy pipeline baseline not stable:\n'
+        + '\n'.join(f'  run{j + 1}={c!r}' for j, c in enumerate(contents)))
+    return refs[0]
+
+
+def _should_enforce_level2_greedy_checks(
+        backend: type[PytorchEngineConfig] | type[TurbomindEngineConfig]) -> bool:
+    # Known issue: TurboMind may not be deterministic for temperature=0 runs.
+    # Keep validating sleep/wakeup/update_params behavior, but do not fail on
+    # strict greedy-stability checks for this backend.
+    return backend is not TurbomindEngineConfig
+
+
+def _apply_sleep(pipe, level: int = 1) -> None:
+    eng = pipe.async_engine
+    out = eng.sleep(level)
+    if inspect.isawaitable(out):
+        pipe._run(coro=out).result()
+
+
+def _pipeline_wakeup(pipe, tags: list[str] | None = None) -> None:
+    pipe.async_engine.wakeup(tags)
+
+
+def _pipeline_is_sleeping(pipe) -> bool:
+    return bool(pipe.async_engine.is_sleeping)
+
+
+def _ensure_awake_pipeline(pipe, max_attempts: int = 8) -> None:
+    for _ in range(max_attempts):
+        _pipeline_wakeup(pipe, None)
+        if not _pipeline_is_sleeping(pipe):
+            return
+        time.sleep(0.25)
+    raise AssertionError(
+        f'pipeline engine still is_sleeping=true after {max_attempts} wakeup attempts')
+
+
+def _level2_reload_weights_if_supported_pipeline(
+        pipe,
+        backend: type[PytorchEngineConfig] | type[TurbomindEngineConfig],
+        config: dict,
+        model: str,
+) -> None:
+    if backend is not PytorchEngineConfig and backend is not TurbomindEngineConfig:
+        return
+    if not torch.cuda.is_available():
+        pytest.skip('level-2 reload needs CUDA for serialize_state_dict / weight upload')
+    model_dir = resolve_hf_checkpoint_dir(config, model)
+    if not model_dir.is_dir():
+        pytest.skip(f'HF checkpoint not found for update_weights: {model_dir}')
+    eng = pipe.async_engine.engine
+
+    def _emit(serialized_data: object, finished: bool) -> None:
+        eng.update_params(UpdateParamsRequest(**level2_update_weights_request_dict(
+            serialized_data, finished)))
+
+    try:
+        if backend is PytorchEngineConfig:
+            apply_serialized_hf_segments_for_level2_weights(model_dir, _emit)
+        else:
+            apply_serialized_hf_segments_for_turbomind_level2_weights(model_dir, _emit)
+    except FileNotFoundError as e:
+        pytest.skip(str(e))
+    except RuntimeError as e:
+        pytest.skip(str(e))
+
+
+@pytest.mark.order(8)
+@pytest.mark.flaky(reruns=2)
+@pytest.mark.parametrize('model', SLEEP_WAKEUP_MODEL_LIST)
+@pytest.mark.parametrize('backend', _pipeline_sleep_backend_classes())
+class TestPipelineSleepWakeup:
+
+    def test_pipeline_sleep_wakeup_roundtrip(self, model, backend, config):
+        pipe = None
+        try:
+            pipe = _open_pipeline(config, model, backend)
+            _ensure_awake_pipeline(pipe)
+            _apply_sleep(pipe, 1)
+            assert _pipeline_is_sleeping(pipe) is True
+            _pipeline_wakeup(pipe, None)
+            assert _pipeline_is_sleeping(pipe) is False
+        finally:
+            if pipe is not None:
+                try:
+                    _ensure_awake_pipeline(pipe)
+                finally:
+                    pipe.close()
+
+    def test_pipeline_sleep_level1_wakeup_and_infer(self, model, backend, config):
+        pipe = None
+        try:
+            pipe = _open_pipeline(config, model, backend)
+            _ensure_awake_pipeline(pipe)
+            _apply_sleep(pipe, 1)
+            assert _pipeline_is_sleeping(pipe) is True
+            _pipeline_wakeup(pipe, None)
+            assert _pipeline_is_sleeping(pipe) is False
+            gen = GenerationConfig(max_new_tokens=32, temperature=0.01)
+            r = pipe([[{'role': 'user', 'content': 'Hi, reply with one short sentence.'}]], gen_config=gen)
+            out = r[0] if isinstance(r, list) else r
+            assert (out.text or '').strip()
+        finally:
+            if pipe is not None:
+                try:
+                    _ensure_awake_pipeline(pipe)
+                finally:
+                    pipe.close()
+
+    def test_pipeline_partial_wakeup_with_tags(self, model, backend, config):
+        pipe = None
+        try:
+            pipe = _open_pipeline(config, model, backend)
+            _ensure_awake_pipeline(pipe)
+            _apply_sleep(pipe, 1)
+            assert _pipeline_is_sleeping(pipe) is True
+            _pipeline_wakeup(pipe, ['weights'])
+            assert _pipeline_is_sleeping(pipe) is True
+            _pipeline_wakeup(pipe, ['kv_cache'])
+            assert _pipeline_is_sleeping(pipe) is False
+        finally:
+            if pipe is not None:
+                try:
+                    _ensure_awake_pipeline(pipe)
+                finally:
+                    pipe.close()
+
+    def test_pipeline_wakeup_unknown_tags_noop_then_full(self, model, backend, config):
+        pipe = None
+        try:
+            pipe = _open_pipeline(config, model, backend)
+            _ensure_awake_pipeline(pipe)
+            _apply_sleep(pipe, 1)
+            assert _pipeline_is_sleeping(pipe) is True
+            _pipeline_wakeup(pipe, ['not_a_valid_tag'])
+            assert _pipeline_is_sleeping(pipe) is True
+            _pipeline_wakeup(pipe, None)
+            assert _pipeline_is_sleeping(pipe) is False
+        finally:
+            if pipe is not None:
+                try:
+                    _ensure_awake_pipeline(pipe)
+                finally:
+                    pipe.close()
+
+    def test_pipeline_wakeup_mixed_valid_invalid_tags_noop(self, model, backend, config):
+        pipe = None
+        try:
+            pipe = _open_pipeline(config, model, backend)
+            _ensure_awake_pipeline(pipe)
+            _apply_sleep(pipe, 1)
+            assert _pipeline_is_sleeping(pipe) is True
+            _pipeline_wakeup(pipe, ['weights', 'not_a_valid_tag'])
+            assert _pipeline_is_sleeping(pipe) is True
+            _pipeline_wakeup(pipe, ['not_a_valid_tag', 'weights'])
+            assert _pipeline_is_sleeping(pipe) is True
+            _pipeline_wakeup(pipe, None)
+            assert _pipeline_is_sleeping(pipe) is False
+        finally:
+            if pipe is not None:
+                try:
+                    _ensure_awake_pipeline(pipe)
+                finally:
+                    pipe.close()
+
+    def test_pipeline_wakeup_both_tags_one_call(self, model, backend, config):
+        pipe = None
+        try:
+            pipe = _open_pipeline(config, model, backend)
+            _ensure_awake_pipeline(pipe)
+            _apply_sleep(pipe, 1)
+            assert _pipeline_is_sleeping(pipe) is True
+            _pipeline_wakeup(pipe, ['weights', 'kv_cache'])
+            assert _pipeline_is_sleeping(pipe) is False
+            gen = GenerationConfig(max_new_tokens=32, temperature=0.01)
+            r = pipe([[{'role': 'user', 'content': 'Hi, reply with one short sentence.'}]], gen_config=gen)
+            out = r[0] if isinstance(r, list) else r
+            assert (out.text or '').strip()
+        finally:
+            if pipe is not None:
+                try:
+                    _ensure_awake_pipeline(pipe)
+                finally:
+                    pipe.close()
+
+    def test_pipeline_redundant_weights_wakeup_noop(self, model, backend, config):
+        pipe = None
+        try:
+            pipe = _open_pipeline(config, model, backend)
+            _ensure_awake_pipeline(pipe)
+            _apply_sleep(pipe, 1)
+            assert _pipeline_is_sleeping(pipe) is True
+            _pipeline_wakeup(pipe, ['weights'])
+            assert _pipeline_is_sleeping(pipe) is True
+            _pipeline_wakeup(pipe, ['weights'])
+            assert _pipeline_is_sleeping(pipe) is True
+            _pipeline_wakeup(pipe, ['kv_cache'])
+            assert _pipeline_is_sleeping(pipe) is False
+        finally:
+            if pipe is not None:
+                try:
+                    _ensure_awake_pipeline(pipe)
+                finally:
+                    pipe.close()
+
+    def test_pipeline_wakeup_empty_string_tag_noop(self, model, backend, config):
+        pipe = None
+        try:
+            pipe = _open_pipeline(config, model, backend)
+            _ensure_awake_pipeline(pipe)
+            _apply_sleep(pipe, 1)
+            assert _pipeline_is_sleeping(pipe) is True
+            _pipeline_wakeup(pipe, [''])
+            assert _pipeline_is_sleeping(pipe) is True
+            _pipeline_wakeup(pipe, None)
+            assert _pipeline_is_sleeping(pipe) is False
+        finally:
+            if pipe is not None:
+                try:
+                    _ensure_awake_pipeline(pipe)
+                finally:
+                    pipe.close()
+
+    def test_pipeline_full_wakeup_when_already_awake(self, model, backend, config):
+        pipe = None
+        try:
+            pipe = _open_pipeline(config, model, backend)
+            _ensure_awake_pipeline(pipe)
+            assert _pipeline_is_sleeping(pipe) is False
+            _pipeline_wakeup(pipe, None)
+            assert _pipeline_is_sleeping(pipe) is False
+            _pipeline_wakeup(pipe, None)
+            assert _pipeline_is_sleeping(pipe) is False
+        finally:
+            if pipe is not None:
+                try:
+                    _ensure_awake_pipeline(pipe)
+                finally:
+                    pipe.close()
+
+    def test_pipeline_second_sleep_while_sleeping_ok(self, model, backend, config):
+        pipe = None
+        try:
+            pipe = _open_pipeline(config, model, backend)
+            _ensure_awake_pipeline(pipe)
+            _apply_sleep(pipe, 1)
+            assert _pipeline_is_sleeping(pipe) is True
+            _apply_sleep(pipe, 1)
+            assert _pipeline_is_sleeping(pipe) is True
+            _pipeline_wakeup(pipe, None)
+            assert _pipeline_is_sleeping(pipe) is False
+        finally:
+            if pipe is not None:
+                try:
+                    _ensure_awake_pipeline(pipe)
+                finally:
+                    pipe.close()
+
+    def test_pipeline_sleep_level2_staged_wakeup_and_infer(self, model, backend, config):
+        pipe = None
+        try:
+            pipe = _open_pipeline(config, model, backend)
+            _ensure_awake_pipeline(pipe)
+            gen = GenerationConfig(
+                max_new_tokens=LEVEL2_MAX_TOKENS,
+                temperature=0.0,
+                top_p=1.0,
+                top_k=1,
+                do_sample=False,
+            )
+            baseline = None
+            if _should_enforce_level2_greedy_checks(backend):
+                baseline_r = _assert_level2_pipeline_baseline_stable(pipe, gen, label='level2 pipeline')
+                baseline = _pipeline_resp_to_chat_dict(baseline_r)
+
+            _apply_sleep(pipe, 2)
+            assert _pipeline_is_sleeping(pipe) is True
+            _pipeline_wakeup(pipe, ['weights'])
+            assert _pipeline_is_sleeping(pipe) is True
+            _level2_reload_weights_if_supported_pipeline(pipe, backend, config, model)
+            _pipeline_wakeup(pipe, ['kv_cache'])
+            assert _pipeline_is_sleeping(pipe) is False
+
+            after = _infer_level2_greedy(pipe, gen)
+            assert_assistant_not_degenerate(
+                (after.text or '').strip(), label='level2 pipeline after staged wakeup (1st infer)')
+            if baseline is not None:
+                assert_chat_decode_unchanged(baseline, _pipeline_resp_to_chat_dict(after),
+                                             label='level2 pipeline 1st infer after staged wakeup')
+
+            after2 = _infer_level2_greedy(pipe, gen)
+            if baseline is not None:
+                assert_chat_decode_unchanged(baseline, _pipeline_resp_to_chat_dict(after2),
+                                             label='level2 pipeline 2nd infer after staged wakeup')
+
+            _apply_sleep(pipe, 2)
+            assert _pipeline_is_sleeping(pipe) is True
+            _pipeline_wakeup(pipe, ['weights'])
+            assert _pipeline_is_sleeping(pipe) is True
+            _level2_reload_weights_if_supported_pipeline(pipe, backend, config, model)
+            _pipeline_wakeup(pipe, ['kv_cache'])
+            assert _pipeline_is_sleeping(pipe) is False
+
+            after_full = _infer_level2_greedy(pipe, gen)
+            if baseline is not None:
+                assert_chat_decode_unchanged(
+                    baseline, _pipeline_resp_to_chat_dict(after_full),
+                    label='level2 pipeline infer after 2nd sleep cycle (staged wakeup)')
+
+            gen2 = GenerationConfig(max_new_tokens=32, temperature=0.01)
+            r = pipe([[{'role': 'user', 'content': 'Hi, reply with one short sentence.'}]], gen_config=gen2)
+            out = r[0] if isinstance(r, list) else r
+            assert (out.text or '').strip()
+        finally:
+            if pipe is not None:
+                try:
+                    _ensure_awake_pipeline(pipe)
+                finally:
+                    pipe.close()
diff --git a/autotest/interface/restful/test_restful_abort_request.py b/autotest/interface/restful/test_restful_abort_request.py
new file mode 100644
index 0000000000..ff3d1b4dd3
--- /dev/null
+++ b/autotest/interface/restful/test_restful_abort_request.py
@@ -0,0 +1,426 @@
+import json
+import random
+import threading
+import time
+
+import pytest
+import requests
+from utils.constant import BACKEND_LIST, DEFAULT_PORT, DEFAULT_SERVER, RESTFUL_MODEL_LIST
+from utils.restful_return_check import assert_chat_completions_batch_return
+
+from lmdeploy.serve.openai.api_client import APIClient
+
+BASE_URL = f'http://{DEFAULT_SERVER}:{DEFAULT_PORT}'
+JSON_HEADERS = {'Content-Type': 'application/json'}
+_REQUEST_TIMEOUT = 300
+_ABORT_TIMEOUT = 60
+_SESSION_RETRY = 25
+_SESSION_RETRY_INTERVAL = 0.3
+_NONSTREAM_ABORT_LEAD_S = 2.0
+_THREAD_JOIN_EXTRA_S = 30
+_POST_ABORT_LOGPROBS_NUM = 10
+
+
+def _post_abort_request(payload: dict) -> requests.Response:
+    return requests.post(
+        f'{BASE_URL}/abort_request',
+        headers=JSON_HEADERS,
+        json=payload,
+        timeout=_ABORT_TIMEOUT,
+    )
+
+
+def _chat_non_stream(
+        model_name: str,
+        session_id: int,
+        *,
+        max_tokens: int = 32,
+        logprobs: bool = False,
+        top_logprobs: int = _POST_ABORT_LOGPROBS_NUM,
+) -> requests.Response:
+    body: dict = {
+        'model': model_name,
+        'messages': [{'role': 'user', 'content': 'Say OK in one word.'}],
+        'max_tokens': max_tokens,
+        'temperature': 0.01,
+        'stream': False,
+        'session_id': session_id,
+    }
+    if logprobs:
+        body['logprobs'] = True
+        body['top_logprobs'] = top_logprobs
+    return requests.post(
+        f'{BASE_URL}/v1/chat/completions',
+        headers=JSON_HEADERS,
+        json=body,
+        timeout=_REQUEST_TIMEOUT,
+    )
+
+
+def _consume_first_nonempty_sse_data_line(resp: requests.Response) -> None:
+    for raw in resp.iter_lines(decode_unicode=True):
+        if not raw or not raw.startswith('data:'):
+            continue
+        chunk = raw[5:].strip()
+        if chunk == '[DONE]':
+            break
+        if not chunk:
+            continue
+        try:
+            json.loads(chunk)
+        except json.JSONDecodeError:
+            continue
+        return
+    assert False, 'expected at least one parsable SSE data line before abort'
+
+
+def _post_abort_explicit_session_or_skip(session_id: int) -> None:
+    abort_r = _post_abort_request({'session_id': session_id, 'abort_all': False})
+    if abort_r.status_code == 501:
+        pytest.skip('api_server started without --enable-abort-handling')
+    assert abort_r.status_code == 200, f'abort_request: {abort_r.status_code} {abort_r.text!r}'
+
+
+def _post_abort_all_or_skip() -> None:
+    abort_r = _post_abort_request({'abort_all': True})
+    if abort_r.status_code == 501:
+        pytest.skip('api_server started without --enable-abort-handling')
+    assert abort_r.status_code == 200, f'abort_request abort_all: {abort_r.status_code} {abort_r.text!r}'
+
+
+def _assert_session_reusable_after_abort(model_name: str, session_id: int) -> None:
+    last = None
+    for _ in range(_SESSION_RETRY):
+        last = _chat_non_stream(
+            model_name,
+            session_id,
+            max_tokens=16,
+            logprobs=True,
+            top_logprobs=_POST_ABORT_LOGPROBS_NUM,
+        )
+        if last.status_code == 200:
+            data = last.json()
+            assert 'choices' in data and data['choices'], last.text
+            assert_chat_completions_batch_return(
+                data,
+                model_name,
+                check_logprobs=True,
+                logprobs_num=_POST_ABORT_LOGPROBS_NUM,
+            )
+            return
+        if last.status_code == 400 and 'occupied' in (last.text or '').lower():
+            time.sleep(_SESSION_RETRY_INTERVAL)
+            continue
+        break
+    assert False, f'session {session_id} not reusable after abort: last={last.status_code} {last.text!r}'
+
+
+def _long_user_prompt() -> str:
+    return 'Write a long numbered list from 1 to 500, one number per line, no other text.'
+
+
+@pytest.mark.order(9)
+@pytest.mark.flaky(reruns=2)
+@pytest.mark.parametrize('backend', BACKEND_LIST)
+@pytest.mark.parametrize('model_case', RESTFUL_MODEL_LIST)
+class TestRestfulAbortRequest:
+
+    def test_abort_request_releases_explicit_session_mid_stream(self, backend, model_case):
+        api_client = APIClient(BASE_URL)
+        model_name = api_client.available_models[0]
+        session_id = 8_000_000 + random.randint(0, 99_999)
+
+        stream_payload = {
+            'model': model_name,
+            'messages': [{'role': 'user', 'content': _long_user_prompt()}],
+            'max_tokens': 2048,
+            'temperature': 0.3,
+            'stream': True,
+            'session_id': session_id,
+        }
+        resp = requests.post(
+            f'{BASE_URL}/v1/chat/completions',
+            headers=JSON_HEADERS,
+            json=stream_payload,
+            stream=True,
+            timeout=_REQUEST_TIMEOUT,
+        )
+        resp.raise_for_status()
+
+        try:
+            _consume_first_nonempty_sse_data_line(resp)
+            _post_abort_explicit_session_or_skip(session_id)
+        finally:
+            resp.close()
+
+        _assert_session_reusable_after_abort(model_name, session_id)
+
+    def test_abort_request_releases_explicit_session_mid_stream_generate(self, backend, model_case):
+        api_client = APIClient(BASE_URL)
+        model_name = api_client.available_models[0]
+        session_id = 7_000_000 + random.randint(0, 99_999)
+
+        stream_payload = {
+            'prompt': _long_user_prompt(),
+            'max_tokens': 2048,
+            'temperature': 0.3,
+            'stream': True,
+            'session_id': session_id,
+        }
+        resp = requests.post(
+            f'{BASE_URL}/generate',
+            headers=JSON_HEADERS,
+            json=stream_payload,
+            stream=True,
+            timeout=_REQUEST_TIMEOUT,
+        )
+        resp.raise_for_status()
+
+        try:
+            _consume_first_nonempty_sse_data_line(resp)
+            _post_abort_explicit_session_or_skip(session_id)
+        finally:
+            resp.close()
+
+        _assert_session_reusable_after_abort(model_name, session_id)
+
+    def test_abort_request_releases_explicit_session_mid_stream_completions(self, backend, model_case):
+        api_client = APIClient(BASE_URL)
+        model_name = api_client.available_models[0]
+        session_id = 6_000_000 + random.randint(0, 99_999)
+
+        stream_payload = {
+            'model': model_name,
+            'prompt': _long_user_prompt(),
+            'max_tokens': 2048,
+            'temperature': 0.3,
+            'stream': True,
+            'session_id': session_id,
+        }
+        resp = requests.post(
+            f'{BASE_URL}/v1/completions',
+            headers=JSON_HEADERS,
+            json=stream_payload,
+            stream=True,
+            timeout=_REQUEST_TIMEOUT,
+        )
+        resp.raise_for_status()
+
+        try:
+            _consume_first_nonempty_sse_data_line(resp)
+            _post_abort_explicit_session_or_skip(session_id)
+        finally:
+            resp.close()
+
+        _assert_session_reusable_after_abort(model_name, session_id)
+
+    def test_abort_request_releases_explicit_session_non_stream_chat_thread(self, backend, model_case):
+        api_client = APIClient(BASE_URL)
+        model_name = api_client.available_models[0]
+        session_id = 5_000_000 + random.randint(0, 99_999)
+
+        def worker(out: dict) -> None:
+            try:
+                out['resp'] = requests.post(
+                    f'{BASE_URL}/v1/chat/completions',
+                    headers=JSON_HEADERS,
+                    json={
+                        'model': model_name,
+                        'messages': [{'role': 'user', 'content': _long_user_prompt()}],
+                        'max_tokens': 2048,
+                        'temperature': 0.3,
+                        'stream': False,
+                        'session_id': session_id,
+                    },
+                    timeout=_REQUEST_TIMEOUT,
+                )
+            except Exception as e:
+                out['exc'] = e
+
+        holder: dict = {}
+        t = threading.Thread(target=worker, args=(holder,), daemon=True)
+        t.start()
+        time.sleep(_NONSTREAM_ABORT_LEAD_S)
+        abort_r = _post_abort_request({'session_id': session_id, 'abort_all': False})
+        if abort_r.status_code == 501:
+            t.join(timeout=_REQUEST_TIMEOUT + _THREAD_JOIN_EXTRA_S)
+            pytest.skip('api_server started without --enable-abort-handling')
+        assert abort_r.status_code == 200, f'abort_request: {abort_r.status_code} {abort_r.text!r}'
+        t.join(timeout=_REQUEST_TIMEOUT + _THREAD_JOIN_EXTRA_S)
+        assert not t.is_alive(), 'non-stream chat thread should finish after abort'
+
+        _assert_session_reusable_after_abort(model_name, session_id)
+
+    def test_abort_request_releases_explicit_session_non_stream_generate_thread(self, backend, model_case):
+        api_client = APIClient(BASE_URL)
+        model_name = api_client.available_models[0]
+        session_id = 4_000_000 + random.randint(0, 99_999)
+
+        def worker(out: dict) -> None:
+            try:
+                out['resp'] = requests.post(
+                    f'{BASE_URL}/generate',
+                    headers=JSON_HEADERS,
+                    json={
+                        'prompt': _long_user_prompt(),
+                        'max_tokens': 2048,
+                        'temperature': 0.3,
+                        'stream': False,
+                        'session_id': session_id,
+                    },
+                    timeout=_REQUEST_TIMEOUT,
+                )
+            except Exception as e:
+                out['exc'] = e
+
+        holder: dict = {}
+        t = threading.Thread(target=worker, args=(holder,), daemon=True)
+        t.start()
+        time.sleep(_NONSTREAM_ABORT_LEAD_S)
+        abort_r = _post_abort_request({'session_id': session_id, 'abort_all': False})
+        if abort_r.status_code == 501:
+            t.join(timeout=_REQUEST_TIMEOUT + _THREAD_JOIN_EXTRA_S)
+            pytest.skip('api_server started without --enable-abort-handling')
+        assert abort_r.status_code == 200, f'abort_request: {abort_r.status_code} {abort_r.text!r}'
+        t.join(timeout=_REQUEST_TIMEOUT + _THREAD_JOIN_EXTRA_S)
+        assert not t.is_alive(), 'non-stream generate thread should finish after abort'
+
+        _assert_session_reusable_after_abort(model_name, session_id)
+
+    def test_abort_request_releases_explicit_session_non_stream_completions_thread(self, backend, model_case):
+        api_client = APIClient(BASE_URL)
+        model_name = api_client.available_models[0]
+        session_id = 3_000_000 + random.randint(0, 99_999)
+
+        def worker(out: dict) -> None:
+            try:
+                out['resp'] = requests.post(
+                    f'{BASE_URL}/v1/completions',
+                    headers=JSON_HEADERS,
+                    json={
+                        'model': model_name,
+                        'prompt': _long_user_prompt(),
+                        'max_tokens': 2048,
+                        'temperature': 0.3,
+                        'stream': False,
+                        'session_id': session_id,
+                    },
+                    timeout=_REQUEST_TIMEOUT,
+                )
+            except Exception as e:
+                out['exc'] = e
+
+        holder: dict = {}
+        t = threading.Thread(target=worker, args=(holder,), daemon=True)
+        t.start()
+        time.sleep(_NONSTREAM_ABORT_LEAD_S)
+        abort_r = _post_abort_request({'session_id': session_id, 'abort_all': False})
+        if abort_r.status_code == 501:
+            t.join(timeout=_REQUEST_TIMEOUT + _THREAD_JOIN_EXTRA_S)
+            pytest.skip('api_server started without --enable-abort-handling')
+        assert abort_r.status_code == 200, f'abort_request: {abort_r.status_code} {abort_r.text!r}'
+        t.join(timeout=_REQUEST_TIMEOUT + _THREAD_JOIN_EXTRA_S)
+        assert not t.is_alive(), 'non-stream completions thread should finish after abort'
+
+        _assert_session_reusable_after_abort(model_name, session_id)
+
+    def test_streaming_client_close_releases_session_without_abort_request(self, backend, model_case):
+        api_client = APIClient(BASE_URL)
+        model_name = api_client.available_models[0]
+        session_id = 2_000_000 + random.randint(0, 99_999)
+
+        resp = requests.post(
+            f'{BASE_URL}/v1/chat/completions',
+            headers=JSON_HEADERS,
+            json={
+                'model': model_name,
+                'messages': [{'role': 'user', 'content': _long_user_prompt()}],
+                'max_tokens': 2048,
+                'temperature': 0.3,
+                'stream': True,
+                'session_id': session_id,
+            },
+            stream=True,
+            timeout=_REQUEST_TIMEOUT,
+        )
+        resp.raise_for_status()
+        try:
+            _consume_first_nonempty_sse_data_line(resp)
+        finally:
+            resp.close()
+
+        _assert_session_reusable_after_abort(model_name, session_id)
+
+    def test_streaming_client_close_completions_releases_session(self, backend, model_case):
+        api_client = APIClient(BASE_URL)
+        model_name = api_client.available_models[0]
+        session_id = 1_000_000 + random.randint(0, 99_999)
+
+        resp = requests.post(
+            f'{BASE_URL}/v1/completions',
+            headers=JSON_HEADERS,
+            json={
+                'model': model_name,
+                'prompt': _long_user_prompt(),
+                'max_tokens': 2048,
+                'temperature': 0.3,
+                'stream': True,
+                'session_id': session_id,
+            },
+            stream=True,
+            timeout=_REQUEST_TIMEOUT,
+        )
+        resp.raise_for_status()
+        try:
+            _consume_first_nonempty_sse_data_line(resp)
+        finally:
+            resp.close()
+
+        _assert_session_reusable_after_abort(model_name, session_id)
+
+    def test_streaming_client_close_generate_releases_session(self, backend, model_case):
+        api_client = APIClient(BASE_URL)
+        model_name = api_client.available_models[0]
+        session_id = 500_000 + random.randint(0, 99_999)
+
+        resp = requests.post(
+            f'{BASE_URL}/generate',
+            headers=JSON_HEADERS,
+            json={
+                'prompt': _long_user_prompt(),
+                'max_tokens': 2048,
+                'temperature': 0.3,
+                'stream': True,
+                'session_id': session_id,
+            },
+            stream=True,
+            timeout=_REQUEST_TIMEOUT,
+        )
+        resp.raise_for_status()
+        try:
+            _consume_first_nonempty_sse_data_line(resp)
+        finally:
+            resp.close()
+
+        _assert_session_reusable_after_abort(model_name, session_id)
+
+
+@pytest.mark.order(10)
+@pytest.mark.flaky(reruns=2)
+@pytest.mark.parametrize('backend', BACKEND_LIST)
+@pytest.mark.parametrize('model_case', RESTFUL_MODEL_LIST)
+class TestRestfulAbortRequestAbortAll:
+    def test_abort_request_abort_all_then_chat_ok(self, backend, model_case):
+        _post_abort_all_or_skip()
+        api_client = APIClient(BASE_URL)
+        model_name = api_client.available_models[0]
+        last = None
+        for out in api_client.chat_completions_v1(
+                model=model_name,
+                messages=[{'role': 'user', 'content': 'Reply with one word: OK'}],
+                max_tokens=16,
+                temperature=0.01,
+                stream=False):
+            last = out
+        assert last is not None
+        assert_chat_completions_batch_return(last, model_name)
diff --git a/autotest/interface/restful/test_restful_sleep_wakeup.py b/autotest/interface/restful/test_restful_sleep_wakeup.py
new file mode 100644
index 0000000000..0f417a603d
--- /dev/null
+++ b/autotest/interface/restful/test_restful_sleep_wakeup.py
@@ -0,0 +1,432 @@
+import time
+from pathlib import Path
+
+import pytest
+import requests
+import torch
+from utils.constant import (
+    DEFAULT_PORT,
+    DEFAULT_SERVER,
+    SLEEP_WAKEUP_BACKENDS,
+    SLEEP_WAKEUP_MODEL_LIST,
+)
+from utils.restful_return_check import assert_chat_completions_batch_return
+from utils.sleep_utils import (
+    LEVEL2_BASELINE_RUNS,
+    LEVEL2_GREEDY_MESSAGES,
+    LEVEL2_MAX_TOKENS,
+    apply_serialized_hf_segments_for_level2_weights,
+    apply_serialized_hf_segments_for_turbomind_level2_weights,
+    assert_assistant_not_degenerate,
+    assert_chat_decode_unchanged,
+    assistant_content_from_openai_completion_dict,
+    level2_update_weights_request_dict,
+    resolve_hf_checkpoint_dir,
+)
+
+from lmdeploy.serve.openai.api_client import APIClient
+
+BASE_URL = f'http://{DEFAULT_SERVER}:{DEFAULT_PORT}'
+JSON_HEADERS = {'Content-Type': 'application/json'}
+_REQUEST_TIMEOUT = 120
+_UPDATE_WEIGHTS_TIMEOUT = 600
+
+
+def _assert_status_200(resp: requests.Response) -> None:
+    assert resp.status_code == 200, f'status={resp.status_code} body={resp.text!r}'
+
+
+def _post_sleep(*, level: int | None = None) -> requests.Response:
+    url = f'{BASE_URL}/sleep'
+    if level is not None:
+        url = f'{url}?level={level}'
+    return requests.post(url, headers=JSON_HEADERS, json={}, timeout=_REQUEST_TIMEOUT)
+
+
+def _post_sleep_level2() -> requests.Response:
+    return requests.post(
+        f'{BASE_URL}/sleep',
+        headers=JSON_HEADERS,
+        json={},
+        params=[('tags', 'weights'), ('tags', 'kv_cache'), ('level', 2)],
+        timeout=_REQUEST_TIMEOUT,
+    )
+
+
+def _post_sleep_query_raw(query: str) -> requests.Response:
+    q = query.lstrip('?')
+    url = f'{BASE_URL}/sleep?{q}' if q else f'{BASE_URL}/sleep'
+    return requests.post(url, headers=JSON_HEADERS, json={}, timeout=_REQUEST_TIMEOUT)
+
+
+def _post_wakeup(*, tags: list[str] | None = None) -> requests.Response:
+    params = [('tags', t) for t in tags] if tags else None
+    return requests.post(
+        f'{BASE_URL}/wakeup',
+        headers=JSON_HEADERS,
+        json={},
+        params=params,
+        timeout=_REQUEST_TIMEOUT,
+    )
+
+
+def _post_update_weights_from_hf_dir(model_dir: Path, *, engine: str) -> None:
+    def _emit(serialized_data: object, finished: bool) -> None:
+        data = level2_update_weights_request_dict(serialized_data, finished)
+        r = requests.post(
+            f'{BASE_URL}/update_weights',
+            headers=JSON_HEADERS,
+            json=data,
+            timeout=_UPDATE_WEIGHTS_TIMEOUT,
+        )
+        _assert_status_200(r)
+
+    if engine == 'pytorch':
+        apply_serialized_hf_segments_for_level2_weights(model_dir, _emit)
+    elif engine == 'turbomind':
+        apply_serialized_hf_segments_for_turbomind_level2_weights(model_dir, _emit)
+    else:
+        pytest.skip(f'unsupported engine for update_weights: {engine!r}')
+
+
+def _level2_reload_hf_weights(backend: str, config: dict, model_case: str) -> None:
+    if not torch.cuda.is_available():
+        pytest.skip('level-2 reload needs CUDA for serialize_state_dict / weight upload')
+    model_dir = resolve_hf_checkpoint_dir(config, model_case)
+    if not model_dir.is_dir():
+        pytest.skip(f'HF checkpoint not found for update_weights: {model_dir}')
+    try:
+        _post_update_weights_from_hf_dir(model_dir, engine=backend)
+    except FileNotFoundError as e:
+        pytest.skip(str(e))
+    except RuntimeError as e:
+        pytest.skip(str(e))
+
+
+def _fetch_is_sleeping() -> bool:
+    r = requests.get(f'{BASE_URL}/is_sleeping', timeout=30)
+    _assert_status_200(r)
+    return bool(r.json().get('is_sleeping'))
+
+
+def _ensure_awake(max_attempts: int = 8) -> None:
+    for _ in range(max_attempts):
+        _assert_status_200(_post_wakeup())
+        if not _fetch_is_sleeping():
+            return
+        time.sleep(0.25)
+    raise AssertionError(
+        f'engine still is_sleeping=true after {max_attempts} POST /wakeup attempts; '
+        f'BASE_URL={BASE_URL!r}')
+
+
+def _chat_completion_collect(api_client: APIClient, model_name: str, **kwargs) -> dict:
+    kw = dict(kwargs)
+    kw['stream'] = False
+    output = None
+    for output in api_client.chat_completions_v1(model=model_name, **kw):
+        continue
+    assert output is not None, 'chat_completions_v1 returned no chunk'
+    return output
+
+
+def _assert_level2_greedy_baseline_stable(api_client: APIClient, model_name: str, *, label: str) -> dict:
+    kwargs = dict(
+        messages=LEVEL2_GREEDY_MESSAGES,
+        max_tokens=LEVEL2_MAX_TOKENS,
+        temperature=0.0,
+        top_p=1.0,
+        top_k=1,
+    )
+    refs: list[dict] = []
+    contents: list[str] = []
+    for i in range(LEVEL2_BASELINE_RUNS):
+        out = _chat_completion_collect(api_client, model_name, **kwargs)
+        assert_chat_completions_batch_return(out, model_name)
+        text = assistant_content_from_openai_completion_dict(out)
+        assert_assistant_not_degenerate(text, label=f'{label} baseline run {i + 1}')
+        refs.append(out)
+        contents.append(text)
+    assert len(set(contents)) == 1, (
+        f'{label}: greedy REST baseline not stable (fix prompt/model for this case):\n'
+        + '\n'.join(f'  run{j + 1}={c!r}' for j, c in enumerate(contents)))
+    return refs[0]
+
+
+def _should_enforce_level2_greedy_checks(backend: str) -> bool:
+    # Known issue: TurboMind may produce non-stable outputs even in
+    # temperature=0 greedy-style requests. Keep the staged wakeup / reload
+    # flow coverage, but skip strict determinism assertions for this backend.
+    return backend != 'turbomind'
+
+
+@pytest.mark.order(8)
+@pytest.mark.flaky(reruns=2)
+@pytest.mark.parametrize('backend', SLEEP_WAKEUP_BACKENDS)
+@pytest.mark.parametrize('model_case', SLEEP_WAKEUP_MODEL_LIST)
+class TestRestfulSleepWakeup:
+
+    def test_sleep_wakeup_is_sleeping_roundtrip(self, model_case, backend):
+        try:
+            _ensure_awake()
+            r_sleep = _post_sleep()
+            _assert_status_200(r_sleep)
+
+            assert _fetch_is_sleeping() is True
+
+            r_wake = _post_wakeup()
+            _assert_status_200(r_wake)
+
+            assert _fetch_is_sleeping() is False
+        finally:
+            _ensure_awake()
+
+    def test_sleep_with_level_query_wakeup_and_chat(self, model_case, backend):
+        try:
+            _ensure_awake()
+            r_sleep = _post_sleep(level=1)
+            _assert_status_200(r_sleep)
+
+            assert _fetch_is_sleeping() is True
+
+            r_wake = _post_wakeup()
+            _assert_status_200(r_wake)
+            assert _fetch_is_sleeping() is False
+
+            api_client = APIClient(BASE_URL)
+            model_name = api_client.available_models[0]
+            output = None
+            for output in api_client.chat_completions_v1(
+                    model=model_name,
+                    messages=[{'role': 'user', 'content': 'Hi, reply with one short sentence.'}],
+                    max_tokens=32,
+                    temperature=0.01):
+                continue
+            assert output is not None
+            assert_chat_completions_batch_return(output, model_name)
+        finally:
+            _ensure_awake()
+
+    def test_sleep_partial_wakeup_with_tags(self, model_case, backend):
+        try:
+            _ensure_awake()
+            r_sleep = _post_sleep(level=1)
+            _assert_status_200(r_sleep)
+            assert _fetch_is_sleeping() is True
+
+            r_w = _post_wakeup(tags=['weights'])
+            _assert_status_200(r_w)
+            assert _fetch_is_sleeping() is True
+
+            r_kv = _post_wakeup(tags=['kv_cache'])
+            _assert_status_200(r_kv)
+            assert _fetch_is_sleeping() is False
+        finally:
+            _ensure_awake()
+
+    def test_wakeup_unknown_tags_is_noop_then_full_wakeup(self, model_case, backend):
+        try:
+            _ensure_awake()
+            _assert_status_200(_post_sleep(level=1))
+            assert _fetch_is_sleeping() is True
+
+            _assert_status_200(_post_wakeup(tags=['not_a_valid_tag']))
+            assert _fetch_is_sleeping() is True
+
+            _assert_status_200(_post_wakeup())
+            assert _fetch_is_sleeping() is False
+        finally:
+            _ensure_awake()
+
+    def test_wakeup_mixed_valid_and_invalid_tags_entire_call_noop(self, model_case, backend):
+        try:
+            _ensure_awake()
+            _assert_status_200(_post_sleep(level=1))
+            assert _fetch_is_sleeping() is True
+
+            _assert_status_200(_post_wakeup(tags=['weights', 'not_a_valid_tag']))
+            assert _fetch_is_sleeping() is True
+
+            _assert_status_200(_post_wakeup(tags=['not_a_valid_tag', 'weights']))
+            assert _fetch_is_sleeping() is True
+
+            _assert_status_200(_post_wakeup())
+            assert _fetch_is_sleeping() is False
+        finally:
+            _ensure_awake()
+
+    def test_wakeup_both_valid_tags_in_one_request(self, model_case, backend):
+        try:
+            _ensure_awake()
+            _assert_status_200(_post_sleep(level=1))
+            assert _fetch_is_sleeping() is True
+
+            _assert_status_200(_post_wakeup(tags=['weights', 'kv_cache']))
+            assert _fetch_is_sleeping() is False
+
+            api_client = APIClient(BASE_URL)
+            model_name = api_client.available_models[0]
+            output = None
+            for output in api_client.chat_completions_v1(
+                    model=model_name,
+                    messages=[{'role': 'user', 'content': 'Hi, reply with one short sentence.'}],
+                    max_tokens=32,
+                    temperature=0.01):
+                continue
+            assert output is not None
+            assert_chat_completions_batch_return(output, model_name)
+        finally:
+            _ensure_awake()
+
+    def test_wakeup_redundant_tag_after_partial_wake_is_noop(self, model_case, backend):
+        try:
+            _ensure_awake()
+            _assert_status_200(_post_sleep(level=1))
+            assert _fetch_is_sleeping() is True
+
+            _assert_status_200(_post_wakeup(tags=['weights']))
+            assert _fetch_is_sleeping() is True
+
+            _assert_status_200(_post_wakeup(tags=['weights']))
+            assert _fetch_is_sleeping() is True
+
+            _assert_status_200(_post_wakeup(tags=['kv_cache']))
+            assert _fetch_is_sleeping() is False
+        finally:
+            _ensure_awake()
+
+    def test_wakeup_empty_string_tag_is_noop_when_sleeping(self, model_case, backend):
+        try:
+            _ensure_awake()
+            _assert_status_200(_post_sleep(level=1))
+            assert _fetch_is_sleeping() is True
+
+            r = requests.post(
+                f'{BASE_URL}/wakeup',
+                headers=JSON_HEADERS,
+                json={},
+                params=[('tags', '')],
+                timeout=_REQUEST_TIMEOUT,
+            )
+            _assert_status_200(r)
+            assert _fetch_is_sleeping() is True
+
+            _assert_status_200(_post_wakeup())
+            assert _fetch_is_sleeping() is False
+        finally:
+            _ensure_awake()
+
+    def test_full_wakeup_when_already_awake(self, model_case, backend):
+        try:
+            _ensure_awake()
+            assert _fetch_is_sleeping() is False
+            _assert_status_200(_post_wakeup())
+            assert _fetch_is_sleeping() is False
+            _assert_status_200(_post_wakeup())
+            assert _fetch_is_sleeping() is False
+        finally:
+            _ensure_awake()
+
+    def test_sleep_second_call_while_sleeping_still_ok(self, model_case, backend):
+        try:
+            _ensure_awake()
+            _assert_status_200(_post_sleep(level=1))
+            assert _fetch_is_sleeping() is True
+            _assert_status_200(_post_sleep(level=1))
+            assert _fetch_is_sleeping() is True
+            _assert_status_200(_post_wakeup())
+            assert _fetch_is_sleeping() is False
+        finally:
+            _ensure_awake()
+
+    def test_sleep_non_integer_level_is_http_error(self, model_case, backend):
+        try:
+            _ensure_awake()
+            resp = _post_sleep_query_raw('level=not_an_int')
+            assert resp.status_code != 200, f'expected non-200, got {resp.status_code} body={resp.text!r}'
+        finally:
+            _ensure_awake()
+
+    def test_sleep_level_2_full_wakeup_and_chat(self, model_case, backend, config):
+        try:
+            _ensure_awake()
+            api_client = APIClient(BASE_URL)
+            model_name = api_client.available_models[0]
+
+            baseline = None
+            if _should_enforce_level2_greedy_checks(backend):
+                baseline = _assert_level2_greedy_baseline_stable(
+                    api_client, model_name, label='level2 REST')
+
+            _assert_status_200(_post_sleep_level2())
+            assert _fetch_is_sleeping() is True
+
+            _assert_status_200(_post_wakeup(tags=['weights']))
+            assert _fetch_is_sleeping() is True
+            _level2_reload_hf_weights(backend, config, model_case)
+
+            _assert_status_200(_post_wakeup(tags=['kv_cache']))
+            assert _fetch_is_sleeping() is False
+
+            after = _chat_completion_collect(
+                api_client,
+                model_name,
+                messages=LEVEL2_GREEDY_MESSAGES,
+                max_tokens=LEVEL2_MAX_TOKENS,
+                temperature=0.0,
+                top_p=1.0,
+                top_k=1,
+            )
+            assert_chat_completions_batch_return(after, model_name)
+            assert_assistant_not_degenerate(
+                assistant_content_from_openai_completion_dict(after),
+                label='level2 REST after staged wakeup (1st chat)')
+            if baseline is not None:
+                assert_chat_decode_unchanged(baseline, after, label='level2 REST 1st infer after staged wakeup')
+
+            after2 = _chat_completion_collect(
+                api_client,
+                model_name,
+                messages=LEVEL2_GREEDY_MESSAGES,
+                max_tokens=LEVEL2_MAX_TOKENS,
+                temperature=0.0,
+                top_p=1.0,
+                top_k=1,
+            )
+            assert_chat_completions_batch_return(after2, model_name)
+            if baseline is not None:
+                assert_chat_decode_unchanged(baseline, after2, label='level2 REST 2nd infer after staged wakeup')
+
+            _assert_status_200(_post_sleep_level2())
+            assert _fetch_is_sleeping() is True
+            _assert_status_200(_post_wakeup(tags=['weights']))
+            assert _fetch_is_sleeping() is True
+            _level2_reload_hf_weights(backend, config, model_case)
+            _assert_status_200(_post_wakeup(tags=['kv_cache']))
+            assert _fetch_is_sleeping() is False
+
+            after_full = _chat_completion_collect(
+                api_client,
+                model_name,
+                messages=LEVEL2_GREEDY_MESSAGES,
+                max_tokens=LEVEL2_MAX_TOKENS,
+                temperature=0.0,
+                top_p=1.0,
+                top_k=1,
+            )
+            assert_chat_completions_batch_return(after_full, model_name)
+            label2 = 'level2 REST infer after 2nd sleep cycle (staged wakeup)'
+            if baseline is not None:
+                assert_chat_decode_unchanged(baseline, after_full, label=label2)
+
+            output = None
+            for output in api_client.chat_completions_v1(
+                    model=model_name,
+                    messages=[{'role': 'user', 'content': 'Hi, reply with one short sentence.'}],
+                    max_tokens=32,
+                    temperature=0.01):
+                continue
+            assert output is not None
+            assert_chat_completions_batch_return(output, model_name)
+        finally:
+            _ensure_awake()
diff --git a/autotest/tools/pipeline/mllm_case.py b/autotest/tools/pipeline/mllm_case.py
index 2ac134f440..88c9c2c2d2 100644
--- a/autotest/tools/pipeline/mllm_case.py
+++ b/autotest/tools/pipeline/mllm_case.py
@@ -1,12 +1,14 @@
 import json
+import os
+from typing import Any
 
-import fire
-import numpy as np
-from PIL import Image
+import fire  # noqa: E402
+import numpy as np  # noqa: E402
+from PIL import Image  # noqa: E402
 
-from lmdeploy import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline
-from lmdeploy.vl import encode_image_base64, load_image
-from lmdeploy.vl.constants import IMAGE_TOKEN
+from lmdeploy import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline  # noqa: E402
+from lmdeploy.vl import encode_image_base64, load_image, load_video  # noqa: E402
+from lmdeploy.vl.constants import IMAGE_TOKEN  # noqa: E402
 
 gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=10)
 
@@ -18,6 +20,35 @@
 PIC_PANDA = 'panda.jpg'
 DESC = 'What are the similarities and differences between these two images.'
 DESC_ZH = '两张图有什么相同和不同的地方.'
+_MM_DEMO_TOMB_MCQ_JSON_BLOCK = """{
+  "question": "How many porcelain jars were discovered in the niches located in the primary chamber of the tomb?",
+  "options": [
+    "A. 4.",
+    "B. 9.",
+    "C. 5.",
+    "D. 13."
+  ]
+}"""
+MM_DEMO_TOMB_USER_PROMPT = (
+    'You are given a multiple-choice problem as JSON (question and options only; there is no answer field). '
+    'Watch the entire video, pick the best option from what you see, then reply briefly with the letter '
+    '(A, B, C, or D) first and at most one short sentence. Do not output long step-by-step reasoning; '
+    'keep the final reply concise.\n\n' + _MM_DEMO_TOMB_MCQ_JSON_BLOCK)
+
+DEFAULT_VIDEO_FILENAME = 'red-panda.mp4'
+VIDEO_QWEN3_DEMO_FILENAME = 'N1cdUjctpG8.mp4'
+
+
+def _numpy_video_to_pil_list(frames: np.ndarray) -> list[Image.Image]:
+    images: list[Image.Image] = []
+    for i in range(int(frames.shape[0])):
+        images.append(Image.fromarray(frames[i].astype('uint8')).convert('RGB'))
+    return images
+
+
+def load_video_sampled_pil(video_path: str, num_frames: int, **kwargs: Any) -> tuple[list[Image.Image], dict[str, Any]]:
+    frames, meta = load_video(video_path, num_frames=num_frames, **kwargs)
+    return _numpy_video_to_pil_list(frames), meta
 
 
 def run_pipeline_mllm_test(model_path, run_config, resource_path, is_pr_test: bool = False):
@@ -169,44 +200,9 @@ def internvl_vl_testcase(pipe, resource_path, lang='en'):
     print(f'[caseresult internvl-separate-images2-{lang} start]' + json.dumps(response.text, ensure_ascii=False) +
           f'[caseresult internvl-separate-images2-{lang} end]\n')
 
-    # video multi-round conversation
-    def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
-        if bound:
-            start, end = bound[0], bound[1]
-        else:
-            start, end = -100000, 100000
-        start_idx = max(first_idx, round(start * fps))
-        end_idx = min(round(end * fps), max_frame)
-        seg_size = float(end_idx - start_idx) / num_segments
-        frame_indices = np.array(
-            [int(start_idx + (seg_size / 2) + np.round(seg_size * idx)) for idx in range(num_segments)])
-        return frame_indices
-
-    def load_video(video_path, bound=None, num_segments=32):
-        import cv2
-        cap = cv2.VideoCapture(video_path)
-        if not cap.isOpened():
-            raise ValueError(f'Cannot open video file: {video_path}')
-
-        max_frame = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - 1
-        fps = cap.get(cv2.CAP_PROP_FPS)
-
-        frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
-        imgs = []
-
-        for frame_index in frame_indices:
-            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
-            ret, frame = cap.read()
-            if ret:
-                rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                img = Image.fromarray(rgb_frame).convert('RGB')
-                imgs.append(img)
-
-        cap.release()
-        return imgs
-
-    video_path = resource_path + '/red-panda.mp4'
-    imgs = load_video(video_path, num_segments=8)
+    # video multi-round conversation (uniform ``num_frames`` via lmdeploy.vl.load_video)
+    video_path = f'{resource_path}/{DEFAULT_VIDEO_FILENAME}'
+    imgs, _ = load_video_sampled_pil(video_path, num_frames=8)
 
     question = ''
     for i in range(len(imgs)):
@@ -287,43 +283,11 @@ def MiniCPM_vl_testcase(pipe, resource_path):
     print('[caseresult minicpm-fewshot start]' + json.dumps(response.text, ensure_ascii=False) +
           '[caseresult minicpm-fewshot end]\n')
 
-    # Chat with video
-    MAX_NUM_FRAMES = 64  # if cuda OOM set a smaller number
-
-    def encode_video(video_path):
-
-        def uniform_sample(length, n):
-            gap = len(length) / n
-            idxs = [int(i * gap + gap / 2) for i in range(n)]
-            return [length[i] for i in idxs]
-
-        import cv2
-        cap = cv2.VideoCapture(video_path)
-        if not cap.isOpened():
-            raise ValueError(f'Cannot open video file: {video_path}')
-
-        fps = cap.get(cv2.CAP_PROP_FPS)
-        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-
-        sample_fps = round(fps / 1)  # FPS
-        frame_idx = [i for i in range(0, total_frames, sample_fps)]
-        if len(frame_idx) > MAX_NUM_FRAMES:
-            frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
-
-        frames = []
-        for idx in frame_idx:
-            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
-            ret, frame = cap.read()
-            if ret:
-                rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                frames.append(Image.fromarray(rgb_frame.astype('uint8')).convert('RGB'))
-
-        cap.release()
-        print('num frames:', len(frames))
-        return frames
-
-    video_path = resource_path + '/red-panda.mp4'
-    frames = encode_video(video_path)
+    # Chat with video (fixed frame budget; same decoder as REST ``video_url``)
+    max_video_frames = 32
+    video_path = f'{resource_path}/{DEFAULT_VIDEO_FILENAME}'
+    frames, video_meta = load_video_sampled_pil(video_path, num_frames=max_video_frames)
+    print('num frames:', len(frames), 'meta:', video_meta.get('frames_indices'))
     question = 'What animals are in the video, and what are they doing?'
 
     content = [dict(type='text', text=question)]
@@ -386,6 +350,97 @@ def Qwen_vl_testcase(pipe, resource_path):
     print('[caseresult qwen-performance-images2 start]' + json.dumps(response.text, ensure_ascii=False) +
           '[caseresult qwen-performance-images2 end]\n')
 
+    # Qwen2.5/3-VL: native ``video`` + same knobs as REST ``extra_body`` (top_k / mm_processor_kwargs).
+    demo_path = os.path.join(resource_path, VIDEO_QWEN3_DEMO_FILENAME)
+    if not os.path.isfile(demo_path):
+        print('[caseresult qwen3-demo-video start]' +
+              json.dumps('SKIPPED_NO_DEMO_MP4', ensure_ascii=False) + '[caseresult qwen3-demo-video end]\n')
+    else:
+        try:
+            frames, vmeta = load_video(demo_path, num_frames=16, fps=2)
+            demo_q = MM_DEMO_TOMB_USER_PROMPT
+            vmsg = [{
+                'role':
+                'user',
+                'content': [
+                    {
+                        'type': 'video',
+                        'data': frames,
+                        'video_metadata': vmeta,
+                    },
+                    {
+                        'type': 'text',
+                        'text': demo_q,
+                    },
+                ],
+            }]
+            mm_gen_config = GenerationConfig(
+                max_new_tokens=24576,
+                min_new_tokens=10,
+                top_k=20,
+                temperature=0.3,
+                top_p=0.95,
+            )
+            response = pipe(
+                vmsg,
+                gen_config=mm_gen_config,
+                log_level='INFO',
+                max_log_len=10,
+                mm_processor_kwargs={
+                    'fps': 2,
+                    'do_sample_frames': True,
+                },
+            )
+            print('[caseresult qwen3-demo-video start]' + json.dumps(response.text, ensure_ascii=False) +
+                  '[caseresult qwen3-demo-video end]\n')
+        except Exception as exc:
+            err = json.dumps(f'PIPELINE_VIDEO_ERROR:{exc!s}', ensure_ascii=False)
+            print('[caseresult qwen3-demo-video start]' + err + '[caseresult qwen3-demo-video end]\n')
+
+    rp_video = os.path.join(resource_path, DEFAULT_VIDEO_FILENAME)
+    if not os.path.isfile(rp_video):
+        print('[caseresult qwen-mixed-image-text-video start]' +
+              json.dumps('SKIPPED_NO_RED_PANDA_MP4', ensure_ascii=False) +
+              '[caseresult qwen-mixed-image-text-video end]\n')
+    else:
+        try:
+            frames_pil, _vmeta_m = load_video_sampled_pil(rp_video, num_frames=6, fps=1)
+            mixed_content = [
+                {
+                    'type':
+                    'text',
+                    'text': (
+                        'You are given one still image, then several frames from a short video in order. '
+                        'In 2-4 sentences: name one thing in the still image, and what animal or activity '
+                        'you see in the video frames.'),
+                },
+                {
+                    'type': 'image_url',
+                    'image_url': {
+                        'url': f'{resource_path}/{PIC1}',
+                    },
+                },
+            ]
+            for frame in frames_pil:
+                mixed_content.append(
+                    dict(
+                        type='image_url',
+                        image_url=dict(url=f'data:image/jpeg;base64,{encode_image_base64(frame)}'),
+                    ))
+            mixed_msg = [{'role': 'user', 'content': mixed_content}]
+            response = pipe(
+                mixed_msg,
+                gen_config=gen_config,
+                log_level='INFO',
+                max_log_len=10,
+            )
+            print('[caseresult qwen-mixed-image-text-video start]' +
+                  json.dumps(response.text, ensure_ascii=False) + '[caseresult qwen-mixed-image-text-video end]\n')
+        except Exception as exc:
+            err = json.dumps(f'PIPELINE_MIXED_MM_ERROR:{exc!s}', ensure_ascii=False)
+            print('[caseresult qwen-mixed-image-text-video start]' + err +
+                  '[caseresult qwen-mixed-image-text-video end]\n')
+
 
 if __name__ == '__main__':
     fire.Fire()
diff --git a/autotest/utils/constant.py b/autotest/utils/constant.py
index bc3ebb0ad5..85a2ea9d84 100644
--- a/autotest/utils/constant.py
+++ b/autotest/utils/constant.py
@@ -201,6 +201,14 @@
     }
 }
 
+SLEEP_WAKEUP_MODEL_LIST = [
+    'Qwen/Qwen3.5-35B-A3B',
+    'Qwen/Qwen3.5-35B-A3B-FP8',
+    'Qwen/Qwen3.5-122B-A10B',
+]
+
+SLEEP_WAKEUP_BACKENDS = ['pytorch', 'turbomind']
+
 BACKEND_LIST = ['turbomind', 'pytorch']
 
 RESTFUL_MODEL_LIST_LATEST = [
@@ -260,3 +268,19 @@
         'cache-max-entry-count': 0.7
     }
 }
+
+# Qwen3-VL tomb demo (REST ``mm_processor`` + pipeline video): MCQ JSON without a labelled answer field.
+MM_DEMO_TOMB_MCQ_JSON_BLOCK = """{
+  "question": "How many porcelain jars were discovered in the niches located in the primary chamber of the tomb?",
+  "options": [
+    "A. 4.",
+    "B. 9.",
+    "C. 5.",
+    "D. 13."
+  ]
+}"""
+MM_DEMO_TOMB_USER_PROMPT = (
+    'You are given a multiple-choice problem as JSON (question and options only; there is no answer field). '
+    'Watch the entire video, pick the best option from what you see, then reply briefly with the letter '
+    '(A, B, C, or D) first and at most one short sentence. Do not output long step-by-step reasoning; '
+    'keep the final reply concise.\n\n' + MM_DEMO_TOMB_MCQ_JSON_BLOCK)
diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
index 1240227af4..470dca1f3d 100644
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
@@ -8,6 +8,7 @@
 from utils.common_utils import execute_command_with_logging
 from utils.config_utils import get_case_str_by_config, get_cuda_prefix_by_workerid, get_workerid, resolve_extra_params
 from utils.rule_condition_assert import assert_result
+from utils.run_restful_chat import _mm_demo_thinking_wrapper_shape_assert, _mm_demo_tomb_answer_assert
 
 
 def run_pipeline_llm_test(config, run_config, common_case_config, worker_id: str = '', is_smoke: bool = False):
@@ -337,6 +338,51 @@ def Qwen_vl_testcase(output_text, file):
         file.writelines(f'qwen-performance-images2 result: {case_result}, reason: buildings should in {response} \n')
         with assume:
             assert case_result, f'reason: performance images2: buildings should in {response}'
+    with allure.step('qwen3-demo-video'):
+        response = get_response_from_output(output_text, 'qwen3-demo-video')
+        rl = response.lower()
+        if 'skipped_no_demo_mp4' in rl:
+            file.writelines('qwen3-demo-video result: skipped (N1cdUjctpG8.mp4 not in resource_path)\n')
+        elif 'pipeline_video_error:' in rl:
+            file.writelines(f'qwen3-demo-video result: false, pipeline video error in {response} \n')
+            with assume:
+                assert False, f'qwen3-demo-video pipeline error: {response}'
+        else:
+            tomb_assert = _mm_demo_tomb_answer_assert(response)
+            shape_assert = _mm_demo_thinking_wrapper_shape_assert(response)
+            case_result = tomb_assert and shape_assert
+            reason = 'tomb/jar + bounded public tail'
+            file.writelines(f'qwen3-demo-video result: {case_result}, reason: {reason}: {response} \n')
+            with assume:
+                msg = 'reason: qwen3 demo video: expected tomb/jar-related bounded answer'
+                assert case_result, f'{msg}: {response}'
+    if '[caseresult qwen-mixed-image-text-video start]' in output_text:
+        with allure.step('qwen-mixed-image-text-video'):
+            response = get_response_from_output(output_text, 'qwen-mixed-image-text-video')
+            rl = response.lower()
+            if 'skipped_no_red_panda_mp4' in rl:
+                file.writelines(
+                    'qwen-mixed-image-text-video result: skipped (red-panda.mp4 not in resource_path)\n')
+            elif 'pipeline_mixed_mm_error:' in rl:
+                file.writelines(f'qwen-mixed-image-text-video result: false, mixed mm error in {response} \n')
+                with assume:
+                    assert False, f'qwen-mixed-image-text-video pipeline error: {response}'
+            else:
+                img = (
+                    any(w in rl for w in ('tiger', 'ski'))
+                    or '虎' in response
+                    or '滑雪' in response
+                )
+                vid = (
+                    any(w in rl for w in ('panda', 'red panda', 'lesser panda', 'ailurus'))
+                    or any(w in response for w in ('小熊猫', '红熊猫'))
+                )
+                case_result = bool(response.strip()) and img and vid
+                file.writelines(
+                    f'qwen-mixed-image-text-video result: {case_result}, reason: image+tiger + video+panda cues\n')
+                with assume:
+                    msg = 'reason: mixed image+video reply should mention tiger/ski and panda'
+                    assert case_result, f'{msg}: {response}'
 
 
 def save_pipeline_common_log(config, log_name, result, content, msg: str = '', write_type: str = 'w'):
diff --git a/autotest/utils/restful_return_check.py b/autotest/utils/restful_return_check.py
index b425b809da..537e5acac6 100644
--- a/autotest/utils/restful_return_check.py
+++ b/autotest/utils/restful_return_check.py
@@ -14,8 +14,15 @@ def assert_chat_completions_batch_return(output, model_name, check_logprobs: boo
         assert len(message.get('message').get('content')) > 0
         assert message.get('message').get('role') == 'assistant'
         if check_logprobs:
-            len(message.get('logprobs').get('content')) == output.get('usage').get('completion_tokens')
-            for logprob in message.get('logprobs').get('content'):
+            lp = message.get('logprobs')
+            assert lp is not None, output
+            content_lp = lp.get('content')
+            assert content_lp is not None, output
+            n_tok = output.get('usage', {}).get('completion_tokens')
+            assert len(content_lp) == n_tok, (
+                f'logprobs.content len {len(content_lp)} != completion_tokens {n_tok!r}'
+            )
+            for logprob in content_lp:
                 assert_logprobs(logprob, logprobs_num)
 
 
@@ -31,8 +38,15 @@ def assert_completions_batch_return(output, model_name, check_logprobs: bool = F
         assert message.get('index') == 0
         assert len(message.get('text')) > 0
         if check_logprobs:
-            len(message.get('logprobs').get('content')) == output.get('usage').get('completion_tokens')
-            for logprob in message.get('logprobs').get('content'):
+            lp = message.get('logprobs')
+            assert lp is not None, output
+            content_lp = lp.get('content')
+            assert content_lp is not None, output
+            n_tok = output.get('usage', {}).get('completion_tokens')
+            assert len(content_lp) == n_tok, (
+                f'logprobs.content len {len(content_lp)} != completion_tokens {n_tok!r}'
+            )
+            for logprob in content_lp:
                 assert_logprobs(logprob, logprobs_num)
 
 
diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py
index 6e186c9328..de1e6d2689 100644
--- a/autotest/utils/run_restful_chat.py
+++ b/autotest/utils/run_restful_chat.py
@@ -1,12 +1,13 @@
 import json
 import os
+import re
 import subprocess
 import time
 
 import allure
 import psutil
 import requests
-from openai import OpenAI
+from openai import APIStatusError, BadRequestError, OpenAI
 from pytest_assume.plugin import assume
 from utils.config_utils import (
     get_case_str_by_config,
@@ -15,7 +16,7 @@
     get_workerid,
     resolve_extra_params,
 )
-from utils.constant import DEFAULT_PORT, DEFAULT_SERVER
+from utils.constant import DEFAULT_PORT, DEFAULT_SERVER, MM_DEMO_TOMB_USER_PROMPT
 from utils.restful_return_check import assert_chat_completions_batch_return
 from utils.rule_condition_assert import assert_result
 
@@ -244,6 +245,143 @@ def _run_logprobs_test(port: int = DEFAULT_PORT):
 
 PIC = 'tiger.jpeg'  # noqa E501
 PIC2 = 'human-pose.jpg'  # noqa E501
+VIDEO = 'red-panda.mp4'  # noqa E501
+VIDEO_QWEN3_DEMO = 'N1cdUjctpG8.mp4'  # noqa E501
+MM_DEMO_MAX_TOKENS = 24576
+MM_DEMO_MAX_TOKENS_STREAM = 24576
+VIDEO_SINGLE_FRAME_MAX_TOKENS = 512
+VIDEO_REDPANDA_STREAM_MAX_TOKENS = 2048
+
+
+def _vl_video_stream_finish_assert(finish: str | None, text: str) -> bool:
+    """``stop`` / ``length`` red-panda video: species keywords, then ``length``
+    needs enough text."""
+    if finish not in ('stop', 'length'):
+        return False
+    t = (text or '').lower()
+    raw = text or ''
+    species_match = (
+        any(p in t for p in ('red panda', 'lesser panda'))
+        or 'ailurus' in t
+        or any(s in raw for s in ('小熊猫', '红熊猫'))
+    )
+    if not species_match:
+        return False
+    if finish == 'length':
+        return len(raw.strip()) >= 300
+    return True
+
+
+def _vl_openai_http_error_skippable(exc: BaseException) -> bool:
+    if isinstance(exc, BadRequestError):
+        return True
+    if isinstance(exc, APIStatusError):
+        code = getattr(exc, 'status_code', None)
+        return isinstance(code, int) and code < 500
+    return False
+
+
+_REDACTED_THINKING_END = '</think>'
+
+
+def _mm_demo_public_answer_text(text: str) -> str:
+    """Optional JSON-string decode (pipeline logs); then tail after
+    ``</think>`` when present."""
+    s = (text or '').strip()
+    if len(s) >= 2 and s[0] == '"' and s[-1] == '"':
+        try:
+            s = str(json.loads(s))
+        except (json.JSONDecodeError, TypeError, ValueError):
+            pass
+    s = s.strip()
+    key = _REDACTED_THINKING_END
+    i = s.lower().rfind(key.lower())
+    if i == -1:
+        return s
+    return s[i + len(key) :].strip()
+
+
+def _mm_demo_tomb_answer_assert(text: str) -> bool:
+    """Tomb/MCQ: visible tail mentions scene, a digit, or an MCQ-style letter
+    (A–D)."""
+    raw = _mm_demo_public_answer_text(text).strip()
+    if not raw:
+        return False
+    rl = raw.lower()
+    if any(w in rl for w in ('jar', 'porcelain', 'tomb', 'niche', 'chamber', '罐', '瓷', '墓', '龛')):
+        return True
+    if any(c.isdigit() for c in raw):
+        return True
+    s = raw.strip()
+    if re.search(r'(?i)\b(?:answer|choice|option|correct)\b\s*[:：]?\s*[abcd]\b', s):
+        return True
+    if re.fullmatch(r'(?is)[`"\(\[]*[abcd][`"\)\]]*\.?\s*', s):
+        return True
+    if len(s) <= 120 and re.match(r'(?is)[`"\(\[]*[abcd][`"\)\]]*[\s\.\):,\-]', s):
+        return True
+    return False
+
+
+def _mm_demo_thinking_wrapper_shape_assert(text: str) -> bool:
+    """Bound user-visible tail after ``</think>``, or total size if the wrapper
+    never closes."""
+    s = (text or '').strip()
+    if not s:
+        return False
+    if _REDACTED_THINKING_END.lower() in s.lower():
+        public = _mm_demo_public_answer_text(s).strip()
+        return 0 < len(public) <= 2000
+    return len(s) <= 3200
+
+
+def _mm_demo_tomb_run_assert(finish: str | None, text: str) -> bool:
+    """Tomb + ``mm_processor``: ``stop`` + shape; ``length`` + closed thinking
+    + shape, else long jar/scene tail."""
+    t = (text or '').strip()
+    if not t or not _mm_demo_tomb_answer_assert(t):
+        return False
+    if finish == 'stop':
+        return _mm_demo_thinking_wrapper_shape_assert(t)
+    if finish == 'length':
+        if _REDACTED_THINKING_END.lower() in t.lower():
+            return _mm_demo_thinking_wrapper_shape_assert(t)
+        if len(t) < 1500:
+            return False
+        head_l = t[:8000].lower()
+        if 'jar' not in head_l:
+            return False
+        return any(w in head_l for w in ('niche', 'chamber', 'tomb', 'porcelain', 'primary', '罐', '墓', '龛', '瓷'))
+    return False
+
+
+def _mm_demo_single_frame_scene_assert(text: str) -> bool:
+    """Single-frame: short visible tail plus chamber / niche / vessel hints."""
+    raw = _mm_demo_public_answer_text(text).strip()
+    if not raw or len(raw) < 20:
+        return False
+    if sum(1 for c in raw if c.isalpha()) < 12:
+        return False
+    rl = raw.lower()
+    if any(w in rl for w in ('chamber', 'niche', 'jar', 'porcelain', 'artifact', 'coffin', '墓室', '龛', '罐')):
+        return True
+    return False
+
+
+def _consume_chat_completion_stream(stream_iter) -> tuple[str | None, str]:
+    """Drain a chat-completion stream: ``(finish_reason, joined delta content)``."""
+    chunks: list[str] = []
+    last_fr: str | None = None
+    for ev in stream_iter:
+        if not getattr(ev, 'choices', None):
+            continue
+        choice = ev.choices[0]
+        fr = getattr(choice, 'finish_reason', None)
+        if fr:
+            last_fr = fr
+        delta = getattr(choice, 'delta', None)
+        if delta and getattr(delta, 'content', None):
+            chunks.append(delta.content)
+    return last_fr, ''.join(chunks)
 
 
 def run_vl_testcase(log_path, resource_path, port: int = DEFAULT_PORT):
@@ -289,6 +427,355 @@ def run_vl_testcase(log_path, resource_path, port: int = DEFAULT_PORT):
     for item in api_client.chat_completions_v1(model=model_name, messages=prompt_messages):
         continue
     file.writelines(str(item) + '\n')
+
+    video_path = os.path.join(resource_path, VIDEO)
+    video_messages = [{
+        'role':
+        'user',
+        'content': [
+            {
+                'type': 'text',
+                'text': ('What animal appears in the clip? Give the common species name in one or two '
+                         'short sentences (avoid long step-by-step reasoning).'),
+            },
+            {
+                'type': 'video_url',
+                'video_url': {
+                    'url': video_path,
+                },
+            },
+        ],
+    }]
+    video_messages_one_frame = [{
+        'role':
+        'user',
+        'content': [
+            {
+                'type': 'text',
+                'text': ('The server decodes this clip to a single video frame only. What animal appears? '
+                         'Answer in one or two short sentences.'),
+            },
+            {
+                'type': 'video_url',
+                'video_url': {
+                    'url': video_path,
+                },
+            },
+        ],
+    }]
+
+    if not os.path.isfile(video_path):
+        file.writelines(f'[video testcase skipped] missing file: {video_path}\n')
+    else:
+        try:
+            v_resp = client.chat.completions.create(
+                model=model_name,
+                messages=video_messages,
+                temperature=0.2,
+                max_tokens=512,
+                extra_body={
+                    'media_io_kwargs': {
+                        'video': {
+                            'num_frames': 8,
+                        },
+                    },
+                },
+            )
+        except (BadRequestError, APIStatusError) as exc:
+            if not _vl_openai_http_error_skippable(exc):
+                raise
+            file.writelines(f'[video testcase skipped] model/server rejected video_url: {exc!r}\n')
+        else:
+            file.writelines('[video non-stream] ' + str(v_resp).lower() + '\n')
+            content = (v_resp.choices[0].message.content or '')
+            assert _vl_video_stream_finish_assert(getattr(v_resp.choices[0], 'finish_reason', None), content), v_resp
+
+            v_more = client.chat.completions.create(
+                model=model_name,
+                messages=video_messages,
+                temperature=0.0,
+                max_tokens=1,
+                extra_body={
+                    'media_io_kwargs': {
+                        'video': {
+                            'num_frames': 16,
+                        },
+                    },
+                },
+            )
+            v_few = client.chat.completions.create(
+                model=model_name,
+                messages=video_messages,
+                temperature=0.0,
+                max_tokens=1,
+                extra_body={
+                    'media_io_kwargs': {
+                        'video': {
+                            'num_frames': 4,
+                        },
+                    },
+                },
+            )
+            u_more = getattr(v_more, 'usage', None)
+            u_few = getattr(v_few, 'usage', None)
+            if u_more and u_few and getattr(u_few, 'prompt_tokens', None) and getattr(u_more, 'prompt_tokens', None):
+                if u_few.prompt_tokens < u_more.prompt_tokens:
+                    file.writelines('[video] fewer frames => fewer prompt_tokens (as expected)\n')
+                else:
+                    few_t, many_t = u_few.prompt_tokens, u_more.prompt_tokens
+                    file.writelines(
+                        f'[video] prompt_tokens not compared (few={few_t}, many={many_t})\n',
+                    )
+
+            stream = client.chat.completions.create(
+                model=model_name,
+                messages=video_messages,
+                temperature=0.2,
+                max_tokens=VIDEO_REDPANDA_STREAM_MAX_TOKENS,
+                stream=True,
+                extra_body={
+                    'media_io_kwargs': {
+                        'video': {
+                            'num_frames': 8,
+                        },
+                    },
+                },
+            )
+            stream_fr, joined = _consume_chat_completion_stream(stream)
+            file.writelines('[video stream] ' + joined.lower() + '\n')
+            assert _vl_video_stream_finish_assert(stream_fr, joined), (stream_fr, joined[:1200])
+
+            video_payload = {
+                'model': model_name,
+                'messages': video_messages,
+                'temperature': 0.2,
+                'max_tokens': VIDEO_REDPANDA_STREAM_MAX_TOKENS,
+                'media_io_kwargs': {
+                    'video': {
+                        'num_frames': 8,
+                    },
+                },
+            }
+            raw = requests.post(f'{http_url}/v1/chat/completions',
+                                headers={'content-type': 'application/json'},
+                                json=video_payload,
+                                timeout=600)
+            file.writelines(f'[video raw http] status={raw.status_code}\n')
+            assert raw.ok, raw.text
+            raw_json = raw.json()
+            raw_ch0 = (raw_json.get('choices') or [{}])[0]
+            raw_text = raw_ch0.get('message', {}).get('content') or ''
+            file.writelines(raw_text.lower() + '\n')
+            raw_fr = raw_ch0.get('finish_reason')
+            assert _vl_video_stream_finish_assert(raw_fr, raw_text), (raw_fr, raw_text[:1200], raw_json)
+
+            v_one = client.chat.completions.create(
+                model=model_name,
+                messages=video_messages_one_frame,
+                temperature=0.2,
+                max_tokens=VIDEO_SINGLE_FRAME_MAX_TOKENS,
+                extra_body={
+                    'media_io_kwargs': {
+                        'video': {
+                            'num_frames': 1,
+                        },
+                    },
+                },
+            )
+            file.writelines('[video single-frame] ' + str(v_one).lower() + '\n')
+            one_content = (v_one.choices[0].message.content or '')
+            assert _vl_video_stream_finish_assert(getattr(v_one.choices[0], 'finish_reason', None), one_content), v_one
+
+    # Qwen3-VL style: local demo mp4 + mm_processor_kwargs (fps / do_sample_frames), OpenAI-compatible body.
+    demo_video_path = os.path.join(resource_path, VIDEO_QWEN3_DEMO)
+    demo_question = MM_DEMO_TOMB_USER_PROMPT
+    # Single-frame sampling often lands on an aerial or intro shot, not the jar niche scene.
+    mm_one_question = (
+        'This is one frame from a short news-style clip about an ancient tomb. '
+        'If you see interior details, focus on chamber, niches, pottery or porcelain jars, '
+        'coffin, or furnishings, in one or two short sentences. '
+        'If the frame is only exterior or aerial, say that in one short sentence. '
+        'No long step-by-step reasoning.')
+    mm_demo_messages = [{
+        'role':
+        'user',
+        'content': [
+            {
+                'type': 'video_url',
+                'video_url': {
+                    'url': demo_video_path,
+                },
+            },
+            {
+                'type': 'text',
+                'text': demo_question,
+            },
+        ],
+    }]
+    mm_one_messages = [{
+        'role':
+        'user',
+        'content': [
+            {
+                'type': 'video_url',
+                'video_url': {
+                    'url': demo_video_path,
+                },
+            },
+            {
+                'type': 'text',
+                'text': mm_one_question,
+            },
+        ],
+    }]
+    if not os.path.isfile(demo_video_path):
+        file.writelines(f'[video mm_processor demo skipped] missing file: {demo_video_path}\n')
+    else:
+        try:
+            mm_resp = client.chat.completions.create(
+                model=model_name,
+                messages=mm_demo_messages,
+                max_tokens=MM_DEMO_MAX_TOKENS,
+                temperature=0.3,
+                top_p=0.95,
+                extra_body={
+                    'top_k': 20,
+                    'mm_processor_kwargs': {
+                        'fps': 2,
+                        'do_sample_frames': True,
+                    },
+                },
+            )
+        except (BadRequestError, APIStatusError) as exc:
+            if not _vl_openai_http_error_skippable(exc):
+                raise
+            file.writelines(f'[video mm_processor demo skipped] {exc!r}\n')
+        else:
+            file.writelines('[video mm_processor non-stream] ' + str(mm_resp).lower() + '\n')
+            mm_text = (mm_resp.choices[0].message.content or '').strip()
+            mm_fr = getattr(mm_resp.choices[0], 'finish_reason', None)
+            assert _mm_demo_tomb_run_assert(mm_fr, mm_text), (mm_fr, mm_text[:2000])
+
+            mm_stream = client.chat.completions.create(
+                model=model_name,
+                messages=mm_demo_messages,
+                max_tokens=MM_DEMO_MAX_TOKENS_STREAM,
+                temperature=0.2,
+                stream=True,
+                extra_body={
+                    'top_k': 20,
+                    'mm_processor_kwargs': {
+                        'fps': 2,
+                        'do_sample_frames': True,
+                    },
+                },
+            )
+            mm_finish, mm_joined = _consume_chat_completion_stream(mm_stream)
+            mm_joined = mm_joined.strip()
+            file.writelines('[video mm_processor stream] ' + mm_joined.lower() + '\n')
+            assert _mm_demo_tomb_run_assert(mm_finish, mm_joined), (mm_finish, mm_joined[:2000])
+
+            mm_raw_payload = {
+                'model': model_name,
+                'messages': mm_demo_messages,
+                'temperature': 0.3,
+                'max_tokens': MM_DEMO_MAX_TOKENS,
+                'top_k': 20,
+                'mm_processor_kwargs': {
+                    'fps': 2,
+                    'do_sample_frames': True,
+                },
+            }
+            mm_raw = requests.post(f'{http_url}/v1/chat/completions',
+                                     headers={'content-type': 'application/json'},
+                                     json=mm_raw_payload,
+                                     timeout=600)
+            file.writelines(f'[video mm_processor raw http] status={mm_raw.status_code}\n')
+            assert mm_raw.ok, mm_raw.text
+            mm_raw_json = mm_raw.json()
+            mm_raw_choice0 = (mm_raw_json.get('choices') or [{}])[0]
+            mm_raw_text = mm_raw_choice0.get('message', {}).get('content') or ''
+            file.writelines(mm_raw_text.lower() + '\n')
+            mm_raw_fr = mm_raw_choice0.get('finish_reason')
+            assert _mm_demo_tomb_run_assert(mm_raw_fr, mm_raw_text), (mm_raw_fr, mm_raw_text[:2000], mm_raw_json)
+
+            mm_one = client.chat.completions.create(
+                model=model_name,
+                messages=mm_one_messages,
+                max_tokens=2048,
+                temperature=0.3,
+                top_p=0.95,
+                extra_body={
+                    'top_k': 20,
+                    'media_io_kwargs': {
+                        'video': {
+                            'num_frames': 1,
+                        },
+                    },
+                },
+            )
+            file.writelines('[video mm_processor single-frame] ' + str(mm_one).lower() + '\n')
+            assert getattr(mm_one.choices[0], 'finish_reason', None) == 'stop', mm_one
+            mm_one_text = (mm_one.choices[0].message.content or '').strip()
+            assert mm_one_text and _mm_demo_single_frame_scene_assert(mm_one_text), mm_one_text
+            assert _mm_demo_thinking_wrapper_shape_assert(mm_one_text), mm_one_text
+
+    mixed_messages = [{
+        'role':
+        'user',
+        'content': [
+            {
+                'type':
+                'text',
+                'text': (
+                    'You receive one still image and one video clip in this message. In 2-4 short sentences: '
+                    '(1) name one clear subject from the image; '
+                    '(2) name the animal or main scene in the video.'),
+            },
+            {
+                'type': 'image_url',
+                'image_url': {
+                    'url': f'{resource_path}/{PIC}',
+                },
+            },
+            {
+                'type': 'video_url',
+                'video_url': {
+                    'url': video_path,
+                },
+            },
+        ],
+    }]
+    if not os.path.isfile(video_path):
+        file.writelines('[mixed image+text+video skipped] missing video file (same as video testcase)\n')
+    else:
+        try:
+            mix_resp = client.chat.completions.create(
+                model=model_name,
+                messages=mixed_messages,
+                temperature=0.3,
+                max_tokens=512,
+                extra_body={
+                    'media_io_kwargs': {
+                        'video': {
+                            'num_frames': 6,
+                        },
+                    },
+                },
+            )
+        except (BadRequestError, APIStatusError) as exc:
+            if not _vl_openai_http_error_skippable(exc):
+                raise
+            file.writelines(f'[mixed image+text+video skipped] server rejected: {exc!r}\n')
+        else:
+            file.writelines('[mixed image+text+video] ' + str(mix_resp).lower() + '\n')
+            mix_content = (mix_resp.choices[0].message.content or '').strip()
+            assert mix_content, mix_resp
+            assert ('tiger' in mix_content.lower() or '虎' in mix_content or 'ski' in mix_content.lower()
+                    or '滑雪' in mix_content), mix_resp
+            assert _vl_video_stream_finish_assert(
+                getattr(mix_resp.choices[0], 'finish_reason', None), mix_content), mix_resp
+
     file.close()
 
     allure.attach.file(restful_log, name=restful_log, attachment_type=allure.attachment_type.TEXT)
diff --git a/autotest/utils/sleep_utils.py b/autotest/utils/sleep_utils.py
new file mode 100644
index 0000000000..f289f3d558
--- /dev/null
+++ b/autotest/utils/sleep_utils.py
@@ -0,0 +1,176 @@
+from __future__ import annotations
+
+import json
+import os
+from collections import Counter
+from collections.abc import Callable
+from pathlib import Path
+from typing import Any
+
+import torch
+from safetensors.torch import safe_open
+from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, WEIGHTS_INDEX_NAME, WEIGHTS_NAME
+
+from lmdeploy.utils import serialize_state_dict
+
+UPDATE_WEIGHTS_CUDA_DEVICE_ENV = 'LMDEPLOY_UPDATE_WEIGHTS_CUDA_DEVICE'
+
+LEVEL2_GREEDY_MESSAGES = [{'role': 'user', 'content': '424242'}]
+LEVEL2_MAX_TOKENS = 64
+LEVEL2_BASELINE_RUNS = 3
+MAX_SINGLE_CHAR_FRACTION = 0.75
+
+
+def resolve_update_weights_cuda_device_index() -> int:
+    raw = os.environ.get(UPDATE_WEIGHTS_CUDA_DEVICE_ENV, '').strip()
+    if not raw:
+        return torch.cuda.current_device()
+    try:
+        idx = int(raw)
+    except ValueError as e:
+        raise AssertionError(
+            f'{UPDATE_WEIGHTS_CUDA_DEVICE_ENV} must be an int, got {raw!r}') from e
+    n = torch.cuda.device_count()
+    assert 0 <= idx < n, (
+        f'{UPDATE_WEIGHTS_CUDA_DEVICE_ENV}={idx} out of range for cuda.device_count()={n}')
+    return idx
+
+
+def resolve_hf_checkpoint_dir(config: dict, model_case: str) -> Path:
+    if os.environ.get('LMDEPLOY_USE_MODELSCOPE', 'False') == 'True':
+        return Path(model_case)
+    return Path(config['model_path']) / model_case
+
+
+def shard_paths(model_dir: Path) -> tuple[str, list[Path]]:
+    if (model_dir / SAFE_WEIGHTS_NAME).is_file():
+        return 'safetensors', [model_dir / SAFE_WEIGHTS_NAME]
+    if (model_dir / SAFE_WEIGHTS_INDEX_NAME).is_file():
+        with open(model_dir / SAFE_WEIGHTS_INDEX_NAME, encoding='utf-8') as f:
+            index = json.load(f)
+        paths = sorted(set(index['weight_map'].values()))
+        return 'safetensors', [model_dir / p for p in paths]
+    if (model_dir / WEIGHTS_NAME).is_file():
+        return 'pytorch', [model_dir / WEIGHTS_NAME]
+    if (model_dir / WEIGHTS_INDEX_NAME).is_file():
+        with open(model_dir / WEIGHTS_INDEX_NAME, encoding='utf-8') as f:
+            index = json.load(f)
+        paths = sorted(set(index['weight_map'].values()))
+        return 'pytorch', [model_dir / p for p in paths]
+    raise FileNotFoundError(f'No HF weights under {model_dir}')
+
+
+def load_shard_tensors(kind: str, path: Path) -> dict[str, torch.Tensor]:
+    out: dict[str, torch.Tensor] = {}
+    if kind == 'safetensors':
+        with safe_open(str(path), framework='pt') as f:
+            for key in f.keys():
+                out[key] = f.get_tensor(key)
+    else:
+        state = torch.load(str(path), weights_only=True, map_location='cpu')
+        try:
+            out.update(state)
+        finally:
+            del state
+    return out
+
+
+def assistant_content_from_openai_completion_dict(output: dict) -> str:
+    choices = output.get('choices') or []
+    assert len(choices) == 1, f'expected 1 choice, got {len(choices)}'
+    msg = choices[0].get('message') or {}
+    return (msg.get('content') or '').strip()
+
+
+def assert_assistant_not_degenerate(content: str, *, label: str) -> None:
+    assert content, f'{label}: empty assistant content'
+    compact = content.replace('\n', ' ').strip()
+    assert len(set(compact)) >= 4, (
+        f'{label}: degenerate assistant text (too few distinct chars): {content!r}')
+    top_cnt = Counter(compact).most_common(1)[0][1]
+    assert top_cnt / len(compact) <= MAX_SINGLE_CHAR_FRACTION, (
+        f'{label}: one token/char dominates assistant text: {content!r}')
+
+
+def level2_update_weights_request_dict(serialized_data: object, finished: bool) -> dict[str, Any]:
+    return {
+        'serialized_named_tensors': serialized_data,
+        'finished': finished,
+    }
+
+
+def assert_chat_decode_unchanged(ref: dict, cur: dict, *, label: str) -> None:
+    a, b = assistant_content_from_openai_completion_dict(ref), assistant_content_from_openai_completion_dict(cur)
+    assert a == b, f'{label}: assistant content changed\n before={a!r}\n after={b!r}'
+    rt = ref.get('usage', {}).get('completion_tokens')
+    ct = cur.get('usage', {}).get('completion_tokens')
+    assert rt == ct, f'{label}: completion_tokens changed {rt} -> {ct}'
+    rfr = ref['choices'][0].get('finish_reason')
+    cfr = cur['choices'][0].get('finish_reason')
+    if rfr is not None and cfr is not None:
+        assert rfr == cfr, f'{label}: finish_reason changed {rfr!r} -> {cfr!r}'
+
+
+def apply_serialized_hf_segments_for_level2_weights(
+    model_dir: Path,
+    emit_segment: Callable[[Any, bool], None],
+) -> None:
+    kind, shards = shard_paths(model_dir)
+    num_segment = len(shards)
+    dev_idx = resolve_update_weights_cuda_device_index()
+    device = torch.device('cuda', dev_idx)
+    with torch.cuda.device(dev_idx):
+        for seg_idx in range(num_segment):
+            cpu_dict = load_shard_tensors(kind, shards[seg_idx])
+            seg_gpu = {k: v.to(device, non_blocking=True) for k, v in cpu_dict.items()}
+            del cpu_dict
+            serialized_data = serialize_state_dict(seg_gpu)
+            del seg_gpu
+            torch.cuda.empty_cache()
+            emit_segment(serialized_data, seg_idx == num_segment - 1)
+
+
+def apply_serialized_hf_segments_for_turbomind_level2_weights(
+    model_dir: Path,
+    emit_segment: Callable[[Any, bool], None],
+) -> None:
+    from lmdeploy.turbomind.deploy.converter import get_input_model_registered_name
+    from lmdeploy.turbomind.deploy.source_model.base import INPUT_MODELS
+
+    root = str(model_dir.resolve())
+    try:
+        input_model_name = get_input_model_registered_name(root, 'hf')
+        if input_model_name == 'qwen3_5-moe':
+            raise RuntimeError(
+                'turbomind update_weights is unsupported for qwen3_5-moe in the current server build: '
+                'server-side StateDictLoader has no `index`, but Qwen3_5MoeModel.readers() accesses loader.index')
+        input_model_cls = INPUT_MODELS.get(input_model_name)
+        input_model = input_model_cls(model_path=root, tokenizer_path=root)
+    except Exception as e:
+        raise RuntimeError(
+            f'turbomind update_weights: failed to build input_model readers for {model_dir}: {e}') from e
+
+    dev_idx = resolve_update_weights_cuda_device_index()
+    device = torch.device('cuda', dev_idx)
+    with torch.cuda.device(dev_idx):
+        it = iter(dict(reader.params) for _, reader in input_model.readers())
+        try:
+            chunk = next(it)
+        except StopIteration:
+            raise RuntimeError(f'no turbomind weight chunks to emit under {model_dir}') from None
+
+        for cpu_dict_next in it:
+            seg_gpu = {k: v.to(device, non_blocking=True) for k, v in chunk.items()}
+            try:
+                emit_segment(serialize_state_dict(seg_gpu), False)
+            finally:
+                del seg_gpu
+                torch.cuda.empty_cache()
+            chunk = cpu_dict_next
+
+        seg_gpu = {k: v.to(device, non_blocking=True) for k, v in chunk.items()}
+        try:
+            emit_segment(serialize_state_dict(seg_gpu), True)
+        finally:
+            del seg_gpu
+            torch.cuda.empty_cache()