InternLM · littlegy · Apr 15, 2026 · Apr 15, 2026 · Apr 16, 2026 · Apr 17, 2026
diff --git a/autotest/interface/pipeline/test_pipeline_sleep_wakeup.py b/autotest/interface/pipeline/test_pipeline_sleep_wakeup.py
diff --git a/autotest/interface/restful/test_restful_abort_request.py b/autotest/interface/restful/test_restful_abort_request.py
diff --git a/autotest/interface/restful/test_restful_sleep_wakeup.py b/autotest/interface/restful/test_restful_sleep_wakeup.py
diff --git a/autotest/tools/pipeline/mllm_case.py b/autotest/tools/pipeline/mllm_case.py
@@ -1,12 +1,14 @@
 import json
+import os
+from typing import Any
 
-import fire
-import numpy as np
-from PIL import Image
+import fire  # noqa: E402
+import numpy as np  # noqa: E402
+from PIL import Image  # noqa: E402
 
-from lmdeploy import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline
-from lmdeploy.vl import encode_image_base64, load_image
-from lmdeploy.vl.constants import IMAGE_TOKEN
+from lmdeploy import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline  # noqa: E402
+from lmdeploy.vl import encode_image_base64, load_image, load_video  # noqa: E402
+from lmdeploy.vl.constants import IMAGE_TOKEN  # noqa: E402
 
 gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=10)
 
@@ -18,6 +20,35 @@
 PIC_PANDA = 'panda.jpg'
 DESC = 'What are the similarities and differences between these two images.'
 DESC_ZH = '两张图有什么相同和不同的地方.'
+_MM_DEMO_TOMB_MCQ_JSON_BLOCK = """{
+  "question": "How many porcelain jars were discovered in the niches located in the primary chamber of the tomb?",
+  "options": [
+    "A. 4.",
+    "B. 9.",
+    "C. 5.",
+    "D. 13."
+  ]
+}"""
+MM_DEMO_TOMB_USER_PROMPT = (
+    'You are given a multiple-choice problem as JSON (question and options only; there is no answer field). '
+    'Watch the entire video, pick the best option from what you see, then reply briefly with the letter '
+    '(A, B, C, or D) first and at most one short sentence. Do not output long step-by-step reasoning; '
+    'keep the final reply concise.\n\n' + _MM_DEMO_TOMB_MCQ_JSON_BLOCK)
+
+DEFAULT_VIDEO_FILENAME = 'red-panda.mp4'
+VIDEO_QWEN3_DEMO_FILENAME = 'N1cdUjctpG8.mp4'
+
+
+def _numpy_video_to_pil_list(frames: np.ndarray) -> list[Image.Image]:
+    images: list[Image.Image] = []
+    for i in range(int(frames.shape[0])):
+        images.append(Image.fromarray(frames[i].astype('uint8')).convert('RGB'))
+    return images
+
+
+def load_video_sampled_pil(video_path: str, num_frames: int, **kwargs: Any) -> tuple[list[Image.Image], dict[str, Any]]:
+    frames, meta = load_video(video_path, num_frames=num_frames, **kwargs)
+    return _numpy_video_to_pil_list(frames), meta
 
 
 def run_pipeline_mllm_test(model_path, run_config, resource_path, is_pr_test: bool = False):
@@ -169,44 +200,9 @@ def internvl_vl_testcase(pipe, resource_path, lang='en'):
     print(f'[caseresult internvl-separate-images2-{lang} start]' + json.dumps(response.text, ensure_ascii=False) +
           f'[caseresult internvl-separate-images2-{lang} end]\n')
 
-    # video multi-round conversation
-    def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
-        if bound:
-            start, end = bound[0], bound[1]
-        else:
-            start, end = -100000, 100000
-        start_idx = max(first_idx, round(start * fps))
-        end_idx = min(round(end * fps), max_frame)
-        seg_size = float(end_idx - start_idx) / num_segments
-        frame_indices = np.array(
-            [int(start_idx + (seg_size / 2) + np.round(seg_size * idx)) for idx in range(num_segments)])
-        return frame_indices
-
-    def load_video(video_path, bound=None, num_segments=32):
-        import cv2
-        cap = cv2.VideoCapture(video_path)
-        if not cap.isOpened():
-            raise ValueError(f'Cannot open video file: {video_path}')
-
-        max_frame = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - 1
-        fps = cap.get(cv2.CAP_PROP_FPS)
-
-        frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
-        imgs = []
-
-        for frame_index in frame_indices:
-            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
-            ret, frame = cap.read()
-            if ret:
-                rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                img = Image.fromarray(rgb_frame).convert('RGB')
-                imgs.append(img)
-
-        cap.release()
-        return imgs
-
-    video_path = resource_path + '/red-panda.mp4'
-    imgs = load_video(video_path, num_segments=8)
+    # video multi-round conversation (uniform ``num_frames`` via lmdeploy.vl.load_video)
+    video_path = f'{resource_path}/{DEFAULT_VIDEO_FILENAME}'
+    imgs, _ = load_video_sampled_pil(video_path, num_frames=8)
 
     question = ''
     for i in range(len(imgs)):
@@ -287,43 +283,11 @@ def MiniCPM_vl_testcase(pipe, resource_path):
     print('[caseresult minicpm-fewshot start]' + json.dumps(response.text, ensure_ascii=False) +
           '[caseresult minicpm-fewshot end]\n')
 
-    # Chat with video
-    MAX_NUM_FRAMES = 64  # if cuda OOM set a smaller number
-
-    def encode_video(video_path):
-
-        def uniform_sample(length, n):
-            gap = len(length) / n
-            idxs = [int(i * gap + gap / 2) for i in range(n)]
-            return [length[i] for i in idxs]
-
-        import cv2
-        cap = cv2.VideoCapture(video_path)
-        if not cap.isOpened():
-            raise ValueError(f'Cannot open video file: {video_path}')
-
-        fps = cap.get(cv2.CAP_PROP_FPS)
-        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-
-        sample_fps = round(fps / 1)  # FPS
-        frame_idx = [i for i in range(0, total_frames, sample_fps)]
-        if len(frame_idx) > MAX_NUM_FRAMES:
-            frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
-
-        frames = []
-        for idx in frame_idx:
-            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
-            ret, frame = cap.read()
-            if ret:
-                rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                frames.append(Image.fromarray(rgb_frame.astype('uint8')).convert('RGB'))
-
-        cap.release()
-        print('num frames:', len(frames))
-        return frames
-
-    video_path = resource_path + '/red-panda.mp4'
-    frames = encode_video(video_path)
+    # Chat with video (fixed frame budget; same decoder as REST ``video_url``)
+    max_video_frames = 32
+    video_path = f'{resource_path}/{DEFAULT_VIDEO_FILENAME}'
+    frames, video_meta = load_video_sampled_pil(video_path, num_frames=max_video_frames)
+    print('num frames:', len(frames), 'meta:', video_meta.get('frames_indices'))
     question = 'What animals are in the video, and what are they doing?'
 
     content = [dict(type='text', text=question)]
@@ -386,6 +350,97 @@ def Qwen_vl_testcase(pipe, resource_path):
     print('[caseresult qwen-performance-images2 start]' + json.dumps(response.text, ensure_ascii=False) +
           '[caseresult qwen-performance-images2 end]\n')
 
+    # Qwen2.5/3-VL: native ``video`` + same knobs as REST ``extra_body`` (top_k / mm_processor_kwargs).
+    demo_path = os.path.join(resource_path, VIDEO_QWEN3_DEMO_FILENAME)
+    if not os.path.isfile(demo_path):
+        print('[caseresult qwen3-demo-video start]' +
+              json.dumps('SKIPPED_NO_DEMO_MP4', ensure_ascii=False) + '[caseresult qwen3-demo-video end]\n')
+    else:
+        try:
+            frames, vmeta = load_video(demo_path, num_frames=16, fps=2)
+            demo_q = MM_DEMO_TOMB_USER_PROMPT
+            vmsg = [{
+                'role':
+                'user',
+                'content': [
+                    {
+                        'type': 'video',
+                        'data': frames,
+                        'video_metadata': vmeta,
+                    },
+                    {
+                        'type': 'text',
+                        'text': demo_q,
+                    },
+                ],
+            }]
+            mm_gen_config = GenerationConfig(
+                max_new_tokens=24576,
+                min_new_tokens=10,
+                top_k=20,
+                temperature=0.3,
+                top_p=0.95,
+            )
+            response = pipe(
+                vmsg,
+                gen_config=mm_gen_config,
+                log_level='INFO',
+                max_log_len=10,
+                mm_processor_kwargs={
+                    'fps': 2,
+                    'do_sample_frames': True,
+                },
+            )
+            print('[caseresult qwen3-demo-video start]' + json.dumps(response.text, ensure_ascii=False) +
+                  '[caseresult qwen3-demo-video end]\n')
+        except Exception as exc:
+            err = json.dumps(f'PIPELINE_VIDEO_ERROR:{exc!s}', ensure_ascii=False)
+            print('[caseresult qwen3-demo-video start]' + err + '[caseresult qwen3-demo-video end]\n')
+
+    rp_video = os.path.join(resource_path, DEFAULT_VIDEO_FILENAME)
+    if not os.path.isfile(rp_video):
+        print('[caseresult qwen-mixed-image-text-video start]' +
+              json.dumps('SKIPPED_NO_RED_PANDA_MP4', ensure_ascii=False) +
+              '[caseresult qwen-mixed-image-text-video end]\n')
+    else:
+        try:
+            frames_pil, _vmeta_m = load_video_sampled_pil(rp_video, num_frames=6, fps=1)
+            mixed_content = [
+                {
+                    'type':
+                    'text',
+                    'text': (
+                        'You are given one still image, then several frames from a short video in order. '
+                        'In 2-4 sentences: name one thing in the still image, and what animal or activity '
+                        'you see in the video frames.'),
+                },
+                {
+                    'type': 'image_url',
+                    'image_url': {
+                        'url': f'{resource_path}/{PIC1}',
+                    },
+                },
+            ]
+            for frame in frames_pil:
+                mixed_content.append(
+                    dict(
+                        type='image_url',
+                        image_url=dict(url=f'data:image/jpeg;base64,{encode_image_base64(frame)}'),
+                    ))
+            mixed_msg = [{'role': 'user', 'content': mixed_content}]
+            response = pipe(
+                mixed_msg,
+                gen_config=gen_config,
+                log_level='INFO',
+                max_log_len=10,
+            )
+            print('[caseresult qwen-mixed-image-text-video start]' +
+                  json.dumps(response.text, ensure_ascii=False) + '[caseresult qwen-mixed-image-text-video end]\n')
+        except Exception as exc:
+            err = json.dumps(f'PIPELINE_MIXED_MM_ERROR:{exc!s}', ensure_ascii=False)
+            print('[caseresult qwen-mixed-image-text-video start]' + err +
+                  '[caseresult qwen-mixed-image-text-video end]\n')
+
 
 if __name__ == '__main__':
     fire.Fire()
diff --git a/autotest/utils/constant.py b/autotest/utils/constant.py
@@ -201,6 +201,14 @@
     }
 }
 
+SLEEP_WAKEUP_MODEL_LIST = [
+    'Qwen/Qwen3.5-35B-A3B',
+    'Qwen/Qwen3.5-35B-A3B-FP8',
+    'Qwen/Qwen3.5-122B-A10B',
+]
+
+SLEEP_WAKEUP_BACKENDS = ['pytorch', 'turbomind']
+
 BACKEND_LIST = ['turbomind', 'pytorch']
 
 RESTFUL_MODEL_LIST_LATEST = [
@@ -260,3 +268,19 @@
         'cache-max-entry-count': 0.7
     }
 }
+
+# Qwen3-VL tomb demo (REST ``mm_processor`` + pipeline video): MCQ JSON without a labelled answer field.
+MM_DEMO_TOMB_MCQ_JSON_BLOCK = """{
+  "question": "How many porcelain jars were discovered in the niches located in the primary chamber of the tomb?",
+  "options": [
+    "A. 4.",
+    "B. 9.",
+    "C. 5.",
+    "D. 13."
+  ]
+}"""
+MM_DEMO_TOMB_USER_PROMPT = (
+    'You are given a multiple-choice problem as JSON (question and options only; there is no answer field). '
+    'Watch the entire video, pick the best option from what you see, then reply briefly with the letter '
+    '(A, B, C, or D) first and at most one short sentence. Do not output long step-by-step reasoning; '
+    'keep the final reply concise.\n\n' + MM_DEMO_TOMB_MCQ_JSON_BLOCK)
diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
@@ -8,6 +8,7 @@
 from utils.common_utils import execute_command_with_logging
 from utils.config_utils import get_case_str_by_config, get_cuda_prefix_by_workerid, get_workerid, resolve_extra_params
 from utils.rule_condition_assert import assert_result
+from utils.run_restful_chat import _mm_demo_thinking_wrapper_shape_assert, _mm_demo_tomb_answer_assert
 
 
 def run_pipeline_llm_test(config, run_config, common_case_config, worker_id: str = '', is_smoke: bool = False):
@@ -337,6 +338,51 @@ def Qwen_vl_testcase(output_text, file):
         file.writelines(f'qwen-performance-images2 result: {case_result}, reason: buildings should in {response} \n')
         with assume:
             assert case_result, f'reason: performance images2: buildings should in {response}'
+    with allure.step('qwen3-demo-video'):
+        response = get_response_from_output(output_text, 'qwen3-demo-video')
+        rl = response.lower()
+        if 'skipped_no_demo_mp4' in rl:
+            file.writelines('qwen3-demo-video result: skipped (N1cdUjctpG8.mp4 not in resource_path)\n')
+        elif 'pipeline_video_error:' in rl:
+            file.writelines(f'qwen3-demo-video result: false, pipeline video error in {response} \n')
+            with assume:
+                assert False, f'qwen3-demo-video pipeline error: {response}'
+        else:
+            tomb_assert = _mm_demo_tomb_answer_assert(response)
+            shape_assert = _mm_demo_thinking_wrapper_shape_assert(response)
+            case_result = tomb_assert and shape_assert
+            reason = 'tomb/jar + bounded public tail'
+            file.writelines(f'qwen3-demo-video result: {case_result}, reason: {reason}: {response} \n')
+            with assume:
+                msg = 'reason: qwen3 demo video: expected tomb/jar-related bounded answer'
+                assert case_result, f'{msg}: {response}'
+    if '[caseresult qwen-mixed-image-text-video start]' in output_text:
+        with allure.step('qwen-mixed-image-text-video'):
+            response = get_response_from_output(output_text, 'qwen-mixed-image-text-video')
+            rl = response.lower()
+            if 'skipped_no_red_panda_mp4' in rl:
+                file.writelines(
+                    'qwen-mixed-image-text-video result: skipped (red-panda.mp4 not in resource_path)\n')
+            elif 'pipeline_mixed_mm_error:' in rl:
+                file.writelines(f'qwen-mixed-image-text-video result: false, mixed mm error in {response} \n')
+                with assume:
+                    assert False, f'qwen-mixed-image-text-video pipeline error: {response}'
+            else:
+                img = (
+                    any(w in rl for w in ('tiger', 'ski'))
+                    or '虎' in response
+                    or '滑雪' in response
+                )
+                vid = (
+                    any(w in rl for w in ('panda', 'red panda', 'lesser panda', 'ailurus'))
+                    or any(w in response for w in ('小熊猫', '红熊猫'))
+                )
+                case_result = bool(response.strip()) and img and vid
+                file.writelines(
+                    f'qwen-mixed-image-text-video result: {case_result}, reason: image+tiger + video+panda cues\n')
+                with assume:
+                    msg = 'reason: mixed image+video reply should mention tiger/ski and panda'
+                    assert case_result, f'{msg}: {response}'
 
 
 def save_pipeline_common_log(config, log_name, result, content, msg: str = '', write_type: str = 'w'):

diff --git a/autotest/utils/restful_return_check.py b/autotest/utils/restful_return_check.py
@@ -14,8 +14,15 @@ def assert_chat_completions_batch_return(output, model_name, check_logprobs: boo
         assert len(message.get('message').get('content')) > 0
         assert message.get('message').get('role') == 'assistant'
         if check_logprobs:
-            len(message.get('logprobs').get('content')) == output.get('usage').get('completion_tokens')
-            for logprob in message.get('logprobs').get('content'):
+            lp = message.get('logprobs')
+            assert lp is not None, output
+            content_lp = lp.get('content')
+            assert content_lp is not None, output
+            n_tok = output.get('usage', {}).get('completion_tokens')
+            assert len(content_lp) == n_tok, (
+                f'logprobs.content len {len(content_lp)} != completion_tokens {n_tok!r}'
+            )
+            for logprob in content_lp:
                 assert_logprobs(logprob, logprobs_num)
 
 
@@ -31,8 +38,15 @@ def assert_completions_batch_return(output, model_name, check_logprobs: bool = F
         assert message.get('index') == 0
         assert len(message.get('text')) > 0
         if check_logprobs:
-            len(message.get('logprobs').get('content')) == output.get('usage').get('completion_tokens')
-            for logprob in message.get('logprobs').get('content'):
+            lp = message.get('logprobs')
+            assert lp is not None, output
+            content_lp = lp.get('content')
+            assert content_lp is not None, output
+            n_tok = output.get('usage', {}).get('completion_tokens')
+            assert len(content_lp) == n_tok, (
+                f'logprobs.content len {len(content_lp)} != completion_tokens {n_tok!r}'
+            )
+            for logprob in content_lp:
                 assert_logprobs(logprob, logprobs_num)