Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
441 changes: 441 additions & 0 deletions autotest/interface/pipeline/test_pipeline_sleep_wakeup.py

Large diffs are not rendered by default.

426 changes: 426 additions & 0 deletions autotest/interface/restful/test_restful_abort_request.py

Large diffs are not rendered by default.

432 changes: 432 additions & 0 deletions autotest/interface/restful/test_restful_sleep_wakeup.py

Large diffs are not rendered by default.

217 changes: 136 additions & 81 deletions autotest/tools/pipeline/mllm_case.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import json
import os
from typing import Any

import fire
import numpy as np
from PIL import Image
import fire # noqa: E402
import numpy as np # noqa: E402
from PIL import Image # noqa: E402

from lmdeploy import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline
from lmdeploy.vl import encode_image_base64, load_image
from lmdeploy.vl.constants import IMAGE_TOKEN
from lmdeploy import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline # noqa: E402
from lmdeploy.vl import encode_image_base64, load_image, load_video # noqa: E402
from lmdeploy.vl.constants import IMAGE_TOKEN # noqa: E402

gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=10)

Expand All @@ -18,6 +20,35 @@
PIC_PANDA = 'panda.jpg'
DESC = 'What are the similarities and differences between these two images.'
DESC_ZH = '两张图有什么相同和不同的地方.'
_MM_DEMO_TOMB_MCQ_JSON_BLOCK = """{
"question": "How many porcelain jars were discovered in the niches located in the primary chamber of the tomb?",
"options": [
"A. 4.",
"B. 9.",
"C. 5.",
"D. 13."
]
}"""
MM_DEMO_TOMB_USER_PROMPT = (
'You are given a multiple-choice problem as JSON (question and options only; there is no answer field). '
'Watch the entire video, pick the best option from what you see, then reply briefly with the letter '
'(A, B, C, or D) first and at most one short sentence. Do not output long step-by-step reasoning; '
'keep the final reply concise.\n\n' + _MM_DEMO_TOMB_MCQ_JSON_BLOCK)

DEFAULT_VIDEO_FILENAME = 'red-panda.mp4'
VIDEO_QWEN3_DEMO_FILENAME = 'N1cdUjctpG8.mp4'


def _numpy_video_to_pil_list(frames: np.ndarray) -> list[Image.Image]:
images: list[Image.Image] = []
for i in range(int(frames.shape[0])):
images.append(Image.fromarray(frames[i].astype('uint8')).convert('RGB'))
return images


def load_video_sampled_pil(video_path: str, num_frames: int, **kwargs: Any) -> tuple[list[Image.Image], dict[str, Any]]:
frames, meta = load_video(video_path, num_frames=num_frames, **kwargs)
return _numpy_video_to_pil_list(frames), meta


def run_pipeline_mllm_test(model_path, run_config, resource_path, is_pr_test: bool = False):
Expand Down Expand Up @@ -169,44 +200,9 @@ def internvl_vl_testcase(pipe, resource_path, lang='en'):
print(f'[caseresult internvl-separate-images2-{lang} start]' + json.dumps(response.text, ensure_ascii=False) +
f'[caseresult internvl-separate-images2-{lang} end]\n')

# video multi-round conversation
def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
if bound:
start, end = bound[0], bound[1]
else:
start, end = -100000, 100000
start_idx = max(first_idx, round(start * fps))
end_idx = min(round(end * fps), max_frame)
seg_size = float(end_idx - start_idx) / num_segments
frame_indices = np.array(
[int(start_idx + (seg_size / 2) + np.round(seg_size * idx)) for idx in range(num_segments)])
return frame_indices

def load_video(video_path, bound=None, num_segments=32):
import cv2
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
raise ValueError(f'Cannot open video file: {video_path}')

max_frame = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - 1
fps = cap.get(cv2.CAP_PROP_FPS)

frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
imgs = []

for frame_index in frame_indices:
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
ret, frame = cap.read()
if ret:
rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
img = Image.fromarray(rgb_frame).convert('RGB')
imgs.append(img)

cap.release()
return imgs

video_path = resource_path + '/red-panda.mp4'
imgs = load_video(video_path, num_segments=8)
# video multi-round conversation (uniform ``num_frames`` via lmdeploy.vl.load_video)
video_path = f'{resource_path}/{DEFAULT_VIDEO_FILENAME}'
imgs, _ = load_video_sampled_pil(video_path, num_frames=8)

question = ''
for i in range(len(imgs)):
Expand Down Expand Up @@ -287,43 +283,11 @@ def MiniCPM_vl_testcase(pipe, resource_path):
print('[caseresult minicpm-fewshot start]' + json.dumps(response.text, ensure_ascii=False) +
'[caseresult minicpm-fewshot end]\n')

# Chat with video
MAX_NUM_FRAMES = 64 # if cuda OOM set a smaller number

def encode_video(video_path):

def uniform_sample(length, n):
gap = len(length) / n
idxs = [int(i * gap + gap / 2) for i in range(n)]
return [length[i] for i in idxs]

import cv2
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
raise ValueError(f'Cannot open video file: {video_path}')

fps = cap.get(cv2.CAP_PROP_FPS)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

sample_fps = round(fps / 1) # FPS
frame_idx = [i for i in range(0, total_frames, sample_fps)]
if len(frame_idx) > MAX_NUM_FRAMES:
frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)

frames = []
for idx in frame_idx:
cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
ret, frame = cap.read()
if ret:
rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frames.append(Image.fromarray(rgb_frame.astype('uint8')).convert('RGB'))

cap.release()
print('num frames:', len(frames))
return frames

video_path = resource_path + '/red-panda.mp4'
frames = encode_video(video_path)
# Chat with video (fixed frame budget; same decoder as REST ``video_url``)
max_video_frames = 32
video_path = f'{resource_path}/{DEFAULT_VIDEO_FILENAME}'
frames, video_meta = load_video_sampled_pil(video_path, num_frames=max_video_frames)
print('num frames:', len(frames), 'meta:', video_meta.get('frames_indices'))
question = 'What animals are in the video, and what are they doing?'

content = [dict(type='text', text=question)]
Expand Down Expand Up @@ -386,6 +350,97 @@ def Qwen_vl_testcase(pipe, resource_path):
print('[caseresult qwen-performance-images2 start]' + json.dumps(response.text, ensure_ascii=False) +
'[caseresult qwen-performance-images2 end]\n')

# Qwen2.5/3-VL: native ``video`` + same knobs as REST ``extra_body`` (top_k / mm_processor_kwargs).
demo_path = os.path.join(resource_path, VIDEO_QWEN3_DEMO_FILENAME)
if not os.path.isfile(demo_path):
print('[caseresult qwen3-demo-video start]' +
json.dumps('SKIPPED_NO_DEMO_MP4', ensure_ascii=False) + '[caseresult qwen3-demo-video end]\n')
else:
try:
frames, vmeta = load_video(demo_path, num_frames=16, fps=2)
demo_q = MM_DEMO_TOMB_USER_PROMPT
vmsg = [{
'role':
'user',
'content': [
{
'type': 'video',
'data': frames,
'video_metadata': vmeta,
},
{
'type': 'text',
'text': demo_q,
},
],
}]
mm_gen_config = GenerationConfig(
max_new_tokens=24576,
min_new_tokens=10,
top_k=20,
temperature=0.3,
top_p=0.95,
)
response = pipe(
vmsg,
gen_config=mm_gen_config,
log_level='INFO',
max_log_len=10,
mm_processor_kwargs={
'fps': 2,
'do_sample_frames': True,
},
)
print('[caseresult qwen3-demo-video start]' + json.dumps(response.text, ensure_ascii=False) +
'[caseresult qwen3-demo-video end]\n')
except Exception as exc:
err = json.dumps(f'PIPELINE_VIDEO_ERROR:{exc!s}', ensure_ascii=False)
print('[caseresult qwen3-demo-video start]' + err + '[caseresult qwen3-demo-video end]\n')

rp_video = os.path.join(resource_path, DEFAULT_VIDEO_FILENAME)
if not os.path.isfile(rp_video):
print('[caseresult qwen-mixed-image-text-video start]' +
json.dumps('SKIPPED_NO_RED_PANDA_MP4', ensure_ascii=False) +
'[caseresult qwen-mixed-image-text-video end]\n')
else:
try:
frames_pil, _vmeta_m = load_video_sampled_pil(rp_video, num_frames=6, fps=1)
mixed_content = [
{
'type':
'text',
'text': (
'You are given one still image, then several frames from a short video in order. '
'In 2-4 sentences: name one thing in the still image, and what animal or activity '
'you see in the video frames.'),
},
{
'type': 'image_url',
'image_url': {
'url': f'{resource_path}/{PIC1}',
},
},
]
for frame in frames_pil:
mixed_content.append(
dict(
type='image_url',
image_url=dict(url=f'data:image/jpeg;base64,{encode_image_base64(frame)}'),
))
mixed_msg = [{'role': 'user', 'content': mixed_content}]
response = pipe(
mixed_msg,
gen_config=gen_config,
log_level='INFO',
max_log_len=10,
)
print('[caseresult qwen-mixed-image-text-video start]' +
json.dumps(response.text, ensure_ascii=False) + '[caseresult qwen-mixed-image-text-video end]\n')
except Exception as exc:
err = json.dumps(f'PIPELINE_MIXED_MM_ERROR:{exc!s}', ensure_ascii=False)
print('[caseresult qwen-mixed-image-text-video start]' + err +
'[caseresult qwen-mixed-image-text-video end]\n')


if __name__ == '__main__':
fire.Fire()
24 changes: 24 additions & 0 deletions autotest/utils/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,14 @@
}
}

SLEEP_WAKEUP_MODEL_LIST = [
'Qwen/Qwen3.5-35B-A3B',
'Qwen/Qwen3.5-35B-A3B-FP8',
'Qwen/Qwen3.5-122B-A10B',
]

SLEEP_WAKEUP_BACKENDS = ['pytorch', 'turbomind']

BACKEND_LIST = ['turbomind', 'pytorch']

RESTFUL_MODEL_LIST_LATEST = [
Expand Down Expand Up @@ -260,3 +268,19 @@
'cache-max-entry-count': 0.7
}
}

# Qwen3-VL tomb demo (REST ``mm_processor`` + pipeline video): MCQ JSON without a labelled answer field.
MM_DEMO_TOMB_MCQ_JSON_BLOCK = """{
"question": "How many porcelain jars were discovered in the niches located in the primary chamber of the tomb?",
"options": [
"A. 4.",
"B. 9.",
"C. 5.",
"D. 13."
]
}"""
MM_DEMO_TOMB_USER_PROMPT = (
'You are given a multiple-choice problem as JSON (question and options only; there is no answer field). '
'Watch the entire video, pick the best option from what you see, then reply briefly with the letter '
'(A, B, C, or D) first and at most one short sentence. Do not output long step-by-step reasoning; '
'keep the final reply concise.\n\n' + MM_DEMO_TOMB_MCQ_JSON_BLOCK)
46 changes: 46 additions & 0 deletions autotest/utils/pipeline_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from utils.common_utils import execute_command_with_logging
from utils.config_utils import get_case_str_by_config, get_cuda_prefix_by_workerid, get_workerid, resolve_extra_params
from utils.rule_condition_assert import assert_result
from utils.run_restful_chat import _mm_demo_thinking_wrapper_shape_assert, _mm_demo_tomb_answer_assert


def run_pipeline_llm_test(config, run_config, common_case_config, worker_id: str = '', is_smoke: bool = False):
Expand Down Expand Up @@ -337,6 +338,51 @@ def Qwen_vl_testcase(output_text, file):
file.writelines(f'qwen-performance-images2 result: {case_result}, reason: buildings should in {response} \n')
with assume:
assert case_result, f'reason: performance images2: buildings should in {response}'
with allure.step('qwen3-demo-video'):
response = get_response_from_output(output_text, 'qwen3-demo-video')
rl = response.lower()
if 'skipped_no_demo_mp4' in rl:
file.writelines('qwen3-demo-video result: skipped (N1cdUjctpG8.mp4 not in resource_path)\n')
elif 'pipeline_video_error:' in rl:
file.writelines(f'qwen3-demo-video result: false, pipeline video error in {response} \n')
with assume:
assert False, f'qwen3-demo-video pipeline error: {response}'
else:
tomb_assert = _mm_demo_tomb_answer_assert(response)
shape_assert = _mm_demo_thinking_wrapper_shape_assert(response)
case_result = tomb_assert and shape_assert
reason = 'tomb/jar + bounded public tail'
file.writelines(f'qwen3-demo-video result: {case_result}, reason: {reason}: {response} \n')
with assume:
msg = 'reason: qwen3 demo video: expected tomb/jar-related bounded answer'
assert case_result, f'{msg}: {response}'
if '[caseresult qwen-mixed-image-text-video start]' in output_text:
with allure.step('qwen-mixed-image-text-video'):
response = get_response_from_output(output_text, 'qwen-mixed-image-text-video')
rl = response.lower()
if 'skipped_no_red_panda_mp4' in rl:
file.writelines(
'qwen-mixed-image-text-video result: skipped (red-panda.mp4 not in resource_path)\n')
elif 'pipeline_mixed_mm_error:' in rl:
file.writelines(f'qwen-mixed-image-text-video result: false, mixed mm error in {response} \n')
with assume:
assert False, f'qwen-mixed-image-text-video pipeline error: {response}'
else:
img = (
any(w in rl for w in ('tiger', 'ski'))
or '虎' in response
or '滑雪' in response
)
vid = (
any(w in rl for w in ('panda', 'red panda', 'lesser panda', 'ailurus'))
or any(w in response for w in ('小熊猫', '红熊猫'))
)
case_result = bool(response.strip()) and img and vid
file.writelines(
f'qwen-mixed-image-text-video result: {case_result}, reason: image+tiger + video+panda cues\n')
with assume:
msg = 'reason: mixed image+video reply should mention tiger/ski and panda'
assert case_result, f'{msg}: {response}'


def save_pipeline_common_log(config, log_name, result, content, msg: str = '', write_type: str = 'w'):
Expand Down
22 changes: 18 additions & 4 deletions autotest/utils/restful_return_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,15 @@ def assert_chat_completions_batch_return(output, model_name, check_logprobs: boo
assert len(message.get('message').get('content')) > 0
assert message.get('message').get('role') == 'assistant'
if check_logprobs:
len(message.get('logprobs').get('content')) == output.get('usage').get('completion_tokens')
for logprob in message.get('logprobs').get('content'):
lp = message.get('logprobs')
assert lp is not None, output
content_lp = lp.get('content')
assert content_lp is not None, output
n_tok = output.get('usage', {}).get('completion_tokens')
assert len(content_lp) == n_tok, (
f'logprobs.content len {len(content_lp)} != completion_tokens {n_tok!r}'
)
for logprob in content_lp:
assert_logprobs(logprob, logprobs_num)


Expand All @@ -31,8 +38,15 @@ def assert_completions_batch_return(output, model_name, check_logprobs: bool = F
assert message.get('index') == 0
assert len(message.get('text')) > 0
if check_logprobs:
len(message.get('logprobs').get('content')) == output.get('usage').get('completion_tokens')
for logprob in message.get('logprobs').get('content'):
lp = message.get('logprobs')
assert lp is not None, output
content_lp = lp.get('content')
assert content_lp is not None, output
n_tok = output.get('usage', {}).get('completion_tokens')
assert len(content_lp) == n_tok, (
f'logprobs.content len {len(content_lp)} != completion_tokens {n_tok!r}'
)
for logprob in content_lp:
assert_logprobs(logprob, logprobs_num)


Expand Down
Loading
Loading