diff --git a/autotest/interface/pipeline/test_pipeline_sleep_wakeup.py b/autotest/interface/pipeline/test_pipeline_sleep_wakeup.py new file mode 100644 index 0000000000..9cea30bdf4 --- /dev/null +++ b/autotest/interface/pipeline/test_pipeline_sleep_wakeup.py @@ -0,0 +1,441 @@ +from __future__ import annotations + +import inspect +import os +import time +from pathlib import Path + +import pytest +import torch +from utils.config_utils import get_parallel_config +from utils.constant import SLEEP_WAKEUP_BACKENDS, SLEEP_WAKEUP_MODEL_LIST +from utils.sleep_utils import ( + LEVEL2_BASELINE_RUNS, + LEVEL2_GREEDY_MESSAGES, + LEVEL2_MAX_TOKENS, + apply_serialized_hf_segments_for_level2_weights, + apply_serialized_hf_segments_for_turbomind_level2_weights, + assert_assistant_not_degenerate, + assert_chat_decode_unchanged, + level2_update_weights_request_dict, + resolve_hf_checkpoint_dir, +) + +from lmdeploy import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline +from lmdeploy.messages import Response +from lmdeploy.serve.openai.protocol import UpdateParamsRequest +from lmdeploy.utils import is_bf16_supported + +_SLEEP_PIPELINE_BACKEND_CLASS = { + 'pytorch': PytorchEngineConfig, + 'turbomind': TurbomindEngineConfig, +} + + +def _pipeline_sleep_backend_classes(): + out: list[type[PytorchEngineConfig] | type[TurbomindEngineConfig]] = [] + for name in SLEEP_WAKEUP_BACKENDS: + cls = _SLEEP_PIPELINE_BACKEND_CLASS.get(name) + if cls is None: + allowed = set(_SLEEP_PIPELINE_BACKEND_CLASS) + raise ValueError( + f'unknown SLEEP_WAKEUP_BACKENDS entry {name!r}; expected one of {allowed}', + ) + out.append(cls) + return out + + +def _force_pipeline_sleep_under_llm_dist() -> bool: + v = os.environ.get('LMDEPLOY_FORCE_PIPELINE_SLEEP', '').strip().lower() + return v in ('1', 'true', 'yes', 'on') + + +@pytest.fixture(scope='module', autouse=True) +def _skip_module_if_rest_runner_gpu_conflict(): + if os.environ.get('LLM_DIST_PORT') and not _force_pipeline_sleep_under_llm_dist(): + pytest.skip( + 'pipeline sleep/wakeup: skipped when LLM_DIST_PORT is set (REST api_server already uses GPUs). ' + 'Run this file standalone from lmdeploy_sleep root, or set LMDEPLOY_FORCE_PIPELINE_SLEEP=1 ' + 'if you allocated extra GPUs for pytest.') + + +def _pipeline_tp_for_model(config: dict, model: str) -> int: + tp = 1 + for item in get_parallel_config(config, model): + if isinstance(item, dict) and 'tp' in item: + tp = max(tp, int(item['tp'])) + return max(1, tp) + + +def _make_backend_config( + backend: type[PytorchEngineConfig] | type[TurbomindEngineConfig], + config: dict, + model: str, +): + tp = _pipeline_tp_for_model(config, model) + cfg = backend(tp=tp) + if backend is TurbomindEngineConfig: + cfg.empty_init = True + if backend is PytorchEngineConfig and not is_bf16_supported(): + cfg.dtype = 'float16' + return cfg + + +def _model_path(config: dict, model: str) -> str: + if os.environ.get('LMDEPLOY_USE_MODELSCOPE', 'False') == 'True': + return model + return str(Path(config['model_path']) / model) + + +def _open_pipeline(config: dict, model: str, backend: type[PytorchEngineConfig] | type[TurbomindEngineConfig]): + return pipeline( + _model_path(config, model), + backend_config=_make_backend_config(backend, config, model), + ) + + +def _pipeline_resp_to_chat_dict(resp: Response) -> dict: + return { + 'choices': [{ + 'message': {'content': (resp.text or '').strip()}, + 'finish_reason': getattr(resp, 'finish_reason', None), + }], + 'usage': {'completion_tokens': resp.generate_token_len}, + } + + +def _infer_level2_greedy(pipe, gen_cfg: GenerationConfig) -> Response: + prompt = LEVEL2_GREEDY_MESSAGES[0]['content'] + return pipe.infer(prompt, gen_config=gen_cfg) + + +def _assert_level2_pipeline_baseline_stable(pipe, gen_cfg: GenerationConfig, *, label: str) -> Response: + contents: list[str] = [] + refs: list[Response] = [] + for i in range(LEVEL2_BASELINE_RUNS): + out = _infer_level2_greedy(pipe, gen_cfg) + assert_assistant_not_degenerate( + (out.text or '').strip(), label=f'{label} baseline run {i + 1}') + refs.append(out) + contents.append((out.text or '').strip()) + assert len(set(contents)) == 1, ( + f'{label}: greedy pipeline baseline not stable:\n' + + '\n'.join(f' run{j + 1}={c!r}' for j, c in enumerate(contents))) + return refs[0] + + +def _should_enforce_level2_greedy_checks( + backend: type[PytorchEngineConfig] | type[TurbomindEngineConfig]) -> bool: + # Known issue: TurboMind may not be deterministic for temperature=0 runs. + # Keep validating sleep/wakeup/update_params behavior, but do not fail on + # strict greedy-stability checks for this backend. + return backend is not TurbomindEngineConfig + + +def _apply_sleep(pipe, level: int = 1) -> None: + eng = pipe.async_engine + out = eng.sleep(level) + if inspect.isawaitable(out): + pipe._run(coro=out).result() + + +def _pipeline_wakeup(pipe, tags: list[str] | None = None) -> None: + pipe.async_engine.wakeup(tags) + + +def _pipeline_is_sleeping(pipe) -> bool: + return bool(pipe.async_engine.is_sleeping) + + +def _ensure_awake_pipeline(pipe, max_attempts: int = 8) -> None: + for _ in range(max_attempts): + _pipeline_wakeup(pipe, None) + if not _pipeline_is_sleeping(pipe): + return + time.sleep(0.25) + raise AssertionError( + f'pipeline engine still is_sleeping=true after {max_attempts} wakeup attempts') + + +def _level2_reload_weights_if_supported_pipeline( + pipe, + backend: type[PytorchEngineConfig] | type[TurbomindEngineConfig], + config: dict, + model: str, +) -> None: + if backend is not PytorchEngineConfig and backend is not TurbomindEngineConfig: + return + if not torch.cuda.is_available(): + pytest.skip('level-2 reload needs CUDA for serialize_state_dict / weight upload') + model_dir = resolve_hf_checkpoint_dir(config, model) + if not model_dir.is_dir(): + pytest.skip(f'HF checkpoint not found for update_weights: {model_dir}') + eng = pipe.async_engine.engine + + def _emit(serialized_data: object, finished: bool) -> None: + eng.update_params(UpdateParamsRequest(**level2_update_weights_request_dict( + serialized_data, finished))) + + try: + if backend is PytorchEngineConfig: + apply_serialized_hf_segments_for_level2_weights(model_dir, _emit) + else: + apply_serialized_hf_segments_for_turbomind_level2_weights(model_dir, _emit) + except FileNotFoundError as e: + pytest.skip(str(e)) + except RuntimeError as e: + pytest.skip(str(e)) + + +@pytest.mark.order(8) +@pytest.mark.flaky(reruns=2) +@pytest.mark.parametrize('model', SLEEP_WAKEUP_MODEL_LIST) +@pytest.mark.parametrize('backend', _pipeline_sleep_backend_classes()) +class TestPipelineSleepWakeup: + + def test_pipeline_sleep_wakeup_roundtrip(self, model, backend, config): + pipe = None + try: + pipe = _open_pipeline(config, model, backend) + _ensure_awake_pipeline(pipe) + _apply_sleep(pipe, 1) + assert _pipeline_is_sleeping(pipe) is True + _pipeline_wakeup(pipe, None) + assert _pipeline_is_sleeping(pipe) is False + finally: + if pipe is not None: + try: + _ensure_awake_pipeline(pipe) + finally: + pipe.close() + + def test_pipeline_sleep_level1_wakeup_and_infer(self, model, backend, config): + pipe = None + try: + pipe = _open_pipeline(config, model, backend) + _ensure_awake_pipeline(pipe) + _apply_sleep(pipe, 1) + assert _pipeline_is_sleeping(pipe) is True + _pipeline_wakeup(pipe, None) + assert _pipeline_is_sleeping(pipe) is False + gen = GenerationConfig(max_new_tokens=32, temperature=0.01) + r = pipe([[{'role': 'user', 'content': 'Hi, reply with one short sentence.'}]], gen_config=gen) + out = r[0] if isinstance(r, list) else r + assert (out.text or '').strip() + finally: + if pipe is not None: + try: + _ensure_awake_pipeline(pipe) + finally: + pipe.close() + + def test_pipeline_partial_wakeup_with_tags(self, model, backend, config): + pipe = None + try: + pipe = _open_pipeline(config, model, backend) + _ensure_awake_pipeline(pipe) + _apply_sleep(pipe, 1) + assert _pipeline_is_sleeping(pipe) is True + _pipeline_wakeup(pipe, ['weights']) + assert _pipeline_is_sleeping(pipe) is True + _pipeline_wakeup(pipe, ['kv_cache']) + assert _pipeline_is_sleeping(pipe) is False + finally: + if pipe is not None: + try: + _ensure_awake_pipeline(pipe) + finally: + pipe.close() + + def test_pipeline_wakeup_unknown_tags_noop_then_full(self, model, backend, config): + pipe = None + try: + pipe = _open_pipeline(config, model, backend) + _ensure_awake_pipeline(pipe) + _apply_sleep(pipe, 1) + assert _pipeline_is_sleeping(pipe) is True + _pipeline_wakeup(pipe, ['not_a_valid_tag']) + assert _pipeline_is_sleeping(pipe) is True + _pipeline_wakeup(pipe, None) + assert _pipeline_is_sleeping(pipe) is False + finally: + if pipe is not None: + try: + _ensure_awake_pipeline(pipe) + finally: + pipe.close() + + def test_pipeline_wakeup_mixed_valid_invalid_tags_noop(self, model, backend, config): + pipe = None + try: + pipe = _open_pipeline(config, model, backend) + _ensure_awake_pipeline(pipe) + _apply_sleep(pipe, 1) + assert _pipeline_is_sleeping(pipe) is True + _pipeline_wakeup(pipe, ['weights', 'not_a_valid_tag']) + assert _pipeline_is_sleeping(pipe) is True + _pipeline_wakeup(pipe, ['not_a_valid_tag', 'weights']) + assert _pipeline_is_sleeping(pipe) is True + _pipeline_wakeup(pipe, None) + assert _pipeline_is_sleeping(pipe) is False + finally: + if pipe is not None: + try: + _ensure_awake_pipeline(pipe) + finally: + pipe.close() + + def test_pipeline_wakeup_both_tags_one_call(self, model, backend, config): + pipe = None + try: + pipe = _open_pipeline(config, model, backend) + _ensure_awake_pipeline(pipe) + _apply_sleep(pipe, 1) + assert _pipeline_is_sleeping(pipe) is True + _pipeline_wakeup(pipe, ['weights', 'kv_cache']) + assert _pipeline_is_sleeping(pipe) is False + gen = GenerationConfig(max_new_tokens=32, temperature=0.01) + r = pipe([[{'role': 'user', 'content': 'Hi, reply with one short sentence.'}]], gen_config=gen) + out = r[0] if isinstance(r, list) else r + assert (out.text or '').strip() + finally: + if pipe is not None: + try: + _ensure_awake_pipeline(pipe) + finally: + pipe.close() + + def test_pipeline_redundant_weights_wakeup_noop(self, model, backend, config): + pipe = None + try: + pipe = _open_pipeline(config, model, backend) + _ensure_awake_pipeline(pipe) + _apply_sleep(pipe, 1) + assert _pipeline_is_sleeping(pipe) is True + _pipeline_wakeup(pipe, ['weights']) + assert _pipeline_is_sleeping(pipe) is True + _pipeline_wakeup(pipe, ['weights']) + assert _pipeline_is_sleeping(pipe) is True + _pipeline_wakeup(pipe, ['kv_cache']) + assert _pipeline_is_sleeping(pipe) is False + finally: + if pipe is not None: + try: + _ensure_awake_pipeline(pipe) + finally: + pipe.close() + + def test_pipeline_wakeup_empty_string_tag_noop(self, model, backend, config): + pipe = None + try: + pipe = _open_pipeline(config, model, backend) + _ensure_awake_pipeline(pipe) + _apply_sleep(pipe, 1) + assert _pipeline_is_sleeping(pipe) is True + _pipeline_wakeup(pipe, ['']) + assert _pipeline_is_sleeping(pipe) is True + _pipeline_wakeup(pipe, None) + assert _pipeline_is_sleeping(pipe) is False + finally: + if pipe is not None: + try: + _ensure_awake_pipeline(pipe) + finally: + pipe.close() + + def test_pipeline_full_wakeup_when_already_awake(self, model, backend, config): + pipe = None + try: + pipe = _open_pipeline(config, model, backend) + _ensure_awake_pipeline(pipe) + assert _pipeline_is_sleeping(pipe) is False + _pipeline_wakeup(pipe, None) + assert _pipeline_is_sleeping(pipe) is False + _pipeline_wakeup(pipe, None) + assert _pipeline_is_sleeping(pipe) is False + finally: + if pipe is not None: + try: + _ensure_awake_pipeline(pipe) + finally: + pipe.close() + + def test_pipeline_second_sleep_while_sleeping_ok(self, model, backend, config): + pipe = None + try: + pipe = _open_pipeline(config, model, backend) + _ensure_awake_pipeline(pipe) + _apply_sleep(pipe, 1) + assert _pipeline_is_sleeping(pipe) is True + _apply_sleep(pipe, 1) + assert _pipeline_is_sleeping(pipe) is True + _pipeline_wakeup(pipe, None) + assert _pipeline_is_sleeping(pipe) is False + finally: + if pipe is not None: + try: + _ensure_awake_pipeline(pipe) + finally: + pipe.close() + + def test_pipeline_sleep_level2_staged_wakeup_and_infer(self, model, backend, config): + pipe = None + try: + pipe = _open_pipeline(config, model, backend) + _ensure_awake_pipeline(pipe) + gen = GenerationConfig( + max_new_tokens=LEVEL2_MAX_TOKENS, + temperature=0.0, + top_p=1.0, + top_k=1, + do_sample=False, + ) + baseline = None + if _should_enforce_level2_greedy_checks(backend): + baseline_r = _assert_level2_pipeline_baseline_stable(pipe, gen, label='level2 pipeline') + baseline = _pipeline_resp_to_chat_dict(baseline_r) + + _apply_sleep(pipe, 2) + assert _pipeline_is_sleeping(pipe) is True + _pipeline_wakeup(pipe, ['weights']) + assert _pipeline_is_sleeping(pipe) is True + _level2_reload_weights_if_supported_pipeline(pipe, backend, config, model) + _pipeline_wakeup(pipe, ['kv_cache']) + assert _pipeline_is_sleeping(pipe) is False + + after = _infer_level2_greedy(pipe, gen) + assert_assistant_not_degenerate( + (after.text or '').strip(), label='level2 pipeline after staged wakeup (1st infer)') + if baseline is not None: + assert_chat_decode_unchanged(baseline, _pipeline_resp_to_chat_dict(after), + label='level2 pipeline 1st infer after staged wakeup') + + after2 = _infer_level2_greedy(pipe, gen) + if baseline is not None: + assert_chat_decode_unchanged(baseline, _pipeline_resp_to_chat_dict(after2), + label='level2 pipeline 2nd infer after staged wakeup') + + _apply_sleep(pipe, 2) + assert _pipeline_is_sleeping(pipe) is True + _pipeline_wakeup(pipe, ['weights']) + assert _pipeline_is_sleeping(pipe) is True + _level2_reload_weights_if_supported_pipeline(pipe, backend, config, model) + _pipeline_wakeup(pipe, ['kv_cache']) + assert _pipeline_is_sleeping(pipe) is False + + after_full = _infer_level2_greedy(pipe, gen) + if baseline is not None: + assert_chat_decode_unchanged( + baseline, _pipeline_resp_to_chat_dict(after_full), + label='level2 pipeline infer after 2nd sleep cycle (staged wakeup)') + + gen2 = GenerationConfig(max_new_tokens=32, temperature=0.01) + r = pipe([[{'role': 'user', 'content': 'Hi, reply with one short sentence.'}]], gen_config=gen2) + out = r[0] if isinstance(r, list) else r + assert (out.text or '').strip() + finally: + if pipe is not None: + try: + _ensure_awake_pipeline(pipe) + finally: + pipe.close() diff --git a/autotest/interface/restful/test_restful_abort_request.py b/autotest/interface/restful/test_restful_abort_request.py new file mode 100644 index 0000000000..ff3d1b4dd3 --- /dev/null +++ b/autotest/interface/restful/test_restful_abort_request.py @@ -0,0 +1,426 @@ +import json +import random +import threading +import time + +import pytest +import requests +from utils.constant import BACKEND_LIST, DEFAULT_PORT, DEFAULT_SERVER, RESTFUL_MODEL_LIST +from utils.restful_return_check import assert_chat_completions_batch_return + +from lmdeploy.serve.openai.api_client import APIClient + +BASE_URL = f'http://{DEFAULT_SERVER}:{DEFAULT_PORT}' +JSON_HEADERS = {'Content-Type': 'application/json'} +_REQUEST_TIMEOUT = 300 +_ABORT_TIMEOUT = 60 +_SESSION_RETRY = 25 +_SESSION_RETRY_INTERVAL = 0.3 +_NONSTREAM_ABORT_LEAD_S = 2.0 +_THREAD_JOIN_EXTRA_S = 30 +_POST_ABORT_LOGPROBS_NUM = 10 + + +def _post_abort_request(payload: dict) -> requests.Response: + return requests.post( + f'{BASE_URL}/abort_request', + headers=JSON_HEADERS, + json=payload, + timeout=_ABORT_TIMEOUT, + ) + + +def _chat_non_stream( + model_name: str, + session_id: int, + *, + max_tokens: int = 32, + logprobs: bool = False, + top_logprobs: int = _POST_ABORT_LOGPROBS_NUM, +) -> requests.Response: + body: dict = { + 'model': model_name, + 'messages': [{'role': 'user', 'content': 'Say OK in one word.'}], + 'max_tokens': max_tokens, + 'temperature': 0.01, + 'stream': False, + 'session_id': session_id, + } + if logprobs: + body['logprobs'] = True + body['top_logprobs'] = top_logprobs + return requests.post( + f'{BASE_URL}/v1/chat/completions', + headers=JSON_HEADERS, + json=body, + timeout=_REQUEST_TIMEOUT, + ) + + +def _consume_first_nonempty_sse_data_line(resp: requests.Response) -> None: + for raw in resp.iter_lines(decode_unicode=True): + if not raw or not raw.startswith('data:'): + continue + chunk = raw[5:].strip() + if chunk == '[DONE]': + break + if not chunk: + continue + try: + json.loads(chunk) + except json.JSONDecodeError: + continue + return + assert False, 'expected at least one parsable SSE data line before abort' + + +def _post_abort_explicit_session_or_skip(session_id: int) -> None: + abort_r = _post_abort_request({'session_id': session_id, 'abort_all': False}) + if abort_r.status_code == 501: + pytest.skip('api_server started without --enable-abort-handling') + assert abort_r.status_code == 200, f'abort_request: {abort_r.status_code} {abort_r.text!r}' + + +def _post_abort_all_or_skip() -> None: + abort_r = _post_abort_request({'abort_all': True}) + if abort_r.status_code == 501: + pytest.skip('api_server started without --enable-abort-handling') + assert abort_r.status_code == 200, f'abort_request abort_all: {abort_r.status_code} {abort_r.text!r}' + + +def _assert_session_reusable_after_abort(model_name: str, session_id: int) -> None: + last = None + for _ in range(_SESSION_RETRY): + last = _chat_non_stream( + model_name, + session_id, + max_tokens=16, + logprobs=True, + top_logprobs=_POST_ABORT_LOGPROBS_NUM, + ) + if last.status_code == 200: + data = last.json() + assert 'choices' in data and data['choices'], last.text + assert_chat_completions_batch_return( + data, + model_name, + check_logprobs=True, + logprobs_num=_POST_ABORT_LOGPROBS_NUM, + ) + return + if last.status_code == 400 and 'occupied' in (last.text or '').lower(): + time.sleep(_SESSION_RETRY_INTERVAL) + continue + break + assert False, f'session {session_id} not reusable after abort: last={last.status_code} {last.text!r}' + + +def _long_user_prompt() -> str: + return 'Write a long numbered list from 1 to 500, one number per line, no other text.' + + +@pytest.mark.order(9) +@pytest.mark.flaky(reruns=2) +@pytest.mark.parametrize('backend', BACKEND_LIST) +@pytest.mark.parametrize('model_case', RESTFUL_MODEL_LIST) +class TestRestfulAbortRequest: + + def test_abort_request_releases_explicit_session_mid_stream(self, backend, model_case): + api_client = APIClient(BASE_URL) + model_name = api_client.available_models[0] + session_id = 8_000_000 + random.randint(0, 99_999) + + stream_payload = { + 'model': model_name, + 'messages': [{'role': 'user', 'content': _long_user_prompt()}], + 'max_tokens': 2048, + 'temperature': 0.3, + 'stream': True, + 'session_id': session_id, + } + resp = requests.post( + f'{BASE_URL}/v1/chat/completions', + headers=JSON_HEADERS, + json=stream_payload, + stream=True, + timeout=_REQUEST_TIMEOUT, + ) + resp.raise_for_status() + + try: + _consume_first_nonempty_sse_data_line(resp) + _post_abort_explicit_session_or_skip(session_id) + finally: + resp.close() + + _assert_session_reusable_after_abort(model_name, session_id) + + def test_abort_request_releases_explicit_session_mid_stream_generate(self, backend, model_case): + api_client = APIClient(BASE_URL) + model_name = api_client.available_models[0] + session_id = 7_000_000 + random.randint(0, 99_999) + + stream_payload = { + 'prompt': _long_user_prompt(), + 'max_tokens': 2048, + 'temperature': 0.3, + 'stream': True, + 'session_id': session_id, + } + resp = requests.post( + f'{BASE_URL}/generate', + headers=JSON_HEADERS, + json=stream_payload, + stream=True, + timeout=_REQUEST_TIMEOUT, + ) + resp.raise_for_status() + + try: + _consume_first_nonempty_sse_data_line(resp) + _post_abort_explicit_session_or_skip(session_id) + finally: + resp.close() + + _assert_session_reusable_after_abort(model_name, session_id) + + def test_abort_request_releases_explicit_session_mid_stream_completions(self, backend, model_case): + api_client = APIClient(BASE_URL) + model_name = api_client.available_models[0] + session_id = 6_000_000 + random.randint(0, 99_999) + + stream_payload = { + 'model': model_name, + 'prompt': _long_user_prompt(), + 'max_tokens': 2048, + 'temperature': 0.3, + 'stream': True, + 'session_id': session_id, + } + resp = requests.post( + f'{BASE_URL}/v1/completions', + headers=JSON_HEADERS, + json=stream_payload, + stream=True, + timeout=_REQUEST_TIMEOUT, + ) + resp.raise_for_status() + + try: + _consume_first_nonempty_sse_data_line(resp) + _post_abort_explicit_session_or_skip(session_id) + finally: + resp.close() + + _assert_session_reusable_after_abort(model_name, session_id) + + def test_abort_request_releases_explicit_session_non_stream_chat_thread(self, backend, model_case): + api_client = APIClient(BASE_URL) + model_name = api_client.available_models[0] + session_id = 5_000_000 + random.randint(0, 99_999) + + def worker(out: dict) -> None: + try: + out['resp'] = requests.post( + f'{BASE_URL}/v1/chat/completions', + headers=JSON_HEADERS, + json={ + 'model': model_name, + 'messages': [{'role': 'user', 'content': _long_user_prompt()}], + 'max_tokens': 2048, + 'temperature': 0.3, + 'stream': False, + 'session_id': session_id, + }, + timeout=_REQUEST_TIMEOUT, + ) + except Exception as e: + out['exc'] = e + + holder: dict = {} + t = threading.Thread(target=worker, args=(holder,), daemon=True) + t.start() + time.sleep(_NONSTREAM_ABORT_LEAD_S) + abort_r = _post_abort_request({'session_id': session_id, 'abort_all': False}) + if abort_r.status_code == 501: + t.join(timeout=_REQUEST_TIMEOUT + _THREAD_JOIN_EXTRA_S) + pytest.skip('api_server started without --enable-abort-handling') + assert abort_r.status_code == 200, f'abort_request: {abort_r.status_code} {abort_r.text!r}' + t.join(timeout=_REQUEST_TIMEOUT + _THREAD_JOIN_EXTRA_S) + assert not t.is_alive(), 'non-stream chat thread should finish after abort' + + _assert_session_reusable_after_abort(model_name, session_id) + + def test_abort_request_releases_explicit_session_non_stream_generate_thread(self, backend, model_case): + api_client = APIClient(BASE_URL) + model_name = api_client.available_models[0] + session_id = 4_000_000 + random.randint(0, 99_999) + + def worker(out: dict) -> None: + try: + out['resp'] = requests.post( + f'{BASE_URL}/generate', + headers=JSON_HEADERS, + json={ + 'prompt': _long_user_prompt(), + 'max_tokens': 2048, + 'temperature': 0.3, + 'stream': False, + 'session_id': session_id, + }, + timeout=_REQUEST_TIMEOUT, + ) + except Exception as e: + out['exc'] = e + + holder: dict = {} + t = threading.Thread(target=worker, args=(holder,), daemon=True) + t.start() + time.sleep(_NONSTREAM_ABORT_LEAD_S) + abort_r = _post_abort_request({'session_id': session_id, 'abort_all': False}) + if abort_r.status_code == 501: + t.join(timeout=_REQUEST_TIMEOUT + _THREAD_JOIN_EXTRA_S) + pytest.skip('api_server started without --enable-abort-handling') + assert abort_r.status_code == 200, f'abort_request: {abort_r.status_code} {abort_r.text!r}' + t.join(timeout=_REQUEST_TIMEOUT + _THREAD_JOIN_EXTRA_S) + assert not t.is_alive(), 'non-stream generate thread should finish after abort' + + _assert_session_reusable_after_abort(model_name, session_id) + + def test_abort_request_releases_explicit_session_non_stream_completions_thread(self, backend, model_case): + api_client = APIClient(BASE_URL) + model_name = api_client.available_models[0] + session_id = 3_000_000 + random.randint(0, 99_999) + + def worker(out: dict) -> None: + try: + out['resp'] = requests.post( + f'{BASE_URL}/v1/completions', + headers=JSON_HEADERS, + json={ + 'model': model_name, + 'prompt': _long_user_prompt(), + 'max_tokens': 2048, + 'temperature': 0.3, + 'stream': False, + 'session_id': session_id, + }, + timeout=_REQUEST_TIMEOUT, + ) + except Exception as e: + out['exc'] = e + + holder: dict = {} + t = threading.Thread(target=worker, args=(holder,), daemon=True) + t.start() + time.sleep(_NONSTREAM_ABORT_LEAD_S) + abort_r = _post_abort_request({'session_id': session_id, 'abort_all': False}) + if abort_r.status_code == 501: + t.join(timeout=_REQUEST_TIMEOUT + _THREAD_JOIN_EXTRA_S) + pytest.skip('api_server started without --enable-abort-handling') + assert abort_r.status_code == 200, f'abort_request: {abort_r.status_code} {abort_r.text!r}' + t.join(timeout=_REQUEST_TIMEOUT + _THREAD_JOIN_EXTRA_S) + assert not t.is_alive(), 'non-stream completions thread should finish after abort' + + _assert_session_reusable_after_abort(model_name, session_id) + + def test_streaming_client_close_releases_session_without_abort_request(self, backend, model_case): + api_client = APIClient(BASE_URL) + model_name = api_client.available_models[0] + session_id = 2_000_000 + random.randint(0, 99_999) + + resp = requests.post( + f'{BASE_URL}/v1/chat/completions', + headers=JSON_HEADERS, + json={ + 'model': model_name, + 'messages': [{'role': 'user', 'content': _long_user_prompt()}], + 'max_tokens': 2048, + 'temperature': 0.3, + 'stream': True, + 'session_id': session_id, + }, + stream=True, + timeout=_REQUEST_TIMEOUT, + ) + resp.raise_for_status() + try: + _consume_first_nonempty_sse_data_line(resp) + finally: + resp.close() + + _assert_session_reusable_after_abort(model_name, session_id) + + def test_streaming_client_close_completions_releases_session(self, backend, model_case): + api_client = APIClient(BASE_URL) + model_name = api_client.available_models[0] + session_id = 1_000_000 + random.randint(0, 99_999) + + resp = requests.post( + f'{BASE_URL}/v1/completions', + headers=JSON_HEADERS, + json={ + 'model': model_name, + 'prompt': _long_user_prompt(), + 'max_tokens': 2048, + 'temperature': 0.3, + 'stream': True, + 'session_id': session_id, + }, + stream=True, + timeout=_REQUEST_TIMEOUT, + ) + resp.raise_for_status() + try: + _consume_first_nonempty_sse_data_line(resp) + finally: + resp.close() + + _assert_session_reusable_after_abort(model_name, session_id) + + def test_streaming_client_close_generate_releases_session(self, backend, model_case): + api_client = APIClient(BASE_URL) + model_name = api_client.available_models[0] + session_id = 500_000 + random.randint(0, 99_999) + + resp = requests.post( + f'{BASE_URL}/generate', + headers=JSON_HEADERS, + json={ + 'prompt': _long_user_prompt(), + 'max_tokens': 2048, + 'temperature': 0.3, + 'stream': True, + 'session_id': session_id, + }, + stream=True, + timeout=_REQUEST_TIMEOUT, + ) + resp.raise_for_status() + try: + _consume_first_nonempty_sse_data_line(resp) + finally: + resp.close() + + _assert_session_reusable_after_abort(model_name, session_id) + + +@pytest.mark.order(10) +@pytest.mark.flaky(reruns=2) +@pytest.mark.parametrize('backend', BACKEND_LIST) +@pytest.mark.parametrize('model_case', RESTFUL_MODEL_LIST) +class TestRestfulAbortRequestAbortAll: + def test_abort_request_abort_all_then_chat_ok(self, backend, model_case): + _post_abort_all_or_skip() + api_client = APIClient(BASE_URL) + model_name = api_client.available_models[0] + last = None + for out in api_client.chat_completions_v1( + model=model_name, + messages=[{'role': 'user', 'content': 'Reply with one word: OK'}], + max_tokens=16, + temperature=0.01, + stream=False): + last = out + assert last is not None + assert_chat_completions_batch_return(last, model_name) diff --git a/autotest/interface/restful/test_restful_sleep_wakeup.py b/autotest/interface/restful/test_restful_sleep_wakeup.py new file mode 100644 index 0000000000..0f417a603d --- /dev/null +++ b/autotest/interface/restful/test_restful_sleep_wakeup.py @@ -0,0 +1,432 @@ +import time +from pathlib import Path + +import pytest +import requests +import torch +from utils.constant import ( + DEFAULT_PORT, + DEFAULT_SERVER, + SLEEP_WAKEUP_BACKENDS, + SLEEP_WAKEUP_MODEL_LIST, +) +from utils.restful_return_check import assert_chat_completions_batch_return +from utils.sleep_utils import ( + LEVEL2_BASELINE_RUNS, + LEVEL2_GREEDY_MESSAGES, + LEVEL2_MAX_TOKENS, + apply_serialized_hf_segments_for_level2_weights, + apply_serialized_hf_segments_for_turbomind_level2_weights, + assert_assistant_not_degenerate, + assert_chat_decode_unchanged, + assistant_content_from_openai_completion_dict, + level2_update_weights_request_dict, + resolve_hf_checkpoint_dir, +) + +from lmdeploy.serve.openai.api_client import APIClient + +BASE_URL = f'http://{DEFAULT_SERVER}:{DEFAULT_PORT}' +JSON_HEADERS = {'Content-Type': 'application/json'} +_REQUEST_TIMEOUT = 120 +_UPDATE_WEIGHTS_TIMEOUT = 600 + + +def _assert_status_200(resp: requests.Response) -> None: + assert resp.status_code == 200, f'status={resp.status_code} body={resp.text!r}' + + +def _post_sleep(*, level: int | None = None) -> requests.Response: + url = f'{BASE_URL}/sleep' + if level is not None: + url = f'{url}?level={level}' + return requests.post(url, headers=JSON_HEADERS, json={}, timeout=_REQUEST_TIMEOUT) + + +def _post_sleep_level2() -> requests.Response: + return requests.post( + f'{BASE_URL}/sleep', + headers=JSON_HEADERS, + json={}, + params=[('tags', 'weights'), ('tags', 'kv_cache'), ('level', 2)], + timeout=_REQUEST_TIMEOUT, + ) + + +def _post_sleep_query_raw(query: str) -> requests.Response: + q = query.lstrip('?') + url = f'{BASE_URL}/sleep?{q}' if q else f'{BASE_URL}/sleep' + return requests.post(url, headers=JSON_HEADERS, json={}, timeout=_REQUEST_TIMEOUT) + + +def _post_wakeup(*, tags: list[str] | None = None) -> requests.Response: + params = [('tags', t) for t in tags] if tags else None + return requests.post( + f'{BASE_URL}/wakeup', + headers=JSON_HEADERS, + json={}, + params=params, + timeout=_REQUEST_TIMEOUT, + ) + + +def _post_update_weights_from_hf_dir(model_dir: Path, *, engine: str) -> None: + def _emit(serialized_data: object, finished: bool) -> None: + data = level2_update_weights_request_dict(serialized_data, finished) + r = requests.post( + f'{BASE_URL}/update_weights', + headers=JSON_HEADERS, + json=data, + timeout=_UPDATE_WEIGHTS_TIMEOUT, + ) + _assert_status_200(r) + + if engine == 'pytorch': + apply_serialized_hf_segments_for_level2_weights(model_dir, _emit) + elif engine == 'turbomind': + apply_serialized_hf_segments_for_turbomind_level2_weights(model_dir, _emit) + else: + pytest.skip(f'unsupported engine for update_weights: {engine!r}') + + +def _level2_reload_hf_weights(backend: str, config: dict, model_case: str) -> None: + if not torch.cuda.is_available(): + pytest.skip('level-2 reload needs CUDA for serialize_state_dict / weight upload') + model_dir = resolve_hf_checkpoint_dir(config, model_case) + if not model_dir.is_dir(): + pytest.skip(f'HF checkpoint not found for update_weights: {model_dir}') + try: + _post_update_weights_from_hf_dir(model_dir, engine=backend) + except FileNotFoundError as e: + pytest.skip(str(e)) + except RuntimeError as e: + pytest.skip(str(e)) + + +def _fetch_is_sleeping() -> bool: + r = requests.get(f'{BASE_URL}/is_sleeping', timeout=30) + _assert_status_200(r) + return bool(r.json().get('is_sleeping')) + + +def _ensure_awake(max_attempts: int = 8) -> None: + for _ in range(max_attempts): + _assert_status_200(_post_wakeup()) + if not _fetch_is_sleeping(): + return + time.sleep(0.25) + raise AssertionError( + f'engine still is_sleeping=true after {max_attempts} POST /wakeup attempts; ' + f'BASE_URL={BASE_URL!r}') + + +def _chat_completion_collect(api_client: APIClient, model_name: str, **kwargs) -> dict: + kw = dict(kwargs) + kw['stream'] = False + output = None + for output in api_client.chat_completions_v1(model=model_name, **kw): + continue + assert output is not None, 'chat_completions_v1 returned no chunk' + return output + + +def _assert_level2_greedy_baseline_stable(api_client: APIClient, model_name: str, *, label: str) -> dict: + kwargs = dict( + messages=LEVEL2_GREEDY_MESSAGES, + max_tokens=LEVEL2_MAX_TOKENS, + temperature=0.0, + top_p=1.0, + top_k=1, + ) + refs: list[dict] = [] + contents: list[str] = [] + for i in range(LEVEL2_BASELINE_RUNS): + out = _chat_completion_collect(api_client, model_name, **kwargs) + assert_chat_completions_batch_return(out, model_name) + text = assistant_content_from_openai_completion_dict(out) + assert_assistant_not_degenerate(text, label=f'{label} baseline run {i + 1}') + refs.append(out) + contents.append(text) + assert len(set(contents)) == 1, ( + f'{label}: greedy REST baseline not stable (fix prompt/model for this case):\n' + + '\n'.join(f' run{j + 1}={c!r}' for j, c in enumerate(contents))) + return refs[0] + + +def _should_enforce_level2_greedy_checks(backend: str) -> bool: + # Known issue: TurboMind may produce non-stable outputs even in + # temperature=0 greedy-style requests. Keep the staged wakeup / reload + # flow coverage, but skip strict determinism assertions for this backend. + return backend != 'turbomind' + + +@pytest.mark.order(8) +@pytest.mark.flaky(reruns=2) +@pytest.mark.parametrize('backend', SLEEP_WAKEUP_BACKENDS) +@pytest.mark.parametrize('model_case', SLEEP_WAKEUP_MODEL_LIST) +class TestRestfulSleepWakeup: + + def test_sleep_wakeup_is_sleeping_roundtrip(self, model_case, backend): + try: + _ensure_awake() + r_sleep = _post_sleep() + _assert_status_200(r_sleep) + + assert _fetch_is_sleeping() is True + + r_wake = _post_wakeup() + _assert_status_200(r_wake) + + assert _fetch_is_sleeping() is False + finally: + _ensure_awake() + + def test_sleep_with_level_query_wakeup_and_chat(self, model_case, backend): + try: + _ensure_awake() + r_sleep = _post_sleep(level=1) + _assert_status_200(r_sleep) + + assert _fetch_is_sleeping() is True + + r_wake = _post_wakeup() + _assert_status_200(r_wake) + assert _fetch_is_sleeping() is False + + api_client = APIClient(BASE_URL) + model_name = api_client.available_models[0] + output = None + for output in api_client.chat_completions_v1( + model=model_name, + messages=[{'role': 'user', 'content': 'Hi, reply with one short sentence.'}], + max_tokens=32, + temperature=0.01): + continue + assert output is not None + assert_chat_completions_batch_return(output, model_name) + finally: + _ensure_awake() + + def test_sleep_partial_wakeup_with_tags(self, model_case, backend): + try: + _ensure_awake() + r_sleep = _post_sleep(level=1) + _assert_status_200(r_sleep) + assert _fetch_is_sleeping() is True + + r_w = _post_wakeup(tags=['weights']) + _assert_status_200(r_w) + assert _fetch_is_sleeping() is True + + r_kv = _post_wakeup(tags=['kv_cache']) + _assert_status_200(r_kv) + assert _fetch_is_sleeping() is False + finally: + _ensure_awake() + + def test_wakeup_unknown_tags_is_noop_then_full_wakeup(self, model_case, backend): + try: + _ensure_awake() + _assert_status_200(_post_sleep(level=1)) + assert _fetch_is_sleeping() is True + + _assert_status_200(_post_wakeup(tags=['not_a_valid_tag'])) + assert _fetch_is_sleeping() is True + + _assert_status_200(_post_wakeup()) + assert _fetch_is_sleeping() is False + finally: + _ensure_awake() + + def test_wakeup_mixed_valid_and_invalid_tags_entire_call_noop(self, model_case, backend): + try: + _ensure_awake() + _assert_status_200(_post_sleep(level=1)) + assert _fetch_is_sleeping() is True + + _assert_status_200(_post_wakeup(tags=['weights', 'not_a_valid_tag'])) + assert _fetch_is_sleeping() is True + + _assert_status_200(_post_wakeup(tags=['not_a_valid_tag', 'weights'])) + assert _fetch_is_sleeping() is True + + _assert_status_200(_post_wakeup()) + assert _fetch_is_sleeping() is False + finally: + _ensure_awake() + + def test_wakeup_both_valid_tags_in_one_request(self, model_case, backend): + try: + _ensure_awake() + _assert_status_200(_post_sleep(level=1)) + assert _fetch_is_sleeping() is True + + _assert_status_200(_post_wakeup(tags=['weights', 'kv_cache'])) + assert _fetch_is_sleeping() is False + + api_client = APIClient(BASE_URL) + model_name = api_client.available_models[0] + output = None + for output in api_client.chat_completions_v1( + model=model_name, + messages=[{'role': 'user', 'content': 'Hi, reply with one short sentence.'}], + max_tokens=32, + temperature=0.01): + continue + assert output is not None + assert_chat_completions_batch_return(output, model_name) + finally: + _ensure_awake() + + def test_wakeup_redundant_tag_after_partial_wake_is_noop(self, model_case, backend): + try: + _ensure_awake() + _assert_status_200(_post_sleep(level=1)) + assert _fetch_is_sleeping() is True + + _assert_status_200(_post_wakeup(tags=['weights'])) + assert _fetch_is_sleeping() is True + + _assert_status_200(_post_wakeup(tags=['weights'])) + assert _fetch_is_sleeping() is True + + _assert_status_200(_post_wakeup(tags=['kv_cache'])) + assert _fetch_is_sleeping() is False + finally: + _ensure_awake() + + def test_wakeup_empty_string_tag_is_noop_when_sleeping(self, model_case, backend): + try: + _ensure_awake() + _assert_status_200(_post_sleep(level=1)) + assert _fetch_is_sleeping() is True + + r = requests.post( + f'{BASE_URL}/wakeup', + headers=JSON_HEADERS, + json={}, + params=[('tags', '')], + timeout=_REQUEST_TIMEOUT, + ) + _assert_status_200(r) + assert _fetch_is_sleeping() is True + + _assert_status_200(_post_wakeup()) + assert _fetch_is_sleeping() is False + finally: + _ensure_awake() + + def test_full_wakeup_when_already_awake(self, model_case, backend): + try: + _ensure_awake() + assert _fetch_is_sleeping() is False + _assert_status_200(_post_wakeup()) + assert _fetch_is_sleeping() is False + _assert_status_200(_post_wakeup()) + assert _fetch_is_sleeping() is False + finally: + _ensure_awake() + + def test_sleep_second_call_while_sleeping_still_ok(self, model_case, backend): + try: + _ensure_awake() + _assert_status_200(_post_sleep(level=1)) + assert _fetch_is_sleeping() is True + _assert_status_200(_post_sleep(level=1)) + assert _fetch_is_sleeping() is True + _assert_status_200(_post_wakeup()) + assert _fetch_is_sleeping() is False + finally: + _ensure_awake() + + def test_sleep_non_integer_level_is_http_error(self, model_case, backend): + try: + _ensure_awake() + resp = _post_sleep_query_raw('level=not_an_int') + assert resp.status_code != 200, f'expected non-200, got {resp.status_code} body={resp.text!r}' + finally: + _ensure_awake() + + def test_sleep_level_2_full_wakeup_and_chat(self, model_case, backend, config): + try: + _ensure_awake() + api_client = APIClient(BASE_URL) + model_name = api_client.available_models[0] + + baseline = None + if _should_enforce_level2_greedy_checks(backend): + baseline = _assert_level2_greedy_baseline_stable( + api_client, model_name, label='level2 REST') + + _assert_status_200(_post_sleep_level2()) + assert _fetch_is_sleeping() is True + + _assert_status_200(_post_wakeup(tags=['weights'])) + assert _fetch_is_sleeping() is True + _level2_reload_hf_weights(backend, config, model_case) + + _assert_status_200(_post_wakeup(tags=['kv_cache'])) + assert _fetch_is_sleeping() is False + + after = _chat_completion_collect( + api_client, + model_name, + messages=LEVEL2_GREEDY_MESSAGES, + max_tokens=LEVEL2_MAX_TOKENS, + temperature=0.0, + top_p=1.0, + top_k=1, + ) + assert_chat_completions_batch_return(after, model_name) + assert_assistant_not_degenerate( + assistant_content_from_openai_completion_dict(after), + label='level2 REST after staged wakeup (1st chat)') + if baseline is not None: + assert_chat_decode_unchanged(baseline, after, label='level2 REST 1st infer after staged wakeup') + + after2 = _chat_completion_collect( + api_client, + model_name, + messages=LEVEL2_GREEDY_MESSAGES, + max_tokens=LEVEL2_MAX_TOKENS, + temperature=0.0, + top_p=1.0, + top_k=1, + ) + assert_chat_completions_batch_return(after2, model_name) + if baseline is not None: + assert_chat_decode_unchanged(baseline, after2, label='level2 REST 2nd infer after staged wakeup') + + _assert_status_200(_post_sleep_level2()) + assert _fetch_is_sleeping() is True + _assert_status_200(_post_wakeup(tags=['weights'])) + assert _fetch_is_sleeping() is True + _level2_reload_hf_weights(backend, config, model_case) + _assert_status_200(_post_wakeup(tags=['kv_cache'])) + assert _fetch_is_sleeping() is False + + after_full = _chat_completion_collect( + api_client, + model_name, + messages=LEVEL2_GREEDY_MESSAGES, + max_tokens=LEVEL2_MAX_TOKENS, + temperature=0.0, + top_p=1.0, + top_k=1, + ) + assert_chat_completions_batch_return(after_full, model_name) + label2 = 'level2 REST infer after 2nd sleep cycle (staged wakeup)' + if baseline is not None: + assert_chat_decode_unchanged(baseline, after_full, label=label2) + + output = None + for output in api_client.chat_completions_v1( + model=model_name, + messages=[{'role': 'user', 'content': 'Hi, reply with one short sentence.'}], + max_tokens=32, + temperature=0.01): + continue + assert output is not None + assert_chat_completions_batch_return(output, model_name) + finally: + _ensure_awake() diff --git a/autotest/tools/pipeline/mllm_case.py b/autotest/tools/pipeline/mllm_case.py index 2ac134f440..88c9c2c2d2 100644 --- a/autotest/tools/pipeline/mllm_case.py +++ b/autotest/tools/pipeline/mllm_case.py @@ -1,12 +1,14 @@ import json +import os +from typing import Any -import fire -import numpy as np -from PIL import Image +import fire # noqa: E402 +import numpy as np # noqa: E402 +from PIL import Image # noqa: E402 -from lmdeploy import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline -from lmdeploy.vl import encode_image_base64, load_image -from lmdeploy.vl.constants import IMAGE_TOKEN +from lmdeploy import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline # noqa: E402 +from lmdeploy.vl import encode_image_base64, load_image, load_video # noqa: E402 +from lmdeploy.vl.constants import IMAGE_TOKEN # noqa: E402 gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=10) @@ -18,6 +20,35 @@ PIC_PANDA = 'panda.jpg' DESC = 'What are the similarities and differences between these two images.' DESC_ZH = '两张图有什么相同和不同的地方.' +_MM_DEMO_TOMB_MCQ_JSON_BLOCK = """{ + "question": "How many porcelain jars were discovered in the niches located in the primary chamber of the tomb?", + "options": [ + "A. 4.", + "B. 9.", + "C. 5.", + "D. 13." + ] +}""" +MM_DEMO_TOMB_USER_PROMPT = ( + 'You are given a multiple-choice problem as JSON (question and options only; there is no answer field). ' + 'Watch the entire video, pick the best option from what you see, then reply briefly with the letter ' + '(A, B, C, or D) first and at most one short sentence. Do not output long step-by-step reasoning; ' + 'keep the final reply concise.\n\n' + _MM_DEMO_TOMB_MCQ_JSON_BLOCK) + +DEFAULT_VIDEO_FILENAME = 'red-panda.mp4' +VIDEO_QWEN3_DEMO_FILENAME = 'N1cdUjctpG8.mp4' + + +def _numpy_video_to_pil_list(frames: np.ndarray) -> list[Image.Image]: + images: list[Image.Image] = [] + for i in range(int(frames.shape[0])): + images.append(Image.fromarray(frames[i].astype('uint8')).convert('RGB')) + return images + + +def load_video_sampled_pil(video_path: str, num_frames: int, **kwargs: Any) -> tuple[list[Image.Image], dict[str, Any]]: + frames, meta = load_video(video_path, num_frames=num_frames, **kwargs) + return _numpy_video_to_pil_list(frames), meta def run_pipeline_mllm_test(model_path, run_config, resource_path, is_pr_test: bool = False): @@ -169,44 +200,9 @@ def internvl_vl_testcase(pipe, resource_path, lang='en'): print(f'[caseresult internvl-separate-images2-{lang} start]' + json.dumps(response.text, ensure_ascii=False) + f'[caseresult internvl-separate-images2-{lang} end]\n') - # video multi-round conversation - def get_index(bound, fps, max_frame, first_idx=0, num_segments=32): - if bound: - start, end = bound[0], bound[1] - else: - start, end = -100000, 100000 - start_idx = max(first_idx, round(start * fps)) - end_idx = min(round(end * fps), max_frame) - seg_size = float(end_idx - start_idx) / num_segments - frame_indices = np.array( - [int(start_idx + (seg_size / 2) + np.round(seg_size * idx)) for idx in range(num_segments)]) - return frame_indices - - def load_video(video_path, bound=None, num_segments=32): - import cv2 - cap = cv2.VideoCapture(video_path) - if not cap.isOpened(): - raise ValueError(f'Cannot open video file: {video_path}') - - max_frame = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - 1 - fps = cap.get(cv2.CAP_PROP_FPS) - - frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments) - imgs = [] - - for frame_index in frame_indices: - cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index) - ret, frame = cap.read() - if ret: - rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) - img = Image.fromarray(rgb_frame).convert('RGB') - imgs.append(img) - - cap.release() - return imgs - - video_path = resource_path + '/red-panda.mp4' - imgs = load_video(video_path, num_segments=8) + # video multi-round conversation (uniform ``num_frames`` via lmdeploy.vl.load_video) + video_path = f'{resource_path}/{DEFAULT_VIDEO_FILENAME}' + imgs, _ = load_video_sampled_pil(video_path, num_frames=8) question = '' for i in range(len(imgs)): @@ -287,43 +283,11 @@ def MiniCPM_vl_testcase(pipe, resource_path): print('[caseresult minicpm-fewshot start]' + json.dumps(response.text, ensure_ascii=False) + '[caseresult minicpm-fewshot end]\n') - # Chat with video - MAX_NUM_FRAMES = 64 # if cuda OOM set a smaller number - - def encode_video(video_path): - - def uniform_sample(length, n): - gap = len(length) / n - idxs = [int(i * gap + gap / 2) for i in range(n)] - return [length[i] for i in idxs] - - import cv2 - cap = cv2.VideoCapture(video_path) - if not cap.isOpened(): - raise ValueError(f'Cannot open video file: {video_path}') - - fps = cap.get(cv2.CAP_PROP_FPS) - total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - - sample_fps = round(fps / 1) # FPS - frame_idx = [i for i in range(0, total_frames, sample_fps)] - if len(frame_idx) > MAX_NUM_FRAMES: - frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES) - - frames = [] - for idx in frame_idx: - cap.set(cv2.CAP_PROP_POS_FRAMES, idx) - ret, frame = cap.read() - if ret: - rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) - frames.append(Image.fromarray(rgb_frame.astype('uint8')).convert('RGB')) - - cap.release() - print('num frames:', len(frames)) - return frames - - video_path = resource_path + '/red-panda.mp4' - frames = encode_video(video_path) + # Chat with video (fixed frame budget; same decoder as REST ``video_url``) + max_video_frames = 32 + video_path = f'{resource_path}/{DEFAULT_VIDEO_FILENAME}' + frames, video_meta = load_video_sampled_pil(video_path, num_frames=max_video_frames) + print('num frames:', len(frames), 'meta:', video_meta.get('frames_indices')) question = 'What animals are in the video, and what are they doing?' content = [dict(type='text', text=question)] @@ -386,6 +350,97 @@ def Qwen_vl_testcase(pipe, resource_path): print('[caseresult qwen-performance-images2 start]' + json.dumps(response.text, ensure_ascii=False) + '[caseresult qwen-performance-images2 end]\n') + # Qwen2.5/3-VL: native ``video`` + same knobs as REST ``extra_body`` (top_k / mm_processor_kwargs). + demo_path = os.path.join(resource_path, VIDEO_QWEN3_DEMO_FILENAME) + if not os.path.isfile(demo_path): + print('[caseresult qwen3-demo-video start]' + + json.dumps('SKIPPED_NO_DEMO_MP4', ensure_ascii=False) + '[caseresult qwen3-demo-video end]\n') + else: + try: + frames, vmeta = load_video(demo_path, num_frames=16, fps=2) + demo_q = MM_DEMO_TOMB_USER_PROMPT + vmsg = [{ + 'role': + 'user', + 'content': [ + { + 'type': 'video', + 'data': frames, + 'video_metadata': vmeta, + }, + { + 'type': 'text', + 'text': demo_q, + }, + ], + }] + mm_gen_config = GenerationConfig( + max_new_tokens=24576, + min_new_tokens=10, + top_k=20, + temperature=0.3, + top_p=0.95, + ) + response = pipe( + vmsg, + gen_config=mm_gen_config, + log_level='INFO', + max_log_len=10, + mm_processor_kwargs={ + 'fps': 2, + 'do_sample_frames': True, + }, + ) + print('[caseresult qwen3-demo-video start]' + json.dumps(response.text, ensure_ascii=False) + + '[caseresult qwen3-demo-video end]\n') + except Exception as exc: + err = json.dumps(f'PIPELINE_VIDEO_ERROR:{exc!s}', ensure_ascii=False) + print('[caseresult qwen3-demo-video start]' + err + '[caseresult qwen3-demo-video end]\n') + + rp_video = os.path.join(resource_path, DEFAULT_VIDEO_FILENAME) + if not os.path.isfile(rp_video): + print('[caseresult qwen-mixed-image-text-video start]' + + json.dumps('SKIPPED_NO_RED_PANDA_MP4', ensure_ascii=False) + + '[caseresult qwen-mixed-image-text-video end]\n') + else: + try: + frames_pil, _vmeta_m = load_video_sampled_pil(rp_video, num_frames=6, fps=1) + mixed_content = [ + { + 'type': + 'text', + 'text': ( + 'You are given one still image, then several frames from a short video in order. ' + 'In 2-4 sentences: name one thing in the still image, and what animal or activity ' + 'you see in the video frames.'), + }, + { + 'type': 'image_url', + 'image_url': { + 'url': f'{resource_path}/{PIC1}', + }, + }, + ] + for frame in frames_pil: + mixed_content.append( + dict( + type='image_url', + image_url=dict(url=f'data:image/jpeg;base64,{encode_image_base64(frame)}'), + )) + mixed_msg = [{'role': 'user', 'content': mixed_content}] + response = pipe( + mixed_msg, + gen_config=gen_config, + log_level='INFO', + max_log_len=10, + ) + print('[caseresult qwen-mixed-image-text-video start]' + + json.dumps(response.text, ensure_ascii=False) + '[caseresult qwen-mixed-image-text-video end]\n') + except Exception as exc: + err = json.dumps(f'PIPELINE_MIXED_MM_ERROR:{exc!s}', ensure_ascii=False) + print('[caseresult qwen-mixed-image-text-video start]' + err + + '[caseresult qwen-mixed-image-text-video end]\n') + if __name__ == '__main__': fire.Fire() diff --git a/autotest/utils/constant.py b/autotest/utils/constant.py index bc3ebb0ad5..85a2ea9d84 100644 --- a/autotest/utils/constant.py +++ b/autotest/utils/constant.py @@ -201,6 +201,14 @@ } } +SLEEP_WAKEUP_MODEL_LIST = [ + 'Qwen/Qwen3.5-35B-A3B', + 'Qwen/Qwen3.5-35B-A3B-FP8', + 'Qwen/Qwen3.5-122B-A10B', +] + +SLEEP_WAKEUP_BACKENDS = ['pytorch', 'turbomind'] + BACKEND_LIST = ['turbomind', 'pytorch'] RESTFUL_MODEL_LIST_LATEST = [ @@ -260,3 +268,19 @@ 'cache-max-entry-count': 0.7 } } + +# Qwen3-VL tomb demo (REST ``mm_processor`` + pipeline video): MCQ JSON without a labelled answer field. +MM_DEMO_TOMB_MCQ_JSON_BLOCK = """{ + "question": "How many porcelain jars were discovered in the niches located in the primary chamber of the tomb?", + "options": [ + "A. 4.", + "B. 9.", + "C. 5.", + "D. 13." + ] +}""" +MM_DEMO_TOMB_USER_PROMPT = ( + 'You are given a multiple-choice problem as JSON (question and options only; there is no answer field). ' + 'Watch the entire video, pick the best option from what you see, then reply briefly with the letter ' + '(A, B, C, or D) first and at most one short sentence. Do not output long step-by-step reasoning; ' + 'keep the final reply concise.\n\n' + MM_DEMO_TOMB_MCQ_JSON_BLOCK) diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py index 1240227af4..470dca1f3d 100644 --- a/autotest/utils/pipeline_chat.py +++ b/autotest/utils/pipeline_chat.py @@ -8,6 +8,7 @@ from utils.common_utils import execute_command_with_logging from utils.config_utils import get_case_str_by_config, get_cuda_prefix_by_workerid, get_workerid, resolve_extra_params from utils.rule_condition_assert import assert_result +from utils.run_restful_chat import _mm_demo_thinking_wrapper_shape_assert, _mm_demo_tomb_answer_assert def run_pipeline_llm_test(config, run_config, common_case_config, worker_id: str = '', is_smoke: bool = False): @@ -337,6 +338,51 @@ def Qwen_vl_testcase(output_text, file): file.writelines(f'qwen-performance-images2 result: {case_result}, reason: buildings should in {response} \n') with assume: assert case_result, f'reason: performance images2: buildings should in {response}' + with allure.step('qwen3-demo-video'): + response = get_response_from_output(output_text, 'qwen3-demo-video') + rl = response.lower() + if 'skipped_no_demo_mp4' in rl: + file.writelines('qwen3-demo-video result: skipped (N1cdUjctpG8.mp4 not in resource_path)\n') + elif 'pipeline_video_error:' in rl: + file.writelines(f'qwen3-demo-video result: false, pipeline video error in {response} \n') + with assume: + assert False, f'qwen3-demo-video pipeline error: {response}' + else: + tomb_assert = _mm_demo_tomb_answer_assert(response) + shape_assert = _mm_demo_thinking_wrapper_shape_assert(response) + case_result = tomb_assert and shape_assert + reason = 'tomb/jar + bounded public tail' + file.writelines(f'qwen3-demo-video result: {case_result}, reason: {reason}: {response} \n') + with assume: + msg = 'reason: qwen3 demo video: expected tomb/jar-related bounded answer' + assert case_result, f'{msg}: {response}' + if '[caseresult qwen-mixed-image-text-video start]' in output_text: + with allure.step('qwen-mixed-image-text-video'): + response = get_response_from_output(output_text, 'qwen-mixed-image-text-video') + rl = response.lower() + if 'skipped_no_red_panda_mp4' in rl: + file.writelines( + 'qwen-mixed-image-text-video result: skipped (red-panda.mp4 not in resource_path)\n') + elif 'pipeline_mixed_mm_error:' in rl: + file.writelines(f'qwen-mixed-image-text-video result: false, mixed mm error in {response} \n') + with assume: + assert False, f'qwen-mixed-image-text-video pipeline error: {response}' + else: + img = ( + any(w in rl for w in ('tiger', 'ski')) + or '虎' in response + or '滑雪' in response + ) + vid = ( + any(w in rl for w in ('panda', 'red panda', 'lesser panda', 'ailurus')) + or any(w in response for w in ('小熊猫', '红熊猫')) + ) + case_result = bool(response.strip()) and img and vid + file.writelines( + f'qwen-mixed-image-text-video result: {case_result}, reason: image+tiger + video+panda cues\n') + with assume: + msg = 'reason: mixed image+video reply should mention tiger/ski and panda' + assert case_result, f'{msg}: {response}' def save_pipeline_common_log(config, log_name, result, content, msg: str = '', write_type: str = 'w'): diff --git a/autotest/utils/restful_return_check.py b/autotest/utils/restful_return_check.py index b425b809da..537e5acac6 100644 --- a/autotest/utils/restful_return_check.py +++ b/autotest/utils/restful_return_check.py @@ -14,8 +14,15 @@ def assert_chat_completions_batch_return(output, model_name, check_logprobs: boo assert len(message.get('message').get('content')) > 0 assert message.get('message').get('role') == 'assistant' if check_logprobs: - len(message.get('logprobs').get('content')) == output.get('usage').get('completion_tokens') - for logprob in message.get('logprobs').get('content'): + lp = message.get('logprobs') + assert lp is not None, output + content_lp = lp.get('content') + assert content_lp is not None, output + n_tok = output.get('usage', {}).get('completion_tokens') + assert len(content_lp) == n_tok, ( + f'logprobs.content len {len(content_lp)} != completion_tokens {n_tok!r}' + ) + for logprob in content_lp: assert_logprobs(logprob, logprobs_num) @@ -31,8 +38,15 @@ def assert_completions_batch_return(output, model_name, check_logprobs: bool = F assert message.get('index') == 0 assert len(message.get('text')) > 0 if check_logprobs: - len(message.get('logprobs').get('content')) == output.get('usage').get('completion_tokens') - for logprob in message.get('logprobs').get('content'): + lp = message.get('logprobs') + assert lp is not None, output + content_lp = lp.get('content') + assert content_lp is not None, output + n_tok = output.get('usage', {}).get('completion_tokens') + assert len(content_lp) == n_tok, ( + f'logprobs.content len {len(content_lp)} != completion_tokens {n_tok!r}' + ) + for logprob in content_lp: assert_logprobs(logprob, logprobs_num) diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py index 6e186c9328..de1e6d2689 100644 --- a/autotest/utils/run_restful_chat.py +++ b/autotest/utils/run_restful_chat.py @@ -1,12 +1,13 @@ import json import os +import re import subprocess import time import allure import psutil import requests -from openai import OpenAI +from openai import APIStatusError, BadRequestError, OpenAI from pytest_assume.plugin import assume from utils.config_utils import ( get_case_str_by_config, @@ -15,7 +16,7 @@ get_workerid, resolve_extra_params, ) -from utils.constant import DEFAULT_PORT, DEFAULT_SERVER +from utils.constant import DEFAULT_PORT, DEFAULT_SERVER, MM_DEMO_TOMB_USER_PROMPT from utils.restful_return_check import assert_chat_completions_batch_return from utils.rule_condition_assert import assert_result @@ -244,6 +245,143 @@ def _run_logprobs_test(port: int = DEFAULT_PORT): PIC = 'tiger.jpeg' # noqa E501 PIC2 = 'human-pose.jpg' # noqa E501 +VIDEO = 'red-panda.mp4' # noqa E501 +VIDEO_QWEN3_DEMO = 'N1cdUjctpG8.mp4' # noqa E501 +MM_DEMO_MAX_TOKENS = 24576 +MM_DEMO_MAX_TOKENS_STREAM = 24576 +VIDEO_SINGLE_FRAME_MAX_TOKENS = 512 +VIDEO_REDPANDA_STREAM_MAX_TOKENS = 2048 + + +def _vl_video_stream_finish_assert(finish: str | None, text: str) -> bool: + """``stop`` / ``length`` red-panda video: species keywords, then ``length`` + needs enough text.""" + if finish not in ('stop', 'length'): + return False + t = (text or '').lower() + raw = text or '' + species_match = ( + any(p in t for p in ('red panda', 'lesser panda')) + or 'ailurus' in t + or any(s in raw for s in ('小熊猫', '红熊猫')) + ) + if not species_match: + return False + if finish == 'length': + return len(raw.strip()) >= 300 + return True + + +def _vl_openai_http_error_skippable(exc: BaseException) -> bool: + if isinstance(exc, BadRequestError): + return True + if isinstance(exc, APIStatusError): + code = getattr(exc, 'status_code', None) + return isinstance(code, int) and code < 500 + return False + + +_REDACTED_THINKING_END = '' + + +def _mm_demo_public_answer_text(text: str) -> str: + """Optional JSON-string decode (pipeline logs); then tail after + ```` when present.""" + s = (text or '').strip() + if len(s) >= 2 and s[0] == '"' and s[-1] == '"': + try: + s = str(json.loads(s)) + except (json.JSONDecodeError, TypeError, ValueError): + pass + s = s.strip() + key = _REDACTED_THINKING_END + i = s.lower().rfind(key.lower()) + if i == -1: + return s + return s[i + len(key) :].strip() + + +def _mm_demo_tomb_answer_assert(text: str) -> bool: + """Tomb/MCQ: visible tail mentions scene, a digit, or an MCQ-style letter + (A–D).""" + raw = _mm_demo_public_answer_text(text).strip() + if not raw: + return False + rl = raw.lower() + if any(w in rl for w in ('jar', 'porcelain', 'tomb', 'niche', 'chamber', '罐', '瓷', '墓', '龛')): + return True + if any(c.isdigit() for c in raw): + return True + s = raw.strip() + if re.search(r'(?i)\b(?:answer|choice|option|correct)\b\s*[::]?\s*[abcd]\b', s): + return True + if re.fullmatch(r'(?is)[`"\(\[]*[abcd][`"\)\]]*\.?\s*', s): + return True + if len(s) <= 120 and re.match(r'(?is)[`"\(\[]*[abcd][`"\)\]]*[\s\.\):,\-]', s): + return True + return False + + +def _mm_demo_thinking_wrapper_shape_assert(text: str) -> bool: + """Bound user-visible tail after ````, or total size if the wrapper + never closes.""" + s = (text or '').strip() + if not s: + return False + if _REDACTED_THINKING_END.lower() in s.lower(): + public = _mm_demo_public_answer_text(s).strip() + return 0 < len(public) <= 2000 + return len(s) <= 3200 + + +def _mm_demo_tomb_run_assert(finish: str | None, text: str) -> bool: + """Tomb + ``mm_processor``: ``stop`` + shape; ``length`` + closed thinking + + shape, else long jar/scene tail.""" + t = (text or '').strip() + if not t or not _mm_demo_tomb_answer_assert(t): + return False + if finish == 'stop': + return _mm_demo_thinking_wrapper_shape_assert(t) + if finish == 'length': + if _REDACTED_THINKING_END.lower() in t.lower(): + return _mm_demo_thinking_wrapper_shape_assert(t) + if len(t) < 1500: + return False + head_l = t[:8000].lower() + if 'jar' not in head_l: + return False + return any(w in head_l for w in ('niche', 'chamber', 'tomb', 'porcelain', 'primary', '罐', '墓', '龛', '瓷')) + return False + + +def _mm_demo_single_frame_scene_assert(text: str) -> bool: + """Single-frame: short visible tail plus chamber / niche / vessel hints.""" + raw = _mm_demo_public_answer_text(text).strip() + if not raw or len(raw) < 20: + return False + if sum(1 for c in raw if c.isalpha()) < 12: + return False + rl = raw.lower() + if any(w in rl for w in ('chamber', 'niche', 'jar', 'porcelain', 'artifact', 'coffin', '墓室', '龛', '罐')): + return True + return False + + +def _consume_chat_completion_stream(stream_iter) -> tuple[str | None, str]: + """Drain a chat-completion stream: ``(finish_reason, joined delta content)``.""" + chunks: list[str] = [] + last_fr: str | None = None + for ev in stream_iter: + if not getattr(ev, 'choices', None): + continue + choice = ev.choices[0] + fr = getattr(choice, 'finish_reason', None) + if fr: + last_fr = fr + delta = getattr(choice, 'delta', None) + if delta and getattr(delta, 'content', None): + chunks.append(delta.content) + return last_fr, ''.join(chunks) def run_vl_testcase(log_path, resource_path, port: int = DEFAULT_PORT): @@ -289,6 +427,355 @@ def run_vl_testcase(log_path, resource_path, port: int = DEFAULT_PORT): for item in api_client.chat_completions_v1(model=model_name, messages=prompt_messages): continue file.writelines(str(item) + '\n') + + video_path = os.path.join(resource_path, VIDEO) + video_messages = [{ + 'role': + 'user', + 'content': [ + { + 'type': 'text', + 'text': ('What animal appears in the clip? Give the common species name in one or two ' + 'short sentences (avoid long step-by-step reasoning).'), + }, + { + 'type': 'video_url', + 'video_url': { + 'url': video_path, + }, + }, + ], + }] + video_messages_one_frame = [{ + 'role': + 'user', + 'content': [ + { + 'type': 'text', + 'text': ('The server decodes this clip to a single video frame only. What animal appears? ' + 'Answer in one or two short sentences.'), + }, + { + 'type': 'video_url', + 'video_url': { + 'url': video_path, + }, + }, + ], + }] + + if not os.path.isfile(video_path): + file.writelines(f'[video testcase skipped] missing file: {video_path}\n') + else: + try: + v_resp = client.chat.completions.create( + model=model_name, + messages=video_messages, + temperature=0.2, + max_tokens=512, + extra_body={ + 'media_io_kwargs': { + 'video': { + 'num_frames': 8, + }, + }, + }, + ) + except (BadRequestError, APIStatusError) as exc: + if not _vl_openai_http_error_skippable(exc): + raise + file.writelines(f'[video testcase skipped] model/server rejected video_url: {exc!r}\n') + else: + file.writelines('[video non-stream] ' + str(v_resp).lower() + '\n') + content = (v_resp.choices[0].message.content or '') + assert _vl_video_stream_finish_assert(getattr(v_resp.choices[0], 'finish_reason', None), content), v_resp + + v_more = client.chat.completions.create( + model=model_name, + messages=video_messages, + temperature=0.0, + max_tokens=1, + extra_body={ + 'media_io_kwargs': { + 'video': { + 'num_frames': 16, + }, + }, + }, + ) + v_few = client.chat.completions.create( + model=model_name, + messages=video_messages, + temperature=0.0, + max_tokens=1, + extra_body={ + 'media_io_kwargs': { + 'video': { + 'num_frames': 4, + }, + }, + }, + ) + u_more = getattr(v_more, 'usage', None) + u_few = getattr(v_few, 'usage', None) + if u_more and u_few and getattr(u_few, 'prompt_tokens', None) and getattr(u_more, 'prompt_tokens', None): + if u_few.prompt_tokens < u_more.prompt_tokens: + file.writelines('[video] fewer frames => fewer prompt_tokens (as expected)\n') + else: + few_t, many_t = u_few.prompt_tokens, u_more.prompt_tokens + file.writelines( + f'[video] prompt_tokens not compared (few={few_t}, many={many_t})\n', + ) + + stream = client.chat.completions.create( + model=model_name, + messages=video_messages, + temperature=0.2, + max_tokens=VIDEO_REDPANDA_STREAM_MAX_TOKENS, + stream=True, + extra_body={ + 'media_io_kwargs': { + 'video': { + 'num_frames': 8, + }, + }, + }, + ) + stream_fr, joined = _consume_chat_completion_stream(stream) + file.writelines('[video stream] ' + joined.lower() + '\n') + assert _vl_video_stream_finish_assert(stream_fr, joined), (stream_fr, joined[:1200]) + + video_payload = { + 'model': model_name, + 'messages': video_messages, + 'temperature': 0.2, + 'max_tokens': VIDEO_REDPANDA_STREAM_MAX_TOKENS, + 'media_io_kwargs': { + 'video': { + 'num_frames': 8, + }, + }, + } + raw = requests.post(f'{http_url}/v1/chat/completions', + headers={'content-type': 'application/json'}, + json=video_payload, + timeout=600) + file.writelines(f'[video raw http] status={raw.status_code}\n') + assert raw.ok, raw.text + raw_json = raw.json() + raw_ch0 = (raw_json.get('choices') or [{}])[0] + raw_text = raw_ch0.get('message', {}).get('content') or '' + file.writelines(raw_text.lower() + '\n') + raw_fr = raw_ch0.get('finish_reason') + assert _vl_video_stream_finish_assert(raw_fr, raw_text), (raw_fr, raw_text[:1200], raw_json) + + v_one = client.chat.completions.create( + model=model_name, + messages=video_messages_one_frame, + temperature=0.2, + max_tokens=VIDEO_SINGLE_FRAME_MAX_TOKENS, + extra_body={ + 'media_io_kwargs': { + 'video': { + 'num_frames': 1, + }, + }, + }, + ) + file.writelines('[video single-frame] ' + str(v_one).lower() + '\n') + one_content = (v_one.choices[0].message.content or '') + assert _vl_video_stream_finish_assert(getattr(v_one.choices[0], 'finish_reason', None), one_content), v_one + + # Qwen3-VL style: local demo mp4 + mm_processor_kwargs (fps / do_sample_frames), OpenAI-compatible body. + demo_video_path = os.path.join(resource_path, VIDEO_QWEN3_DEMO) + demo_question = MM_DEMO_TOMB_USER_PROMPT + # Single-frame sampling often lands on an aerial or intro shot, not the jar niche scene. + mm_one_question = ( + 'This is one frame from a short news-style clip about an ancient tomb. ' + 'If you see interior details, focus on chamber, niches, pottery or porcelain jars, ' + 'coffin, or furnishings, in one or two short sentences. ' + 'If the frame is only exterior or aerial, say that in one short sentence. ' + 'No long step-by-step reasoning.') + mm_demo_messages = [{ + 'role': + 'user', + 'content': [ + { + 'type': 'video_url', + 'video_url': { + 'url': demo_video_path, + }, + }, + { + 'type': 'text', + 'text': demo_question, + }, + ], + }] + mm_one_messages = [{ + 'role': + 'user', + 'content': [ + { + 'type': 'video_url', + 'video_url': { + 'url': demo_video_path, + }, + }, + { + 'type': 'text', + 'text': mm_one_question, + }, + ], + }] + if not os.path.isfile(demo_video_path): + file.writelines(f'[video mm_processor demo skipped] missing file: {demo_video_path}\n') + else: + try: + mm_resp = client.chat.completions.create( + model=model_name, + messages=mm_demo_messages, + max_tokens=MM_DEMO_MAX_TOKENS, + temperature=0.3, + top_p=0.95, + extra_body={ + 'top_k': 20, + 'mm_processor_kwargs': { + 'fps': 2, + 'do_sample_frames': True, + }, + }, + ) + except (BadRequestError, APIStatusError) as exc: + if not _vl_openai_http_error_skippable(exc): + raise + file.writelines(f'[video mm_processor demo skipped] {exc!r}\n') + else: + file.writelines('[video mm_processor non-stream] ' + str(mm_resp).lower() + '\n') + mm_text = (mm_resp.choices[0].message.content or '').strip() + mm_fr = getattr(mm_resp.choices[0], 'finish_reason', None) + assert _mm_demo_tomb_run_assert(mm_fr, mm_text), (mm_fr, mm_text[:2000]) + + mm_stream = client.chat.completions.create( + model=model_name, + messages=mm_demo_messages, + max_tokens=MM_DEMO_MAX_TOKENS_STREAM, + temperature=0.2, + stream=True, + extra_body={ + 'top_k': 20, + 'mm_processor_kwargs': { + 'fps': 2, + 'do_sample_frames': True, + }, + }, + ) + mm_finish, mm_joined = _consume_chat_completion_stream(mm_stream) + mm_joined = mm_joined.strip() + file.writelines('[video mm_processor stream] ' + mm_joined.lower() + '\n') + assert _mm_demo_tomb_run_assert(mm_finish, mm_joined), (mm_finish, mm_joined[:2000]) + + mm_raw_payload = { + 'model': model_name, + 'messages': mm_demo_messages, + 'temperature': 0.3, + 'max_tokens': MM_DEMO_MAX_TOKENS, + 'top_k': 20, + 'mm_processor_kwargs': { + 'fps': 2, + 'do_sample_frames': True, + }, + } + mm_raw = requests.post(f'{http_url}/v1/chat/completions', + headers={'content-type': 'application/json'}, + json=mm_raw_payload, + timeout=600) + file.writelines(f'[video mm_processor raw http] status={mm_raw.status_code}\n') + assert mm_raw.ok, mm_raw.text + mm_raw_json = mm_raw.json() + mm_raw_choice0 = (mm_raw_json.get('choices') or [{}])[0] + mm_raw_text = mm_raw_choice0.get('message', {}).get('content') or '' + file.writelines(mm_raw_text.lower() + '\n') + mm_raw_fr = mm_raw_choice0.get('finish_reason') + assert _mm_demo_tomb_run_assert(mm_raw_fr, mm_raw_text), (mm_raw_fr, mm_raw_text[:2000], mm_raw_json) + + mm_one = client.chat.completions.create( + model=model_name, + messages=mm_one_messages, + max_tokens=2048, + temperature=0.3, + top_p=0.95, + extra_body={ + 'top_k': 20, + 'media_io_kwargs': { + 'video': { + 'num_frames': 1, + }, + }, + }, + ) + file.writelines('[video mm_processor single-frame] ' + str(mm_one).lower() + '\n') + assert getattr(mm_one.choices[0], 'finish_reason', None) == 'stop', mm_one + mm_one_text = (mm_one.choices[0].message.content or '').strip() + assert mm_one_text and _mm_demo_single_frame_scene_assert(mm_one_text), mm_one_text + assert _mm_demo_thinking_wrapper_shape_assert(mm_one_text), mm_one_text + + mixed_messages = [{ + 'role': + 'user', + 'content': [ + { + 'type': + 'text', + 'text': ( + 'You receive one still image and one video clip in this message. In 2-4 short sentences: ' + '(1) name one clear subject from the image; ' + '(2) name the animal or main scene in the video.'), + }, + { + 'type': 'image_url', + 'image_url': { + 'url': f'{resource_path}/{PIC}', + }, + }, + { + 'type': 'video_url', + 'video_url': { + 'url': video_path, + }, + }, + ], + }] + if not os.path.isfile(video_path): + file.writelines('[mixed image+text+video skipped] missing video file (same as video testcase)\n') + else: + try: + mix_resp = client.chat.completions.create( + model=model_name, + messages=mixed_messages, + temperature=0.3, + max_tokens=512, + extra_body={ + 'media_io_kwargs': { + 'video': { + 'num_frames': 6, + }, + }, + }, + ) + except (BadRequestError, APIStatusError) as exc: + if not _vl_openai_http_error_skippable(exc): + raise + file.writelines(f'[mixed image+text+video skipped] server rejected: {exc!r}\n') + else: + file.writelines('[mixed image+text+video] ' + str(mix_resp).lower() + '\n') + mix_content = (mix_resp.choices[0].message.content or '').strip() + assert mix_content, mix_resp + assert ('tiger' in mix_content.lower() or '虎' in mix_content or 'ski' in mix_content.lower() + or '滑雪' in mix_content), mix_resp + assert _vl_video_stream_finish_assert( + getattr(mix_resp.choices[0], 'finish_reason', None), mix_content), mix_resp + file.close() allure.attach.file(restful_log, name=restful_log, attachment_type=allure.attachment_type.TEXT) diff --git a/autotest/utils/sleep_utils.py b/autotest/utils/sleep_utils.py new file mode 100644 index 0000000000..f289f3d558 --- /dev/null +++ b/autotest/utils/sleep_utils.py @@ -0,0 +1,176 @@ +from __future__ import annotations + +import json +import os +from collections import Counter +from collections.abc import Callable +from pathlib import Path +from typing import Any + +import torch +from safetensors.torch import safe_open +from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, WEIGHTS_INDEX_NAME, WEIGHTS_NAME + +from lmdeploy.utils import serialize_state_dict + +UPDATE_WEIGHTS_CUDA_DEVICE_ENV = 'LMDEPLOY_UPDATE_WEIGHTS_CUDA_DEVICE' + +LEVEL2_GREEDY_MESSAGES = [{'role': 'user', 'content': '424242'}] +LEVEL2_MAX_TOKENS = 64 +LEVEL2_BASELINE_RUNS = 3 +MAX_SINGLE_CHAR_FRACTION = 0.75 + + +def resolve_update_weights_cuda_device_index() -> int: + raw = os.environ.get(UPDATE_WEIGHTS_CUDA_DEVICE_ENV, '').strip() + if not raw: + return torch.cuda.current_device() + try: + idx = int(raw) + except ValueError as e: + raise AssertionError( + f'{UPDATE_WEIGHTS_CUDA_DEVICE_ENV} must be an int, got {raw!r}') from e + n = torch.cuda.device_count() + assert 0 <= idx < n, ( + f'{UPDATE_WEIGHTS_CUDA_DEVICE_ENV}={idx} out of range for cuda.device_count()={n}') + return idx + + +def resolve_hf_checkpoint_dir(config: dict, model_case: str) -> Path: + if os.environ.get('LMDEPLOY_USE_MODELSCOPE', 'False') == 'True': + return Path(model_case) + return Path(config['model_path']) / model_case + + +def shard_paths(model_dir: Path) -> tuple[str, list[Path]]: + if (model_dir / SAFE_WEIGHTS_NAME).is_file(): + return 'safetensors', [model_dir / SAFE_WEIGHTS_NAME] + if (model_dir / SAFE_WEIGHTS_INDEX_NAME).is_file(): + with open(model_dir / SAFE_WEIGHTS_INDEX_NAME, encoding='utf-8') as f: + index = json.load(f) + paths = sorted(set(index['weight_map'].values())) + return 'safetensors', [model_dir / p for p in paths] + if (model_dir / WEIGHTS_NAME).is_file(): + return 'pytorch', [model_dir / WEIGHTS_NAME] + if (model_dir / WEIGHTS_INDEX_NAME).is_file(): + with open(model_dir / WEIGHTS_INDEX_NAME, encoding='utf-8') as f: + index = json.load(f) + paths = sorted(set(index['weight_map'].values())) + return 'pytorch', [model_dir / p for p in paths] + raise FileNotFoundError(f'No HF weights under {model_dir}') + + +def load_shard_tensors(kind: str, path: Path) -> dict[str, torch.Tensor]: + out: dict[str, torch.Tensor] = {} + if kind == 'safetensors': + with safe_open(str(path), framework='pt') as f: + for key in f.keys(): + out[key] = f.get_tensor(key) + else: + state = torch.load(str(path), weights_only=True, map_location='cpu') + try: + out.update(state) + finally: + del state + return out + + +def assistant_content_from_openai_completion_dict(output: dict) -> str: + choices = output.get('choices') or [] + assert len(choices) == 1, f'expected 1 choice, got {len(choices)}' + msg = choices[0].get('message') or {} + return (msg.get('content') or '').strip() + + +def assert_assistant_not_degenerate(content: str, *, label: str) -> None: + assert content, f'{label}: empty assistant content' + compact = content.replace('\n', ' ').strip() + assert len(set(compact)) >= 4, ( + f'{label}: degenerate assistant text (too few distinct chars): {content!r}') + top_cnt = Counter(compact).most_common(1)[0][1] + assert top_cnt / len(compact) <= MAX_SINGLE_CHAR_FRACTION, ( + f'{label}: one token/char dominates assistant text: {content!r}') + + +def level2_update_weights_request_dict(serialized_data: object, finished: bool) -> dict[str, Any]: + return { + 'serialized_named_tensors': serialized_data, + 'finished': finished, + } + + +def assert_chat_decode_unchanged(ref: dict, cur: dict, *, label: str) -> None: + a, b = assistant_content_from_openai_completion_dict(ref), assistant_content_from_openai_completion_dict(cur) + assert a == b, f'{label}: assistant content changed\n before={a!r}\n after={b!r}' + rt = ref.get('usage', {}).get('completion_tokens') + ct = cur.get('usage', {}).get('completion_tokens') + assert rt == ct, f'{label}: completion_tokens changed {rt} -> {ct}' + rfr = ref['choices'][0].get('finish_reason') + cfr = cur['choices'][0].get('finish_reason') + if rfr is not None and cfr is not None: + assert rfr == cfr, f'{label}: finish_reason changed {rfr!r} -> {cfr!r}' + + +def apply_serialized_hf_segments_for_level2_weights( + model_dir: Path, + emit_segment: Callable[[Any, bool], None], +) -> None: + kind, shards = shard_paths(model_dir) + num_segment = len(shards) + dev_idx = resolve_update_weights_cuda_device_index() + device = torch.device('cuda', dev_idx) + with torch.cuda.device(dev_idx): + for seg_idx in range(num_segment): + cpu_dict = load_shard_tensors(kind, shards[seg_idx]) + seg_gpu = {k: v.to(device, non_blocking=True) for k, v in cpu_dict.items()} + del cpu_dict + serialized_data = serialize_state_dict(seg_gpu) + del seg_gpu + torch.cuda.empty_cache() + emit_segment(serialized_data, seg_idx == num_segment - 1) + + +def apply_serialized_hf_segments_for_turbomind_level2_weights( + model_dir: Path, + emit_segment: Callable[[Any, bool], None], +) -> None: + from lmdeploy.turbomind.deploy.converter import get_input_model_registered_name + from lmdeploy.turbomind.deploy.source_model.base import INPUT_MODELS + + root = str(model_dir.resolve()) + try: + input_model_name = get_input_model_registered_name(root, 'hf') + if input_model_name == 'qwen3_5-moe': + raise RuntimeError( + 'turbomind update_weights is unsupported for qwen3_5-moe in the current server build: ' + 'server-side StateDictLoader has no `index`, but Qwen3_5MoeModel.readers() accesses loader.index') + input_model_cls = INPUT_MODELS.get(input_model_name) + input_model = input_model_cls(model_path=root, tokenizer_path=root) + except Exception as e: + raise RuntimeError( + f'turbomind update_weights: failed to build input_model readers for {model_dir}: {e}') from e + + dev_idx = resolve_update_weights_cuda_device_index() + device = torch.device('cuda', dev_idx) + with torch.cuda.device(dev_idx): + it = iter(dict(reader.params) for _, reader in input_model.readers()) + try: + chunk = next(it) + except StopIteration: + raise RuntimeError(f'no turbomind weight chunks to emit under {model_dir}') from None + + for cpu_dict_next in it: + seg_gpu = {k: v.to(device, non_blocking=True) for k, v in chunk.items()} + try: + emit_segment(serialize_state_dict(seg_gpu), False) + finally: + del seg_gpu + torch.cuda.empty_cache() + chunk = cpu_dict_next + + seg_gpu = {k: v.to(device, non_blocking=True) for k, v in chunk.items()} + try: + emit_segment(serialize_state_dict(seg_gpu), True) + finally: + del seg_gpu + torch.cuda.empty_cache()