Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/en/llm/api_server.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,10 @@ for item in api_client.completions_v1(model=model_name, prompt='hi'):

May refer to [api_server_tools](./api_server_tools.md).

### Anthropic-Compatible Endpoints

May refer to [api_server_anthropic](./api_server_anthropic.md).

### Integrate with Java/Golang/Rust

May use [openapi-generator-cli](https://github.com/OpenAPITools/openapi-generator-cli) to convert `http://{server_ip}:{server_port}/openapi.json` to java/rust/golang client.
Expand Down
48 changes: 48 additions & 0 deletions docs/en/llm/api_server_anthropic.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Anthropic-Compatible Endpoints

LMDeploy provides a lightweight Anthropic-compatible surface for easier integration with Anthropic-style clients and gateways.

## Supported Endpoints

- `POST /v1/messages`
- `POST /v1/messages/count_tokens`
- `GET /anthropic/v1/models`

## Required Headers

For Anthropic `POST` endpoints, include:

- `content-type: application/json`
- `anthropic-version: 2023-06-01` (or another accepted version string)

## Notes and Current Limits

- Tool-call fields are **temporarily unsupported** in this phase (`tools`, `tool_choice`).
- If tool fields are provided, LMDeploy returns an Anthropic-style error response.
- `count_tokens` is tokenizer/chat-template based and is intended for practical estimation.

## Example: `/v1/messages`

```bash
curl http://{server_ip}:{server_port}/v1/messages \
-H "content-type: application/json" \
-H "anthropic-version: 2023-06-01" \
-d '{
"model": "internlm-chat-7b",
"max_tokens": 128,
"messages": [{"role": "user", "content": "Hello from Anthropic client"}]
}'
```

## Example: `/v1/messages/count_tokens`

```bash
curl http://{server_ip}:{server_port}/v1/messages/count_tokens \
-H "content-type: application/json" \
-H "anthropic-version: 2023-06-01" \
-d '{
"model": "internlm-chat-7b",
"system": "You are a helpful assistant.",
"messages": [{"role": "user", "content": "Count these tokens"}]
}'
```
6 changes: 6 additions & 0 deletions lmdeploy/serve/anthropic/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Copyright (c) OpenMMLab. All rights reserved.
"""Anthropic-compatible serving endpoints."""

from .router import create_anthropic_router

__all__ = ['create_anthropic_router']
105 changes: 105 additions & 0 deletions lmdeploy/serve/anthropic/adapter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# Copyright (c) OpenMMLab. All rights reserved.
"""Adapters between Anthropic requests and LMDeploy internals."""

from __future__ import annotations

from typing import Any

from lmdeploy.messages import GenerationConfig

from .protocol import CountTokensRequest, MessagesRequest, TextContentBlockParam


def get_model_list(server_context) -> list[str]:
"""Return available model names from the server context."""

model_names = [server_context.async_engine.model_name]
cfg = server_context.async_engine.backend_config
model_names += getattr(cfg, 'adapters', None) or []
return model_names


def ensure_tools_not_requested(request: MessagesRequest | CountTokensRequest) -> None:
"""Reject tool-related fields while parser refactor is in progress."""

if getattr(request, 'tools', None):
raise NotImplementedError('Anthropic tool fields are temporarily unsupported.')
if getattr(request, 'tool_choice', None) is not None:
raise NotImplementedError('Anthropic tool_choice is temporarily unsupported.')


def _text_from_blocks(blocks: list[TextContentBlockParam | dict[str, Any]], field_name: str) -> str:
out: list[str] = []
for idx, block in enumerate(blocks):
if isinstance(block, dict):
block_type = block.get('type')
text = block.get('text')
else:
block_type = block.type
text = block.text
if block_type != 'text':
raise ValueError(
f'Only text content blocks are supported in `{field_name}`. '
f'Got: {block_type!r} at index {idx}.')
if text is None:
raise ValueError(f'Missing `text` in `{field_name}` content block at index {idx}.')
out.append(text)
return ''.join(out)


def text_from_content(content: str | list[TextContentBlockParam], field_name: str) -> str:
"""Normalize Anthropic content field to plain text."""

if isinstance(content, str):
return content
return _text_from_blocks(content, field_name=field_name)


def to_lmdeploy_messages(request: MessagesRequest | CountTokensRequest) -> list[dict[str, str]]:
"""Convert Anthropic request messages into LMDeploy chat messages."""

lm_messages: list[dict[str, str]] = []
if request.system is not None:
lm_messages.append(
dict(role='system', content=text_from_content(request.system, field_name='system')))
for idx, message in enumerate(request.messages):
content = text_from_content(message.content, field_name=f'messages[{idx}].content')
lm_messages.append(dict(role=message.role, content=content))
return lm_messages


def to_generation_config(request: MessagesRequest) -> GenerationConfig:
"""Map Anthropic messages request to LMDeploy generation config."""

return GenerationConfig(
max_new_tokens=request.max_tokens,
do_sample=True,
top_k=40 if request.top_k is None else request.top_k,
top_p=1.0 if request.top_p is None else request.top_p,
temperature=1.0 if request.temperature is None else request.temperature,
stop_words=request.stop_sequences,
skip_special_tokens=True,
spaces_between_special_tokens=True,
)


def count_input_tokens(async_engine, messages: list[dict[str, str]]) -> int:
"""Approximate Anthropic token counting using LMDeploy
tokenizer/template."""

prompt = async_engine.chat_template.messages2prompt(messages, sequence_start=True)
token_ids = async_engine.tokenizer.encode(prompt, add_bos=True)
return len(token_ids)


def map_finish_reason(reason: str | None) -> str:
"""Map LMDeploy/OpenAI-like finish reason to Anthropic stop reason."""

mapping = {
'stop': 'end_turn',
'length': 'max_tokens',
'tool_calls': 'stop_sequence',
'abort': 'stop_sequence',
'error': 'stop_sequence',
}
return mapping.get(reason, 'end_turn')
2 changes: 2 additions & 0 deletions lmdeploy/serve/anthropic/endpoints/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Copyright (c) OpenMMLab. All rights reserved.
"""Anthropic endpoint modules."""
101 changes: 101 additions & 0 deletions lmdeploy/serve/anthropic/endpoints/messages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# Copyright (c) OpenMMLab. All rights reserved.
"""Endpoint for ``POST /v1/messages``."""

from __future__ import annotations

from http import HTTPStatus

import shortuuid
from fastapi import APIRouter, Depends, Request
from fastapi.responses import StreamingResponse

from lmdeploy.serve.utils.server_utils import validate_json_request

from ..adapter import (
ensure_tools_not_requested,
get_model_list,
map_finish_reason,
to_generation_config,
to_lmdeploy_messages,
)
from ..errors import create_error_response
from ..protocol import MessagesRequest, MessagesResponse, MessageTextBlock, MessageUsage
from ..streaming import stream_messages_response


def _validate_headers(raw_request: Request):
anthropic_version = raw_request.headers.get('anthropic-version')
if not anthropic_version:
return create_error_response(HTTPStatus.BAD_REQUEST, 'Missing required header: anthropic-version')
return None


def register(router: APIRouter, server_context) -> None:
"""Register endpoint onto router."""

@router.post('/v1/messages', dependencies=[Depends(validate_json_request)])
async def create_message(request: MessagesRequest, raw_request: Request):
header_error = _validate_headers(raw_request)
if header_error is not None:
return header_error

if request.model not in get_model_list(server_context):
return create_error_response(
HTTPStatus.NOT_FOUND,
f'The model {request.model!r} does not exist.',
error_type='not_found_error',
)

try:
ensure_tools_not_requested(request)
messages = to_lmdeploy_messages(request)
except NotImplementedError as err:
return create_error_response(HTTPStatus.BAD_REQUEST, str(err))
except ValueError as err:
return create_error_response(HTTPStatus.BAD_REQUEST, str(err))

session = server_context.get_session(-1)
adapter_name = None if request.model == server_context.async_engine.model_name else request.model
result_generator = server_context.async_engine.generate(
messages,
session,
gen_config=to_generation_config(request),
stream_response=True,
sequence_start=True,
sequence_end=True,
do_preprocess=True,
adapter_name=adapter_name,
)

request_id = f'msg_{shortuuid.random()}'

if request.stream:
return StreamingResponse(
stream_messages_response(result_generator, request_id=request_id, model=request.model),
media_type='text/event-stream',
)

text = ''
final_res = None
async for res in result_generator:
if await raw_request.is_disconnected():
await session.async_abort()
return create_error_response(HTTPStatus.BAD_REQUEST, 'Client disconnected')
final_res = res
text += res.response or ''

if final_res is None:
return create_error_response(HTTPStatus.INTERNAL_SERVER_ERROR, 'No generation output from engine.')

response = MessagesResponse(
id=request_id,
model=request.model,
content=[MessageTextBlock(text=text)],
stop_reason=map_finish_reason(final_res.finish_reason),
stop_sequence=None,
usage=MessageUsage(
input_tokens=final_res.input_token_len,
output_tokens=final_res.generate_token_len,
),
)
return response.model_dump()
49 changes: 49 additions & 0 deletions lmdeploy/serve/anthropic/endpoints/messages_count_tokens.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Copyright (c) OpenMMLab. All rights reserved.
"""Endpoint for ``POST /v1/messages/count_tokens``."""

from __future__ import annotations

from http import HTTPStatus

from fastapi import APIRouter, Depends, Request

from lmdeploy.serve.utils.server_utils import validate_json_request

from ..adapter import count_input_tokens, ensure_tools_not_requested, get_model_list, to_lmdeploy_messages
from ..errors import create_error_response
from ..protocol import CountTokensRequest, CountTokensResponse


def _validate_headers(raw_request: Request):
anthropic_version = raw_request.headers.get('anthropic-version')
if not anthropic_version:
return create_error_response(HTTPStatus.BAD_REQUEST, 'Missing required header: anthropic-version')
return None


def register(router: APIRouter, server_context) -> None:
"""Register endpoint onto router."""

@router.post('/v1/messages/count_tokens', dependencies=[Depends(validate_json_request)])
async def count_tokens(request: CountTokensRequest, raw_request: Request):
header_error = _validate_headers(raw_request)
if header_error is not None:
return header_error

if request.model not in get_model_list(server_context):
return create_error_response(
HTTPStatus.NOT_FOUND,
f'The model {request.model!r} does not exist.',
error_type='not_found_error',
)

try:
ensure_tools_not_requested(request)
messages = to_lmdeploy_messages(request)
input_tokens = count_input_tokens(server_context.async_engine, messages)
except NotImplementedError as err:
return create_error_response(HTTPStatus.BAD_REQUEST, str(err))
except ValueError as err:
return create_error_response(HTTPStatus.BAD_REQUEST, str(err))

return CountTokensResponse(input_tokens=input_tokens).model_dump()
20 changes: 20 additions & 0 deletions lmdeploy/serve/anthropic/endpoints/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Copyright (c) OpenMMLab. All rights reserved.
"""Endpoint for Anthropic-scoped model listing."""

from __future__ import annotations

from fastapi import APIRouter

from ..adapter import get_model_list
from ..protocol import AnthropicModel, AnthropicModelList


def register(router: APIRouter, server_context) -> None:
"""Register endpoint onto router."""

@router.get('/anthropic/v1/models')
async def list_models():
models = [AnthropicModel(id=name, display_name=name) for name in get_model_list(server_context)]
first_id = models[0].id if models else None
last_id = models[-1].id if models else None
return AnthropicModelList(data=models, first_id=first_id, last_id=last_id).model_dump()
17 changes: 17 additions & 0 deletions lmdeploy/serve/anthropic/errors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Copyright (c) OpenMMLab. All rights reserved.
"""Error helpers for Anthropic-compatible endpoints."""

from __future__ import annotations

from http import HTTPStatus

from fastapi.responses import JSONResponse

from .protocol import AnthropicError, AnthropicErrorResponse


def create_error_response(status: HTTPStatus, message: str, error_type: str = 'invalid_request_error') -> JSONResponse:
"""Create Anthropic-style error response."""

payload = AnthropicErrorResponse(error=AnthropicError(type=error_type, message=message)).model_dump()
return JSONResponse(payload, status_code=status.value)
Loading
Loading