From 4c3275dfb53b7aa7057e31d0657a3e406f5f88fd Mon Sep 17 00:00:00 2001 From: Hyeonki Hong Date: Wed, 13 May 2026 14:49:52 +0900 Subject: [PATCH 1/2] NO-ISSUE: chore(preset): remove unused quickstart presets Drop deepseek-r1-distill-llama-8b, ibm-granite-3.3-8b-instruct, qwen2-0.5b-instruct, and qwen2.5-1.5b-instruct preset templates that are no longer maintained. Co-Authored-By: Claude Opus 4.7 (1M context) --- ...1-distill-llama-8b-amd-mi250-tp2.helm.yaml | 42 ------------------ ...-distill-llama-8b-amd-mi300x-tp2.helm.yaml | 42 ------------------ ...ll-llama-8b-decode-amd-mi250-tp2.helm.yaml | 43 ------------------- ...l-llama-8b-decode-amd-mi300x-tp2.helm.yaml | 43 ------------------- ...l-llama-8b-prefill-amd-mi250-tp2.helm.yaml | 43 ------------------- ...-llama-8b-prefill-amd-mi300x-tp2.helm.yaml | 43 ------------------- ...te-3.3-8b-instruct-amd-mi250-tp2.helm.yaml | 42 ------------------ ...e-3.3-8b-instruct-amd-mi300x-tp2.helm.yaml | 42 ------------------ ...8b-instruct-decode-amd-mi250-tp2.helm.yaml | 43 ------------------- ...b-instruct-decode-amd-mi300x-tp2.helm.yaml | 43 ------------------- ...b-instruct-prefill-amd-mi250-tp2.helm.yaml | 43 ------------------- ...-instruct-prefill-amd-mi300x-tp2.helm.yaml | 43 ------------------- ...wen2-0.5b-instruct-amd-mi250-tp2.helm.yaml | 42 ------------------ ...en2-0.5b-instruct-amd-mi300x-tp2.helm.yaml | 42 ------------------ ...5b-instruct-decode-amd-mi250-tp2.helm.yaml | 43 ------------------- ...b-instruct-decode-amd-mi300x-tp2.helm.yaml | 43 ------------------- ...b-instruct-prefill-amd-mi250-tp2.helm.yaml | 43 ------------------- ...-instruct-prefill-amd-mi300x-tp2.helm.yaml | 43 ------------------- ...n2.5-1.5b-instruct-amd-mi250-tp2.helm.yaml | 42 ------------------ ...2.5-1.5b-instruct-amd-mi300x-tp2.helm.yaml | 42 ------------------ ...5b-instruct-decode-amd-mi250-tp2.helm.yaml | 43 ------------------- ...b-instruct-decode-amd-mi300x-tp2.helm.yaml | 43 ------------------- ...b-instruct-prefill-amd-mi250-tp2.helm.yaml | 43 ------------------- ...-instruct-prefill-amd-mi300x-tp2.helm.yaml | 43 ------------------- 24 files changed, 1024 deletions(-) delete mode 100644 deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-amd-mi250-tp2.helm.yaml delete mode 100644 deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-amd-mi300x-tp2.helm.yaml delete mode 100644 deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-decode-amd-mi250-tp2.helm.yaml delete mode 100644 deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-decode-amd-mi300x-tp2.helm.yaml delete mode 100644 deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-prefill-amd-mi250-tp2.helm.yaml delete mode 100644 deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-prefill-amd-mi300x-tp2.helm.yaml delete mode 100644 deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-amd-mi250-tp2.helm.yaml delete mode 100644 deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-amd-mi300x-tp2.helm.yaml delete mode 100644 deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-decode-amd-mi250-tp2.helm.yaml delete mode 100644 deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-decode-amd-mi300x-tp2.helm.yaml delete mode 100644 deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-prefill-amd-mi250-tp2.helm.yaml delete mode 100644 deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-prefill-amd-mi300x-tp2.helm.yaml delete mode 100644 deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-amd-mi250-tp2.helm.yaml delete mode 100644 deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-amd-mi300x-tp2.helm.yaml delete mode 100644 deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-decode-amd-mi250-tp2.helm.yaml delete mode 100644 deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-decode-amd-mi300x-tp2.helm.yaml delete mode 100644 deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-prefill-amd-mi250-tp2.helm.yaml delete mode 100644 deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-prefill-amd-mi300x-tp2.helm.yaml delete mode 100644 deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-amd-mi250-tp2.helm.yaml delete mode 100644 deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-amd-mi300x-tp2.helm.yaml delete mode 100644 deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-decode-amd-mi250-tp2.helm.yaml delete mode 100644 deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-decode-amd-mi300x-tp2.helm.yaml delete mode 100644 deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-prefill-amd-mi250-tp2.helm.yaml delete mode 100644 deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-prefill-amd-mi300x-tp2.helm.yaml diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-amd-mi250-tp2.helm.yaml deleted file mode 100644 index e980cd9..0000000 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-amd-mi250-tp2.helm.yaml +++ /dev/null @@ -1,42 +0,0 @@ -apiVersion: odin.moreh.io/v1alpha1 -kind: InferenceServiceTemplate -metadata: - name: quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-amd-mi250-tp2 - namespace: {{ include "common.names.namespace" . }} - labels: - {{- include "mif.preset.labels" . | nindent 4 }} - mif.moreh.io/model.org: deepseek-ai - mif.moreh.io/model.name: deepseek-r1-distill-llama-8b - mif.moreh.io/role: e2e - mif.moreh.io/accelerator.vendor: amd - mif.moreh.io/accelerator.model: mi250 - mif.moreh.io/parallelism: tp2 -spec: - framework: vllm - model: - name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - parallelism: - tensor: 2 - template: - spec: - containers: - - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 - env: - - name: ISVC_EXTRA_ARGS - value: >- - --disable-uvicorn-access-log - --no-enable-log-requests - --max-model-len -1 - resources: - requests: - amd.com/gpu: 2 - limits: - amd.com/gpu: 2 - nodeSelector: - moai.moreh.io/accelerator.vendor: amd - moai.moreh.io/accelerator.model: mi250 - tolerations: - - key: amd.com/gpu - operator: Exists - effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-amd-mi300x-tp2.helm.yaml deleted file mode 100644 index fa59ded..0000000 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-amd-mi300x-tp2.helm.yaml +++ /dev/null @@ -1,42 +0,0 @@ -apiVersion: odin.moreh.io/v1alpha1 -kind: InferenceServiceTemplate -metadata: - name: quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-amd-mi300x-tp2 - namespace: {{ include "common.names.namespace" . }} - labels: - {{- include "mif.preset.labels" . | nindent 4 }} - mif.moreh.io/model.org: deepseek-ai - mif.moreh.io/model.name: deepseek-r1-distill-llama-8b - mif.moreh.io/role: e2e - mif.moreh.io/accelerator.vendor: amd - mif.moreh.io/accelerator.model: mi300x - mif.moreh.io/parallelism: tp2 -spec: - framework: vllm - model: - name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - parallelism: - tensor: 2 - template: - spec: - containers: - - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 - env: - - name: ISVC_EXTRA_ARGS - value: >- - --disable-uvicorn-access-log - --no-enable-log-requests - --max-model-len -1 - resources: - requests: - amd.com/gpu: 2 - limits: - amd.com/gpu: 2 - nodeSelector: - moai.moreh.io/accelerator.vendor: amd - moai.moreh.io/accelerator.model: mi300x - tolerations: - - key: amd.com/gpu - operator: Exists - effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-decode-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-decode-amd-mi250-tp2.helm.yaml deleted file mode 100644 index b54b06d..0000000 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-decode-amd-mi250-tp2.helm.yaml +++ /dev/null @@ -1,43 +0,0 @@ -apiVersion: odin.moreh.io/v1alpha1 -kind: InferenceServiceTemplate -metadata: - name: quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-decode-amd-mi250-tp2 - namespace: {{ include "common.names.namespace" . }} - labels: - {{- include "mif.preset.labels" . | nindent 4 }} - mif.moreh.io/model.org: deepseek-ai - mif.moreh.io/model.name: deepseek-r1-distill-llama-8b - mif.moreh.io/role: decode - mif.moreh.io/accelerator.vendor: amd - mif.moreh.io/accelerator.model: mi250 - mif.moreh.io/parallelism: tp2 -spec: - framework: vllm - model: - name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - parallelism: - tensor: 2 - template: - spec: - containers: - - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 - env: - - name: ISVC_EXTRA_ARGS - value: >- - --disable-uvicorn-access-log - --no-enable-log-requests - --max-model-len -1 - --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' - resources: - requests: - amd.com/gpu: 2 - limits: - amd.com/gpu: 2 - nodeSelector: - moai.moreh.io/accelerator.vendor: amd - moai.moreh.io/accelerator.model: mi250 - tolerations: - - key: amd.com/gpu - operator: Exists - effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-decode-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-decode-amd-mi300x-tp2.helm.yaml deleted file mode 100644 index 5670cd4..0000000 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-decode-amd-mi300x-tp2.helm.yaml +++ /dev/null @@ -1,43 +0,0 @@ -apiVersion: odin.moreh.io/v1alpha1 -kind: InferenceServiceTemplate -metadata: - name: quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-decode-amd-mi300x-tp2 - namespace: {{ include "common.names.namespace" . }} - labels: - {{- include "mif.preset.labels" . | nindent 4 }} - mif.moreh.io/model.org: deepseek-ai - mif.moreh.io/model.name: deepseek-r1-distill-llama-8b - mif.moreh.io/role: decode - mif.moreh.io/accelerator.vendor: amd - mif.moreh.io/accelerator.model: mi300x - mif.moreh.io/parallelism: tp2 -spec: - framework: vllm - model: - name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - parallelism: - tensor: 2 - template: - spec: - containers: - - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 - env: - - name: ISVC_EXTRA_ARGS - value: >- - --disable-uvicorn-access-log - --no-enable-log-requests - --max-model-len -1 - --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' - resources: - requests: - amd.com/gpu: 2 - limits: - amd.com/gpu: 2 - nodeSelector: - moai.moreh.io/accelerator.vendor: amd - moai.moreh.io/accelerator.model: mi300x - tolerations: - - key: amd.com/gpu - operator: Exists - effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-prefill-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-prefill-amd-mi250-tp2.helm.yaml deleted file mode 100644 index d7b362f..0000000 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-prefill-amd-mi250-tp2.helm.yaml +++ /dev/null @@ -1,43 +0,0 @@ -apiVersion: odin.moreh.io/v1alpha1 -kind: InferenceServiceTemplate -metadata: - name: quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-prefill-amd-mi250-tp2 - namespace: {{ include "common.names.namespace" . }} - labels: - {{- include "mif.preset.labels" . | nindent 4 }} - mif.moreh.io/model.org: deepseek-ai - mif.moreh.io/model.name: deepseek-r1-distill-llama-8b - mif.moreh.io/role: prefill - mif.moreh.io/accelerator.vendor: amd - mif.moreh.io/accelerator.model: mi250 - mif.moreh.io/parallelism: tp2 -spec: - framework: vllm - model: - name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - parallelism: - tensor: 2 - template: - spec: - containers: - - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 - env: - - name: ISVC_EXTRA_ARGS - value: >- - --disable-uvicorn-access-log - --no-enable-log-requests - --max-model-len -1 - --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' - resources: - requests: - amd.com/gpu: 2 - limits: - amd.com/gpu: 2 - nodeSelector: - moai.moreh.io/accelerator.vendor: amd - moai.moreh.io/accelerator.model: mi250 - tolerations: - - key: amd.com/gpu - operator: Exists - effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-prefill-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-prefill-amd-mi300x-tp2.helm.yaml deleted file mode 100644 index 78cb265..0000000 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-prefill-amd-mi300x-tp2.helm.yaml +++ /dev/null @@ -1,43 +0,0 @@ -apiVersion: odin.moreh.io/v1alpha1 -kind: InferenceServiceTemplate -metadata: - name: quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-prefill-amd-mi300x-tp2 - namespace: {{ include "common.names.namespace" . }} - labels: - {{- include "mif.preset.labels" . | nindent 4 }} - mif.moreh.io/model.org: deepseek-ai - mif.moreh.io/model.name: deepseek-r1-distill-llama-8b - mif.moreh.io/role: prefill - mif.moreh.io/accelerator.vendor: amd - mif.moreh.io/accelerator.model: mi300x - mif.moreh.io/parallelism: tp2 -spec: - framework: vllm - model: - name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - parallelism: - tensor: 2 - template: - spec: - containers: - - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 - env: - - name: ISVC_EXTRA_ARGS - value: >- - --disable-uvicorn-access-log - --no-enable-log-requests - --max-model-len -1 - --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' - resources: - requests: - amd.com/gpu: 2 - limits: - amd.com/gpu: 2 - nodeSelector: - moai.moreh.io/accelerator.vendor: amd - moai.moreh.io/accelerator.model: mi300x - tolerations: - - key: amd.com/gpu - operator: Exists - effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-amd-mi250-tp2.helm.yaml deleted file mode 100644 index 71ba807..0000000 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-amd-mi250-tp2.helm.yaml +++ /dev/null @@ -1,42 +0,0 @@ -apiVersion: odin.moreh.io/v1alpha1 -kind: InferenceServiceTemplate -metadata: - name: quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-amd-mi250-tp2 - namespace: {{ include "common.names.namespace" . }} - labels: - {{- include "mif.preset.labels" . | nindent 4 }} - mif.moreh.io/model.org: ibm-granite - mif.moreh.io/model.name: granite-3.3-8b-instruct - mif.moreh.io/role: e2e - mif.moreh.io/accelerator.vendor: amd - mif.moreh.io/accelerator.model: mi250 - mif.moreh.io/parallelism: tp2 -spec: - framework: vllm - model: - name: ibm-granite/granite-3.3-8b-instruct - parallelism: - tensor: 2 - template: - spec: - containers: - - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 - env: - - name: ISVC_EXTRA_ARGS - value: >- - --disable-uvicorn-access-log - --no-enable-log-requests - --max-model-len -1 - resources: - requests: - amd.com/gpu: 2 - limits: - amd.com/gpu: 2 - nodeSelector: - moai.moreh.io/accelerator.vendor: amd - moai.moreh.io/accelerator.model: mi250 - tolerations: - - key: amd.com/gpu - operator: Exists - effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-amd-mi300x-tp2.helm.yaml deleted file mode 100644 index 9942628..0000000 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-amd-mi300x-tp2.helm.yaml +++ /dev/null @@ -1,42 +0,0 @@ -apiVersion: odin.moreh.io/v1alpha1 -kind: InferenceServiceTemplate -metadata: - name: quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-amd-mi300x-tp2 - namespace: {{ include "common.names.namespace" . }} - labels: - {{- include "mif.preset.labels" . | nindent 4 }} - mif.moreh.io/model.org: ibm-granite - mif.moreh.io/model.name: granite-3.3-8b-instruct - mif.moreh.io/role: e2e - mif.moreh.io/accelerator.vendor: amd - mif.moreh.io/accelerator.model: mi300x - mif.moreh.io/parallelism: tp2 -spec: - framework: vllm - model: - name: ibm-granite/granite-3.3-8b-instruct - parallelism: - tensor: 2 - template: - spec: - containers: - - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 - env: - - name: ISVC_EXTRA_ARGS - value: >- - --disable-uvicorn-access-log - --no-enable-log-requests - --max-model-len -1 - resources: - requests: - amd.com/gpu: 2 - limits: - amd.com/gpu: 2 - nodeSelector: - moai.moreh.io/accelerator.vendor: amd - moai.moreh.io/accelerator.model: mi300x - tolerations: - - key: amd.com/gpu - operator: Exists - effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-decode-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-decode-amd-mi250-tp2.helm.yaml deleted file mode 100644 index d7ce013..0000000 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-decode-amd-mi250-tp2.helm.yaml +++ /dev/null @@ -1,43 +0,0 @@ -apiVersion: odin.moreh.io/v1alpha1 -kind: InferenceServiceTemplate -metadata: - name: quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-decode-amd-mi250-tp2 - namespace: {{ include "common.names.namespace" . }} - labels: - {{- include "mif.preset.labels" . | nindent 4 }} - mif.moreh.io/model.org: ibm-granite - mif.moreh.io/model.name: granite-3.3-8b-instruct - mif.moreh.io/role: decode - mif.moreh.io/accelerator.vendor: amd - mif.moreh.io/accelerator.model: mi250 - mif.moreh.io/parallelism: tp2 -spec: - framework: vllm - model: - name: ibm-granite/granite-3.3-8b-instruct - parallelism: - tensor: 2 - template: - spec: - containers: - - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 - env: - - name: ISVC_EXTRA_ARGS - value: >- - --disable-uvicorn-access-log - --no-enable-log-requests - --max-model-len -1 - --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' - resources: - requests: - amd.com/gpu: 2 - limits: - amd.com/gpu: 2 - nodeSelector: - moai.moreh.io/accelerator.vendor: amd - moai.moreh.io/accelerator.model: mi250 - tolerations: - - key: amd.com/gpu - operator: Exists - effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-decode-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-decode-amd-mi300x-tp2.helm.yaml deleted file mode 100644 index d62bf54..0000000 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-decode-amd-mi300x-tp2.helm.yaml +++ /dev/null @@ -1,43 +0,0 @@ -apiVersion: odin.moreh.io/v1alpha1 -kind: InferenceServiceTemplate -metadata: - name: quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-decode-amd-mi300x-tp2 - namespace: {{ include "common.names.namespace" . }} - labels: - {{- include "mif.preset.labels" . | nindent 4 }} - mif.moreh.io/model.org: ibm-granite - mif.moreh.io/model.name: granite-3.3-8b-instruct - mif.moreh.io/role: decode - mif.moreh.io/accelerator.vendor: amd - mif.moreh.io/accelerator.model: mi300x - mif.moreh.io/parallelism: tp2 -spec: - framework: vllm - model: - name: ibm-granite/granite-3.3-8b-instruct - parallelism: - tensor: 2 - template: - spec: - containers: - - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 - env: - - name: ISVC_EXTRA_ARGS - value: >- - --disable-uvicorn-access-log - --no-enable-log-requests - --max-model-len -1 - --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' - resources: - requests: - amd.com/gpu: 2 - limits: - amd.com/gpu: 2 - nodeSelector: - moai.moreh.io/accelerator.vendor: amd - moai.moreh.io/accelerator.model: mi300x - tolerations: - - key: amd.com/gpu - operator: Exists - effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-prefill-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-prefill-amd-mi250-tp2.helm.yaml deleted file mode 100644 index d68a414..0000000 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-prefill-amd-mi250-tp2.helm.yaml +++ /dev/null @@ -1,43 +0,0 @@ -apiVersion: odin.moreh.io/v1alpha1 -kind: InferenceServiceTemplate -metadata: - name: quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-prefill-amd-mi250-tp2 - namespace: {{ include "common.names.namespace" . }} - labels: - {{- include "mif.preset.labels" . | nindent 4 }} - mif.moreh.io/model.org: ibm-granite - mif.moreh.io/model.name: granite-3.3-8b-instruct - mif.moreh.io/role: prefill - mif.moreh.io/accelerator.vendor: amd - mif.moreh.io/accelerator.model: mi250 - mif.moreh.io/parallelism: tp2 -spec: - framework: vllm - model: - name: ibm-granite/granite-3.3-8b-instruct - parallelism: - tensor: 2 - template: - spec: - containers: - - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 - env: - - name: ISVC_EXTRA_ARGS - value: >- - --disable-uvicorn-access-log - --no-enable-log-requests - --max-model-len -1 - --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' - resources: - requests: - amd.com/gpu: 2 - limits: - amd.com/gpu: 2 - nodeSelector: - moai.moreh.io/accelerator.vendor: amd - moai.moreh.io/accelerator.model: mi250 - tolerations: - - key: amd.com/gpu - operator: Exists - effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-prefill-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-prefill-amd-mi300x-tp2.helm.yaml deleted file mode 100644 index 84fc496..0000000 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-prefill-amd-mi300x-tp2.helm.yaml +++ /dev/null @@ -1,43 +0,0 @@ -apiVersion: odin.moreh.io/v1alpha1 -kind: InferenceServiceTemplate -metadata: - name: quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-prefill-amd-mi300x-tp2 - namespace: {{ include "common.names.namespace" . }} - labels: - {{- include "mif.preset.labels" . | nindent 4 }} - mif.moreh.io/model.org: ibm-granite - mif.moreh.io/model.name: granite-3.3-8b-instruct - mif.moreh.io/role: prefill - mif.moreh.io/accelerator.vendor: amd - mif.moreh.io/accelerator.model: mi300x - mif.moreh.io/parallelism: tp2 -spec: - framework: vllm - model: - name: ibm-granite/granite-3.3-8b-instruct - parallelism: - tensor: 2 - template: - spec: - containers: - - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 - env: - - name: ISVC_EXTRA_ARGS - value: >- - --disable-uvicorn-access-log - --no-enable-log-requests - --max-model-len -1 - --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' - resources: - requests: - amd.com/gpu: 2 - limits: - amd.com/gpu: 2 - nodeSelector: - moai.moreh.io/accelerator.vendor: amd - moai.moreh.io/accelerator.model: mi300x - tolerations: - - key: amd.com/gpu - operator: Exists - effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-amd-mi250-tp2.helm.yaml deleted file mode 100644 index 2111533..0000000 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-amd-mi250-tp2.helm.yaml +++ /dev/null @@ -1,42 +0,0 @@ -apiVersion: odin.moreh.io/v1alpha1 -kind: InferenceServiceTemplate -metadata: - name: quickstart-vllm-qwen-qwen2-0.5b-instruct-amd-mi250-tp2 - namespace: {{ include "common.names.namespace" . }} - labels: - {{- include "mif.preset.labels" . | nindent 4 }} - mif.moreh.io/model.org: qwen - mif.moreh.io/model.name: qwen2-0.5b-instruct - mif.moreh.io/role: e2e - mif.moreh.io/accelerator.vendor: amd - mif.moreh.io/accelerator.model: mi250 - mif.moreh.io/parallelism: tp2 -spec: - framework: vllm - model: - name: Qwen/Qwen2-0.5B-Instruct - parallelism: - tensor: 2 - template: - spec: - containers: - - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 - env: - - name: ISVC_EXTRA_ARGS - value: >- - --disable-uvicorn-access-log - --no-enable-log-requests - --max-model-len -1 - resources: - requests: - amd.com/gpu: 2 - limits: - amd.com/gpu: 2 - nodeSelector: - moai.moreh.io/accelerator.vendor: amd - moai.moreh.io/accelerator.model: mi250 - tolerations: - - key: amd.com/gpu - operator: Exists - effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-amd-mi300x-tp2.helm.yaml deleted file mode 100644 index 9ae24fd..0000000 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-amd-mi300x-tp2.helm.yaml +++ /dev/null @@ -1,42 +0,0 @@ -apiVersion: odin.moreh.io/v1alpha1 -kind: InferenceServiceTemplate -metadata: - name: quickstart-vllm-qwen-qwen2-0.5b-instruct-amd-mi300x-tp2 - namespace: {{ include "common.names.namespace" . }} - labels: - {{- include "mif.preset.labels" . | nindent 4 }} - mif.moreh.io/model.org: qwen - mif.moreh.io/model.name: qwen2-0.5b-instruct - mif.moreh.io/role: e2e - mif.moreh.io/accelerator.vendor: amd - mif.moreh.io/accelerator.model: mi300x - mif.moreh.io/parallelism: tp2 -spec: - framework: vllm - model: - name: Qwen/Qwen2-0.5B-Instruct - parallelism: - tensor: 2 - template: - spec: - containers: - - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 - env: - - name: ISVC_EXTRA_ARGS - value: >- - --disable-uvicorn-access-log - --no-enable-log-requests - --max-model-len -1 - resources: - requests: - amd.com/gpu: 2 - limits: - amd.com/gpu: 2 - nodeSelector: - moai.moreh.io/accelerator.vendor: amd - moai.moreh.io/accelerator.model: mi300x - tolerations: - - key: amd.com/gpu - operator: Exists - effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-decode-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-decode-amd-mi250-tp2.helm.yaml deleted file mode 100644 index 4f3eb5d..0000000 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-decode-amd-mi250-tp2.helm.yaml +++ /dev/null @@ -1,43 +0,0 @@ -apiVersion: odin.moreh.io/v1alpha1 -kind: InferenceServiceTemplate -metadata: - name: quickstart-vllm-qwen-qwen2-0.5b-instruct-decode-amd-mi250-tp2 - namespace: {{ include "common.names.namespace" . }} - labels: - {{- include "mif.preset.labels" . | nindent 4 }} - mif.moreh.io/model.org: qwen - mif.moreh.io/model.name: qwen2-0.5b-instruct - mif.moreh.io/role: decode - mif.moreh.io/accelerator.vendor: amd - mif.moreh.io/accelerator.model: mi250 - mif.moreh.io/parallelism: tp2 -spec: - framework: vllm - model: - name: Qwen/Qwen2-0.5B-Instruct - parallelism: - tensor: 2 - template: - spec: - containers: - - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 - env: - - name: ISVC_EXTRA_ARGS - value: >- - --disable-uvicorn-access-log - --no-enable-log-requests - --max-model-len -1 - --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' - resources: - requests: - amd.com/gpu: 2 - limits: - amd.com/gpu: 2 - nodeSelector: - moai.moreh.io/accelerator.vendor: amd - moai.moreh.io/accelerator.model: mi250 - tolerations: - - key: amd.com/gpu - operator: Exists - effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-decode-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-decode-amd-mi300x-tp2.helm.yaml deleted file mode 100644 index 9b03f6d..0000000 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-decode-amd-mi300x-tp2.helm.yaml +++ /dev/null @@ -1,43 +0,0 @@ -apiVersion: odin.moreh.io/v1alpha1 -kind: InferenceServiceTemplate -metadata: - name: quickstart-vllm-qwen-qwen2-0.5b-instruct-decode-amd-mi300x-tp2 - namespace: {{ include "common.names.namespace" . }} - labels: - {{- include "mif.preset.labels" . | nindent 4 }} - mif.moreh.io/model.org: qwen - mif.moreh.io/model.name: qwen2-0.5b-instruct - mif.moreh.io/role: decode - mif.moreh.io/accelerator.vendor: amd - mif.moreh.io/accelerator.model: mi300x - mif.moreh.io/parallelism: tp2 -spec: - framework: vllm - model: - name: Qwen/Qwen2-0.5B-Instruct - parallelism: - tensor: 2 - template: - spec: - containers: - - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 - env: - - name: ISVC_EXTRA_ARGS - value: >- - --disable-uvicorn-access-log - --no-enable-log-requests - --max-model-len -1 - --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' - resources: - requests: - amd.com/gpu: 2 - limits: - amd.com/gpu: 2 - nodeSelector: - moai.moreh.io/accelerator.vendor: amd - moai.moreh.io/accelerator.model: mi300x - tolerations: - - key: amd.com/gpu - operator: Exists - effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-prefill-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-prefill-amd-mi250-tp2.helm.yaml deleted file mode 100644 index 5a3c89b..0000000 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-prefill-amd-mi250-tp2.helm.yaml +++ /dev/null @@ -1,43 +0,0 @@ -apiVersion: odin.moreh.io/v1alpha1 -kind: InferenceServiceTemplate -metadata: - name: quickstart-vllm-qwen-qwen2-0.5b-instruct-prefill-amd-mi250-tp2 - namespace: {{ include "common.names.namespace" . }} - labels: - {{- include "mif.preset.labels" . | nindent 4 }} - mif.moreh.io/model.org: qwen - mif.moreh.io/model.name: qwen2-0.5b-instruct - mif.moreh.io/role: prefill - mif.moreh.io/accelerator.vendor: amd - mif.moreh.io/accelerator.model: mi250 - mif.moreh.io/parallelism: tp2 -spec: - framework: vllm - model: - name: Qwen/Qwen2-0.5B-Instruct - parallelism: - tensor: 2 - template: - spec: - containers: - - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 - env: - - name: ISVC_EXTRA_ARGS - value: >- - --disable-uvicorn-access-log - --no-enable-log-requests - --max-model-len -1 - --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' - resources: - requests: - amd.com/gpu: 2 - limits: - amd.com/gpu: 2 - nodeSelector: - moai.moreh.io/accelerator.vendor: amd - moai.moreh.io/accelerator.model: mi250 - tolerations: - - key: amd.com/gpu - operator: Exists - effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-prefill-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-prefill-amd-mi300x-tp2.helm.yaml deleted file mode 100644 index 647d030..0000000 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-prefill-amd-mi300x-tp2.helm.yaml +++ /dev/null @@ -1,43 +0,0 @@ -apiVersion: odin.moreh.io/v1alpha1 -kind: InferenceServiceTemplate -metadata: - name: quickstart-vllm-qwen-qwen2-0.5b-instruct-prefill-amd-mi300x-tp2 - namespace: {{ include "common.names.namespace" . }} - labels: - {{- include "mif.preset.labels" . | nindent 4 }} - mif.moreh.io/model.org: qwen - mif.moreh.io/model.name: qwen2-0.5b-instruct - mif.moreh.io/role: prefill - mif.moreh.io/accelerator.vendor: amd - mif.moreh.io/accelerator.model: mi300x - mif.moreh.io/parallelism: tp2 -spec: - framework: vllm - model: - name: Qwen/Qwen2-0.5B-Instruct - parallelism: - tensor: 2 - template: - spec: - containers: - - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 - env: - - name: ISVC_EXTRA_ARGS - value: >- - --disable-uvicorn-access-log - --no-enable-log-requests - --max-model-len -1 - --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' - resources: - requests: - amd.com/gpu: 2 - limits: - amd.com/gpu: 2 - nodeSelector: - moai.moreh.io/accelerator.vendor: amd - moai.moreh.io/accelerator.model: mi300x - tolerations: - - key: amd.com/gpu - operator: Exists - effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-amd-mi250-tp2.helm.yaml deleted file mode 100644 index 8163c2f..0000000 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-amd-mi250-tp2.helm.yaml +++ /dev/null @@ -1,42 +0,0 @@ -apiVersion: odin.moreh.io/v1alpha1 -kind: InferenceServiceTemplate -metadata: - name: quickstart-vllm-qwen-qwen2.5-1.5b-instruct-amd-mi250-tp2 - namespace: {{ include "common.names.namespace" . }} - labels: - {{- include "mif.preset.labels" . | nindent 4 }} - mif.moreh.io/model.org: qwen - mif.moreh.io/model.name: qwen2.5-1.5b-instruct - mif.moreh.io/role: e2e - mif.moreh.io/accelerator.vendor: amd - mif.moreh.io/accelerator.model: mi250 - mif.moreh.io/parallelism: tp2 -spec: - framework: vllm - model: - name: Qwen/Qwen2.5-1.5B-Instruct - parallelism: - tensor: 2 - template: - spec: - containers: - - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 - env: - - name: ISVC_EXTRA_ARGS - value: >- - --disable-uvicorn-access-log - --no-enable-log-requests - --max-model-len -1 - resources: - requests: - amd.com/gpu: 2 - limits: - amd.com/gpu: 2 - nodeSelector: - moai.moreh.io/accelerator.vendor: amd - moai.moreh.io/accelerator.model: mi250 - tolerations: - - key: amd.com/gpu - operator: Exists - effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-amd-mi300x-tp2.helm.yaml deleted file mode 100644 index 215e2a1..0000000 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-amd-mi300x-tp2.helm.yaml +++ /dev/null @@ -1,42 +0,0 @@ -apiVersion: odin.moreh.io/v1alpha1 -kind: InferenceServiceTemplate -metadata: - name: quickstart-vllm-qwen-qwen2.5-1.5b-instruct-amd-mi300x-tp2 - namespace: {{ include "common.names.namespace" . }} - labels: - {{- include "mif.preset.labels" . | nindent 4 }} - mif.moreh.io/model.org: qwen - mif.moreh.io/model.name: qwen2.5-1.5b-instruct - mif.moreh.io/role: e2e - mif.moreh.io/accelerator.vendor: amd - mif.moreh.io/accelerator.model: mi300x - mif.moreh.io/parallelism: tp2 -spec: - framework: vllm - model: - name: Qwen/Qwen2.5-1.5B-Instruct - parallelism: - tensor: 2 - template: - spec: - containers: - - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 - env: - - name: ISVC_EXTRA_ARGS - value: >- - --disable-uvicorn-access-log - --no-enable-log-requests - --max-model-len -1 - resources: - requests: - amd.com/gpu: 2 - limits: - amd.com/gpu: 2 - nodeSelector: - moai.moreh.io/accelerator.vendor: amd - moai.moreh.io/accelerator.model: mi300x - tolerations: - - key: amd.com/gpu - operator: Exists - effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-decode-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-decode-amd-mi250-tp2.helm.yaml deleted file mode 100644 index 15080bf..0000000 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-decode-amd-mi250-tp2.helm.yaml +++ /dev/null @@ -1,43 +0,0 @@ -apiVersion: odin.moreh.io/v1alpha1 -kind: InferenceServiceTemplate -metadata: - name: quickstart-vllm-qwen-qwen2.5-1.5b-instruct-decode-amd-mi250-tp2 - namespace: {{ include "common.names.namespace" . }} - labels: - {{- include "mif.preset.labels" . | nindent 4 }} - mif.moreh.io/model.org: qwen - mif.moreh.io/model.name: qwen2.5-1.5b-instruct - mif.moreh.io/role: decode - mif.moreh.io/accelerator.vendor: amd - mif.moreh.io/accelerator.model: mi250 - mif.moreh.io/parallelism: tp2 -spec: - framework: vllm - model: - name: Qwen/Qwen2.5-1.5B-Instruct - parallelism: - tensor: 2 - template: - spec: - containers: - - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 - env: - - name: ISVC_EXTRA_ARGS - value: >- - --disable-uvicorn-access-log - --no-enable-log-requests - --max-model-len -1 - --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' - resources: - requests: - amd.com/gpu: 2 - limits: - amd.com/gpu: 2 - nodeSelector: - moai.moreh.io/accelerator.vendor: amd - moai.moreh.io/accelerator.model: mi250 - tolerations: - - key: amd.com/gpu - operator: Exists - effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-decode-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-decode-amd-mi300x-tp2.helm.yaml deleted file mode 100644 index 2e2546d..0000000 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-decode-amd-mi300x-tp2.helm.yaml +++ /dev/null @@ -1,43 +0,0 @@ -apiVersion: odin.moreh.io/v1alpha1 -kind: InferenceServiceTemplate -metadata: - name: quickstart-vllm-qwen-qwen2.5-1.5b-instruct-decode-amd-mi300x-tp2 - namespace: {{ include "common.names.namespace" . }} - labels: - {{- include "mif.preset.labels" . | nindent 4 }} - mif.moreh.io/model.org: qwen - mif.moreh.io/model.name: qwen2.5-1.5b-instruct - mif.moreh.io/role: decode - mif.moreh.io/accelerator.vendor: amd - mif.moreh.io/accelerator.model: mi300x - mif.moreh.io/parallelism: tp2 -spec: - framework: vllm - model: - name: Qwen/Qwen2.5-1.5B-Instruct - parallelism: - tensor: 2 - template: - spec: - containers: - - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 - env: - - name: ISVC_EXTRA_ARGS - value: >- - --disable-uvicorn-access-log - --no-enable-log-requests - --max-model-len -1 - --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' - resources: - requests: - amd.com/gpu: 2 - limits: - amd.com/gpu: 2 - nodeSelector: - moai.moreh.io/accelerator.vendor: amd - moai.moreh.io/accelerator.model: mi300x - tolerations: - - key: amd.com/gpu - operator: Exists - effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-prefill-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-prefill-amd-mi250-tp2.helm.yaml deleted file mode 100644 index 4bb5d85..0000000 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-prefill-amd-mi250-tp2.helm.yaml +++ /dev/null @@ -1,43 +0,0 @@ -apiVersion: odin.moreh.io/v1alpha1 -kind: InferenceServiceTemplate -metadata: - name: quickstart-vllm-qwen-qwen2.5-1.5b-instruct-prefill-amd-mi250-tp2 - namespace: {{ include "common.names.namespace" . }} - labels: - {{- include "mif.preset.labels" . | nindent 4 }} - mif.moreh.io/model.org: qwen - mif.moreh.io/model.name: qwen2.5-1.5b-instruct - mif.moreh.io/role: prefill - mif.moreh.io/accelerator.vendor: amd - mif.moreh.io/accelerator.model: mi250 - mif.moreh.io/parallelism: tp2 -spec: - framework: vllm - model: - name: Qwen/Qwen2.5-1.5B-Instruct - parallelism: - tensor: 2 - template: - spec: - containers: - - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 - env: - - name: ISVC_EXTRA_ARGS - value: >- - --disable-uvicorn-access-log - --no-enable-log-requests - --max-model-len -1 - --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' - resources: - requests: - amd.com/gpu: 2 - limits: - amd.com/gpu: 2 - nodeSelector: - moai.moreh.io/accelerator.vendor: amd - moai.moreh.io/accelerator.model: mi250 - tolerations: - - key: amd.com/gpu - operator: Exists - effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-prefill-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-prefill-amd-mi300x-tp2.helm.yaml deleted file mode 100644 index 7dee76b..0000000 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-prefill-amd-mi300x-tp2.helm.yaml +++ /dev/null @@ -1,43 +0,0 @@ -apiVersion: odin.moreh.io/v1alpha1 -kind: InferenceServiceTemplate -metadata: - name: quickstart-vllm-qwen-qwen2.5-1.5b-instruct-prefill-amd-mi300x-tp2 - namespace: {{ include "common.names.namespace" . }} - labels: - {{- include "mif.preset.labels" . | nindent 4 }} - mif.moreh.io/model.org: qwen - mif.moreh.io/model.name: qwen2.5-1.5b-instruct - mif.moreh.io/role: prefill - mif.moreh.io/accelerator.vendor: amd - mif.moreh.io/accelerator.model: mi300x - mif.moreh.io/parallelism: tp2 -spec: - framework: vllm - model: - name: Qwen/Qwen2.5-1.5B-Instruct - parallelism: - tensor: 2 - template: - spec: - containers: - - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 - env: - - name: ISVC_EXTRA_ARGS - value: >- - --disable-uvicorn-access-log - --no-enable-log-requests - --max-model-len -1 - --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' - resources: - requests: - amd.com/gpu: 2 - limits: - amd.com/gpu: 2 - nodeSelector: - moai.moreh.io/accelerator.vendor: amd - moai.moreh.io/accelerator.model: mi300x - tolerations: - - key: amd.com/gpu - operator: Exists - effect: NoSchedule From 6240cb1e4d44de6430ee037ef00c7b13154e7cb2 Mon Sep 17 00:00:00 2001 From: Hyeonki Hong Date: Wed, 13 May 2026 14:50:55 +0900 Subject: [PATCH 2/2] NO-ISSUE: feat(preset): refresh quickstart presets and add Ministral-3-8B Refresh quickstart preset templates across maintained models: - Bump moreh-vllm image to v0.19.1.1 - Enable prefix caching by default - Add reasoning and tool-call parsers for gemma-4-31b-it (gemma4) - Add MTP speculative decoding to qwen3.6-27b - Add PD-disaggregated prefill/decode templates for qwen3.6-27b - Replace mistral-7b-instruct-v0.3 with Ministral-3-8B-Reasoning-2512, including mistral tokenizer/config/load format and reasoning parser Co-Authored-By: Claude Opus 4.7 (1M context) --- ...gle-gemma-4-31b-it-amd-mi250-tp2.helm.yaml | 6 ++- ...le-gemma-4-31b-it-amd-mi300x-tp2.helm.yaml | 6 ++- ...ma-4-31b-it-decode-amd-mi250-tp2.helm.yaml | 6 ++- ...a-4-31b-it-decode-amd-mi300x-tp2.helm.yaml | 6 ++- ...a-4-31b-it-prefill-amd-mi250-tp2.helm.yaml | 6 ++- ...-4-31b-it-prefill-amd-mi300x-tp2.helm.yaml | 6 ++- ...ma-3.2-1b-instruct-amd-mi250-tp2.helm.yaml | 3 +- ...a-3.2-1b-instruct-amd-mi300x-tp2.helm.yaml | 3 +- ...1b-instruct-decode-amd-mi250-tp2.helm.yaml | 3 +- ...b-instruct-decode-amd-mi300x-tp2.helm.yaml | 3 +- ...b-instruct-prefill-amd-mi250-tp2.helm.yaml | 3 +- ...-instruct-prefill-amd-mi300x-tp2.helm.yaml | 3 +- ...e-instruct-amd-mi250-dp2-moe-tp2.helm.yaml | 3 +- ...uct-decode-amd-mi250-dp2-moe-tp2.helm.yaml | 3 +- ...ct-prefill-amd-mi250-dp2-moe-tp2.helm.yaml | 3 +- ...8b-reasoning-2512-amd-mi250-tp2.helm.yaml} | 15 ++++-- ...b-reasoning-2512-amd-mi300x-tp2.helm.yaml} | 15 ++++-- ...oning-2512-decode-amd-mi250-tp2.helm.yaml} | 15 ++++-- ...ning-2512-decode-amd-mi300x-tp2.helm.yaml} | 15 ++++-- ...ning-2512-prefill-amd-mi250-tp2.helm.yaml} | 15 ++++-- ...ing-2512-prefill-amd-mi300x-tp2.helm.yaml} | 15 ++++-- ...openai-gpt-oss-20b-amd-mi250-tp2.helm.yaml | 3 +- ...penai-gpt-oss-20b-amd-mi300x-tp2.helm.yaml | 3 +- ...gpt-oss-20b-decode-amd-mi250-tp2.helm.yaml | 3 +- ...pt-oss-20b-decode-amd-mi300x-tp2.helm.yaml | 3 +- ...pt-oss-20b-prefill-amd-mi250-tp2.helm.yaml | 3 +- ...t-oss-20b-prefill-amd-mi300x-tp2.helm.yaml | 3 +- ...lm-qwen-qwen3-1.7b-amd-mi250-tp2.helm.yaml | 3 +- ...m-qwen-qwen3-1.7b-amd-mi300x-tp2.helm.yaml | 3 +- ...-qwen3-1.7b-decode-amd-mi250-tp2.helm.yaml | 3 +- ...qwen3-1.7b-decode-amd-mi300x-tp2.helm.yaml | 3 +- ...qwen3-1.7b-prefill-amd-mi250-tp2.helm.yaml | 3 +- ...wen3-1.7b-prefill-amd-mi300x-tp2.helm.yaml | 3 +- ...llm-qwen-qwen3-32b-amd-mi250-tp2.helm.yaml | 3 +- ...lm-qwen-qwen3-32b-amd-mi300x-tp2.helm.yaml | 3 +- ...n-qwen3-32b-decode-amd-mi250-tp2.helm.yaml | 3 +- ...-qwen3-32b-decode-amd-mi300x-tp2.helm.yaml | 3 +- ...-qwen3-32b-prefill-amd-mi250-tp2.helm.yaml | 3 +- ...qwen3-32b-prefill-amd-mi300x-tp2.helm.yaml | 3 +- ...en3-vl-8b-instruct-amd-mi250-tp2.helm.yaml | 3 +- ...n3-vl-8b-instruct-amd-mi300x-tp2.helm.yaml | 3 +- ...8b-instruct-decode-amd-mi250-tp2.helm.yaml | 3 +- ...b-instruct-decode-amd-mi300x-tp2.helm.yaml | 3 +- ...b-instruct-prefill-amd-mi250-tp2.helm.yaml | 3 +- ...-instruct-prefill-amd-mi300x-tp2.helm.yaml | 3 +- ...m-qwen-qwen3.6-27b-amd-mi250-tp2.helm.yaml | 4 +- ...-qwen-qwen3.6-27b-amd-mi300x-tp2.helm.yaml | 4 +- ...qwen3.6-27b-decode-amd-mi250-tp2.helm.yaml | 50 +++++++++++++++++++ ...wen3.6-27b-decode-amd-mi300x-tp2.helm.yaml | 50 +++++++++++++++++++ ...wen3.6-27b-prefill-amd-mi250-tp2.helm.yaml | 50 +++++++++++++++++++ ...en3.6-27b-prefill-amd-mi300x-tp2.helm.yaml | 50 +++++++++++++++++++ 51 files changed, 368 insertions(+), 65 deletions(-) rename deploy/helm/moai-inference-preset/templates/presets/quickstart/{quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-amd-mi250-tp2.helm.yaml => quickstart-vllm-mistralai-ministral-3-8b-reasoning-2512-amd-mi250-tp2.helm.yaml} (68%) rename deploy/helm/moai-inference-preset/templates/presets/quickstart/{quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-amd-mi300x-tp2.helm.yaml => quickstart-vllm-mistralai-ministral-3-8b-reasoning-2512-amd-mi300x-tp2.helm.yaml} (68%) rename deploy/helm/moai-inference-preset/templates/presets/quickstart/{quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-decode-amd-mi250-tp2.helm.yaml => quickstart-vllm-mistralai-ministral-3-8b-reasoning-2512-decode-amd-mi250-tp2.helm.yaml} (69%) rename deploy/helm/moai-inference-preset/templates/presets/quickstart/{quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-decode-amd-mi300x-tp2.helm.yaml => quickstart-vllm-mistralai-ministral-3-8b-reasoning-2512-decode-amd-mi300x-tp2.helm.yaml} (69%) rename deploy/helm/moai-inference-preset/templates/presets/quickstart/{quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-prefill-amd-mi250-tp2.helm.yaml => quickstart-vllm-mistralai-ministral-3-8b-reasoning-2512-prefill-amd-mi250-tp2.helm.yaml} (69%) rename deploy/helm/moai-inference-preset/templates/presets/quickstart/{quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-prefill-amd-mi300x-tp2.helm.yaml => quickstart-vllm-mistralai-ministral-3-8b-reasoning-2512-prefill-amd-mi300x-tp2.helm.yaml} (69%) create mode 100644 deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3.6-27b-decode-amd-mi250-tp2.helm.yaml create mode 100644 deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3.6-27b-decode-amd-mi300x-tp2.helm.yaml create mode 100644 deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3.6-27b-prefill-amd-mi250-tp2.helm.yaml create mode 100644 deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3.6-27b-prefill-amd-mi300x-tp2.helm.yaml diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-google-gemma-4-31b-it-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-google-gemma-4-31b-it-amd-mi250-tp2.helm.yaml index 244e945..b3722be 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-google-gemma-4-31b-it-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-google-gemma-4-31b-it-amd-mi250-tp2.helm.yaml @@ -21,13 +21,17 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching + --enable-auto-tool-choice + --reasoning-parser gemma4 + --tool-call-parser gemma4 resources: requests: amd.com/gpu: 2 diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-google-gemma-4-31b-it-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-google-gemma-4-31b-it-amd-mi300x-tp2.helm.yaml index 9b43311..db9d35b 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-google-gemma-4-31b-it-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-google-gemma-4-31b-it-amd-mi300x-tp2.helm.yaml @@ -21,13 +21,17 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching + --enable-auto-tool-choice + --reasoning-parser gemma4 + --tool-call-parser gemma4 resources: requests: amd.com/gpu: 2 diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-google-gemma-4-31b-it-decode-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-google-gemma-4-31b-it-decode-amd-mi250-tp2.helm.yaml index 39604d6..6429cc7 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-google-gemma-4-31b-it-decode-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-google-gemma-4-31b-it-decode-amd-mi250-tp2.helm.yaml @@ -21,13 +21,17 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching + --enable-auto-tool-choice + --reasoning-parser gemma4 + --tool-call-parser gemma4 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-google-gemma-4-31b-it-decode-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-google-gemma-4-31b-it-decode-amd-mi300x-tp2.helm.yaml index 2295bc9..de4967a 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-google-gemma-4-31b-it-decode-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-google-gemma-4-31b-it-decode-amd-mi300x-tp2.helm.yaml @@ -21,13 +21,17 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching + --enable-auto-tool-choice + --reasoning-parser gemma4 + --tool-call-parser gemma4 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-google-gemma-4-31b-it-prefill-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-google-gemma-4-31b-it-prefill-amd-mi250-tp2.helm.yaml index c1087cc..c689dcf 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-google-gemma-4-31b-it-prefill-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-google-gemma-4-31b-it-prefill-amd-mi250-tp2.helm.yaml @@ -21,13 +21,17 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching + --enable-auto-tool-choice + --reasoning-parser gemma4 + --tool-call-parser gemma4 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-google-gemma-4-31b-it-prefill-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-google-gemma-4-31b-it-prefill-amd-mi300x-tp2.helm.yaml index 5018b2a..354525e 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-google-gemma-4-31b-it-prefill-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-google-gemma-4-31b-it-prefill-amd-mi300x-tp2.helm.yaml @@ -21,13 +21,17 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching + --enable-auto-tool-choice + --reasoning-parser gemma4 + --tool-call-parser gemma4 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-amd-mi250-tp2.helm.yaml index e1171dc..5844c21 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-amd-mi250-tp2.helm.yaml @@ -21,13 +21,14 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching resources: requests: amd.com/gpu: 2 diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-amd-mi300x-tp2.helm.yaml index bb93deb..3fcc4d7 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-amd-mi300x-tp2.helm.yaml @@ -21,13 +21,14 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching resources: requests: amd.com/gpu: 2 diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-decode-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-decode-amd-mi250-tp2.helm.yaml index a725710..4bb5f89 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-decode-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-decode-amd-mi250-tp2.helm.yaml @@ -21,13 +21,14 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-decode-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-decode-amd-mi300x-tp2.helm.yaml index 3c0e9c5..1958a51 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-decode-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-decode-amd-mi300x-tp2.helm.yaml @@ -21,13 +21,14 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-prefill-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-prefill-amd-mi250-tp2.helm.yaml index 537041f..ca6ee68 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-prefill-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-prefill-amd-mi250-tp2.helm.yaml @@ -21,13 +21,14 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-prefill-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-prefill-amd-mi300x-tp2.helm.yaml index 64bc449..84a5331 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-prefill-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-prefill-amd-mi300x-tp2.helm.yaml @@ -21,13 +21,14 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-microsoft-phi-mini-moe-instruct-amd-mi250-dp2-moe-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-microsoft-phi-mini-moe-instruct-amd-mi250-dp2-moe-tp2.helm.yaml index 14cc19f..e8176bf 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-microsoft-phi-mini-moe-instruct-amd-mi250-dp2-moe-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-microsoft-phi-mini-moe-instruct-amd-mi250-dp2-moe-tp2.helm.yaml @@ -21,7 +21,7 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- @@ -29,6 +29,7 @@ spec: --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching resources: requests: amd.com/gpu: 2 diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-microsoft-phi-mini-moe-instruct-decode-amd-mi250-dp2-moe-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-microsoft-phi-mini-moe-instruct-decode-amd-mi250-dp2-moe-tp2.helm.yaml index 685d2d8..dd5848e 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-microsoft-phi-mini-moe-instruct-decode-amd-mi250-dp2-moe-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-microsoft-phi-mini-moe-instruct-decode-amd-mi250-dp2-moe-tp2.helm.yaml @@ -21,7 +21,7 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- @@ -29,6 +29,7 @@ spec: --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-microsoft-phi-mini-moe-instruct-prefill-amd-mi250-dp2-moe-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-microsoft-phi-mini-moe-instruct-prefill-amd-mi250-dp2-moe-tp2.helm.yaml index 441c354..69f7ebb 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-microsoft-phi-mini-moe-instruct-prefill-amd-mi250-dp2-moe-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-microsoft-phi-mini-moe-instruct-prefill-amd-mi250-dp2-moe-tp2.helm.yaml @@ -21,7 +21,7 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- @@ -29,6 +29,7 @@ spec: --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-ministral-3-8b-reasoning-2512-amd-mi250-tp2.helm.yaml similarity index 68% rename from deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-amd-mi250-tp2.helm.yaml rename to deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-ministral-3-8b-reasoning-2512-amd-mi250-tp2.helm.yaml index b51b52d..4afc2b4 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-ministral-3-8b-reasoning-2512-amd-mi250-tp2.helm.yaml @@ -1,12 +1,12 @@ apiVersion: odin.moreh.io/v1alpha1 kind: InferenceServiceTemplate metadata: - name: quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-amd-mi250-tp2 + name: quickstart-vllm-mistralai-ministral-3-8b-reasoning-2512-amd-mi250-tp2 namespace: {{ include "common.names.namespace" . }} labels: {{- include "mif.preset.labels" . | nindent 4 }} mif.moreh.io/model.org: mistralai - mif.moreh.io/model.name: mistral-7b-instruct-v0.3 + mif.moreh.io/model.name: ministral-3-8b-reasoning-2512 mif.moreh.io/role: e2e mif.moreh.io/accelerator.vendor: amd mif.moreh.io/accelerator.model: mi250 @@ -14,20 +14,27 @@ metadata: spec: framework: vllm model: - name: mistralai/Mistral-7B-Instruct-v0.3 + name: mistralai/Ministral-3-8B-Reasoning-2512 parallelism: tensor: 2 template: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching + --tokenizer_mode mistral + --config_format mistral + --load_format mistral + --enable-auto-tool-choice + --tool-call-parser mistral + --reasoning-parser mistral resources: requests: amd.com/gpu: 2 diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-ministral-3-8b-reasoning-2512-amd-mi300x-tp2.helm.yaml similarity index 68% rename from deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-amd-mi300x-tp2.helm.yaml rename to deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-ministral-3-8b-reasoning-2512-amd-mi300x-tp2.helm.yaml index b3fe931..91a4b56 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-ministral-3-8b-reasoning-2512-amd-mi300x-tp2.helm.yaml @@ -1,12 +1,12 @@ apiVersion: odin.moreh.io/v1alpha1 kind: InferenceServiceTemplate metadata: - name: quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-amd-mi300x-tp2 + name: quickstart-vllm-mistralai-ministral-3-8b-reasoning-2512-amd-mi300x-tp2 namespace: {{ include "common.names.namespace" . }} labels: {{- include "mif.preset.labels" . | nindent 4 }} mif.moreh.io/model.org: mistralai - mif.moreh.io/model.name: mistral-7b-instruct-v0.3 + mif.moreh.io/model.name: ministral-3-8b-reasoning-2512 mif.moreh.io/role: e2e mif.moreh.io/accelerator.vendor: amd mif.moreh.io/accelerator.model: mi300x @@ -14,20 +14,27 @@ metadata: spec: framework: vllm model: - name: mistralai/Mistral-7B-Instruct-v0.3 + name: mistralai/Ministral-3-8B-Reasoning-2512 parallelism: tensor: 2 template: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching + --tokenizer_mode mistral + --config_format mistral + --load_format mistral + --enable-auto-tool-choice + --tool-call-parser mistral + --reasoning-parser mistral resources: requests: amd.com/gpu: 2 diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-decode-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-ministral-3-8b-reasoning-2512-decode-amd-mi250-tp2.helm.yaml similarity index 69% rename from deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-decode-amd-mi250-tp2.helm.yaml rename to deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-ministral-3-8b-reasoning-2512-decode-amd-mi250-tp2.helm.yaml index 2b4ee9f..95eec95 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-decode-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-ministral-3-8b-reasoning-2512-decode-amd-mi250-tp2.helm.yaml @@ -1,12 +1,12 @@ apiVersion: odin.moreh.io/v1alpha1 kind: InferenceServiceTemplate metadata: - name: quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-decode-amd-mi250-tp2 + name: quickstart-vllm-mistralai-ministral-3-8b-reasoning-2512-decode-amd-mi250-tp2 namespace: {{ include "common.names.namespace" . }} labels: {{- include "mif.preset.labels" . | nindent 4 }} mif.moreh.io/model.org: mistralai - mif.moreh.io/model.name: mistral-7b-instruct-v0.3 + mif.moreh.io/model.name: ministral-3-8b-reasoning-2512 mif.moreh.io/role: decode mif.moreh.io/accelerator.vendor: amd mif.moreh.io/accelerator.model: mi250 @@ -14,20 +14,27 @@ metadata: spec: framework: vllm model: - name: mistralai/Mistral-7B-Instruct-v0.3 + name: mistralai/Ministral-3-8B-Reasoning-2512 parallelism: tensor: 2 template: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching + --tokenizer_mode mistral + --config_format mistral + --load_format mistral + --enable-auto-tool-choice + --tool-call-parser mistral + --reasoning-parser mistral --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-decode-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-ministral-3-8b-reasoning-2512-decode-amd-mi300x-tp2.helm.yaml similarity index 69% rename from deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-decode-amd-mi300x-tp2.helm.yaml rename to deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-ministral-3-8b-reasoning-2512-decode-amd-mi300x-tp2.helm.yaml index 361f3f6..2eda992 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-decode-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-ministral-3-8b-reasoning-2512-decode-amd-mi300x-tp2.helm.yaml @@ -1,12 +1,12 @@ apiVersion: odin.moreh.io/v1alpha1 kind: InferenceServiceTemplate metadata: - name: quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-decode-amd-mi300x-tp2 + name: quickstart-vllm-mistralai-ministral-3-8b-reasoning-2512-decode-amd-mi300x-tp2 namespace: {{ include "common.names.namespace" . }} labels: {{- include "mif.preset.labels" . | nindent 4 }} mif.moreh.io/model.org: mistralai - mif.moreh.io/model.name: mistral-7b-instruct-v0.3 + mif.moreh.io/model.name: ministral-3-8b-reasoning-2512 mif.moreh.io/role: decode mif.moreh.io/accelerator.vendor: amd mif.moreh.io/accelerator.model: mi300x @@ -14,20 +14,27 @@ metadata: spec: framework: vllm model: - name: mistralai/Mistral-7B-Instruct-v0.3 + name: mistralai/Ministral-3-8B-Reasoning-2512 parallelism: tensor: 2 template: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching + --tokenizer_mode mistral + --config_format mistral + --load_format mistral + --enable-auto-tool-choice + --tool-call-parser mistral + --reasoning-parser mistral --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-prefill-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-ministral-3-8b-reasoning-2512-prefill-amd-mi250-tp2.helm.yaml similarity index 69% rename from deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-prefill-amd-mi250-tp2.helm.yaml rename to deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-ministral-3-8b-reasoning-2512-prefill-amd-mi250-tp2.helm.yaml index 6a2d50c..ed7873b 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-prefill-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-ministral-3-8b-reasoning-2512-prefill-amd-mi250-tp2.helm.yaml @@ -1,12 +1,12 @@ apiVersion: odin.moreh.io/v1alpha1 kind: InferenceServiceTemplate metadata: - name: quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-prefill-amd-mi250-tp2 + name: quickstart-vllm-mistralai-ministral-3-8b-reasoning-2512-prefill-amd-mi250-tp2 namespace: {{ include "common.names.namespace" . }} labels: {{- include "mif.preset.labels" . | nindent 4 }} mif.moreh.io/model.org: mistralai - mif.moreh.io/model.name: mistral-7b-instruct-v0.3 + mif.moreh.io/model.name: ministral-3-8b-reasoning-2512 mif.moreh.io/role: prefill mif.moreh.io/accelerator.vendor: amd mif.moreh.io/accelerator.model: mi250 @@ -14,20 +14,27 @@ metadata: spec: framework: vllm model: - name: mistralai/Mistral-7B-Instruct-v0.3 + name: mistralai/Ministral-3-8B-Reasoning-2512 parallelism: tensor: 2 template: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching + --tokenizer_mode mistral + --config_format mistral + --load_format mistral + --enable-auto-tool-choice + --tool-call-parser mistral + --reasoning-parser mistral --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-prefill-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-ministral-3-8b-reasoning-2512-prefill-amd-mi300x-tp2.helm.yaml similarity index 69% rename from deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-prefill-amd-mi300x-tp2.helm.yaml rename to deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-ministral-3-8b-reasoning-2512-prefill-amd-mi300x-tp2.helm.yaml index 59dd294..8b30b15 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-prefill-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-ministral-3-8b-reasoning-2512-prefill-amd-mi300x-tp2.helm.yaml @@ -1,12 +1,12 @@ apiVersion: odin.moreh.io/v1alpha1 kind: InferenceServiceTemplate metadata: - name: quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-prefill-amd-mi300x-tp2 + name: quickstart-vllm-mistralai-ministral-3-8b-reasoning-2512-prefill-amd-mi300x-tp2 namespace: {{ include "common.names.namespace" . }} labels: {{- include "mif.preset.labels" . | nindent 4 }} mif.moreh.io/model.org: mistralai - mif.moreh.io/model.name: mistral-7b-instruct-v0.3 + mif.moreh.io/model.name: ministral-3-8b-reasoning-2512 mif.moreh.io/role: prefill mif.moreh.io/accelerator.vendor: amd mif.moreh.io/accelerator.model: mi300x @@ -14,20 +14,27 @@ metadata: spec: framework: vllm model: - name: mistralai/Mistral-7B-Instruct-v0.3 + name: mistralai/Ministral-3-8B-Reasoning-2512 parallelism: tensor: 2 template: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching + --tokenizer_mode mistral + --config_format mistral + --load_format mistral + --enable-auto-tool-choice + --tool-call-parser mistral + --reasoning-parser mistral --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-amd-mi250-tp2.helm.yaml index c11c17f..8b42985 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-amd-mi250-tp2.helm.yaml @@ -21,7 +21,7 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- @@ -32,6 +32,7 @@ spec: --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching resources: requests: amd.com/gpu: 2 diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-amd-mi300x-tp2.helm.yaml index d3e9d1a..efa3ff0 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-amd-mi300x-tp2.helm.yaml @@ -21,7 +21,7 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- @@ -32,6 +32,7 @@ spec: --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching resources: requests: amd.com/gpu: 2 diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-decode-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-decode-amd-mi250-tp2.helm.yaml index 06d3457..5ed169a 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-decode-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-decode-amd-mi250-tp2.helm.yaml @@ -21,7 +21,7 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- @@ -32,6 +32,7 @@ spec: --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-decode-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-decode-amd-mi300x-tp2.helm.yaml index f334ce4..69d3cef 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-decode-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-decode-amd-mi300x-tp2.helm.yaml @@ -21,7 +21,7 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- @@ -32,6 +32,7 @@ spec: --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-prefill-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-prefill-amd-mi250-tp2.helm.yaml index 9dd9a55..7703586 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-prefill-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-prefill-amd-mi250-tp2.helm.yaml @@ -21,7 +21,7 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- @@ -32,6 +32,7 @@ spec: --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-prefill-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-prefill-amd-mi300x-tp2.helm.yaml index e7b78cb..978ad13 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-prefill-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-prefill-amd-mi300x-tp2.helm.yaml @@ -21,7 +21,7 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- @@ -32,6 +32,7 @@ spec: --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-amd-mi250-tp2.helm.yaml index f6f4bbb..1cb267e 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-amd-mi250-tp2.helm.yaml @@ -21,13 +21,14 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching resources: requests: amd.com/gpu: 2 diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-amd-mi300x-tp2.helm.yaml index 913bb31..04bedc9 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-amd-mi300x-tp2.helm.yaml @@ -21,13 +21,14 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching resources: requests: amd.com/gpu: 2 diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-decode-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-decode-amd-mi250-tp2.helm.yaml index e1d90af..6006e59 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-decode-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-decode-amd-mi250-tp2.helm.yaml @@ -21,13 +21,14 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-decode-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-decode-amd-mi300x-tp2.helm.yaml index fcecf49..819bf89 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-decode-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-decode-amd-mi300x-tp2.helm.yaml @@ -21,13 +21,14 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-prefill-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-prefill-amd-mi250-tp2.helm.yaml index 2c44525..ca9e388 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-prefill-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-prefill-amd-mi250-tp2.helm.yaml @@ -21,13 +21,14 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-prefill-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-prefill-amd-mi300x-tp2.helm.yaml index 61cc4e2..119c7fa 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-prefill-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-prefill-amd-mi300x-tp2.helm.yaml @@ -21,13 +21,14 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-32b-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-32b-amd-mi250-tp2.helm.yaml index a5b604e..fd838f6 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-32b-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-32b-amd-mi250-tp2.helm.yaml @@ -21,13 +21,14 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching resources: requests: amd.com/gpu: 2 diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-32b-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-32b-amd-mi300x-tp2.helm.yaml index 06b11ac..c50bc58 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-32b-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-32b-amd-mi300x-tp2.helm.yaml @@ -21,13 +21,14 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching resources: requests: amd.com/gpu: 2 diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-32b-decode-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-32b-decode-amd-mi250-tp2.helm.yaml index 2fa31ad..b571b14 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-32b-decode-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-32b-decode-amd-mi250-tp2.helm.yaml @@ -21,13 +21,14 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-32b-decode-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-32b-decode-amd-mi300x-tp2.helm.yaml index eb823a9..59821ad 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-32b-decode-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-32b-decode-amd-mi300x-tp2.helm.yaml @@ -21,13 +21,14 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-32b-prefill-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-32b-prefill-amd-mi250-tp2.helm.yaml index be3e983..f4e7db2 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-32b-prefill-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-32b-prefill-amd-mi250-tp2.helm.yaml @@ -21,13 +21,14 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-32b-prefill-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-32b-prefill-amd-mi300x-tp2.helm.yaml index 679be4e..45c8c02 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-32b-prefill-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-32b-prefill-amd-mi300x-tp2.helm.yaml @@ -21,13 +21,14 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-amd-mi250-tp2.helm.yaml index b568204..1d9869a 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-amd-mi250-tp2.helm.yaml @@ -21,13 +21,14 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching resources: requests: amd.com/gpu: 2 diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-amd-mi300x-tp2.helm.yaml index d1131be..e32460d 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-amd-mi300x-tp2.helm.yaml @@ -21,13 +21,14 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching resources: requests: amd.com/gpu: 2 diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-decode-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-decode-amd-mi250-tp2.helm.yaml index b09238f..18e8de2 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-decode-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-decode-amd-mi250-tp2.helm.yaml @@ -21,13 +21,14 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-decode-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-decode-amd-mi300x-tp2.helm.yaml index d655b58..7a690f7 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-decode-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-decode-amd-mi300x-tp2.helm.yaml @@ -21,13 +21,14 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-prefill-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-prefill-amd-mi250-tp2.helm.yaml index 626ccc0..69be305 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-prefill-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-prefill-amd-mi250-tp2.helm.yaml @@ -21,13 +21,14 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-prefill-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-prefill-amd-mi300x-tp2.helm.yaml index b1a678b..2a09f0e 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-prefill-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-prefill-amd-mi300x-tp2.helm.yaml @@ -21,13 +21,14 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.17.1.1 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3.6-27b-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3.6-27b-amd-mi250-tp2.helm.yaml index c273a39..0f2e44b 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3.6-27b-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3.6-27b-amd-mi250-tp2.helm.yaml @@ -21,18 +21,20 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.0 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching --trust-remote-code --enable-auto-tool-choice --tool-call-parser qwen3_coder --reasoning-parser qwen3 --mm-encoder-tp-mode data + --speculative-config '{"method":"mtp","num_speculative_tokens":3}' resources: requests: amd.com/gpu: 2 diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3.6-27b-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3.6-27b-amd-mi300x-tp2.helm.yaml index c9cc601..74b1a81 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3.6-27b-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3.6-27b-amd-mi300x-tp2.helm.yaml @@ -21,18 +21,20 @@ spec: spec: containers: - name: main - image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.0 + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 env: - name: ISVC_EXTRA_ARGS value: >- --disable-uvicorn-access-log --no-enable-log-requests --max-model-len -1 + --enable-prefix-caching --trust-remote-code --enable-auto-tool-choice --tool-call-parser qwen3_coder --reasoning-parser qwen3 --mm-encoder-tp-mode data + --speculative-config '{"method":"mtp","num_speculative_tokens":3}' resources: requests: amd.com/gpu: 2 diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3.6-27b-decode-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3.6-27b-decode-amd-mi250-tp2.helm.yaml new file mode 100644 index 0000000..31486ca --- /dev/null +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3.6-27b-decode-amd-mi250-tp2.helm.yaml @@ -0,0 +1,50 @@ +apiVersion: odin.moreh.io/v1alpha1 +kind: InferenceServiceTemplate +metadata: + name: quickstart-vllm-qwen-qwen3.6-27b-decode-amd-mi250-tp2 + namespace: {{ include "common.names.namespace" . }} + labels: + {{- include "mif.preset.labels" . | nindent 4 }} + mif.moreh.io/model.org: qwen + mif.moreh.io/model.name: qwen3.6-27b + mif.moreh.io/role: decode + mif.moreh.io/accelerator.vendor: amd + mif.moreh.io/accelerator.model: mi250 + mif.moreh.io/parallelism: tp2 +spec: + framework: vllm + model: + name: Qwen/Qwen3.6-27B + parallelism: + tensor: 2 + template: + spec: + containers: + - name: main + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 + env: + - name: ISVC_EXTRA_ARGS + value: >- + --disable-uvicorn-access-log + --no-enable-log-requests + --max-model-len -1 + --enable-prefix-caching + --trust-remote-code + --enable-auto-tool-choice + --tool-call-parser qwen3_coder + --reasoning-parser qwen3 + --mm-encoder-tp-mode data + --speculative-config '{"method":"mtp","num_speculative_tokens":3}' + --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' + resources: + requests: + amd.com/gpu: 2 + limits: + amd.com/gpu: 2 + nodeSelector: + moai.moreh.io/accelerator.vendor: amd + moai.moreh.io/accelerator.model: mi250 + tolerations: + - key: amd.com/gpu + operator: Exists + effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3.6-27b-decode-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3.6-27b-decode-amd-mi300x-tp2.helm.yaml new file mode 100644 index 0000000..a411a10 --- /dev/null +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3.6-27b-decode-amd-mi300x-tp2.helm.yaml @@ -0,0 +1,50 @@ +apiVersion: odin.moreh.io/v1alpha1 +kind: InferenceServiceTemplate +metadata: + name: quickstart-vllm-qwen-qwen3.6-27b-decode-amd-mi300x-tp2 + namespace: {{ include "common.names.namespace" . }} + labels: + {{- include "mif.preset.labels" . | nindent 4 }} + mif.moreh.io/model.org: qwen + mif.moreh.io/model.name: qwen3.6-27b + mif.moreh.io/role: decode + mif.moreh.io/accelerator.vendor: amd + mif.moreh.io/accelerator.model: mi300x + mif.moreh.io/parallelism: tp2 +spec: + framework: vllm + model: + name: Qwen/Qwen3.6-27B + parallelism: + tensor: 2 + template: + spec: + containers: + - name: main + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 + env: + - name: ISVC_EXTRA_ARGS + value: >- + --disable-uvicorn-access-log + --no-enable-log-requests + --max-model-len -1 + --enable-prefix-caching + --trust-remote-code + --enable-auto-tool-choice + --tool-call-parser qwen3_coder + --reasoning-parser qwen3 + --mm-encoder-tp-mode data + --speculative-config '{"method":"mtp","num_speculative_tokens":3}' + --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' + resources: + requests: + amd.com/gpu: 2 + limits: + amd.com/gpu: 2 + nodeSelector: + moai.moreh.io/accelerator.vendor: amd + moai.moreh.io/accelerator.model: mi300x + tolerations: + - key: amd.com/gpu + operator: Exists + effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3.6-27b-prefill-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3.6-27b-prefill-amd-mi250-tp2.helm.yaml new file mode 100644 index 0000000..02b5311 --- /dev/null +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3.6-27b-prefill-amd-mi250-tp2.helm.yaml @@ -0,0 +1,50 @@ +apiVersion: odin.moreh.io/v1alpha1 +kind: InferenceServiceTemplate +metadata: + name: quickstart-vllm-qwen-qwen3.6-27b-prefill-amd-mi250-tp2 + namespace: {{ include "common.names.namespace" . }} + labels: + {{- include "mif.preset.labels" . | nindent 4 }} + mif.moreh.io/model.org: qwen + mif.moreh.io/model.name: qwen3.6-27b + mif.moreh.io/role: prefill + mif.moreh.io/accelerator.vendor: amd + mif.moreh.io/accelerator.model: mi250 + mif.moreh.io/parallelism: tp2 +spec: + framework: vllm + model: + name: Qwen/Qwen3.6-27B + parallelism: + tensor: 2 + template: + spec: + containers: + - name: main + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 + env: + - name: ISVC_EXTRA_ARGS + value: >- + --disable-uvicorn-access-log + --no-enable-log-requests + --max-model-len -1 + --enable-prefix-caching + --trust-remote-code + --enable-auto-tool-choice + --tool-call-parser qwen3_coder + --reasoning-parser qwen3 + --mm-encoder-tp-mode data + --speculative-config '{"method":"mtp","num_speculative_tokens":3}' + --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' + resources: + requests: + amd.com/gpu: 2 + limits: + amd.com/gpu: 2 + nodeSelector: + moai.moreh.io/accelerator.vendor: amd + moai.moreh.io/accelerator.model: mi250 + tolerations: + - key: amd.com/gpu + operator: Exists + effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3.6-27b-prefill-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3.6-27b-prefill-amd-mi300x-tp2.helm.yaml new file mode 100644 index 0000000..188b053 --- /dev/null +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3.6-27b-prefill-amd-mi300x-tp2.helm.yaml @@ -0,0 +1,50 @@ +apiVersion: odin.moreh.io/v1alpha1 +kind: InferenceServiceTemplate +metadata: + name: quickstart-vllm-qwen-qwen3.6-27b-prefill-amd-mi300x-tp2 + namespace: {{ include "common.names.namespace" . }} + labels: + {{- include "mif.preset.labels" . | nindent 4 }} + mif.moreh.io/model.org: qwen + mif.moreh.io/model.name: qwen3.6-27b + mif.moreh.io/role: prefill + mif.moreh.io/accelerator.vendor: amd + mif.moreh.io/accelerator.model: mi300x + mif.moreh.io/parallelism: tp2 +spec: + framework: vllm + model: + name: Qwen/Qwen3.6-27B + parallelism: + tensor: 2 + template: + spec: + containers: + - name: main + image: 255250787067.dkr.ecr.ap-northeast-2.amazonaws.com/quickstart/moreh-vllm:v0.19.1.1 + env: + - name: ISVC_EXTRA_ARGS + value: >- + --disable-uvicorn-access-log + --no-enable-log-requests + --max-model-len -1 + --enable-prefix-caching + --trust-remote-code + --enable-auto-tool-choice + --tool-call-parser qwen3_coder + --reasoning-parser qwen3 + --mm-encoder-tp-mode data + --speculative-config '{"method":"mtp","num_speculative_tokens":3}' + --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' + resources: + requests: + amd.com/gpu: 2 + limits: + amd.com/gpu: 2 + nodeSelector: + moai.moreh.io/accelerator.vendor: amd + moai.moreh.io/accelerator.model: mi300x + tolerations: + - key: amd.com/gpu + operator: Exists + effect: NoSchedule