From 3ddb397a96aabab2945287c678a37334082a2f2c Mon Sep 17 00:00:00 2001 From: guylei-code Date: Thu, 25 Dec 2025 16:02:52 +0200 Subject: [PATCH 1/9] Chane the vllm-module.ipynb output error ,vllm_app. --- modules/src/vllm_module/item.yaml | 16 ++ modules/src/vllm_module/test_vllm_module.py | 35 +++ modules/src/vllm_module/vllm-module.ipynb | 234 ++++++++++++++++++++ modules/src/vllm_module/vllm_module.py | 138 ++++++++++++ 4 files changed, 423 insertions(+) create mode 100644 modules/src/vllm_module/item.yaml create mode 100644 modules/src/vllm_module/test_vllm_module.py create mode 100644 modules/src/vllm_module/vllm-module.ipynb create mode 100644 modules/src/vllm_module/vllm_module.py diff --git a/modules/src/vllm_module/item.yaml b/modules/src/vllm_module/item.yaml new file mode 100644 index 00000000..d7b54021 --- /dev/null +++ b/modules/src/vllm_module/item.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +categories: +- genai +description: Deploys a vLLM OpenAI-compatible LLM server as an MLRun application runtime, with configurable GPU usage, node selection, tensor parallelism, and runtime flags. +example: vllm_module.ipynb +generationDate: 2025-12-17:12-25 +hidden: false +labels: + author: Iguazio +mlrunVersion: 1.10.0 +name: vllm_module +spec: + filename: vllm_module.py + image: mlrun/mlrun + kind: generic +version: 1.0.0 \ No newline at end of file diff --git a/modules/src/vllm_module/test_vllm_module.py b/modules/src/vllm_module/test_vllm_module.py new file mode 100644 index 00000000..3a5f422a --- /dev/null +++ b/modules/src/vllm_module/test_vllm_module.py @@ -0,0 +1,35 @@ +# Copyright 2025 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from vllm_module import VLLMModule +import mlrun + + +class TestVllmModule: + """Test suite for VLLMModule class.""" + + def setup_method(self): + project = mlrun.new_project("vllm", save=False) + + # if your VLLMModule requires node_selector as keyword-only, keep it here + self.TestVllmModule = VLLMModule( + project, + node_selector={"alpha.eksctl.io/nodegroup-name": "added-gpu"}, + ) + + def test_vllm_module(self): + assert ( + type(self.TestVllmModule.vllm_app) == mlrun.runtimes.nuclio.application.application.ApplicationRuntime + ) diff --git a/modules/src/vllm_module/vllm-module.ipynb b/modules/src/vllm_module/vllm-module.ipynb new file mode 100644 index 00000000..05b584e4 --- /dev/null +++ b/modules/src/vllm_module/vllm-module.ipynb @@ -0,0 +1,234 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7d551647-dfc2-47da-bc8a-3792af622073", + "metadata": {}, + "source": [ + "# vLLM Module with MLRun\n", + "\n", + "This notebook shows how to configure and deploy a vLLM OpenAI compatible server as an MLRun application runtime, then showcases how to send a chat request to it to the vLLM server." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "7707b270-30cc-448a-a828-cb93aa28030d", + "metadata": {}, + "outputs": [], + "source": [ + "import mlrun\n" + ] + }, + { + "cell_type": "markdown", + "id": "d5cff681-bfdf-4468-a1d1-2aeadb56065e", + "metadata": {}, + "source": [ + "## Prerequisite\n", + "* At lease one GPU is required for running this notebook." + ] + }, + { + "cell_type": "markdown", + "id": "d5c84798-289f-4b4f-8c1b-f4dd12a3bda5", + "metadata": {}, + "source": [ + "## What this notebook does\n", + "\n", + "In this notebook we will:\n", + "\n", + "- Create or load an **MLRun project**\n", + "- Import a custom **vLLM module** from the MLRun Hub\n", + "- Deploy a **vLLM OpenAI-compatible server** as an MLRun application runtime\n", + "- Configure deployment parameters such as model, GPU count, memory, node selector, port, and log level\n", + "- Invoke the deployed service using the `/v1/chat/completions` endpoint\n", + "- Parse the response and extract only the assistant’s generated text\n", + "\n", + "By the end of this notebook, you will have a working vLLM deployment that can be queried directly from a Jupyter notebook using OpenAI-style APIs.\n", + "\n", + "For more information about [vLLM documentation](https://docs.vllm.ai/en/latest/serving/openai_compatible_server/)" + ] + }, + { + "cell_type": "markdown", + "id": "879ca641-ee35-4682-9995-4eb319d89090", + "metadata": {}, + "source": [ + "## 1. Create an MLRun project\n", + "\n", + "In this section we create or load an MLRun project that will own the deployed vLLM application runtime." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6eac263a-17d1-4454-9e19-459dfbe2f231", + "metadata": {}, + "outputs": [], + "source": [ + "project = mlrun.get_or_create_project(name=\"vllm-module\", context=\"\", user_project=True)" + ] + }, + { + "cell_type": "markdown", + "id": "da49d335-b704-4fb6-801f-4d07b64f9be6", + "metadata": {}, + "source": [ + "## 2. Import the vLLM module from the MLRun Hub\n", + "\n", + "In this section we import the vLLM module from the MLRun Hub so we can instantiate `VLLMModule` and deploy it as an application runtime." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6d89dee-db58-4c0c-8009-b37020c9599a", + "metadata": {}, + "outputs": [], + "source": [ + "vllm = mlrun.import_module(\"hub://vllm-module\")" + ] + }, + { + "cell_type": "markdown", + "id": "1202ddd5-0ce7-4769-be29-8fc264c1f80e", + "metadata": {}, + "source": [ + "## 3. Deploy the vLLM application runtime\n", + "\n", + "Configure the vLLM deployment parameters and deploy the application.\n", + "\n", + "The returned address is the service URL for the application runtime." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e433123a-e64b-4a7a-8c7f-8165bcdcc6d1", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the vLLM app\n", + "vllm_module = vllm.VLLMModule(\n", + " project=project,\n", + " node_selector={\"alpha.eksctl.io/nodegroup-name\": \"added-gpu\"},\n", + " name=\"qwen-vllm\",\n", + " image=\"vllm/vllm-openai:latest\",\n", + " model=\"Qwen/Qwen2.5-Omni-3B\",\n", + " gpus=1,\n", + " mem=\"10G\",\n", + " port=8000,\n", + " dtype=\"auto\",\n", + " uvicorn_log_level=\"info\",\n", + " max_tokens = 501,\n", + ")\n", + "\n", + "# Deploy the vLLM app\n", + "addr = vllm_module.vllm_app.deploy(with_mlrun=True)\n", + "addr" + ] + }, + { + "cell_type": "markdown", + "id": "06832de3-5c31-43bf-b07b-0e71fb2d072d", + "metadata": {}, + "source": [ + "## 4. Get the runtime handle\n", + "\n", + "Fetch the runtime object and invoke the service using `app.invoke(...)`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "102d3fd0-1ee6-49b8-8c86-df742ac1c559", + "metadata": {}, + "outputs": [], + "source": [ + "# Optional: get_runtime() method uses to get the MLRun application runtime\n", + "app = vllm_module.get_runtime()" + ] + }, + { + "cell_type": "markdown", + "id": "925730c1-0ac5-454b-8fb2-ab8cebb3f3ac", + "metadata": {}, + "source": [ + "## 5. Send a chat request for testing\n", + "\n", + "Call the OpenAI compatible endpoint `/v1/chat/completions`, parse the JSON response, and print only the assistant message text." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "31bc78d4-1c6f-439c-b894-1522e3a6d3e6", + "metadata": {}, + "outputs": [], + "source": [ + "body = {\n", + " \"model\": vllm_module.model,\n", + " \"messages\": [{\"role\": \"user\", \"content\": \"what are the 3 countries with the most gpu as far as you know\"}],\n", + " \"max_tokens\": vllm_module.max_tokens, # start smaller for testing\n", + "}\n", + "\n", + "resp = app.invoke(path=\"/v1/chat/completions\", body=body)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "a459d5f8-dad0-4735-94c2-3801d4f94bb5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "assistant:\n", + "\n", + "As of the most commonly cited estimates, the three countries with the largest GPU capacity for AI workloads are the United States, China, and India.\n" + ] + } + ], + "source": [ + "data = resp\n", + "assistant_text = data[\"choices\"][0][\"message\"][\"content\"]\n", + "\n", + "print(\"\\nassistant:\\n\")\n", + "print(assistant_text.strip())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "957b5d21-7ade-4131-9100-878652c477fc", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mlrun-base", + "language": "python", + "name": "conda-env-mlrun-base-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.22" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/modules/src/vllm_module/vllm_module.py b/modules/src/vllm_module/vllm_module.py new file mode 100644 index 00000000..ce6307cc --- /dev/null +++ b/modules/src/vllm_module/vllm_module.py @@ -0,0 +1,138 @@ +# Copyright 2025 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# This module acts as a lightweight gateway to OpenAI-compatible APIs. +# You can send chat prompts, create embeddings, or get model responses without worrying about authentication or endpoint differences. +# It simplifies access so you can test, analyze, or integrate AI features directly into your projects or notebooks with minimal setup. + + +from typing import Dict, Optional, List + + +class VLLMModule: + """ + VLLMModule + + This module provides a lightweight wrapper for deploying a vLLM + (OpenAI-compatible) large language model server as an MLRun application runtime. + + The VLLMModule is responsible for: + - Creating an MLRun application runtime based on a vLLM container image + - Configuring GPU resources, memory limits, and Kubernetes node selection + - Launching the model using `vllm serve` with configurable runtime flags + - Supporting multi-GPU inference via tensor parallelism + - Automatically configuring shared memory (/dev/shm) when using multiple GPUs + - Exposing an OpenAI-compatible API (e.g. /v1/chat/completions) for inference + - Providing a simple Python interface for deployment and invocation from Jupyter notebooks + + The module is designed to be used in Jupyter notebooks and MLRun pipelines, + allowing users to deploy and test large language models on Kubernetes + with minimal configuration. + """ + + def __init__( + self, + project: str, + *, + node_selector: Optional[Dict[str, str]] = None, + name: str = "vllm", + image: str = "vllm/vllm-openai:latest", + model: str = "Qwen/Qwen2.5-Omni-3B", + gpus: int = 1, + mem: str = "10G", + port: int = 8000, + dtype: str = "auto", + tensor_parallel_size: Optional[int] = None, + uvicorn_log_level: str = "info", + max_tokens: int = 500, + ): + if gpus < 1: + raise ValueError("gpus must be >= 1") + + if tensor_parallel_size is not None: + if tensor_parallel_size < 1: + raise ValueError("tensor_parallel_size must be >= 1") + if tensor_parallel_size > gpus: + raise ValueError( + f"tensor_parallel_size ({tensor_parallel_size}) cannot be greater than gpus ({gpus})" + ) + + if node_selector is None: + node_selector = {"alpha.eksctl.io/nodegroup-name": "added-gpu"} + + if not isinstance(max_tokens, int): + raise TypeError("max_tokens must be an integer") + + if max_tokens < 1: + raise ValueError("max_tokens must be >= 1") + + self.project = project + self.name = name + self.image = image + self.model = model + self.gpus = gpus + self.mem = mem + self.node_selector = node_selector + self.port = port + self.dtype = dtype + self.tensor_parallel_size = tensor_parallel_size + self.uvicorn_log_level = uvicorn_log_level + self.max_tokens = max_tokens + + self.vllm_app = self.project.set_function( + name=self.name, + kind="application", + image=self.image, + ) + + self.vllm_app.with_limits(gpus=self.gpus, mem=self.mem) + + if self.node_selector: + self.vllm_app.with_node_selection(node_selector=self.node_selector) + + self.vllm_app.set_internal_application_port(self.port) + + args: List[str] = [ + "serve", + self.model, + "--dtype", + self.dtype, + "--port", + str(self.port), + ] + + if self.uvicorn_log_level: + args += ["--uvicorn-log-level", self.uvicorn_log_level] + + if self.gpus > 1: + tps = self.tensor_parallel_size or self.gpus + args += ["--tensor-parallel-size", str(tps)] + + # For more than one GPU you should create a share volume for the multiple GPUs + self.vllm_app.spec.volumes = [{"name": "dshm", "emptyDir": {"medium": "Memory"}}] + self.vllm_app.spec.volume_mounts = [{"name": "dshm", "mountPath": "/dev/shm"}] + + self.vllm_app.spec.command = "vllm" + self.vllm_app.spec.args = args + + self.vllm_app.spec.min_replicas = 1 + self.vllm_app.spec.max_replicas = 1 + + def get_runtime(self): + return self.vllm_app + + def add_args(self, extra_args: List[str]): + if not isinstance(extra_args, list) or not all(isinstance(x, str) for x in extra_args): + raise ValueError("extra_args must be a list of strings") + self.vllm_app.spec.args += extra_args \ No newline at end of file From 4b1e44839a380b4df20b4e86a2e4a1fa06328a04 Mon Sep 17 00:00:00 2001 From: guylei-code Date: Thu, 25 Dec 2025 16:05:24 +0200 Subject: [PATCH 2/9] Chane the vllm-module.ipynb output error ,vllm_app. --- modules/src/vllm_module/item.yaml | 2 +- modules/src/vllm_module/test_vllm_module.py | 2 +- modules/src/vllm_module/vllm-module.ipynb | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/src/vllm_module/item.yaml b/modules/src/vllm_module/item.yaml index d7b54021..edc66f4d 100644 --- a/modules/src/vllm_module/item.yaml +++ b/modules/src/vllm_module/item.yaml @@ -1,7 +1,7 @@ apiVersion: v1 categories: - genai -description: Deploys a vLLM OpenAI-compatible LLM server as an MLRun application runtime, with configurable GPU usage, node selection, tensor parallelism, and runtime flags. +description: Deploys a vLLM OpenAI-compatible LLM server as an MLRun application runtime, with configurable GPU usage, node selection, tensor parallelism and runtime flags. example: vllm_module.ipynb generationDate: 2025-12-17:12-25 hidden: false diff --git a/modules/src/vllm_module/test_vllm_module.py b/modules/src/vllm_module/test_vllm_module.py index 3a5f422a..f2162900 100644 --- a/modules/src/vllm_module/test_vllm_module.py +++ b/modules/src/vllm_module/test_vllm_module.py @@ -18,7 +18,7 @@ class TestVllmModule: - """Test suite for VLLMModule class.""" + """Test suite for VLLMModule class""" def setup_method(self): project = mlrun.new_project("vllm", save=False) diff --git a/modules/src/vllm_module/vllm-module.ipynb b/modules/src/vllm_module/vllm-module.ipynb index 05b584e4..8dda8054 100644 --- a/modules/src/vllm_module/vllm-module.ipynb +++ b/modules/src/vllm_module/vllm-module.ipynb @@ -170,7 +170,7 @@ "body = {\n", " \"model\": vllm_module.model,\n", " \"messages\": [{\"role\": \"user\", \"content\": \"what are the 3 countries with the most gpu as far as you know\"}],\n", - " \"max_tokens\": vllm_module.max_tokens, # start smaller for testing\n", + " \"max_tokens\": vllm_module.max_tokens, # start smaller for testing\n", "}\n", "\n", "resp = app.invoke(path=\"/v1/chat/completions\", body=body)" From bc38f7ad942139e51fc7c3c3e2a0cc58973935d0 Mon Sep 17 00:00:00 2001 From: guylei-code Date: Mon, 27 Apr 2026 14:58:01 +0300 Subject: [PATCH 3/9] toxicity guardrail first commit --- steps/src/toxicity_guardrail/item.yaml | 23 ++ steps/src/toxicity_guardrail/requirements.txt | 2 + .../test_toxicity_guardrail.py | 42 +++ .../toxicity_guardrail.ipynb | 320 ++++++++++++++++++ .../toxicity_guardrail/toxicity_guardrail.py | 61 ++++ 5 files changed, 448 insertions(+) create mode 100644 steps/src/toxicity_guardrail/item.yaml create mode 100644 steps/src/toxicity_guardrail/requirements.txt create mode 100644 steps/src/toxicity_guardrail/test_toxicity_guardrail.py create mode 100644 steps/src/toxicity_guardrail/toxicity_guardrail.ipynb create mode 100644 steps/src/toxicity_guardrail/toxicity_guardrail.py diff --git a/steps/src/toxicity_guardrail/item.yaml b/steps/src/toxicity_guardrail/item.yaml new file mode 100644 index 00000000..95fe820e --- /dev/null +++ b/steps/src/toxicity_guardrail/item.yaml @@ -0,0 +1,23 @@ +apiVersion: v1 +categories: + - data-preparation + - model-serving + - genai +description: Filters toxic requests using a pre-trained text classifier before they reach the LLM +example: toxicity_guardrail.ipynb +generationDate: 2026-04-27:12-00 +hidden: false +labels: + author: Iguazio +mlrunVersion: 1.10.0 +name: toxicity_guardrail +className: ToxicityGuardrailStep +defaultHandler: +spec: + filename: toxicity_guardrail.py + image: mlrun/mlrun + requirements: + - transformers + - torch + kind: generic +version: 1.0.0 \ No newline at end of file diff --git a/steps/src/toxicity_guardrail/requirements.txt b/steps/src/toxicity_guardrail/requirements.txt new file mode 100644 index 00000000..5061402e --- /dev/null +++ b/steps/src/toxicity_guardrail/requirements.txt @@ -0,0 +1,2 @@ +transformers==4.47.0 +torch==2.6.0 diff --git a/steps/src/toxicity_guardrail/test_toxicity_guardrail.py b/steps/src/toxicity_guardrail/test_toxicity_guardrail.py new file mode 100644 index 00000000..42e85fd3 --- /dev/null +++ b/steps/src/toxicity_guardrail/test_toxicity_guardrail.py @@ -0,0 +1,42 @@ +# Copyright 2025 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import mlrun + +from toxicity_guardrail import ToxicityGuardrailStep + + +class TestToxicityGuardrailStep: + """Test suite for ToxicityGuardrailStep class.""" + + def setup_method(self): + """Set up test fixtures before each test method.""" + project = mlrun.new_project("toxicity-guardrail", save=False) + self.fn = project.set_function( + "toxicity_guardrail.py", + name="guardrail-fn", + kind="serving", + image="mlrun/mlrun", + ) + graph = self.fn.set_topology("flow", engine="async") + graph.to( + class_name="ToxicityGuardrailStep", + name="toxicity_guardrail", + threshold=0.5, + ).respond() + + def test_toxicity_guardrail_step(self): + """Test that the serving function is correctly configured with ToxicityGuardrailStep.""" + assert type(self.fn) == mlrun.runtimes.ServingRuntime diff --git a/steps/src/toxicity_guardrail/toxicity_guardrail.ipynb b/steps/src/toxicity_guardrail/toxicity_guardrail.ipynb new file mode 100644 index 00000000..0085a3ba --- /dev/null +++ b/steps/src/toxicity_guardrail/toxicity_guardrail.ipynb @@ -0,0 +1,320 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "93c9feca-c120-443e-bbd3-731f70d49682", + "metadata": {}, + "source": [ + "## Pipeline: Toxicity Guardrail (Hub Step) → LLM Model Runner\n", + "\n", + "A unified serving graph that:\n", + "1. Routes the user's question through a toxicity guardrail hub step\n", + "2. If safe → calls a `ModelRunnerStep` (LLM) and returns the answer\n", + "3. If toxic → blocks the request with a clear rejection response" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67435be350de0cea", + "metadata": {}, + "outputs": [], + "source": [ + "import mlrun" + ] + }, + { + "cell_type": "markdown", + "id": "m001-0000-0000-0000-000000000001", + "metadata": {}, + "source": [ + "Create or load the MLRun project that will hold the serving function and its secrets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd9fd3609223be6a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2026-04-27 10:59:47,707 [info] Loading project from path: {\"path\":\"./\",\"project_name\":\"hubstep-guardrail-toxicity\",\"user_project\":false}\n", + "> 2026-04-27 11:00:02,102 [info] Project loaded successfully: {\"path\":\"./\",\"project_name\":\"hubstep-guardrail-toxicity\",\"stored_in_db\":true}\n" + ] + } + ], + "source": [ + "project = mlrun.get_or_create_project(\"hubstep-guardrail-toxicity\", user_project=False, context=\"./\", allow_cross_project=True)" + ] + }, + { + "cell_type": "markdown", + "id": "m001-0000-0000-0000-000000000002", + "metadata": {}, + "source": [ + "Load credentials from a local `.env` file. The file should define `OPENAI_API_KEY`, `OPENAI_BASE_URL`, and `OPENAI_MODEL`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61bc0d94-4939-46c1-ac0d-2e90fd465c9c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv(\"cred.env\", override=True)" + ] + }, + { + "cell_type": "markdown", + "id": "m001-0000-0000-0000-000000000003", + "metadata": {}, + "source": [ + "Store the credentials as Kubernetes secrets so the deployed Nuclio function can access them securely at runtime — no environment variables need to be injected manually." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "884a67ca-d548-4d7d-bab1-ca8868fbe875", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "project.set_secrets(\n", + " secrets={\n", + " \"OPENAI_API_KEY\": os.getenv(\"OPENAI_API_KEY\"),\n", + " \"OPENAI_BASE_URL\": os.getenv(\"OPENAI_BASE_URL\"),\n", + " \"OPENAI_MODEL\": os.getenv(\"OPENAI_MODEL\"),\n", + " },\n", + " provider=\"kubernetes\",\n", + ")\n", + "project.save()" + ] + }, + { + "cell_type": "markdown", + "id": "m001-0000-0000-0000-000000000004", + "metadata": {}, + "source": [ + "## Build the serving graph\n", + "\n", + "`LLMModel` wraps an OpenAI-compatible API and reads credentials from the Kubernetes secrets set above.\n", + "`format_answer` is a plain function that flattens the `ModelRunnerStep` output dict\n", + "(`{\"llm_model\": {\"answer\": ...}}`) into a simple `{\"answer\": ...}` response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "505c77e2-6875-499d-ae05-c6de3efa0622", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting serving_graph.py\n" + ] + } + ], + "source": [ + "%%writefile serving_graph.py\n", + "from typing import Dict, Any\n", + "from mlrun.serving import Model\n", + "\n", + "class LLMModel(Model):\n", + " \"\"\"OpenAI-compatible LLM. Credentials and model are read from env vars:\n", + " OPENAI_API_KEY, OPENAI_BASE_URL (optional), OPENAI_MODEL (optional, falls back to default_model_name).\n", + " \"\"\"\n", + "\n", + " def __init__(self, default_model_name: str = \"gpt-4o-mini\", **kwargs):\n", + " super().__init__(**kwargs)\n", + " self.default_model_name = default_model_name\n", + "\n", + " def load(self):\n", + " import openai, os\n", + " self.model_name = os.environ.get(\"OPENAI_MODEL\", self.default_model_name)\n", + " client_kwargs = {\"api_key\": os.environ[\"OPENAI_API_KEY\"]}\n", + " base_url = os.environ.get(\"OPENAI_BASE_URL\")\n", + " if base_url:\n", + " client_kwargs[\"base_url\"] = base_url\n", + " self._client = openai.OpenAI(**client_kwargs)\n", + "\n", + " def predict(self, body: Dict[str, Any]) -> Dict[str, Any]:\n", + " question = body.get(\"question\", \"\")\n", + " response = self._client.chat.completions.create(\n", + " model=self.model_name,\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n", + " {\"role\": \"user\", \"content\": question},\n", + " ],\n", + " )\n", + " return {\"answer\": response.choices[0].message.content, \"model\": self.model_name}\n", + "\n", + "\n", + "def format_answer(event: Dict[str, Any]) -> Dict[str, Any]:\n", + " \"\"\"Flatten ModelRunnerStep output: {\"llm_model\": {\"answer\": ...}} → {\"answer\": ...}\"\"\"\n", + " if isinstance(event, dict):\n", + " for _, model_output in event.items():\n", + " if isinstance(model_output, dict):\n", + " return model_output\n", + " return event" + ] + }, + { + "cell_type": "markdown", + "id": "m001-0000-0000-0000-000000000005", + "metadata": {}, + "source": [ + "Wire the three-step async flow graph:\n", + "1. **`toxicity_guardrail`** — loaded directly from `hub://toxicity_guardrail`; blocks requests with a toxicity score ≥ `threshold`\n", + "2. **`llm_runner`** — a `ModelRunnerStep` that runs `LLMModel` against the OpenAI-compatible API\n", + "3. **`format_answer`** — flattens the runner output and sends the response back to the caller" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d0435a5-4e65-4a33-a146-8c6abb382b37", + "metadata": {}, + "outputs": [], + "source": [ + "from mlrun.serving import ModelRunnerStep\n", + "\n", + "fn_pipeline = project.set_function(\n", + " name=\"toxicity-llm-pipeline\",\n", + " func=\"serving_graph.py\",\n", + " kind=\"serving\",\n", + " image=\"mlrun/mlrun\",\n", + " requirements=[\"transformers\", \"torch\", \"openai\"],\n", + ")\n", + "# Credentials come from Kubernetes secrets set above — no set_envs() needed for them.\n", + "\n", + "graph = fn_pipeline.set_topology(\"flow\", engine=\"async\")\n", + "\n", + "graph.add_step(\n", + " class_name=\"hub://toxicity_guardrail\",\n", + " name=\"toxicity_guardrail\",\n", + " threshold=0.5,\n", + ")\n", + "\n", + "model_runner = ModelRunnerStep(name=\"llm_runner\")\n", + "model_runner.add_model(endpoint_name=\"llm_model\", model_class=\"LLMModel\")\n", + "graph.add_step(model_runner, after=\"toxicity_guardrail\")\n", + "\n", + "graph.add_step(name=\"format_answer\", handler=\"format_answer\", after=\"llm_runner\").respond()\n", + "\n", + "graph.plot(rankdir=\"LR\")" + ] + }, + { + "cell_type": "markdown", + "id": "m001-0000-0000-0000-000000000006", + "metadata": {}, + "source": [ + "Deploy the function to Nuclio. This builds a container image with the required packages (`transformers`, `torch`, `openai`) and starts the serving endpoint." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "973ceab8-48b5-4689-a39e-b83ac3e75ddc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2026-04-27 11:08:33,830 [info] Starting remote function deploy\n", + "2026-04-27 11:08:34 (info) Deploying function\n", + "2026-04-27 11:08:34 (info) Building\n", + "2026-04-27 11:08:34 (info) Staging files and preparing base images\n", + "2026-04-27 11:08:34 (warn) Using user provided base image, runtime interpreter version is provided by the base image\n", + "2026-04-27 11:08:34 (info) Building processor image\n", + "2026-04-27 11:17:59 (info) Build complete\n", + "2026-04-27 11:20:19 (info) Function deploy complete\n", + "> 2026-04-27 11:20:27,592 [info] Model endpoint creation task completed with state succeeded\n", + "> 2026-04-27 11:20:27,592 [info] Successfully deployed function: {\"external_invocation_urls\":[\"hubstep-guardrail-toxicity-toxicity-llm-pipeline.default-tenant.app.cust-cs.iguazio-cd1.com/\"],\"internal_invocation_urls\":[\"nuclio-hubstep-guardrail-toxicity-toxicity-llm-pipeline.default-tenant.svc.cluster.local:8080\"]}\n", + "Pipeline deployed: DeployStatus(state=ready, outputs={'endpoint': 'http://hubstep-guardrail-toxicity-toxicity-llm-pipeline.default-tenant.app.cust-cs.iguazio-cd1.com/', 'name': 'hubstep-guardrail-toxicity-toxicity-llm-pipeline'})\n" + ] + } + ], + "source": [ + "addr = project.deploy_function(fn_pipeline)\n", + "print(\"Pipeline deployed:\", addr)" + ] + }, + { + "cell_type": "markdown", + "id": "m001-0000-0000-0000-000000000007", + "metadata": {}, + "source": [ + "Test the pipeline with a safe question and a toxic one.\n", + "Safe requests pass through to the LLM and return an answer; toxic ones are blocked by the guardrail before reaching the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd5cd2c9-3abe-401f-bb2e-65d216b9951d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== Safe input ===\n", + "Response: {'answer': 'Washington, D.C. It\u2019s a federal district, not part of any state.', 'model': 'gpt-5-nano-2025-08-07'}\n", + "\n", + "=== Toxic input ===\n", + "Blocked (expected): bad function response 500: Exception caught in handler - \"Request blocked: toxicity score 0.953 >= 0.5\"\n" + ] + } + ], + "source": [ + "# --- Safe input: should return an LLM answer ---\n", + "print(\"=== Safe input ===\")\n", + "result = fn_pipeline.invoke(\"/\", {\"question\": \"What is the capital of the USA?\"})\n", + "print(\"Response:\", result)\n", + "\n", + "# --- Toxic input: should be blocked ---\n", + "print(\"\\n=== Toxic input ===\")\n", + "try:\n", + " result = fn_pipeline.invoke(\"/\", {\"question\": \"You're completely useless and everything you do is a failure\"})\n", + " print(\"Response:\", result)\n", + "except Exception as e:\n", + " print(f\"Blocked (expected): {e}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/steps/src/toxicity_guardrail/toxicity_guardrail.py b/steps/src/toxicity_guardrail/toxicity_guardrail.py new file mode 100644 index 00000000..def0616b --- /dev/null +++ b/steps/src/toxicity_guardrail/toxicity_guardrail.py @@ -0,0 +1,61 @@ +# Copyright 2025 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Any, Dict + + +class ToxicityGuardrailStep: + """ + A serving graph step that filters out toxic requests using a pre-trained + text classification model. + + If the toxicity score of the input text meets or exceeds the threshold, + the request is blocked with a ValueError. Safe requests are passed through + unchanged. + + The classifier label "toxic" maps directly to the toxicity score; any + other label (e.g. "non-toxic") inverts the model's confidence score. + """ + + def __init__( + self, + context=None, + name=None, + threshold: float = 0.5, + model_name: str = "unitary/toxic-bert", + **kwargs, + ): + self.threshold = threshold + self.model_name = model_name + self._classifier = None + + def post_init(self, mode="sync", **kwargs): + from transformers import pipeline + + self._classifier = pipeline("text-classification", model=self.model_name) + + def do(self, event: Dict[str, Any]) -> Dict[str, Any]: + question = event.get("question", "") + result = self._classifier(question)[0] + score = ( + result["score"] + if result["label"] == "toxic" + else 1 - result["score"] + ) + if score >= self.threshold: + raise ValueError( + f"Request blocked: toxicity score {score:.3f} >= {self.threshold}" + ) + return event From 6cbffc5be932c3ec60b153705cc6233f857f979d Mon Sep 17 00:00:00 2001 From: guylei-code Date: Mon, 27 Apr 2026 15:04:08 +0300 Subject: [PATCH 4/9] delete vllm-module.ipynb --- modules/src/vllm_module/vllm-module.ipynb | 234 ---------------------- 1 file changed, 234 deletions(-) delete mode 100644 modules/src/vllm_module/vllm-module.ipynb diff --git a/modules/src/vllm_module/vllm-module.ipynb b/modules/src/vllm_module/vllm-module.ipynb deleted file mode 100644 index 8dda8054..00000000 --- a/modules/src/vllm_module/vllm-module.ipynb +++ /dev/null @@ -1,234 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "7d551647-dfc2-47da-bc8a-3792af622073", - "metadata": {}, - "source": [ - "# vLLM Module with MLRun\n", - "\n", - "This notebook shows how to configure and deploy a vLLM OpenAI compatible server as an MLRun application runtime, then showcases how to send a chat request to it to the vLLM server." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "7707b270-30cc-448a-a828-cb93aa28030d", - "metadata": {}, - "outputs": [], - "source": [ - "import mlrun\n" - ] - }, - { - "cell_type": "markdown", - "id": "d5cff681-bfdf-4468-a1d1-2aeadb56065e", - "metadata": {}, - "source": [ - "## Prerequisite\n", - "* At lease one GPU is required for running this notebook." - ] - }, - { - "cell_type": "markdown", - "id": "d5c84798-289f-4b4f-8c1b-f4dd12a3bda5", - "metadata": {}, - "source": [ - "## What this notebook does\n", - "\n", - "In this notebook we will:\n", - "\n", - "- Create or load an **MLRun project**\n", - "- Import a custom **vLLM module** from the MLRun Hub\n", - "- Deploy a **vLLM OpenAI-compatible server** as an MLRun application runtime\n", - "- Configure deployment parameters such as model, GPU count, memory, node selector, port, and log level\n", - "- Invoke the deployed service using the `/v1/chat/completions` endpoint\n", - "- Parse the response and extract only the assistant’s generated text\n", - "\n", - "By the end of this notebook, you will have a working vLLM deployment that can be queried directly from a Jupyter notebook using OpenAI-style APIs.\n", - "\n", - "For more information about [vLLM documentation](https://docs.vllm.ai/en/latest/serving/openai_compatible_server/)" - ] - }, - { - "cell_type": "markdown", - "id": "879ca641-ee35-4682-9995-4eb319d89090", - "metadata": {}, - "source": [ - "## 1. Create an MLRun project\n", - "\n", - "In this section we create or load an MLRun project that will own the deployed vLLM application runtime." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6eac263a-17d1-4454-9e19-459dfbe2f231", - "metadata": {}, - "outputs": [], - "source": [ - "project = mlrun.get_or_create_project(name=\"vllm-module\", context=\"\", user_project=True)" - ] - }, - { - "cell_type": "markdown", - "id": "da49d335-b704-4fb6-801f-4d07b64f9be6", - "metadata": {}, - "source": [ - "## 2. Import the vLLM module from the MLRun Hub\n", - "\n", - "In this section we import the vLLM module from the MLRun Hub so we can instantiate `VLLMModule` and deploy it as an application runtime." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e6d89dee-db58-4c0c-8009-b37020c9599a", - "metadata": {}, - "outputs": [], - "source": [ - "vllm = mlrun.import_module(\"hub://vllm-module\")" - ] - }, - { - "cell_type": "markdown", - "id": "1202ddd5-0ce7-4769-be29-8fc264c1f80e", - "metadata": {}, - "source": [ - "## 3. Deploy the vLLM application runtime\n", - "\n", - "Configure the vLLM deployment parameters and deploy the application.\n", - "\n", - "The returned address is the service URL for the application runtime." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e433123a-e64b-4a7a-8c7f-8165bcdcc6d1", - "metadata": {}, - "outputs": [], - "source": [ - "# Initialize the vLLM app\n", - "vllm_module = vllm.VLLMModule(\n", - " project=project,\n", - " node_selector={\"alpha.eksctl.io/nodegroup-name\": \"added-gpu\"},\n", - " name=\"qwen-vllm\",\n", - " image=\"vllm/vllm-openai:latest\",\n", - " model=\"Qwen/Qwen2.5-Omni-3B\",\n", - " gpus=1,\n", - " mem=\"10G\",\n", - " port=8000,\n", - " dtype=\"auto\",\n", - " uvicorn_log_level=\"info\",\n", - " max_tokens = 501,\n", - ")\n", - "\n", - "# Deploy the vLLM app\n", - "addr = vllm_module.vllm_app.deploy(with_mlrun=True)\n", - "addr" - ] - }, - { - "cell_type": "markdown", - "id": "06832de3-5c31-43bf-b07b-0e71fb2d072d", - "metadata": {}, - "source": [ - "## 4. Get the runtime handle\n", - "\n", - "Fetch the runtime object and invoke the service using `app.invoke(...)`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "102d3fd0-1ee6-49b8-8c86-df742ac1c559", - "metadata": {}, - "outputs": [], - "source": [ - "# Optional: get_runtime() method uses to get the MLRun application runtime\n", - "app = vllm_module.get_runtime()" - ] - }, - { - "cell_type": "markdown", - "id": "925730c1-0ac5-454b-8fb2-ab8cebb3f3ac", - "metadata": {}, - "source": [ - "## 5. Send a chat request for testing\n", - "\n", - "Call the OpenAI compatible endpoint `/v1/chat/completions`, parse the JSON response, and print only the assistant message text." - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "31bc78d4-1c6f-439c-b894-1522e3a6d3e6", - "metadata": {}, - "outputs": [], - "source": [ - "body = {\n", - " \"model\": vllm_module.model,\n", - " \"messages\": [{\"role\": \"user\", \"content\": \"what are the 3 countries with the most gpu as far as you know\"}],\n", - " \"max_tokens\": vllm_module.max_tokens, # start smaller for testing\n", - "}\n", - "\n", - "resp = app.invoke(path=\"/v1/chat/completions\", body=body)" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "a459d5f8-dad0-4735-94c2-3801d4f94bb5", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "assistant:\n", - "\n", - "As of the most commonly cited estimates, the three countries with the largest GPU capacity for AI workloads are the United States, China, and India.\n" - ] - } - ], - "source": [ - "data = resp\n", - "assistant_text = data[\"choices\"][0][\"message\"][\"content\"]\n", - "\n", - "print(\"\\nassistant:\\n\")\n", - "print(assistant_text.strip())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "957b5d21-7ade-4131-9100-878652c477fc", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "mlrun-base", - "language": "python", - "name": "conda-env-mlrun-base-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.22" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From b1acc880e55b10cbced1a5176bae9e31a06925d4 Mon Sep 17 00:00:00 2001 From: guylei-code Date: Mon, 27 Apr 2026 16:18:39 +0300 Subject: [PATCH 5/9] second commit, update changes requested in ipynb --- .../toxicity_guardrail.ipynb | 33 +++++++++---------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/steps/src/toxicity_guardrail/toxicity_guardrail.ipynb b/steps/src/toxicity_guardrail/toxicity_guardrail.ipynb index 0085a3ba..973ab392 100644 --- a/steps/src/toxicity_guardrail/toxicity_guardrail.ipynb +++ b/steps/src/toxicity_guardrail/toxicity_guardrail.ipynb @@ -55,7 +55,14 @@ "id": "m001-0000-0000-0000-000000000002", "metadata": {}, "source": [ - "Load credentials from a local `.env` file. The file should define `OPENAI_API_KEY`, `OPENAI_BASE_URL`, and `OPENAI_MODEL`." + "### Load credentials from a local `.env` file.\n", + "\n", + "For example:\n", + "```\n", + "OPENAI_API_KEY=\"...\"\n", + "OPENAI_BASE_URL=\"...\"\n", + "OPENAI_MODEL=\"...\"\n", + "```" ] }, { @@ -85,16 +92,12 @@ "cell_type": "markdown", "id": "m001-0000-0000-0000-000000000003", "metadata": {}, - "source": [ - "Store the credentials as Kubernetes secrets so the deployed Nuclio function can access them securely at runtime — no environment variables need to be injected manually." - ] + "source": "Store the credentials as project secrets - see also [working with secrets](http://docs.mlrun.org/en/stable/secrets.html).\n" }, { "cell_type": "code", - "execution_count": null, "id": "884a67ca-d548-4d7d-bab1-ca8868fbe875", "metadata": {}, - "outputs": [], "source": [ "import os\n", "project.set_secrets(\n", @@ -103,10 +106,11 @@ " \"OPENAI_BASE_URL\": os.getenv(\"OPENAI_BASE_URL\"),\n", " \"OPENAI_MODEL\": os.getenv(\"OPENAI_MODEL\"),\n", " },\n", - " provider=\"kubernetes\",\n", ")\n", "project.save()" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -115,7 +119,7 @@ "source": [ "## Build the serving graph\n", "\n", - "`LLMModel` wraps an OpenAI-compatible API and reads credentials from the Kubernetes secrets set above.\n", + "`LLMModel` wraps an OpenAI-compatible API and reads credentials to the Kubernetes secrets set above.\n", "`format_answer` is a plain function that flattens the `ModelRunnerStep` output dict\n", "(`{\"llm_model\": {\"answer\": ...}}`) into a simple `{\"answer\": ...}` response." ] @@ -228,9 +232,7 @@ "cell_type": "markdown", "id": "m001-0000-0000-0000-000000000006", "metadata": {}, - "source": [ - "Deploy the function to Nuclio. This builds a container image with the required packages (`transformers`, `torch`, `openai`) and starts the serving endpoint." - ] + "source": "Deploy the Serving function, with the required packages (`transformers`, `torch`, `openai`)." }, { "cell_type": "code", @@ -256,10 +258,7 @@ ] } ], - "source": [ - "addr = project.deploy_function(fn_pipeline)\n", - "print(\"Pipeline deployed:\", addr)" - ] + "source": "addr = project.deploy_function(fn_pipeline)" }, { "cell_type": "markdown", @@ -281,7 +280,7 @@ "output_type": "stream", "text": [ "=== Safe input ===\n", - "Response: {'answer': 'Washington, D.C. It\u2019s a federal district, not part of any state.', 'model': 'gpt-5-nano-2025-08-07'}\n", + "Response: {'answer': 'Washington, D.C. It’s a federal district, not part of any state.', 'model': 'gpt-5-nano-2025-08-07'}\n", "\n", "=== Toxic input ===\n", "Blocked (expected): bad function response 500: Exception caught in handler - \"Request blocked: toxicity score 0.953 >= 0.5\"\n" From b50b48cc3a4900e77a3e362a6dde8278b06be8b6 Mon Sep 17 00:00:00 2001 From: guylei-code Date: Tue, 28 Apr 2026 11:23:10 +0300 Subject: [PATCH 6/9] third commit, update changes requested in ipynb --- .../toxicity_guardrail.ipynb | 54 +++---------------- 1 file changed, 8 insertions(+), 46 deletions(-) diff --git a/steps/src/toxicity_guardrail/toxicity_guardrail.ipynb b/steps/src/toxicity_guardrail/toxicity_guardrail.ipynb index 973ab392..ddc3fe99 100644 --- a/steps/src/toxicity_guardrail/toxicity_guardrail.ipynb +++ b/steps/src/toxicity_guardrail/toxicity_guardrail.ipynb @@ -13,16 +13,6 @@ "3. If toxic → blocks the request with a clear rejection response" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "67435be350de0cea", - "metadata": {}, - "outputs": [], - "source": [ - "import mlrun" - ] - }, { "cell_type": "markdown", "id": "m001-0000-0000-0000-000000000001", @@ -47,6 +37,7 @@ } ], "source": [ + "import mlrun\n", "project = mlrun.get_or_create_project(\"hubstep-guardrail-toxicity\", user_project=False, context=\"./\", allow_cross_project=True)" ] }, @@ -67,26 +58,15 @@ }, { "cell_type": "code", - "execution_count": null, "id": "61bc0d94-4939-46c1-ac0d-2e90fd465c9c", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "display_data" - } - ], "source": [ "from dotenv import load_dotenv\n", "\n", "load_dotenv(\"cred.env\", override=True)" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -119,7 +99,7 @@ "source": [ "## Build the serving graph\n", "\n", - "`LLMModel` wraps an OpenAI-compatible API and reads credentials to the Kubernetes secrets set above.\n", + "`LLMModel` wraps an OpenAI-compatible API and reads credentials to the project secrets set above.\n", "`format_answer` is a plain function that flattens the `ModelRunnerStep` output dict\n", "(`{\"llm_model\": {\"answer\": ...}}`) into a simple `{\"answer\": ...}` response." ] @@ -236,29 +216,11 @@ }, { "cell_type": "code", - "execution_count": null, "id": "973ceab8-48b5-4689-a39e-b83ac3e75ddc", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2026-04-27 11:08:33,830 [info] Starting remote function deploy\n", - "2026-04-27 11:08:34 (info) Deploying function\n", - "2026-04-27 11:08:34 (info) Building\n", - "2026-04-27 11:08:34 (info) Staging files and preparing base images\n", - "2026-04-27 11:08:34 (warn) Using user provided base image, runtime interpreter version is provided by the base image\n", - "2026-04-27 11:08:34 (info) Building processor image\n", - "2026-04-27 11:17:59 (info) Build complete\n", - "2026-04-27 11:20:19 (info) Function deploy complete\n", - "> 2026-04-27 11:20:27,592 [info] Model endpoint creation task completed with state succeeded\n", - "> 2026-04-27 11:20:27,592 [info] Successfully deployed function: {\"external_invocation_urls\":[\"hubstep-guardrail-toxicity-toxicity-llm-pipeline.default-tenant.app.cust-cs.iguazio-cd1.com/\"],\"internal_invocation_urls\":[\"nuclio-hubstep-guardrail-toxicity-toxicity-llm-pipeline.default-tenant.svc.cluster.local:8080\"]}\n", - "Pipeline deployed: DeployStatus(state=ready, outputs={'endpoint': 'http://hubstep-guardrail-toxicity-toxicity-llm-pipeline.default-tenant.app.cust-cs.iguazio-cd1.com/', 'name': 'hubstep-guardrail-toxicity-toxicity-llm-pipeline'})\n" - ] - } - ], - "source": "addr = project.deploy_function(fn_pipeline)" + "source": "addr = project.deploy_function(fn_pipeline)", + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", From 42c464fe18a46041f8e36d2652a3c14a32e290a3 Mon Sep 17 00:00:00 2001 From: guylei-code Date: Tue, 28 Apr 2026 12:18:55 +0300 Subject: [PATCH 7/9] Update steps/src/toxicity_guardrail/item.yaml Co-authored-by: Eyal Danieli --- steps/src/toxicity_guardrail/item.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/steps/src/toxicity_guardrail/item.yaml b/steps/src/toxicity_guardrail/item.yaml index 95fe820e..9162d11e 100644 --- a/steps/src/toxicity_guardrail/item.yaml +++ b/steps/src/toxicity_guardrail/item.yaml @@ -9,7 +9,7 @@ generationDate: 2026-04-27:12-00 hidden: false labels: author: Iguazio -mlrunVersion: 1.10.0 +mlrunVersion: 1.11.0-rc48 name: toxicity_guardrail className: ToxicityGuardrailStep defaultHandler: From e3c26cbcae04ef6e58b499979baf7ddec0faa75c Mon Sep 17 00:00:00 2001 From: guylei-code Date: Tue, 28 Apr 2026 13:49:49 +0300 Subject: [PATCH 8/9] Specify versions for transformers and torch --- steps/src/toxicity_guardrail/item.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/steps/src/toxicity_guardrail/item.yaml b/steps/src/toxicity_guardrail/item.yaml index 9162d11e..ebe23b4a 100644 --- a/steps/src/toxicity_guardrail/item.yaml +++ b/steps/src/toxicity_guardrail/item.yaml @@ -17,7 +17,7 @@ spec: filename: toxicity_guardrail.py image: mlrun/mlrun requirements: - - transformers - - torch + - transformers==4.46.3 + - torch==2.11.0 kind: generic -version: 1.0.0 \ No newline at end of file +version: 1.0.0 From b16f27bde49ae59c42b403881f0557d8f6f0c0cf Mon Sep 17 00:00:00 2001 From: guylei-code Date: Tue, 28 Apr 2026 14:03:09 +0300 Subject: [PATCH 9/9] third commit, update changes requested in ipynb --- steps/src/toxicity_guardrail/item.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/steps/src/toxicity_guardrail/item.yaml b/steps/src/toxicity_guardrail/item.yaml index ebe23b4a..baefca0f 100644 --- a/steps/src/toxicity_guardrail/item.yaml +++ b/steps/src/toxicity_guardrail/item.yaml @@ -9,7 +9,7 @@ generationDate: 2026-04-27:12-00 hidden: false labels: author: Iguazio -mlrunVersion: 1.11.0-rc48 +mlrunVersion: 1.10.0 name: toxicity_guardrail className: ToxicityGuardrailStep defaultHandler: