From 3ddb397a96aabab2945287c678a37334082a2f2c Mon Sep 17 00:00:00 2001
From: guylei-code <guyleibu@gmail.com>
Date: Thu, 25 Dec 2025 16:02:52 +0200
Subject: [PATCH 1/9] Chane the vllm-module.ipynb output error ,vllm_app.

---
 modules/src/vllm_module/item.yaml           |  16 ++
 modules/src/vllm_module/test_vllm_module.py |  35 +++
 modules/src/vllm_module/vllm-module.ipynb   | 234 ++++++++++++++++++++
 modules/src/vllm_module/vllm_module.py      | 138 ++++++++++++
 4 files changed, 423 insertions(+)
 create mode 100644 modules/src/vllm_module/item.yaml
 create mode 100644 modules/src/vllm_module/test_vllm_module.py
 create mode 100644 modules/src/vllm_module/vllm-module.ipynb
 create mode 100644 modules/src/vllm_module/vllm_module.py

diff --git a/modules/src/vllm_module/item.yaml b/modules/src/vllm_module/item.yaml
new file mode 100644
index 00000000..d7b54021
--- /dev/null
+++ b/modules/src/vllm_module/item.yaml
@@ -0,0 +1,16 @@
+apiVersion: v1
+categories:
+- genai
+description: Deploys a vLLM OpenAI-compatible LLM server as an MLRun application runtime, with configurable GPU usage, node selection, tensor parallelism, and runtime flags.
+example: vllm_module.ipynb
+generationDate: 2025-12-17:12-25
+hidden: false
+labels:
+  author: Iguazio
+mlrunVersion: 1.10.0
+name: vllm_module
+spec:
+    filename: vllm_module.py
+    image: mlrun/mlrun
+    kind: generic
+version: 1.0.0
\ No newline at end of file
diff --git a/modules/src/vllm_module/test_vllm_module.py b/modules/src/vllm_module/test_vllm_module.py
new file mode 100644
index 00000000..3a5f422a
--- /dev/null
+++ b/modules/src/vllm_module/test_vllm_module.py
@@ -0,0 +1,35 @@
+# Copyright 2025 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from vllm_module import VLLMModule
+import mlrun
+
+
+class TestVllmModule:
+    """Test suite for VLLMModule class."""
+
+    def setup_method(self):
+        project = mlrun.new_project("vllm", save=False)
+
+        # if your VLLMModule requires node_selector as keyword-only, keep it here
+        self.TestVllmModule = VLLMModule(
+            project,
+            node_selector={"alpha.eksctl.io/nodegroup-name": "added-gpu"},
+        )
+
+    def test_vllm_module(self):
+        assert (
+            type(self.TestVllmModule.vllm_app) == mlrun.runtimes.nuclio.application.application.ApplicationRuntime
+        )
diff --git a/modules/src/vllm_module/vllm-module.ipynb b/modules/src/vllm_module/vllm-module.ipynb
new file mode 100644
index 00000000..05b584e4
--- /dev/null
+++ b/modules/src/vllm_module/vllm-module.ipynb
@@ -0,0 +1,234 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "7d551647-dfc2-47da-bc8a-3792af622073",
+   "metadata": {},
+   "source": [
+    "# vLLM Module with MLRun\n",
+    "\n",
+    "This notebook shows how to configure and deploy a vLLM OpenAI compatible server as an MLRun application runtime, then showcases how to send a chat request to it to the vLLM server."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "7707b270-30cc-448a-a828-cb93aa28030d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import mlrun\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d5cff681-bfdf-4468-a1d1-2aeadb56065e",
+   "metadata": {},
+   "source": [
+    "## Prerequisite\n",
+    "* At lease one GPU is required for running this notebook."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d5c84798-289f-4b4f-8c1b-f4dd12a3bda5",
+   "metadata": {},
+   "source": [
+    "## What this notebook does\n",
+    "\n",
+    "In this notebook we will:\n",
+    "\n",
+    "- Create or load an **MLRun project**\n",
+    "- Import a custom **vLLM module** from the MLRun Hub\n",
+    "- Deploy a **vLLM OpenAI-compatible server** as an MLRun application runtime\n",
+    "- Configure deployment parameters such as model, GPU count, memory, node selector, port, and log level\n",
+    "- Invoke the deployed service using the `/v1/chat/completions` endpoint\n",
+    "- Parse the response and extract only the assistant’s generated text\n",
+    "\n",
+    "By the end of this notebook, you will have a working vLLM deployment that can be queried directly from a Jupyter notebook using OpenAI-style APIs.\n",
+    "\n",
+    "For more information about [vLLM documentation](https://docs.vllm.ai/en/latest/serving/openai_compatible_server/)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "879ca641-ee35-4682-9995-4eb319d89090",
+   "metadata": {},
+   "source": [
+    "## 1. Create an MLRun project\n",
+    "\n",
+    "In this section we create or load an MLRun project that will own the deployed vLLM application runtime."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6eac263a-17d1-4454-9e19-459dfbe2f231",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "project = mlrun.get_or_create_project(name=\"vllm-module\", context=\"\", user_project=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "da49d335-b704-4fb6-801f-4d07b64f9be6",
+   "metadata": {},
+   "source": [
+    "## 2. Import the vLLM module from the MLRun Hub\n",
+    "\n",
+    "In this section we import the vLLM module from the MLRun Hub so we can instantiate `VLLMModule` and deploy it as an application runtime."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e6d89dee-db58-4c0c-8009-b37020c9599a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vllm = mlrun.import_module(\"hub://vllm-module\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1202ddd5-0ce7-4769-be29-8fc264c1f80e",
+   "metadata": {},
+   "source": [
+    "## 3. Deploy the vLLM application runtime\n",
+    "\n",
+    "Configure the vLLM deployment parameters and deploy the application.\n",
+    "\n",
+    "The returned address is the service URL for the application runtime."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e433123a-e64b-4a7a-8c7f-8165bcdcc6d1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Initialize the vLLM app\n",
+    "vllm_module = vllm.VLLMModule(\n",
+    "    project=project,\n",
+    "    node_selector={\"alpha.eksctl.io/nodegroup-name\": \"added-gpu\"},\n",
+    "    name=\"qwen-vllm\",\n",
+    "    image=\"vllm/vllm-openai:latest\",\n",
+    "    model=\"Qwen/Qwen2.5-Omni-3B\",\n",
+    "    gpus=1,\n",
+    "    mem=\"10G\",\n",
+    "    port=8000,\n",
+    "    dtype=\"auto\",\n",
+    "    uvicorn_log_level=\"info\",\n",
+    "    max_tokens = 501,\n",
+    ")\n",
+    "\n",
+    "# Deploy the vLLM app\n",
+    "addr = vllm_module.vllm_app.deploy(with_mlrun=True)\n",
+    "addr"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "06832de3-5c31-43bf-b07b-0e71fb2d072d",
+   "metadata": {},
+   "source": [
+    "## 4. Get the runtime handle\n",
+    "\n",
+    "Fetch the runtime object and invoke the service using `app.invoke(...)`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "102d3fd0-1ee6-49b8-8c86-df742ac1c559",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Optional: get_runtime() method uses to get the MLRun application runtime\n",
+    "app = vllm_module.get_runtime()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "925730c1-0ac5-454b-8fb2-ab8cebb3f3ac",
+   "metadata": {},
+   "source": [
+    "## 5. Send a chat request for testing\n",
+    "\n",
+    "Call the OpenAI compatible endpoint `/v1/chat/completions`, parse the JSON response, and print only the assistant message text."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "31bc78d4-1c6f-439c-b894-1522e3a6d3e6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "body = {\n",
+    "    \"model\": vllm_module.model,\n",
+    "    \"messages\": [{\"role\": \"user\", \"content\": \"what are the 3 countries with the most gpu as far as you know\"}],\n",
+    "    \"max_tokens\": vllm_module.max_tokens,   # start smaller for testing\n",
+    "}\n",
+    "\n",
+    "resp = app.invoke(path=\"/v1/chat/completions\", body=body)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "a459d5f8-dad0-4735-94c2-3801d4f94bb5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "assistant:\n",
+      "\n",
+      "As of the most commonly cited estimates, the three countries with the largest GPU capacity for AI workloads are the United States, China, and India.\n"
+     ]
+    }
+   ],
+   "source": [
+    "data = resp\n",
+    "assistant_text = data[\"choices\"][0][\"message\"][\"content\"]\n",
+    "\n",
+    "print(\"\\nassistant:\\n\")\n",
+    "print(assistant_text.strip())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "957b5d21-7ade-4131-9100-878652c477fc",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "mlrun-base",
+   "language": "python",
+   "name": "conda-env-mlrun-base-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.22"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/modules/src/vllm_module/vllm_module.py b/modules/src/vllm_module/vllm_module.py
new file mode 100644
index 00000000..ce6307cc
--- /dev/null
+++ b/modules/src/vllm_module/vllm_module.py
@@ -0,0 +1,138 @@
+# Copyright 2025 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This module acts as a lightweight gateway to OpenAI-compatible APIs.
+# You can send chat prompts, create embeddings, or get model responses without worrying about authentication or endpoint differences.
+# It simplifies access so you can test, analyze, or integrate AI features directly into your projects or notebooks with minimal setup.
+
+
+from typing import Dict, Optional, List
+
+
+class VLLMModule:
+    """
+    VLLMModule
+
+    This module provides a lightweight wrapper for deploying a vLLM
+    (OpenAI-compatible) large language model server as an MLRun application runtime.
+
+    The VLLMModule is responsible for:
+    - Creating an MLRun application runtime based on a vLLM container image
+    - Configuring GPU resources, memory limits, and Kubernetes node selection
+    - Launching the model using `vllm serve` with configurable runtime flags
+    - Supporting multi-GPU inference via tensor parallelism
+    - Automatically configuring shared memory (/dev/shm) when using multiple GPUs
+    - Exposing an OpenAI-compatible API (e.g. /v1/chat/completions) for inference
+    - Providing a simple Python interface for deployment and invocation from Jupyter notebooks
+
+    The module is designed to be used in Jupyter notebooks and MLRun pipelines,
+    allowing users to deploy and test large language models on Kubernetes
+    with minimal configuration.
+    """
+
+    def __init__(
+            self,
+            project: str,
+            *,
+            node_selector: Optional[Dict[str, str]] = None,
+            name: str = "vllm",
+            image: str = "vllm/vllm-openai:latest",
+            model: str = "Qwen/Qwen2.5-Omni-3B",
+            gpus: int = 1,
+            mem: str = "10G",
+            port: int = 8000,
+            dtype: str = "auto",
+            tensor_parallel_size: Optional[int] = None,
+            uvicorn_log_level: str = "info",
+            max_tokens: int = 500,
+    ):
+        if gpus < 1:
+            raise ValueError("gpus must be >= 1")
+
+        if tensor_parallel_size is not None:
+            if tensor_parallel_size < 1:
+                raise ValueError("tensor_parallel_size must be >= 1")
+            if tensor_parallel_size > gpus:
+                raise ValueError(
+                    f"tensor_parallel_size ({tensor_parallel_size}) cannot be greater than gpus ({gpus})"
+                )
+
+        if node_selector is None:
+            node_selector = {"alpha.eksctl.io/nodegroup-name": "added-gpu"}
+
+        if not isinstance(max_tokens, int):
+            raise TypeError("max_tokens must be an integer")
+
+        if max_tokens < 1:
+            raise ValueError("max_tokens must be >= 1")
+
+        self.project = project
+        self.name = name
+        self.image = image
+        self.model = model
+        self.gpus = gpus
+        self.mem = mem
+        self.node_selector = node_selector
+        self.port = port
+        self.dtype = dtype
+        self.tensor_parallel_size = tensor_parallel_size
+        self.uvicorn_log_level = uvicorn_log_level
+        self.max_tokens = max_tokens
+
+        self.vllm_app = self.project.set_function(
+            name=self.name,
+            kind="application",
+            image=self.image,
+        )
+
+        self.vllm_app.with_limits(gpus=self.gpus, mem=self.mem)
+
+        if self.node_selector:
+            self.vllm_app.with_node_selection(node_selector=self.node_selector)
+
+        self.vllm_app.set_internal_application_port(self.port)
+
+        args: List[str] = [
+            "serve",
+            self.model,
+            "--dtype",
+            self.dtype,
+            "--port",
+            str(self.port),
+        ]
+
+        if self.uvicorn_log_level:
+            args += ["--uvicorn-log-level", self.uvicorn_log_level]
+
+        if self.gpus > 1:
+            tps = self.tensor_parallel_size or self.gpus
+            args += ["--tensor-parallel-size", str(tps)]
+
+            # For more than one GPU you should create a share volume for the multiple GPUs
+            self.vllm_app.spec.volumes = [{"name": "dshm", "emptyDir": {"medium": "Memory"}}]
+            self.vllm_app.spec.volume_mounts = [{"name": "dshm", "mountPath": "/dev/shm"}]
+
+        self.vllm_app.spec.command = "vllm"
+        self.vllm_app.spec.args = args
+
+        self.vllm_app.spec.min_replicas = 1
+        self.vllm_app.spec.max_replicas = 1
+
+    def get_runtime(self):
+        return self.vllm_app
+
+    def add_args(self, extra_args: List[str]):
+        if not isinstance(extra_args, list) or not all(isinstance(x, str) for x in extra_args):
+            raise ValueError("extra_args must be a list of strings")
+        self.vllm_app.spec.args += extra_args
\ No newline at end of file

From 4b1e44839a380b4df20b4e86a2e4a1fa06328a04 Mon Sep 17 00:00:00 2001
From: guylei-code <guyleibu@gmail.com>
Date: Thu, 25 Dec 2025 16:05:24 +0200
Subject: [PATCH 2/9] Chane the vllm-module.ipynb output error ,vllm_app.

---
 modules/src/vllm_module/item.yaml           | 2 +-
 modules/src/vllm_module/test_vllm_module.py | 2 +-
 modules/src/vllm_module/vllm-module.ipynb   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/src/vllm_module/item.yaml b/modules/src/vllm_module/item.yaml
index d7b54021..edc66f4d 100644
--- a/modules/src/vllm_module/item.yaml
+++ b/modules/src/vllm_module/item.yaml
@@ -1,7 +1,7 @@
 apiVersion: v1
 categories:
 - genai
-description: Deploys a vLLM OpenAI-compatible LLM server as an MLRun application runtime, with configurable GPU usage, node selection, tensor parallelism, and runtime flags.
+description: Deploys a vLLM OpenAI-compatible LLM server as an MLRun application runtime, with configurable GPU usage, node selection, tensor parallelism and runtime flags.
 example: vllm_module.ipynb
 generationDate: 2025-12-17:12-25
 hidden: false
diff --git a/modules/src/vllm_module/test_vllm_module.py b/modules/src/vllm_module/test_vllm_module.py
index 3a5f422a..f2162900 100644
--- a/modules/src/vllm_module/test_vllm_module.py
+++ b/modules/src/vllm_module/test_vllm_module.py
@@ -18,7 +18,7 @@
 
 
 class TestVllmModule:
-    """Test suite for VLLMModule class."""
+    """Test suite for VLLMModule class"""
 
     def setup_method(self):
         project = mlrun.new_project("vllm", save=False)
diff --git a/modules/src/vllm_module/vllm-module.ipynb b/modules/src/vllm_module/vllm-module.ipynb
index 05b584e4..8dda8054 100644
--- a/modules/src/vllm_module/vllm-module.ipynb
+++ b/modules/src/vllm_module/vllm-module.ipynb
@@ -170,7 +170,7 @@
     "body = {\n",
     "    \"model\": vllm_module.model,\n",
     "    \"messages\": [{\"role\": \"user\", \"content\": \"what are the 3 countries with the most gpu as far as you know\"}],\n",
-    "    \"max_tokens\": vllm_module.max_tokens,   # start smaller for testing\n",
+    "    \"max_tokens\": vllm_module.max_tokens,    # start smaller for testing\n",
     "}\n",
     "\n",
     "resp = app.invoke(path=\"/v1/chat/completions\", body=body)"

From bc38f7ad942139e51fc7c3c3e2a0cc58973935d0 Mon Sep 17 00:00:00 2001
From: guylei-code <guyleibu@gmail.com>
Date: Mon, 27 Apr 2026 14:58:01 +0300
Subject: [PATCH 3/9] toxicity guardrail first commit

---
 steps/src/toxicity_guardrail/item.yaml        |  23 ++
 steps/src/toxicity_guardrail/requirements.txt |   2 +
 .../test_toxicity_guardrail.py                |  42 +++
 .../toxicity_guardrail.ipynb                  | 320 ++++++++++++++++++
 .../toxicity_guardrail/toxicity_guardrail.py  |  61 ++++
 5 files changed, 448 insertions(+)
 create mode 100644 steps/src/toxicity_guardrail/item.yaml
 create mode 100644 steps/src/toxicity_guardrail/requirements.txt
 create mode 100644 steps/src/toxicity_guardrail/test_toxicity_guardrail.py
 create mode 100644 steps/src/toxicity_guardrail/toxicity_guardrail.ipynb
 create mode 100644 steps/src/toxicity_guardrail/toxicity_guardrail.py

diff --git a/steps/src/toxicity_guardrail/item.yaml b/steps/src/toxicity_guardrail/item.yaml
new file mode 100644
index 00000000..95fe820e
--- /dev/null
+++ b/steps/src/toxicity_guardrail/item.yaml
@@ -0,0 +1,23 @@
+apiVersion: v1
+categories:
+  - data-preparation
+  - model-serving
+  - genai
+description: Filters toxic requests using a pre-trained text classifier before they reach the LLM
+example: toxicity_guardrail.ipynb
+generationDate: 2026-04-27:12-00
+hidden: false
+labels:
+  author: Iguazio
+mlrunVersion: 1.10.0
+name: toxicity_guardrail
+className: ToxicityGuardrailStep
+defaultHandler:
+spec:
+  filename: toxicity_guardrail.py
+  image: mlrun/mlrun
+  requirements:
+    - transformers
+    - torch
+  kind: generic
+version: 1.0.0
\ No newline at end of file
diff --git a/steps/src/toxicity_guardrail/requirements.txt b/steps/src/toxicity_guardrail/requirements.txt
new file mode 100644
index 00000000..5061402e
--- /dev/null
+++ b/steps/src/toxicity_guardrail/requirements.txt
@@ -0,0 +1,2 @@
+transformers==4.47.0
+torch==2.6.0
diff --git a/steps/src/toxicity_guardrail/test_toxicity_guardrail.py b/steps/src/toxicity_guardrail/test_toxicity_guardrail.py
new file mode 100644
index 00000000..42e85fd3
--- /dev/null
+++ b/steps/src/toxicity_guardrail/test_toxicity_guardrail.py
@@ -0,0 +1,42 @@
+# Copyright 2025 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import mlrun
+
+from toxicity_guardrail import ToxicityGuardrailStep
+
+
+class TestToxicityGuardrailStep:
+    """Test suite for ToxicityGuardrailStep class."""
+
+    def setup_method(self):
+        """Set up test fixtures before each test method."""
+        project = mlrun.new_project("toxicity-guardrail", save=False)
+        self.fn = project.set_function(
+            "toxicity_guardrail.py",
+            name="guardrail-fn",
+            kind="serving",
+            image="mlrun/mlrun",
+        )
+        graph = self.fn.set_topology("flow", engine="async")
+        graph.to(
+            class_name="ToxicityGuardrailStep",
+            name="toxicity_guardrail",
+            threshold=0.5,
+        ).respond()
+
+    def test_toxicity_guardrail_step(self):
+        """Test that the serving function is correctly configured with ToxicityGuardrailStep."""
+        assert type(self.fn) == mlrun.runtimes.ServingRuntime
diff --git a/steps/src/toxicity_guardrail/toxicity_guardrail.ipynb b/steps/src/toxicity_guardrail/toxicity_guardrail.ipynb
new file mode 100644
index 00000000..0085a3ba
--- /dev/null
+++ b/steps/src/toxicity_guardrail/toxicity_guardrail.ipynb
@@ -0,0 +1,320 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "93c9feca-c120-443e-bbd3-731f70d49682",
+   "metadata": {},
+   "source": [
+    "## Pipeline: Toxicity Guardrail (Hub Step) → LLM Model Runner\n",
+    "\n",
+    "A unified serving graph that:\n",
+    "1. Routes the user's question through a toxicity guardrail hub step\n",
+    "2. If safe → calls a `ModelRunnerStep` (LLM) and returns the answer\n",
+    "3. If toxic → blocks the request with a clear rejection response"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "67435be350de0cea",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import mlrun"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "m001-0000-0000-0000-000000000001",
+   "metadata": {},
+   "source": [
+    "Create or load the MLRun project that will hold the serving function and its secrets."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fd9fd3609223be6a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "> 2026-04-27 10:59:47,707 [info] Loading project from path: {\"path\":\"./\",\"project_name\":\"hubstep-guardrail-toxicity\",\"user_project\":false}\n",
+      "> 2026-04-27 11:00:02,102 [info] Project loaded successfully: {\"path\":\"./\",\"project_name\":\"hubstep-guardrail-toxicity\",\"stored_in_db\":true}\n"
+     ]
+    }
+   ],
+   "source": [
+    "project = mlrun.get_or_create_project(\"hubstep-guardrail-toxicity\", user_project=False, context=\"./\", allow_cross_project=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "m001-0000-0000-0000-000000000002",
+   "metadata": {},
+   "source": [
+    "Load credentials from a local `.env` file. The file should define `OPENAI_API_KEY`, `OPENAI_BASE_URL`, and `OPENAI_MODEL`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "61bc0d94-4939-46c1-ac0d-2e90fd465c9c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "load_dotenv(\"cred.env\", override=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "m001-0000-0000-0000-000000000003",
+   "metadata": {},
+   "source": [
+    "Store the credentials as Kubernetes secrets so the deployed Nuclio function can access them securely at runtime — no environment variables need to be injected manually."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "884a67ca-d548-4d7d-bab1-ca8868fbe875",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "project.set_secrets(\n",
+    "    secrets={\n",
+    "        \"OPENAI_API_KEY\": os.getenv(\"OPENAI_API_KEY\"),\n",
+    "        \"OPENAI_BASE_URL\": os.getenv(\"OPENAI_BASE_URL\"),\n",
+    "        \"OPENAI_MODEL\":   os.getenv(\"OPENAI_MODEL\"),\n",
+    "    },\n",
+    "    provider=\"kubernetes\",\n",
+    ")\n",
+    "project.save()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "m001-0000-0000-0000-000000000004",
+   "metadata": {},
+   "source": [
+    "## Build the serving graph\n",
+    "\n",
+    "`LLMModel` wraps an OpenAI-compatible API and reads credentials from the Kubernetes secrets set above.\n",
+    "`format_answer` is a plain function that flattens the `ModelRunnerStep` output dict\n",
+    "(`{\"llm_model\": {\"answer\": ...}}`) into a simple `{\"answer\": ...}` response."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "505c77e2-6875-499d-ae05-c6de3efa0622",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Overwriting serving_graph.py\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%writefile serving_graph.py\n",
+    "from typing import Dict, Any\n",
+    "from mlrun.serving import Model\n",
+    "\n",
+    "class LLMModel(Model):\n",
+    "    \"\"\"OpenAI-compatible LLM. Credentials and model are read from env vars:\n",
+    "    OPENAI_API_KEY, OPENAI_BASE_URL (optional), OPENAI_MODEL (optional, falls back to default_model_name).\n",
+    "    \"\"\"\n",
+    "\n",
+    "    def __init__(self, default_model_name: str = \"gpt-4o-mini\", **kwargs):\n",
+    "        super().__init__(**kwargs)\n",
+    "        self.default_model_name = default_model_name\n",
+    "\n",
+    "    def load(self):\n",
+    "        import openai, os\n",
+    "        self.model_name = os.environ.get(\"OPENAI_MODEL\", self.default_model_name)\n",
+    "        client_kwargs = {\"api_key\": os.environ[\"OPENAI_API_KEY\"]}\n",
+    "        base_url = os.environ.get(\"OPENAI_BASE_URL\")\n",
+    "        if base_url:\n",
+    "            client_kwargs[\"base_url\"] = base_url\n",
+    "        self._client = openai.OpenAI(**client_kwargs)\n",
+    "\n",
+    "    def predict(self, body: Dict[str, Any]) -> Dict[str, Any]:\n",
+    "        question = body.get(\"question\", \"\")\n",
+    "        response = self._client.chat.completions.create(\n",
+    "            model=self.model_name,\n",
+    "            messages=[\n",
+    "                {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
+    "                {\"role\": \"user\",   \"content\": question},\n",
+    "            ],\n",
+    "        )\n",
+    "        return {\"answer\": response.choices[0].message.content, \"model\": self.model_name}\n",
+    "\n",
+    "\n",
+    "def format_answer(event: Dict[str, Any]) -> Dict[str, Any]:\n",
+    "    \"\"\"Flatten ModelRunnerStep output: {\"llm_model\": {\"answer\": ...}} → {\"answer\": ...}\"\"\"\n",
+    "    if isinstance(event, dict):\n",
+    "        for _, model_output in event.items():\n",
+    "            if isinstance(model_output, dict):\n",
+    "                return model_output\n",
+    "    return event"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "m001-0000-0000-0000-000000000005",
+   "metadata": {},
+   "source": [
+    "Wire the three-step async flow graph:\n",
+    "1. **`toxicity_guardrail`** — loaded directly from `hub://toxicity_guardrail`; blocks requests with a toxicity score ≥ `threshold`\n",
+    "2. **`llm_runner`** — a `ModelRunnerStep` that runs `LLMModel` against the OpenAI-compatible API\n",
+    "3. **`format_answer`** — flattens the runner output and sends the response back to the caller"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6d0435a5-4e65-4a33-a146-8c6abb382b37",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mlrun.serving import ModelRunnerStep\n",
+    "\n",
+    "fn_pipeline = project.set_function(\n",
+    "    name=\"toxicity-llm-pipeline\",\n",
+    "    func=\"serving_graph.py\",\n",
+    "    kind=\"serving\",\n",
+    "    image=\"mlrun/mlrun\",\n",
+    "    requirements=[\"transformers\", \"torch\", \"openai\"],\n",
+    ")\n",
+    "# Credentials come from Kubernetes secrets set above — no set_envs() needed for them.\n",
+    "\n",
+    "graph = fn_pipeline.set_topology(\"flow\", engine=\"async\")\n",
+    "\n",
+    "graph.add_step(\n",
+    "    class_name=\"hub://toxicity_guardrail\",\n",
+    "    name=\"toxicity_guardrail\",\n",
+    "    threshold=0.5,\n",
+    ")\n",
+    "\n",
+    "model_runner = ModelRunnerStep(name=\"llm_runner\")\n",
+    "model_runner.add_model(endpoint_name=\"llm_model\", model_class=\"LLMModel\")\n",
+    "graph.add_step(model_runner, after=\"toxicity_guardrail\")\n",
+    "\n",
+    "graph.add_step(name=\"format_answer\", handler=\"format_answer\", after=\"llm_runner\").respond()\n",
+    "\n",
+    "graph.plot(rankdir=\"LR\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "m001-0000-0000-0000-000000000006",
+   "metadata": {},
+   "source": [
+    "Deploy the function to Nuclio. This builds a container image with the required packages (`transformers`, `torch`, `openai`) and starts the serving endpoint."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "973ceab8-48b5-4689-a39e-b83ac3e75ddc",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "> 2026-04-27 11:08:33,830 [info] Starting remote function deploy\n",
+      "2026-04-27 11:08:34  (info) Deploying function\n",
+      "2026-04-27 11:08:34  (info) Building\n",
+      "2026-04-27 11:08:34  (info) Staging files and preparing base images\n",
+      "2026-04-27 11:08:34  (warn) Using user provided base image, runtime interpreter version is provided by the base image\n",
+      "2026-04-27 11:08:34  (info) Building processor image\n",
+      "2026-04-27 11:17:59  (info) Build complete\n",
+      "2026-04-27 11:20:19  (info) Function deploy complete\n",
+      "> 2026-04-27 11:20:27,592 [info] Model endpoint creation task completed with state succeeded\n",
+      "> 2026-04-27 11:20:27,592 [info] Successfully deployed function: {\"external_invocation_urls\":[\"hubstep-guardrail-toxicity-toxicity-llm-pipeline.default-tenant.app.cust-cs.iguazio-cd1.com/\"],\"internal_invocation_urls\":[\"nuclio-hubstep-guardrail-toxicity-toxicity-llm-pipeline.default-tenant.svc.cluster.local:8080\"]}\n",
+      "Pipeline deployed: DeployStatus(state=ready, outputs={'endpoint': 'http://hubstep-guardrail-toxicity-toxicity-llm-pipeline.default-tenant.app.cust-cs.iguazio-cd1.com/', 'name': 'hubstep-guardrail-toxicity-toxicity-llm-pipeline'})\n"
+     ]
+    }
+   ],
+   "source": [
+    "addr = project.deploy_function(fn_pipeline)\n",
+    "print(\"Pipeline deployed:\", addr)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "m001-0000-0000-0000-000000000007",
+   "metadata": {},
+   "source": [
+    "Test the pipeline with a safe question and a toxic one.\n",
+    "Safe requests pass through to the LLM and return an answer; toxic ones are blocked by the guardrail before reaching the model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fd5cd2c9-3abe-401f-bb2e-65d216b9951d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=== Safe input ===\n",
+      "Response: {'answer': 'Washington, D.C. It\u2019s a federal district, not part of any state.', 'model': 'gpt-5-nano-2025-08-07'}\n",
+      "\n",
+      "=== Toxic input ===\n",
+      "Blocked (expected): bad function response 500: Exception caught in handler - \"Request blocked: toxicity score 0.953 >= 0.5\"\n"
+     ]
+    }
+   ],
+   "source": [
+    "# --- Safe input: should return an LLM answer ---\n",
+    "print(\"=== Safe input ===\")\n",
+    "result = fn_pipeline.invoke(\"/\", {\"question\": \"What is the capital of the USA?\"})\n",
+    "print(\"Response:\", result)\n",
+    "\n",
+    "# --- Toxic input: should be blocked ---\n",
+    "print(\"\\n=== Toxic input ===\")\n",
+    "try:\n",
+    "    result = fn_pipeline.invoke(\"/\", {\"question\": \"You're completely useless and everything you do is a failure\"})\n",
+    "    print(\"Response:\", result)\n",
+    "except Exception as e:\n",
+    "    print(f\"Blocked (expected): {e}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.11.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/steps/src/toxicity_guardrail/toxicity_guardrail.py b/steps/src/toxicity_guardrail/toxicity_guardrail.py
new file mode 100644
index 00000000..def0616b
--- /dev/null
+++ b/steps/src/toxicity_guardrail/toxicity_guardrail.py
@@ -0,0 +1,61 @@
+# Copyright 2025 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Any, Dict
+
+
+class ToxicityGuardrailStep:
+    """
+    A serving graph step that filters out toxic requests using a pre-trained
+    text classification model.
+
+    If the toxicity score of the input text meets or exceeds the threshold,
+    the request is blocked with a ValueError. Safe requests are passed through
+    unchanged.
+
+    The classifier label "toxic" maps directly to the toxicity score; any
+    other label (e.g. "non-toxic") inverts the model's confidence score.
+    """
+
+    def __init__(
+        self,
+        context=None,
+        name=None,
+        threshold: float = 0.5,
+        model_name: str = "unitary/toxic-bert",
+        **kwargs,
+    ):
+        self.threshold = threshold
+        self.model_name = model_name
+        self._classifier = None
+
+    def post_init(self, mode="sync", **kwargs):
+        from transformers import pipeline
+
+        self._classifier = pipeline("text-classification", model=self.model_name)
+
+    def do(self, event: Dict[str, Any]) -> Dict[str, Any]:
+        question = event.get("question", "")
+        result = self._classifier(question)[0]
+        score = (
+            result["score"]
+            if result["label"] == "toxic"
+            else 1 - result["score"]
+        )
+        if score >= self.threshold:
+            raise ValueError(
+                f"Request blocked: toxicity score {score:.3f} >= {self.threshold}"
+            )
+        return event

From 6cbffc5be932c3ec60b153705cc6233f857f979d Mon Sep 17 00:00:00 2001
From: guylei-code <guyleibu@gmail.com>
Date: Mon, 27 Apr 2026 15:04:08 +0300
Subject: [PATCH 4/9] delete vllm-module.ipynb

---
 modules/src/vllm_module/vllm-module.ipynb | 234 ----------------------
 1 file changed, 234 deletions(-)
 delete mode 100644 modules/src/vllm_module/vllm-module.ipynb

diff --git a/modules/src/vllm_module/vllm-module.ipynb b/modules/src/vllm_module/vllm-module.ipynb
deleted file mode 100644
index 8dda8054..00000000
--- a/modules/src/vllm_module/vllm-module.ipynb
+++ /dev/null
@@ -1,234 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "7d551647-dfc2-47da-bc8a-3792af622073",
-   "metadata": {},
-   "source": [
-    "# vLLM Module with MLRun\n",
-    "\n",
-    "This notebook shows how to configure and deploy a vLLM OpenAI compatible server as an MLRun application runtime, then showcases how to send a chat request to it to the vLLM server."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "7707b270-30cc-448a-a828-cb93aa28030d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import mlrun\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d5cff681-bfdf-4468-a1d1-2aeadb56065e",
-   "metadata": {},
-   "source": [
-    "## Prerequisite\n",
-    "* At lease one GPU is required for running this notebook."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d5c84798-289f-4b4f-8c1b-f4dd12a3bda5",
-   "metadata": {},
-   "source": [
-    "## What this notebook does\n",
-    "\n",
-    "In this notebook we will:\n",
-    "\n",
-    "- Create or load an **MLRun project**\n",
-    "- Import a custom **vLLM module** from the MLRun Hub\n",
-    "- Deploy a **vLLM OpenAI-compatible server** as an MLRun application runtime\n",
-    "- Configure deployment parameters such as model, GPU count, memory, node selector, port, and log level\n",
-    "- Invoke the deployed service using the `/v1/chat/completions` endpoint\n",
-    "- Parse the response and extract only the assistant’s generated text\n",
-    "\n",
-    "By the end of this notebook, you will have a working vLLM deployment that can be queried directly from a Jupyter notebook using OpenAI-style APIs.\n",
-    "\n",
-    "For more information about [vLLM documentation](https://docs.vllm.ai/en/latest/serving/openai_compatible_server/)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "879ca641-ee35-4682-9995-4eb319d89090",
-   "metadata": {},
-   "source": [
-    "## 1. Create an MLRun project\n",
-    "\n",
-    "In this section we create or load an MLRun project that will own the deployed vLLM application runtime."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6eac263a-17d1-4454-9e19-459dfbe2f231",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "project = mlrun.get_or_create_project(name=\"vllm-module\", context=\"\", user_project=True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "da49d335-b704-4fb6-801f-4d07b64f9be6",
-   "metadata": {},
-   "source": [
-    "## 2. Import the vLLM module from the MLRun Hub\n",
-    "\n",
-    "In this section we import the vLLM module from the MLRun Hub so we can instantiate `VLLMModule` and deploy it as an application runtime."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e6d89dee-db58-4c0c-8009-b37020c9599a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "vllm = mlrun.import_module(\"hub://vllm-module\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "1202ddd5-0ce7-4769-be29-8fc264c1f80e",
-   "metadata": {},
-   "source": [
-    "## 3. Deploy the vLLM application runtime\n",
-    "\n",
-    "Configure the vLLM deployment parameters and deploy the application.\n",
-    "\n",
-    "The returned address is the service URL for the application runtime."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e433123a-e64b-4a7a-8c7f-8165bcdcc6d1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Initialize the vLLM app\n",
-    "vllm_module = vllm.VLLMModule(\n",
-    "    project=project,\n",
-    "    node_selector={\"alpha.eksctl.io/nodegroup-name\": \"added-gpu\"},\n",
-    "    name=\"qwen-vllm\",\n",
-    "    image=\"vllm/vllm-openai:latest\",\n",
-    "    model=\"Qwen/Qwen2.5-Omni-3B\",\n",
-    "    gpus=1,\n",
-    "    mem=\"10G\",\n",
-    "    port=8000,\n",
-    "    dtype=\"auto\",\n",
-    "    uvicorn_log_level=\"info\",\n",
-    "    max_tokens = 501,\n",
-    ")\n",
-    "\n",
-    "# Deploy the vLLM app\n",
-    "addr = vllm_module.vllm_app.deploy(with_mlrun=True)\n",
-    "addr"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "06832de3-5c31-43bf-b07b-0e71fb2d072d",
-   "metadata": {},
-   "source": [
-    "## 4. Get the runtime handle\n",
-    "\n",
-    "Fetch the runtime object and invoke the service using `app.invoke(...)`."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "102d3fd0-1ee6-49b8-8c86-df742ac1c559",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Optional: get_runtime() method uses to get the MLRun application runtime\n",
-    "app = vllm_module.get_runtime()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "925730c1-0ac5-454b-8fb2-ab8cebb3f3ac",
-   "metadata": {},
-   "source": [
-    "## 5. Send a chat request for testing\n",
-    "\n",
-    "Call the OpenAI compatible endpoint `/v1/chat/completions`, parse the JSON response, and print only the assistant message text."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "id": "31bc78d4-1c6f-439c-b894-1522e3a6d3e6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "body = {\n",
-    "    \"model\": vllm_module.model,\n",
-    "    \"messages\": [{\"role\": \"user\", \"content\": \"what are the 3 countries with the most gpu as far as you know\"}],\n",
-    "    \"max_tokens\": vllm_module.max_tokens,    # start smaller for testing\n",
-    "}\n",
-    "\n",
-    "resp = app.invoke(path=\"/v1/chat/completions\", body=body)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "id": "a459d5f8-dad0-4735-94c2-3801d4f94bb5",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "assistant:\n",
-      "\n",
-      "As of the most commonly cited estimates, the three countries with the largest GPU capacity for AI workloads are the United States, China, and India.\n"
-     ]
-    }
-   ],
-   "source": [
-    "data = resp\n",
-    "assistant_text = data[\"choices\"][0][\"message\"][\"content\"]\n",
-    "\n",
-    "print(\"\\nassistant:\\n\")\n",
-    "print(assistant_text.strip())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "957b5d21-7ade-4131-9100-878652c477fc",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "mlrun-base",
-   "language": "python",
-   "name": "conda-env-mlrun-base-py"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.22"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

From b1acc880e55b10cbced1a5176bae9e31a06925d4 Mon Sep 17 00:00:00 2001
From: guylei-code <guyleibu@gmail.com>
Date: Mon, 27 Apr 2026 16:18:39 +0300
Subject: [PATCH 5/9] second commit, update changes requested in ipynb

---
 .../toxicity_guardrail.ipynb                  | 33 +++++++++----------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/steps/src/toxicity_guardrail/toxicity_guardrail.ipynb b/steps/src/toxicity_guardrail/toxicity_guardrail.ipynb
index 0085a3ba..973ab392 100644
--- a/steps/src/toxicity_guardrail/toxicity_guardrail.ipynb
+++ b/steps/src/toxicity_guardrail/toxicity_guardrail.ipynb
@@ -55,7 +55,14 @@
    "id": "m001-0000-0000-0000-000000000002",
    "metadata": {},
    "source": [
-    "Load credentials from a local `.env` file. The file should define `OPENAI_API_KEY`, `OPENAI_BASE_URL`, and `OPENAI_MODEL`."
+    "### Load credentials from a local `.env` file.\n",
+    "\n",
+    "For example:\n",
+    "```\n",
+    "OPENAI_API_KEY=\"...\"\n",
+    "OPENAI_BASE_URL=\"...\"\n",
+    "OPENAI_MODEL=\"...\"\n",
+    "```"
    ]
   },
   {
@@ -85,16 +92,12 @@
    "cell_type": "markdown",
    "id": "m001-0000-0000-0000-000000000003",
    "metadata": {},
-   "source": [
-    "Store the credentials as Kubernetes secrets so the deployed Nuclio function can access them securely at runtime — no environment variables need to be injected manually."
-   ]
+   "source": "Store the credentials as project secrets - see also [working with secrets](http://docs.mlrun.org/en/stable/secrets.html).\n"
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "884a67ca-d548-4d7d-bab1-ca8868fbe875",
    "metadata": {},
-   "outputs": [],
    "source": [
     "import os\n",
     "project.set_secrets(\n",
@@ -103,10 +106,11 @@
     "        \"OPENAI_BASE_URL\": os.getenv(\"OPENAI_BASE_URL\"),\n",
     "        \"OPENAI_MODEL\":   os.getenv(\"OPENAI_MODEL\"),\n",
     "    },\n",
-    "    provider=\"kubernetes\",\n",
     ")\n",
     "project.save()"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "markdown",
@@ -115,7 +119,7 @@
    "source": [
     "## Build the serving graph\n",
     "\n",
-    "`LLMModel` wraps an OpenAI-compatible API and reads credentials from the Kubernetes secrets set above.\n",
+    "`LLMModel` wraps an OpenAI-compatible API and reads credentials to the Kubernetes secrets set above.\n",
     "`format_answer` is a plain function that flattens the `ModelRunnerStep` output dict\n",
     "(`{\"llm_model\": {\"answer\": ...}}`) into a simple `{\"answer\": ...}` response."
    ]
@@ -228,9 +232,7 @@
    "cell_type": "markdown",
    "id": "m001-0000-0000-0000-000000000006",
    "metadata": {},
-   "source": [
-    "Deploy the function to Nuclio. This builds a container image with the required packages (`transformers`, `torch`, `openai`) and starts the serving endpoint."
-   ]
+   "source": "Deploy the Serving function, with the required packages (`transformers`, `torch`, `openai`)."
   },
   {
    "cell_type": "code",
@@ -256,10 +258,7 @@
      ]
     }
    ],
-   "source": [
-    "addr = project.deploy_function(fn_pipeline)\n",
-    "print(\"Pipeline deployed:\", addr)"
-   ]
+   "source": "addr = project.deploy_function(fn_pipeline)"
   },
   {
    "cell_type": "markdown",
@@ -281,7 +280,7 @@
      "output_type": "stream",
      "text": [
       "=== Safe input ===\n",
-      "Response: {'answer': 'Washington, D.C. It\u2019s a federal district, not part of any state.', 'model': 'gpt-5-nano-2025-08-07'}\n",
+      "Response: {'answer': 'Washington, D.C. It’s a federal district, not part of any state.', 'model': 'gpt-5-nano-2025-08-07'}\n",
       "\n",
       "=== Toxic input ===\n",
       "Blocked (expected): bad function response 500: Exception caught in handler - \"Request blocked: toxicity score 0.953 >= 0.5\"\n"

From b50b48cc3a4900e77a3e362a6dde8278b06be8b6 Mon Sep 17 00:00:00 2001
From: guylei-code <guyleibu@gmail.com>
Date: Tue, 28 Apr 2026 11:23:10 +0300
Subject: [PATCH 6/9] third commit, update changes requested in ipynb

---
 .../toxicity_guardrail.ipynb                  | 54 +++----------------
 1 file changed, 8 insertions(+), 46 deletions(-)

diff --git a/steps/src/toxicity_guardrail/toxicity_guardrail.ipynb b/steps/src/toxicity_guardrail/toxicity_guardrail.ipynb
index 973ab392..ddc3fe99 100644
--- a/steps/src/toxicity_guardrail/toxicity_guardrail.ipynb
+++ b/steps/src/toxicity_guardrail/toxicity_guardrail.ipynb
@@ -13,16 +13,6 @@
     "3. If toxic → blocks the request with a clear rejection response"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "67435be350de0cea",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import mlrun"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "m001-0000-0000-0000-000000000001",
@@ -47,6 +37,7 @@
     }
    ],
    "source": [
+    "import mlrun\n",
     "project = mlrun.get_or_create_project(\"hubstep-guardrail-toxicity\", user_project=False, context=\"./\", allow_cross_project=True)"
    ]
   },
@@ -67,26 +58,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "61bc0d94-4939-46c1-ac0d-2e90fd465c9c",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": null,
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
    "source": [
     "from dotenv import load_dotenv\n",
     "\n",
     "load_dotenv(\"cred.env\", override=True)"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "markdown",
@@ -119,7 +99,7 @@
    "source": [
     "## Build the serving graph\n",
     "\n",
-    "`LLMModel` wraps an OpenAI-compatible API and reads credentials to the Kubernetes secrets set above.\n",
+    "`LLMModel` wraps an OpenAI-compatible API and reads credentials to the project secrets set above.\n",
     "`format_answer` is a plain function that flattens the `ModelRunnerStep` output dict\n",
     "(`{\"llm_model\": {\"answer\": ...}}`) into a simple `{\"answer\": ...}` response."
    ]
@@ -236,29 +216,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "973ceab8-48b5-4689-a39e-b83ac3e75ddc",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "> 2026-04-27 11:08:33,830 [info] Starting remote function deploy\n",
-      "2026-04-27 11:08:34  (info) Deploying function\n",
-      "2026-04-27 11:08:34  (info) Building\n",
-      "2026-04-27 11:08:34  (info) Staging files and preparing base images\n",
-      "2026-04-27 11:08:34  (warn) Using user provided base image, runtime interpreter version is provided by the base image\n",
-      "2026-04-27 11:08:34  (info) Building processor image\n",
-      "2026-04-27 11:17:59  (info) Build complete\n",
-      "2026-04-27 11:20:19  (info) Function deploy complete\n",
-      "> 2026-04-27 11:20:27,592 [info] Model endpoint creation task completed with state succeeded\n",
-      "> 2026-04-27 11:20:27,592 [info] Successfully deployed function: {\"external_invocation_urls\":[\"hubstep-guardrail-toxicity-toxicity-llm-pipeline.default-tenant.app.cust-cs.iguazio-cd1.com/\"],\"internal_invocation_urls\":[\"nuclio-hubstep-guardrail-toxicity-toxicity-llm-pipeline.default-tenant.svc.cluster.local:8080\"]}\n",
-      "Pipeline deployed: DeployStatus(state=ready, outputs={'endpoint': 'http://hubstep-guardrail-toxicity-toxicity-llm-pipeline.default-tenant.app.cust-cs.iguazio-cd1.com/', 'name': 'hubstep-guardrail-toxicity-toxicity-llm-pipeline'})\n"
-     ]
-    }
-   ],
-   "source": "addr = project.deploy_function(fn_pipeline)"
+   "source": "addr = project.deploy_function(fn_pipeline)",
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "markdown",

From 42c464fe18a46041f8e36d2652a3c14a32e290a3 Mon Sep 17 00:00:00 2001
From: guylei-code <guyleibu@gmail.com>
Date: Tue, 28 Apr 2026 12:18:55 +0300
Subject: [PATCH 7/9] Update steps/src/toxicity_guardrail/item.yaml

Co-authored-by: Eyal Danieli <eyal_danieli@mckinsey.com>
---
 steps/src/toxicity_guardrail/item.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/steps/src/toxicity_guardrail/item.yaml b/steps/src/toxicity_guardrail/item.yaml
index 95fe820e..9162d11e 100644
--- a/steps/src/toxicity_guardrail/item.yaml
+++ b/steps/src/toxicity_guardrail/item.yaml
@@ -9,7 +9,7 @@ generationDate: 2026-04-27:12-00
 hidden: false
 labels:
   author: Iguazio
-mlrunVersion: 1.10.0
+mlrunVersion: 1.11.0-rc48
 name: toxicity_guardrail
 className: ToxicityGuardrailStep
 defaultHandler:

From e3c26cbcae04ef6e58b499979baf7ddec0faa75c Mon Sep 17 00:00:00 2001
From: guylei-code <guyleibu@gmail.com>
Date: Tue, 28 Apr 2026 13:49:49 +0300
Subject: [PATCH 8/9] Specify versions for transformers and torch

---
 steps/src/toxicity_guardrail/item.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/steps/src/toxicity_guardrail/item.yaml b/steps/src/toxicity_guardrail/item.yaml
index 9162d11e..ebe23b4a 100644
--- a/steps/src/toxicity_guardrail/item.yaml
+++ b/steps/src/toxicity_guardrail/item.yaml
@@ -17,7 +17,7 @@ spec:
   filename: toxicity_guardrail.py
   image: mlrun/mlrun
   requirements:
-    - transformers
-    - torch
+    - transformers==4.46.3
+    - torch==2.11.0
   kind: generic
-version: 1.0.0
\ No newline at end of file
+version: 1.0.0

From b16f27bde49ae59c42b403881f0557d8f6f0c0cf Mon Sep 17 00:00:00 2001
From: guylei-code <guyleibu@gmail.com>
Date: Tue, 28 Apr 2026 14:03:09 +0300
Subject: [PATCH 9/9] third commit, update changes requested in ipynb

---
 steps/src/toxicity_guardrail/item.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/steps/src/toxicity_guardrail/item.yaml b/steps/src/toxicity_guardrail/item.yaml
index ebe23b4a..baefca0f 100644
--- a/steps/src/toxicity_guardrail/item.yaml
+++ b/steps/src/toxicity_guardrail/item.yaml
@@ -9,7 +9,7 @@ generationDate: 2026-04-27:12-00
 hidden: false
 labels:
   author: Iguazio
-mlrunVersion: 1.11.0-rc48
+mlrunVersion: 1.10.0
 name: toxicity_guardrail
 className: ToxicityGuardrailStep
 defaultHandler: