From 44bf2ec2697714f6e059e24c8d706ebbeea2721d Mon Sep 17 00:00:00 2001
From: Roy Schossberger <85231212+royischoss@users.noreply.github.com>
Date: Sun, 19 Apr 2026 15:04:05 +0300
Subject: [PATCH 1/2] fix adding input schema for agent deployer (#974)

---
 modules/src/agent_deployer/agent_deployer.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/modules/src/agent_deployer/agent_deployer.py b/modules/src/agent_deployer/agent_deployer.py
index 58638eaa..07165070 100644
--- a/modules/src/agent_deployer/agent_deployer.py
+++ b/modules/src/agent_deployer/agent_deployer.py
@@ -45,6 +45,7 @@ def __init__(
         result_path: Optional[str] = None,
         inputs_path: Optional[str] = None,
         outputs: Optional[list[str]] = None,
+        inputs: Optional[list[str]] = None,
         requirements: Optional[list[str]] = None,
         image: str = "mlrun/mlrun",
         set_model_monitoring: bool = False,
@@ -67,6 +68,9 @@ def __init__(
         :param outputs: list of the model outputs (e.g. labels) ,if provided will override the outputs
                                       that been configured in the model artifact, please note that those outputs need to
                                       be equal to the model_class predict method outputs (length, and order).
+        :param inputs: list of the model inputs (e.g. features) ,if provided will override the inputs
+                                      that been configured in the model artifact, please note that those outputs need to
+                                      be equal to the model_class predict method outputs (length, and order).
         :param requirements: List of additional requirements for the function
         :param image: Docker image to be used for the function
         :param set_model_monitoring: Whether to configure model monitoring
@@ -84,6 +88,7 @@ def __init__(
         self.result_path = result_path
         self.inputs_path = inputs_path
         self.output_schema = outputs
+        self.input_schema = inputs
         self.image = image
         if set_model_monitoring:
             self.configure_model_monitoring()
@@ -206,6 +211,7 @@ def _load_function(
             result_path=self.result_path,
             input_path=self.inputs_path,
             outputs=self.output_schema,
+            inputs=self.input_schema,
             execution_mechanism="naive",
             **self.model_params,
         )

From a5641fa024a91598744584aa24fe269bb8fc9b57 Mon Sep 17 00:00:00 2001
From: guylei-code <guyleibu@gmail.com>
Date: Wed, 29 Apr 2026 10:32:40 +0300
Subject: [PATCH 2/2] Toxicity guardrail (#975)

* Chane the vllm-module.ipynb output error ,vllm_app.

* Chane the vllm-module.ipynb output error ,vllm_app.

* toxicity guardrail first commit

* delete vllm-module.ipynb

* second commit, update changes requested in ipynb

* third commit, update changes requested in ipynb

* Update steps/src/toxicity_guardrail/item.yaml

Co-authored-by: Eyal Danieli <eyal_danieli@mckinsey.com>

* Specify versions for transformers and torch

* third commit, update changes requested in ipynb

---------

Co-authored-by: Eyal Danieli <eyal_danieli@mckinsey.com>
---
 steps/src/toxicity_guardrail/item.yaml        |  23 ++
 steps/src/toxicity_guardrail/requirements.txt |   2 +
 .../test_toxicity_guardrail.py                |  42 +++
 .../toxicity_guardrail.ipynb                  | 281 ++++++++++++++++++
 .../toxicity_guardrail/toxicity_guardrail.py  |  61 ++++
 5 files changed, 409 insertions(+)
 create mode 100644 steps/src/toxicity_guardrail/item.yaml
 create mode 100644 steps/src/toxicity_guardrail/requirements.txt
 create mode 100644 steps/src/toxicity_guardrail/test_toxicity_guardrail.py
 create mode 100644 steps/src/toxicity_guardrail/toxicity_guardrail.ipynb
 create mode 100644 steps/src/toxicity_guardrail/toxicity_guardrail.py

diff --git a/steps/src/toxicity_guardrail/item.yaml b/steps/src/toxicity_guardrail/item.yaml
new file mode 100644
index 00000000..baefca0f
--- /dev/null
+++ b/steps/src/toxicity_guardrail/item.yaml
@@ -0,0 +1,23 @@
+apiVersion: v1
+categories:
+  - data-preparation
+  - model-serving
+  - genai
+description: Filters toxic requests using a pre-trained text classifier before they reach the LLM
+example: toxicity_guardrail.ipynb
+generationDate: 2026-04-27:12-00
+hidden: false
+labels:
+  author: Iguazio
+mlrunVersion: 1.10.0
+name: toxicity_guardrail
+className: ToxicityGuardrailStep
+defaultHandler:
+spec:
+  filename: toxicity_guardrail.py
+  image: mlrun/mlrun
+  requirements:
+    - transformers==4.46.3
+    - torch==2.11.0
+  kind: generic
+version: 1.0.0
diff --git a/steps/src/toxicity_guardrail/requirements.txt b/steps/src/toxicity_guardrail/requirements.txt
new file mode 100644
index 00000000..5061402e
--- /dev/null
+++ b/steps/src/toxicity_guardrail/requirements.txt
@@ -0,0 +1,2 @@
+transformers==4.47.0
+torch==2.6.0
diff --git a/steps/src/toxicity_guardrail/test_toxicity_guardrail.py b/steps/src/toxicity_guardrail/test_toxicity_guardrail.py
new file mode 100644
index 00000000..42e85fd3
--- /dev/null
+++ b/steps/src/toxicity_guardrail/test_toxicity_guardrail.py
@@ -0,0 +1,42 @@
+# Copyright 2025 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import mlrun
+
+from toxicity_guardrail import ToxicityGuardrailStep
+
+
+class TestToxicityGuardrailStep:
+    """Test suite for ToxicityGuardrailStep class."""
+
+    def setup_method(self):
+        """Set up test fixtures before each test method."""
+        project = mlrun.new_project("toxicity-guardrail", save=False)
+        self.fn = project.set_function(
+            "toxicity_guardrail.py",
+            name="guardrail-fn",
+            kind="serving",
+            image="mlrun/mlrun",
+        )
+        graph = self.fn.set_topology("flow", engine="async")
+        graph.to(
+            class_name="ToxicityGuardrailStep",
+            name="toxicity_guardrail",
+            threshold=0.5,
+        ).respond()
+
+    def test_toxicity_guardrail_step(self):
+        """Test that the serving function is correctly configured with ToxicityGuardrailStep."""
+        assert type(self.fn) == mlrun.runtimes.ServingRuntime
diff --git a/steps/src/toxicity_guardrail/toxicity_guardrail.ipynb b/steps/src/toxicity_guardrail/toxicity_guardrail.ipynb
new file mode 100644
index 00000000..ddc3fe99
--- /dev/null
+++ b/steps/src/toxicity_guardrail/toxicity_guardrail.ipynb
@@ -0,0 +1,281 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "93c9feca-c120-443e-bbd3-731f70d49682",
+   "metadata": {},
+   "source": [
+    "## Pipeline: Toxicity Guardrail (Hub Step) → LLM Model Runner\n",
+    "\n",
+    "A unified serving graph that:\n",
+    "1. Routes the user's question through a toxicity guardrail hub step\n",
+    "2. If safe → calls a `ModelRunnerStep` (LLM) and returns the answer\n",
+    "3. If toxic → blocks the request with a clear rejection response"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "m001-0000-0000-0000-000000000001",
+   "metadata": {},
+   "source": [
+    "Create or load the MLRun project that will hold the serving function and its secrets."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fd9fd3609223be6a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "> 2026-04-27 10:59:47,707 [info] Loading project from path: {\"path\":\"./\",\"project_name\":\"hubstep-guardrail-toxicity\",\"user_project\":false}\n",
+      "> 2026-04-27 11:00:02,102 [info] Project loaded successfully: {\"path\":\"./\",\"project_name\":\"hubstep-guardrail-toxicity\",\"stored_in_db\":true}\n"
+     ]
+    }
+   ],
+   "source": [
+    "import mlrun\n",
+    "project = mlrun.get_or_create_project(\"hubstep-guardrail-toxicity\", user_project=False, context=\"./\", allow_cross_project=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "m001-0000-0000-0000-000000000002",
+   "metadata": {},
+   "source": [
+    "### Load credentials from a local `.env` file.\n",
+    "\n",
+    "For example:\n",
+    "```\n",
+    "OPENAI_API_KEY=\"...\"\n",
+    "OPENAI_BASE_URL=\"...\"\n",
+    "OPENAI_MODEL=\"...\"\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "id": "61bc0d94-4939-46c1-ac0d-2e90fd465c9c",
+   "metadata": {},
+   "source": [
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "load_dotenv(\"cred.env\", override=True)"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "id": "m001-0000-0000-0000-000000000003",
+   "metadata": {},
+   "source": "Store the credentials as project secrets - see also [working with secrets](http://docs.mlrun.org/en/stable/secrets.html).\n"
+  },
+  {
+   "cell_type": "code",
+   "id": "884a67ca-d548-4d7d-bab1-ca8868fbe875",
+   "metadata": {},
+   "source": [
+    "import os\n",
+    "project.set_secrets(\n",
+    "    secrets={\n",
+    "        \"OPENAI_API_KEY\": os.getenv(\"OPENAI_API_KEY\"),\n",
+    "        \"OPENAI_BASE_URL\": os.getenv(\"OPENAI_BASE_URL\"),\n",
+    "        \"OPENAI_MODEL\":   os.getenv(\"OPENAI_MODEL\"),\n",
+    "    },\n",
+    ")\n",
+    "project.save()"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "id": "m001-0000-0000-0000-000000000004",
+   "metadata": {},
+   "source": [
+    "## Build the serving graph\n",
+    "\n",
+    "`LLMModel` wraps an OpenAI-compatible API and reads credentials to the project secrets set above.\n",
+    "`format_answer` is a plain function that flattens the `ModelRunnerStep` output dict\n",
+    "(`{\"llm_model\": {\"answer\": ...}}`) into a simple `{\"answer\": ...}` response."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "505c77e2-6875-499d-ae05-c6de3efa0622",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Overwriting serving_graph.py\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%writefile serving_graph.py\n",
+    "from typing import Dict, Any\n",
+    "from mlrun.serving import Model\n",
+    "\n",
+    "class LLMModel(Model):\n",
+    "    \"\"\"OpenAI-compatible LLM. Credentials and model are read from env vars:\n",
+    "    OPENAI_API_KEY, OPENAI_BASE_URL (optional), OPENAI_MODEL (optional, falls back to default_model_name).\n",
+    "    \"\"\"\n",
+    "\n",
+    "    def __init__(self, default_model_name: str = \"gpt-4o-mini\", **kwargs):\n",
+    "        super().__init__(**kwargs)\n",
+    "        self.default_model_name = default_model_name\n",
+    "\n",
+    "    def load(self):\n",
+    "        import openai, os\n",
+    "        self.model_name = os.environ.get(\"OPENAI_MODEL\", self.default_model_name)\n",
+    "        client_kwargs = {\"api_key\": os.environ[\"OPENAI_API_KEY\"]}\n",
+    "        base_url = os.environ.get(\"OPENAI_BASE_URL\")\n",
+    "        if base_url:\n",
+    "            client_kwargs[\"base_url\"] = base_url\n",
+    "        self._client = openai.OpenAI(**client_kwargs)\n",
+    "\n",
+    "    def predict(self, body: Dict[str, Any]) -> Dict[str, Any]:\n",
+    "        question = body.get(\"question\", \"\")\n",
+    "        response = self._client.chat.completions.create(\n",
+    "            model=self.model_name,\n",
+    "            messages=[\n",
+    "                {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
+    "                {\"role\": \"user\",   \"content\": question},\n",
+    "            ],\n",
+    "        )\n",
+    "        return {\"answer\": response.choices[0].message.content, \"model\": self.model_name}\n",
+    "\n",
+    "\n",
+    "def format_answer(event: Dict[str, Any]) -> Dict[str, Any]:\n",
+    "    \"\"\"Flatten ModelRunnerStep output: {\"llm_model\": {\"answer\": ...}} → {\"answer\": ...}\"\"\"\n",
+    "    if isinstance(event, dict):\n",
+    "        for _, model_output in event.items():\n",
+    "            if isinstance(model_output, dict):\n",
+    "                return model_output\n",
+    "    return event"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "m001-0000-0000-0000-000000000005",
+   "metadata": {},
+   "source": [
+    "Wire the three-step async flow graph:\n",
+    "1. **`toxicity_guardrail`** — loaded directly from `hub://toxicity_guardrail`; blocks requests with a toxicity score ≥ `threshold`\n",
+    "2. **`llm_runner`** — a `ModelRunnerStep` that runs `LLMModel` against the OpenAI-compatible API\n",
+    "3. **`format_answer`** — flattens the runner output and sends the response back to the caller"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6d0435a5-4e65-4a33-a146-8c6abb382b37",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mlrun.serving import ModelRunnerStep\n",
+    "\n",
+    "fn_pipeline = project.set_function(\n",
+    "    name=\"toxicity-llm-pipeline\",\n",
+    "    func=\"serving_graph.py\",\n",
+    "    kind=\"serving\",\n",
+    "    image=\"mlrun/mlrun\",\n",
+    "    requirements=[\"transformers\", \"torch\", \"openai\"],\n",
+    ")\n",
+    "# Credentials come from Kubernetes secrets set above — no set_envs() needed for them.\n",
+    "\n",
+    "graph = fn_pipeline.set_topology(\"flow\", engine=\"async\")\n",
+    "\n",
+    "graph.add_step(\n",
+    "    class_name=\"hub://toxicity_guardrail\",\n",
+    "    name=\"toxicity_guardrail\",\n",
+    "    threshold=0.5,\n",
+    ")\n",
+    "\n",
+    "model_runner = ModelRunnerStep(name=\"llm_runner\")\n",
+    "model_runner.add_model(endpoint_name=\"llm_model\", model_class=\"LLMModel\")\n",
+    "graph.add_step(model_runner, after=\"toxicity_guardrail\")\n",
+    "\n",
+    "graph.add_step(name=\"format_answer\", handler=\"format_answer\", after=\"llm_runner\").respond()\n",
+    "\n",
+    "graph.plot(rankdir=\"LR\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "m001-0000-0000-0000-000000000006",
+   "metadata": {},
+   "source": "Deploy the Serving function, with the required packages (`transformers`, `torch`, `openai`)."
+  },
+  {
+   "cell_type": "code",
+   "id": "973ceab8-48b5-4689-a39e-b83ac3e75ddc",
+   "metadata": {},
+   "source": "addr = project.deploy_function(fn_pipeline)",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "id": "m001-0000-0000-0000-000000000007",
+   "metadata": {},
+   "source": [
+    "Test the pipeline with a safe question and a toxic one.\n",
+    "Safe requests pass through to the LLM and return an answer; toxic ones are blocked by the guardrail before reaching the model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fd5cd2c9-3abe-401f-bb2e-65d216b9951d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=== Safe input ===\n",
+      "Response: {'answer': 'Washington, D.C. It’s a federal district, not part of any state.', 'model': 'gpt-5-nano-2025-08-07'}\n",
+      "\n",
+      "=== Toxic input ===\n",
+      "Blocked (expected): bad function response 500: Exception caught in handler - \"Request blocked: toxicity score 0.953 >= 0.5\"\n"
+     ]
+    }
+   ],
+   "source": [
+    "# --- Safe input: should return an LLM answer ---\n",
+    "print(\"=== Safe input ===\")\n",
+    "result = fn_pipeline.invoke(\"/\", {\"question\": \"What is the capital of the USA?\"})\n",
+    "print(\"Response:\", result)\n",
+    "\n",
+    "# --- Toxic input: should be blocked ---\n",
+    "print(\"\\n=== Toxic input ===\")\n",
+    "try:\n",
+    "    result = fn_pipeline.invoke(\"/\", {\"question\": \"You're completely useless and everything you do is a failure\"})\n",
+    "    print(\"Response:\", result)\n",
+    "except Exception as e:\n",
+    "    print(f\"Blocked (expected): {e}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.11.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/steps/src/toxicity_guardrail/toxicity_guardrail.py b/steps/src/toxicity_guardrail/toxicity_guardrail.py
new file mode 100644
index 00000000..def0616b
--- /dev/null
+++ b/steps/src/toxicity_guardrail/toxicity_guardrail.py
@@ -0,0 +1,61 @@
+# Copyright 2025 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Any, Dict
+
+
+class ToxicityGuardrailStep:
+    """
+    A serving graph step that filters out toxic requests using a pre-trained
+    text classification model.
+
+    If the toxicity score of the input text meets or exceeds the threshold,
+    the request is blocked with a ValueError. Safe requests are passed through
+    unchanged.
+
+    The classifier label "toxic" maps directly to the toxicity score; any
+    other label (e.g. "non-toxic") inverts the model's confidence score.
+    """
+
+    def __init__(
+        self,
+        context=None,
+        name=None,
+        threshold: float = 0.5,
+        model_name: str = "unitary/toxic-bert",
+        **kwargs,
+    ):
+        self.threshold = threshold
+        self.model_name = model_name
+        self._classifier = None
+
+    def post_init(self, mode="sync", **kwargs):
+        from transformers import pipeline
+
+        self._classifier = pipeline("text-classification", model=self.model_name)
+
+    def do(self, event: Dict[str, Any]) -> Dict[str, Any]:
+        question = event.get("question", "")
+        result = self._classifier(question)[0]
+        score = (
+            result["score"]
+            if result["label"] == "toxic"
+            else 1 - result["score"]
+        )
+        if score >= self.threshold:
+            raise ValueError(
+                f"Request blocked: toxicity score {score:.3f} >= {self.threshold}"
+            )
+        return event