mlrun · Eyal-Danieli · Apr 29, 2026 · Dec 25, 2025 · Dec 25, 2025 · Apr 27, 2026
diff --git a/steps/src/toxicity_guardrail/item.yaml b/steps/src/toxicity_guardrail/item.yaml
@@ -0,0 +1,23 @@
+apiVersion: v1
+categories:
+  - data-preparation
+  - model-serving
+  - genai
+description: Filters toxic requests using a pre-trained text classifier before they reach the LLM
+example: toxicity_guardrail.ipynb
+generationDate: 2026-04-27:12-00
+hidden: false
+labels:
+  author: Iguazio
+mlrunVersion: 1.10.0
+name: toxicity_guardrail
+className: ToxicityGuardrailStep
+defaultHandler:
+spec:
+  filename: toxicity_guardrail.py
+  image: mlrun/mlrun
+  requirements:
+    - transformers
+    - torch
+  kind: generic
+version: 1.0.0
diff --git a/steps/src/toxicity_guardrail/requirements.txt b/steps/src/toxicity_guardrail/requirements.txt
@@ -0,0 +1,2 @@
+transformers==4.47.0
+torch==2.6.0
diff --git a/steps/src/toxicity_guardrail/test_toxicity_guardrail.py b/steps/src/toxicity_guardrail/test_toxicity_guardrail.py
@@ -0,0 +1,42 @@
+# Copyright 2025 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import mlrun
+
+from toxicity_guardrail import ToxicityGuardrailStep
+
+
+class TestToxicityGuardrailStep:
+    """Test suite for ToxicityGuardrailStep class."""
+
+    def setup_method(self):
+        """Set up test fixtures before each test method."""
+        project = mlrun.new_project("toxicity-guardrail", save=False)
+        self.fn = project.set_function(
+            "toxicity_guardrail.py",
+            name="guardrail-fn",
+            kind="serving",
+            image="mlrun/mlrun",
+        )
+        graph = self.fn.set_topology("flow", engine="async")
+        graph.to(
+            class_name="ToxicityGuardrailStep",
+            name="toxicity_guardrail",
+            threshold=0.5,
+        ).respond()
+
+    def test_toxicity_guardrail_step(self):
+        """Test that the serving function is correctly configured with ToxicityGuardrailStep."""
+        assert type(self.fn) == mlrun.runtimes.ServingRuntime
diff --git a/steps/src/toxicity_guardrail/toxicity_guardrail.ipynb b/steps/src/toxicity_guardrail/toxicity_guardrail.ipynb
@@ -0,0 +1,281 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "93c9feca-c120-443e-bbd3-731f70d49682",
+   "metadata": {},
+   "source": [
+    "## Pipeline: Toxicity Guardrail (Hub Step) → LLM Model Runner\n",
+    "\n",
+    "A unified serving graph that:\n",
+    "1. Routes the user's question through a toxicity guardrail hub step\n",
+    "2. If safe → calls a `ModelRunnerStep` (LLM) and returns the answer\n",
+    "3. If toxic → blocks the request with a clear rejection response"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "m001-0000-0000-0000-000000000001",
+   "metadata": {},
+   "source": [
+    "Create or load the MLRun project that will hold the serving function and its secrets."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fd9fd3609223be6a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "> 2026-04-27 10:59:47,707 [info] Loading project from path: {\"path\":\"./\",\"project_name\":\"hubstep-guardrail-toxicity\",\"user_project\":false}\n",
+      "> 2026-04-27 11:00:02,102 [info] Project loaded successfully: {\"path\":\"./\",\"project_name\":\"hubstep-guardrail-toxicity\",\"stored_in_db\":true}\n"
+     ]
+    }
+   ],
+   "source": [
+    "import mlrun\n",
+    "project = mlrun.get_or_create_project(\"hubstep-guardrail-toxicity\", user_project=False, context=\"./\", allow_cross_project=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "m001-0000-0000-0000-000000000002",
+   "metadata": {},
+   "source": [
+    "### Load credentials from a local `.env` file.\n",
+    "\n",
+    "For example:\n",
+    "```\n",
+    "OPENAI_API_KEY=\"...\"\n",
+    "OPENAI_BASE_URL=\"...\"\n",
+    "OPENAI_MODEL=\"...\"\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "id": "61bc0d94-4939-46c1-ac0d-2e90fd465c9c",
+   "metadata": {},
+   "source": [
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "load_dotenv(\"cred.env\", override=True)"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "id": "m001-0000-0000-0000-000000000003",
+   "metadata": {},
+   "source": "Store the credentials as project secrets - see also [working with secrets](http://docs.mlrun.org/en/stable/secrets.html).\n"
+  },
+  {
+   "cell_type": "code",
+   "id": "884a67ca-d548-4d7d-bab1-ca8868fbe875",
+   "metadata": {},
+   "source": [
+    "import os\n",
+    "project.set_secrets(\n",
+    "    secrets={\n",
+    "        \"OPENAI_API_KEY\": os.getenv(\"OPENAI_API_KEY\"),\n",
+    "        \"OPENAI_BASE_URL\": os.getenv(\"OPENAI_BASE_URL\"),\n",
+    "        \"OPENAI_MODEL\":   os.getenv(\"OPENAI_MODEL\"),\n",
+    "    },\n",
+    ")\n",
+    "project.save()"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "id": "m001-0000-0000-0000-000000000004",
+   "metadata": {},
+   "source": [
+    "## Build the serving graph\n",
+    "\n",
+    "`LLMModel` wraps an OpenAI-compatible API and reads credentials to the project secrets set above.\n",
+    "`format_answer` is a plain function that flattens the `ModelRunnerStep` output dict\n",
+    "(`{\"llm_model\": {\"answer\": ...}}`) into a simple `{\"answer\": ...}` response."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "505c77e2-6875-499d-ae05-c6de3efa0622",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Overwriting serving_graph.py\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%writefile serving_graph.py\n",
+    "from typing import Dict, Any\n",
+    "from mlrun.serving import Model\n",
+    "\n",
+    "class LLMModel(Model):\n",
+    "    \"\"\"OpenAI-compatible LLM. Credentials and model are read from env vars:\n",
+    "    OPENAI_API_KEY, OPENAI_BASE_URL (optional), OPENAI_MODEL (optional, falls back to default_model_name).\n",
+    "    \"\"\"\n",
+    "\n",
+    "    def __init__(self, default_model_name: str = \"gpt-4o-mini\", **kwargs):\n",
+    "        super().__init__(**kwargs)\n",
+    "        self.default_model_name = default_model_name\n",
+    "\n",
+    "    def load(self):\n",
+    "        import openai, os\n",
+    "        self.model_name = os.environ.get(\"OPENAI_MODEL\", self.default_model_name)\n",
+    "        client_kwargs = {\"api_key\": os.environ[\"OPENAI_API_KEY\"]}\n",
+    "        base_url = os.environ.get(\"OPENAI_BASE_URL\")\n",
+    "        if base_url:\n",
+    "            client_kwargs[\"base_url\"] = base_url\n",
+    "        self._client = openai.OpenAI(**client_kwargs)\n",
+    "\n",
+    "    def predict(self, body: Dict[str, Any]) -> Dict[str, Any]:\n",
+    "        question = body.get(\"question\", \"\")\n",
+    "        response = self._client.chat.completions.create(\n",
+    "            model=self.model_name,\n",
+    "            messages=[\n",
+    "                {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
+    "                {\"role\": \"user\",   \"content\": question},\n",
+    "            ],\n",
+    "        )\n",
+    "        return {\"answer\": response.choices[0].message.content, \"model\": self.model_name}\n",
+    "\n",
+    "\n",
+    "def format_answer(event: Dict[str, Any]) -> Dict[str, Any]:\n",
+    "    \"\"\"Flatten ModelRunnerStep output: {\"llm_model\": {\"answer\": ...}} → {\"answer\": ...}\"\"\"\n",
+    "    if isinstance(event, dict):\n",
+    "        for _, model_output in event.items():\n",
+    "            if isinstance(model_output, dict):\n",
+    "                return model_output\n",
+    "    return event"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "m001-0000-0000-0000-000000000005",
+   "metadata": {},
+   "source": [
+    "Wire the three-step async flow graph:\n",
+    "1. **`toxicity_guardrail`** — loaded directly from `hub://toxicity_guardrail`; blocks requests with a toxicity score ≥ `threshold`\n",
+    "2. **`llm_runner`** — a `ModelRunnerStep` that runs `LLMModel` against the OpenAI-compatible API\n",
+    "3. **`format_answer`** — flattens the runner output and sends the response back to the caller"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6d0435a5-4e65-4a33-a146-8c6abb382b37",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mlrun.serving import ModelRunnerStep\n",
+    "\n",
+    "fn_pipeline = project.set_function(\n",
+    "    name=\"toxicity-llm-pipeline\",\n",
+    "    func=\"serving_graph.py\",\n",
+    "    kind=\"serving\",\n",
+    "    image=\"mlrun/mlrun\",\n",
+    "    requirements=[\"transformers\", \"torch\", \"openai\"],\n",
+    ")\n",
+    "# Credentials come from Kubernetes secrets set above — no set_envs() needed for them.\n",
+    "\n",
+    "graph = fn_pipeline.set_topology(\"flow\", engine=\"async\")\n",
+    "\n",
+    "graph.add_step(\n",
+    "    class_name=\"hub://toxicity_guardrail\",\n",
+    "    name=\"toxicity_guardrail\",\n",
+    "    threshold=0.5,\n",
+    ")\n",
+    "\n",
+    "model_runner = ModelRunnerStep(name=\"llm_runner\")\n",
+    "model_runner.add_model(endpoint_name=\"llm_model\", model_class=\"LLMModel\")\n",
+    "graph.add_step(model_runner, after=\"toxicity_guardrail\")\n",
+    "\n",
+    "graph.add_step(name=\"format_answer\", handler=\"format_answer\", after=\"llm_runner\").respond()\n",
+    "\n",
+    "graph.plot(rankdir=\"LR\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "m001-0000-0000-0000-000000000006",
+   "metadata": {},
+   "source": "Deploy the Serving function, with the required packages (`transformers`, `torch`, `openai`)."
+  },
+  {
+   "cell_type": "code",
+   "id": "973ceab8-48b5-4689-a39e-b83ac3e75ddc",
+   "metadata": {},
+   "source": "addr = project.deploy_function(fn_pipeline)",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "id": "m001-0000-0000-0000-000000000007",
+   "metadata": {},
+   "source": [
+    "Test the pipeline with a safe question and a toxic one.\n",
+    "Safe requests pass through to the LLM and return an answer; toxic ones are blocked by the guardrail before reaching the model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fd5cd2c9-3abe-401f-bb2e-65d216b9951d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=== Safe input ===\n",
+      "Response: {'answer': 'Washington, D.C. It’s a federal district, not part of any state.', 'model': 'gpt-5-nano-2025-08-07'}\n",
+      "\n",
+      "=== Toxic input ===\n",
+      "Blocked (expected): bad function response 500: Exception caught in handler - \"Request blocked: toxicity score 0.953 >= 0.5\"\n"
+     ]
+    }
+   ],
+   "source": [
+    "# --- Safe input: should return an LLM answer ---\n",
+    "print(\"=== Safe input ===\")\n",
+    "result = fn_pipeline.invoke(\"/\", {\"question\": \"What is the capital of the USA?\"})\n",
+    "print(\"Response:\", result)\n",
+    "\n",
+    "# --- Toxic input: should be blocked ---\n",
+    "print(\"\\n=== Toxic input ===\")\n",
+    "try:\n",
+    "    result = fn_pipeline.invoke(\"/\", {\"question\": \"You're completely useless and everything you do is a failure\"})\n",
+    "    print(\"Response:\", result)\n",
+    "except Exception as e:\n",
+    "    print(f\"Blocked (expected): {e}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.11.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}