mlrun · guylei-code · Apr 27, 2026
diff --git a/steps/master/toxicity_guardrail/1.0.0/src/item.yaml b/steps/master/toxicity_guardrail/1.0.0/src/item.yaml
@@ -0,0 +1,23 @@
+apiVersion: v1
+categories:
+  - data-preparation
+  - model-serving
+  - genai
+description: Filters toxic requests using a pre-trained text classifier before they reach the LLM
+example: toxicity_guardrail.ipynb
+generationDate: 2026-04-27:12-00
+hidden: false
+labels:
+  author: Iguazio
+mlrunVersion: 1.10.0
+name: toxicity_guardrail
+className: ToxicityGuardrailStep
+defaultHandler:
+spec:
+  filename: toxicity_guardrail.py
+  image: mlrun/mlrun
+  requirements:
+    - transformers
+    - torch
+  kind: generic
+version: 1.0.0
diff --git a/steps/master/toxicity_guardrail/1.0.0/src/test_toxicity_guardrail.py b/steps/master/toxicity_guardrail/1.0.0/src/test_toxicity_guardrail.py
@@ -0,0 +1,84 @@
+# Copyright 2025 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from unittest.mock import MagicMock
+
+from toxicity_guardrail import ToxicityGuardrailStep
+
+
+class TestToxicityGuardrailStep:
+    def _make_step(self, threshold=0.5):
+        step = ToxicityGuardrailStep(threshold=threshold)
+        step._classifier = MagicMock()
+        return step
+
+    def test_safe_input_passes(self):
+        step = self._make_step()
+        step._classifier.return_value = [{"label": "non-toxic", "score": 0.999}]
+        event = {"question": "What is the capital of France?"}
+        result = step.do(event)
+        assert result == event
+
+    def test_toxic_input_blocked(self):
+        step = self._make_step()
+        step._classifier.return_value = [{"label": "toxic", "score": 0.998}]
+        event = {"question": "some clearly toxic text"}
+        try:
+            step.do(event)
+            assert False, "Expected ValueError to be raised"
+        except ValueError as e:
+            assert "Request blocked" in str(e)
+            assert "0.998" in str(e)
+
+    def test_custom_threshold_passes_below(self):
+        step = self._make_step(threshold=0.9)
+        # Score 0.85 < threshold 0.9 — should pass through
+        step._classifier.return_value = [{"label": "toxic", "score": 0.85}]
+        event = {"question": "borderline content"}
+        result = step.do(event)
+        assert result == event
+
+    def test_score_at_threshold_is_blocked(self):
+        step = self._make_step(threshold=0.5)
+        # Score exactly equal to threshold — should be blocked
+        step._classifier.return_value = [{"label": "toxic", "score": 0.5}]
+        event = {"question": "borderline content"}
+        try:
+            step.do(event)
+            assert False, "Expected ValueError to be raised"
+        except ValueError as e:
+            assert "0.500" in str(e)
+
+    def test_non_toxic_label_inverts_score(self):
+        step = self._make_step(threshold=0.5)
+        # label="non-toxic", score=0.99 → toxicity score = 1 - 0.99 = 0.01 → safe
+        step._classifier.return_value = [{"label": "non-toxic", "score": 0.99}]
+        event = {"question": "a perfectly safe question"}
+        result = step.do(event)
+        assert result == event
+
+    def test_empty_question_is_safe(self):
+        step = self._make_step()
+        step._classifier.return_value = [{"label": "non-toxic", "score": 0.999}]
+        event = {"question": ""}
+        result = step.do(event)
+        assert result == event
+
+    def test_event_passthrough_unchanged(self):
+        step = self._make_step()
+        step._classifier.return_value = [{"label": "non-toxic", "score": 0.99}]
+        event = {"question": "Hello world", "extra_field": 42}
+        result = step.do(event)
+        assert result["extra_field"] == 42
diff --git a/steps/master/toxicity_guardrail/1.0.0/src/toxicity_guardrail.ipynb b/steps/master/toxicity_guardrail/1.0.0/src/toxicity_guardrail.ipynb
@@ -0,0 +1,229 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "a1b2c3d4-0001-4000-8000-000000000001",
+   "metadata": {},
+   "source": [
+    "# Toxicity Guardrail Step Demo\n",
+    "\n",
+    "This notebook walks through a simple example of using the toxicity-guardrail step in serving functions,\n",
+    "by first downloading the step from the hub and inspecting it, then including the step in a serving graph."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a1b2c3d4-0001-4000-8000-000000000002",
+   "metadata": {},
+   "source": [
+    "## Get the step from the hub"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1b2c3d4-0001-4000-8000-000000000003",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import mlrun"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1b2c3d4-0001-4000-8000-000000000004",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'filename': 'toxicity_guardrail.py',\n",
+       " 'example': 'toxicity_guardrail.ipynb',\n",
+       " 'local_path': PosixPath('/User'),\n",
+       " 'url': 'hub://toxicity_guardrail',\n",
+       " 'class_name': 'ToxicityGuardrailStep',\n",
+       " 'name': 'toxicity_guardrail',\n",
+       " 'version': '1.0.0',\n",
+       " 'categories': ['data-preparation', 'model-serving', 'genai'],\n",
+       " 'description': 'Filters toxic requests using a pre-trained text classifier before they reach the LLM'}"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "hub_step = mlrun.get_hub_step(\"hub://toxicity_guardrail\")\n",
+    "hub_step.to_dict()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a1b2c3d4-0001-4000-8000-000000000005",
+   "metadata": {},
+   "source": [
+    "## Use the step in a serving function\n",
+    "\n",
+    "Add `ToxicityGuardrailStep` as the first step in an async serving graph.\n",
+    "Any request whose toxicity score meets or exceeds the threshold will be rejected\n",
+    "before reaching downstream steps."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1b2c3d4-0001-4000-8000-000000000006",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "> 2026-04-27 12:00:00,000 [info] Project loaded successfully: {\"project_name\":\"toxicity-guardrail-demo\"}\n"
+     ]
+    }
+   ],
+   "source": [
+    "project = mlrun.get_or_create_project(\"toxicity-guardrail-demo\", \"./toxicity-guardrail-demo\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1b2c3d4-0001-4000-8000-000000000007",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fn = project.set_function(\n",
+    "    hub_step.get_src_file_path(),\n",
+    "    name=\"guardrail-fn\",\n",
+    "    kind=\"serving\",\n",
+    "    image=\"mlrun/mlrun\",\n",
+    "    requirements=[\"transformers\", \"torch\"],\n",
+    ")\n",
+    "graph = fn.set_topology(\"flow\", engine=\"async\")\n",
+    "graph.to(\n",
+    "    class_name=\"ToxicityGuardrailStep\",\n",
+    "    name=\"toxicity_guardrail\",\n",
+    "    threshold=0.5,\n",
+    ").respond()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1b2c3d4-0001-4000-8000-000000000008",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "project.deploy_function(fn)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a1b2c3d4-0001-4000-8000-000000000009",
+   "metadata": {},
+   "source": [
+    "### Test with a safe input"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1b2c3d4-0001-4000-8000-00000000000a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Response: {'question': 'What is the capital of France?'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "serving_fn = project.get_function(\"guardrail-fn\")\n",
+    "event = {\"question\": \"What is the capital of France?\"}\n",
+    "result = serving_fn.invoke(\"/\", body=event)\n",
+    "print(\"Response:\", result)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a1b2c3d4-0001-4000-8000-00000000000b",
+   "metadata": {},
+   "source": [
+    "### Test with a toxic input (expect a block)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1b2c3d4-0001-4000-8000-00000000000c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Blocked (expected): bad function response 500: ValueError: Request blocked: toxicity score 0.998 >= 0.5\n"
+     ]
+    }
+   ],
+   "source": [
+    "try:\n",
+    "    result = serving_fn.invoke(\"/\", body={\"question\": \"some toxic text\"})\n",
+    "    print(\"Response:\", result)\n",
+    "except Exception as e:\n",
+    "    print(f\"Blocked (expected): {e}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a1b2c3d4-0001-4000-8000-00000000000d",
+   "metadata": {},
+   "source": [
+    "## Add the step directly from the hub\n",
+    "\n",
+    "If no customisation is needed, the step can be referenced directly from the hub\n",
+    "without downloading the source file first."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1b2c3d4-0001-4000-8000-00000000000e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fn2 = project.set_function(\n",
+    "    name=\"guardrail-fn-2\",\n",
+    "    kind=\"serving\",\n",
+    "    image=\"mlrun/mlrun\",\n",
+    "    requirements=[\"transformers\", \"torch\"],\n",
+    ")\n",
+    "graph2 = fn2.set_topology(\"flow\", engine=\"async\")\n",
+    "graph2.add_step(\n",
+    "    class_name=\"hub://toxicity_guardrail\",\n",
+    "    name=\"toxicity_guardrail\",\n",
+    "    threshold=0.5,\n",
+    ").respond()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.11.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}