diff --git a/steps/src/toxicity_guardrail/item.yaml b/steps/src/toxicity_guardrail/item.yaml new file mode 100644 index 00000000..baefca0f --- /dev/null +++ b/steps/src/toxicity_guardrail/item.yaml @@ -0,0 +1,23 @@ +apiVersion: v1 +categories: + - data-preparation + - model-serving + - genai +description: Filters toxic requests using a pre-trained text classifier before they reach the LLM +example: toxicity_guardrail.ipynb +generationDate: 2026-04-27:12-00 +hidden: false +labels: + author: Iguazio +mlrunVersion: 1.10.0 +name: toxicity_guardrail +className: ToxicityGuardrailStep +defaultHandler: +spec: + filename: toxicity_guardrail.py + image: mlrun/mlrun + requirements: + - transformers==4.46.3 + - torch==2.11.0 + kind: generic +version: 1.0.0 diff --git a/steps/src/toxicity_guardrail/requirements.txt b/steps/src/toxicity_guardrail/requirements.txt new file mode 100644 index 00000000..5061402e --- /dev/null +++ b/steps/src/toxicity_guardrail/requirements.txt @@ -0,0 +1,2 @@ +transformers==4.47.0 +torch==2.6.0 diff --git a/steps/src/toxicity_guardrail/test_toxicity_guardrail.py b/steps/src/toxicity_guardrail/test_toxicity_guardrail.py new file mode 100644 index 00000000..42e85fd3 --- /dev/null +++ b/steps/src/toxicity_guardrail/test_toxicity_guardrail.py @@ -0,0 +1,42 @@ +# Copyright 2025 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import mlrun + +from toxicity_guardrail import ToxicityGuardrailStep + + +class TestToxicityGuardrailStep: + """Test suite for ToxicityGuardrailStep class.""" + + def setup_method(self): + """Set up test fixtures before each test method.""" + project = mlrun.new_project("toxicity-guardrail", save=False) + self.fn = project.set_function( + "toxicity_guardrail.py", + name="guardrail-fn", + kind="serving", + image="mlrun/mlrun", + ) + graph = self.fn.set_topology("flow", engine="async") + graph.to( + class_name="ToxicityGuardrailStep", + name="toxicity_guardrail", + threshold=0.5, + ).respond() + + def test_toxicity_guardrail_step(self): + """Test that the serving function is correctly configured with ToxicityGuardrailStep.""" + assert type(self.fn) == mlrun.runtimes.ServingRuntime diff --git a/steps/src/toxicity_guardrail/toxicity_guardrail.ipynb b/steps/src/toxicity_guardrail/toxicity_guardrail.ipynb new file mode 100644 index 00000000..ddc3fe99 --- /dev/null +++ b/steps/src/toxicity_guardrail/toxicity_guardrail.ipynb @@ -0,0 +1,281 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "93c9feca-c120-443e-bbd3-731f70d49682", + "metadata": {}, + "source": [ + "## Pipeline: Toxicity Guardrail (Hub Step) → LLM Model Runner\n", + "\n", + "A unified serving graph that:\n", + "1. Routes the user's question through a toxicity guardrail hub step\n", + "2. If safe → calls a `ModelRunnerStep` (LLM) and returns the answer\n", + "3. If toxic → blocks the request with a clear rejection response" + ] + }, + { + "cell_type": "markdown", + "id": "m001-0000-0000-0000-000000000001", + "metadata": {}, + "source": [ + "Create or load the MLRun project that will hold the serving function and its secrets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd9fd3609223be6a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2026-04-27 10:59:47,707 [info] Loading project from path: {\"path\":\"./\",\"project_name\":\"hubstep-guardrail-toxicity\",\"user_project\":false}\n", + "> 2026-04-27 11:00:02,102 [info] Project loaded successfully: {\"path\":\"./\",\"project_name\":\"hubstep-guardrail-toxicity\",\"stored_in_db\":true}\n" + ] + } + ], + "source": [ + "import mlrun\n", + "project = mlrun.get_or_create_project(\"hubstep-guardrail-toxicity\", user_project=False, context=\"./\", allow_cross_project=True)" + ] + }, + { + "cell_type": "markdown", + "id": "m001-0000-0000-0000-000000000002", + "metadata": {}, + "source": [ + "### Load credentials from a local `.env` file.\n", + "\n", + "For example:\n", + "```\n", + "OPENAI_API_KEY=\"...\"\n", + "OPENAI_BASE_URL=\"...\"\n", + "OPENAI_MODEL=\"...\"\n", + "```" + ] + }, + { + "cell_type": "code", + "id": "61bc0d94-4939-46c1-ac0d-2e90fd465c9c", + "metadata": {}, + "source": [ + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv(\"cred.env\", override=True)" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "m001-0000-0000-0000-000000000003", + "metadata": {}, + "source": "Store the credentials as project secrets - see also [working with secrets](http://docs.mlrun.org/en/stable/secrets.html).\n" + }, + { + "cell_type": "code", + "id": "884a67ca-d548-4d7d-bab1-ca8868fbe875", + "metadata": {}, + "source": [ + "import os\n", + "project.set_secrets(\n", + " secrets={\n", + " \"OPENAI_API_KEY\": os.getenv(\"OPENAI_API_KEY\"),\n", + " \"OPENAI_BASE_URL\": os.getenv(\"OPENAI_BASE_URL\"),\n", + " \"OPENAI_MODEL\": os.getenv(\"OPENAI_MODEL\"),\n", + " },\n", + ")\n", + "project.save()" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "m001-0000-0000-0000-000000000004", + "metadata": {}, + "source": [ + "## Build the serving graph\n", + "\n", + "`LLMModel` wraps an OpenAI-compatible API and reads credentials to the project secrets set above.\n", + "`format_answer` is a plain function that flattens the `ModelRunnerStep` output dict\n", + "(`{\"llm_model\": {\"answer\": ...}}`) into a simple `{\"answer\": ...}` response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "505c77e2-6875-499d-ae05-c6de3efa0622", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting serving_graph.py\n" + ] + } + ], + "source": [ + "%%writefile serving_graph.py\n", + "from typing import Dict, Any\n", + "from mlrun.serving import Model\n", + "\n", + "class LLMModel(Model):\n", + " \"\"\"OpenAI-compatible LLM. Credentials and model are read from env vars:\n", + " OPENAI_API_KEY, OPENAI_BASE_URL (optional), OPENAI_MODEL (optional, falls back to default_model_name).\n", + " \"\"\"\n", + "\n", + " def __init__(self, default_model_name: str = \"gpt-4o-mini\", **kwargs):\n", + " super().__init__(**kwargs)\n", + " self.default_model_name = default_model_name\n", + "\n", + " def load(self):\n", + " import openai, os\n", + " self.model_name = os.environ.get(\"OPENAI_MODEL\", self.default_model_name)\n", + " client_kwargs = {\"api_key\": os.environ[\"OPENAI_API_KEY\"]}\n", + " base_url = os.environ.get(\"OPENAI_BASE_URL\")\n", + " if base_url:\n", + " client_kwargs[\"base_url\"] = base_url\n", + " self._client = openai.OpenAI(**client_kwargs)\n", + "\n", + " def predict(self, body: Dict[str, Any]) -> Dict[str, Any]:\n", + " question = body.get(\"question\", \"\")\n", + " response = self._client.chat.completions.create(\n", + " model=self.model_name,\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n", + " {\"role\": \"user\", \"content\": question},\n", + " ],\n", + " )\n", + " return {\"answer\": response.choices[0].message.content, \"model\": self.model_name}\n", + "\n", + "\n", + "def format_answer(event: Dict[str, Any]) -> Dict[str, Any]:\n", + " \"\"\"Flatten ModelRunnerStep output: {\"llm_model\": {\"answer\": ...}} → {\"answer\": ...}\"\"\"\n", + " if isinstance(event, dict):\n", + " for _, model_output in event.items():\n", + " if isinstance(model_output, dict):\n", + " return model_output\n", + " return event" + ] + }, + { + "cell_type": "markdown", + "id": "m001-0000-0000-0000-000000000005", + "metadata": {}, + "source": [ + "Wire the three-step async flow graph:\n", + "1. **`toxicity_guardrail`** — loaded directly from `hub://toxicity_guardrail`; blocks requests with a toxicity score ≥ `threshold`\n", + "2. **`llm_runner`** — a `ModelRunnerStep` that runs `LLMModel` against the OpenAI-compatible API\n", + "3. **`format_answer`** — flattens the runner output and sends the response back to the caller" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d0435a5-4e65-4a33-a146-8c6abb382b37", + "metadata": {}, + "outputs": [], + "source": [ + "from mlrun.serving import ModelRunnerStep\n", + "\n", + "fn_pipeline = project.set_function(\n", + " name=\"toxicity-llm-pipeline\",\n", + " func=\"serving_graph.py\",\n", + " kind=\"serving\",\n", + " image=\"mlrun/mlrun\",\n", + " requirements=[\"transformers\", \"torch\", \"openai\"],\n", + ")\n", + "# Credentials come from Kubernetes secrets set above — no set_envs() needed for them.\n", + "\n", + "graph = fn_pipeline.set_topology(\"flow\", engine=\"async\")\n", + "\n", + "graph.add_step(\n", + " class_name=\"hub://toxicity_guardrail\",\n", + " name=\"toxicity_guardrail\",\n", + " threshold=0.5,\n", + ")\n", + "\n", + "model_runner = ModelRunnerStep(name=\"llm_runner\")\n", + "model_runner.add_model(endpoint_name=\"llm_model\", model_class=\"LLMModel\")\n", + "graph.add_step(model_runner, after=\"toxicity_guardrail\")\n", + "\n", + "graph.add_step(name=\"format_answer\", handler=\"format_answer\", after=\"llm_runner\").respond()\n", + "\n", + "graph.plot(rankdir=\"LR\")" + ] + }, + { + "cell_type": "markdown", + "id": "m001-0000-0000-0000-000000000006", + "metadata": {}, + "source": "Deploy the Serving function, with the required packages (`transformers`, `torch`, `openai`)." + }, + { + "cell_type": "code", + "id": "973ceab8-48b5-4689-a39e-b83ac3e75ddc", + "metadata": {}, + "source": "addr = project.deploy_function(fn_pipeline)", + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "m001-0000-0000-0000-000000000007", + "metadata": {}, + "source": [ + "Test the pipeline with a safe question and a toxic one.\n", + "Safe requests pass through to the LLM and return an answer; toxic ones are blocked by the guardrail before reaching the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd5cd2c9-3abe-401f-bb2e-65d216b9951d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== Safe input ===\n", + "Response: {'answer': 'Washington, D.C. It’s a federal district, not part of any state.', 'model': 'gpt-5-nano-2025-08-07'}\n", + "\n", + "=== Toxic input ===\n", + "Blocked (expected): bad function response 500: Exception caught in handler - \"Request blocked: toxicity score 0.953 >= 0.5\"\n" + ] + } + ], + "source": [ + "# --- Safe input: should return an LLM answer ---\n", + "print(\"=== Safe input ===\")\n", + "result = fn_pipeline.invoke(\"/\", {\"question\": \"What is the capital of the USA?\"})\n", + "print(\"Response:\", result)\n", + "\n", + "# --- Toxic input: should be blocked ---\n", + "print(\"\\n=== Toxic input ===\")\n", + "try:\n", + " result = fn_pipeline.invoke(\"/\", {\"question\": \"You're completely useless and everything you do is a failure\"})\n", + " print(\"Response:\", result)\n", + "except Exception as e:\n", + " print(f\"Blocked (expected): {e}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/steps/src/toxicity_guardrail/toxicity_guardrail.py b/steps/src/toxicity_guardrail/toxicity_guardrail.py new file mode 100644 index 00000000..def0616b --- /dev/null +++ b/steps/src/toxicity_guardrail/toxicity_guardrail.py @@ -0,0 +1,61 @@ +# Copyright 2025 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Any, Dict + + +class ToxicityGuardrailStep: + """ + A serving graph step that filters out toxic requests using a pre-trained + text classification model. + + If the toxicity score of the input text meets or exceeds the threshold, + the request is blocked with a ValueError. Safe requests are passed through + unchanged. + + The classifier label "toxic" maps directly to the toxicity score; any + other label (e.g. "non-toxic") inverts the model's confidence score. + """ + + def __init__( + self, + context=None, + name=None, + threshold: float = 0.5, + model_name: str = "unitary/toxic-bert", + **kwargs, + ): + self.threshold = threshold + self.model_name = model_name + self._classifier = None + + def post_init(self, mode="sync", **kwargs): + from transformers import pipeline + + self._classifier = pipeline("text-classification", model=self.model_name) + + def do(self, event: Dict[str, Any]) -> Dict[str, Any]: + question = event.get("question", "") + result = self._classifier(question)[0] + score = ( + result["score"] + if result["label"] == "toxic" + else 1 - result["score"] + ) + if score >= self.threshold: + raise ValueError( + f"Request blocked: toxicity score {score:.3f} >= {self.threshold}" + ) + return event