diff --git a/steps/master/toxicity_guardrail/1.0.0/src/item.yaml b/steps/master/toxicity_guardrail/1.0.0/src/item.yaml new file mode 100644 index 00000000..95fe820e --- /dev/null +++ b/steps/master/toxicity_guardrail/1.0.0/src/item.yaml @@ -0,0 +1,23 @@ +apiVersion: v1 +categories: + - data-preparation + - model-serving + - genai +description: Filters toxic requests using a pre-trained text classifier before they reach the LLM +example: toxicity_guardrail.ipynb +generationDate: 2026-04-27:12-00 +hidden: false +labels: + author: Iguazio +mlrunVersion: 1.10.0 +name: toxicity_guardrail +className: ToxicityGuardrailStep +defaultHandler: +spec: + filename: toxicity_guardrail.py + image: mlrun/mlrun + requirements: + - transformers + - torch + kind: generic +version: 1.0.0 \ No newline at end of file diff --git a/steps/master/toxicity_guardrail/1.0.0/src/test_toxicity_guardrail.py b/steps/master/toxicity_guardrail/1.0.0/src/test_toxicity_guardrail.py new file mode 100644 index 00000000..11ee40e3 --- /dev/null +++ b/steps/master/toxicity_guardrail/1.0.0/src/test_toxicity_guardrail.py @@ -0,0 +1,84 @@ +# Copyright 2025 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from unittest.mock import MagicMock + +from toxicity_guardrail import ToxicityGuardrailStep + + +class TestToxicityGuardrailStep: + def _make_step(self, threshold=0.5): + step = ToxicityGuardrailStep(threshold=threshold) + step._classifier = MagicMock() + return step + + def test_safe_input_passes(self): + step = self._make_step() + step._classifier.return_value = [{"label": "non-toxic", "score": 0.999}] + event = {"question": "What is the capital of France?"} + result = step.do(event) + assert result == event + + def test_toxic_input_blocked(self): + step = self._make_step() + step._classifier.return_value = [{"label": "toxic", "score": 0.998}] + event = {"question": "some clearly toxic text"} + try: + step.do(event) + assert False, "Expected ValueError to be raised" + except ValueError as e: + assert "Request blocked" in str(e) + assert "0.998" in str(e) + + def test_custom_threshold_passes_below(self): + step = self._make_step(threshold=0.9) + # Score 0.85 < threshold 0.9 — should pass through + step._classifier.return_value = [{"label": "toxic", "score": 0.85}] + event = {"question": "borderline content"} + result = step.do(event) + assert result == event + + def test_score_at_threshold_is_blocked(self): + step = self._make_step(threshold=0.5) + # Score exactly equal to threshold — should be blocked + step._classifier.return_value = [{"label": "toxic", "score": 0.5}] + event = {"question": "borderline content"} + try: + step.do(event) + assert False, "Expected ValueError to be raised" + except ValueError as e: + assert "0.500" in str(e) + + def test_non_toxic_label_inverts_score(self): + step = self._make_step(threshold=0.5) + # label="non-toxic", score=0.99 → toxicity score = 1 - 0.99 = 0.01 → safe + step._classifier.return_value = [{"label": "non-toxic", "score": 0.99}] + event = {"question": "a perfectly safe question"} + result = step.do(event) + assert result == event + + def test_empty_question_is_safe(self): + step = self._make_step() + step._classifier.return_value = [{"label": "non-toxic", "score": 0.999}] + event = {"question": ""} + result = step.do(event) + assert result == event + + def test_event_passthrough_unchanged(self): + step = self._make_step() + step._classifier.return_value = [{"label": "non-toxic", "score": 0.99}] + event = {"question": "Hello world", "extra_field": 42} + result = step.do(event) + assert result["extra_field"] == 42 diff --git a/steps/master/toxicity_guardrail/1.0.0/src/toxicity_guardrail.ipynb b/steps/master/toxicity_guardrail/1.0.0/src/toxicity_guardrail.ipynb new file mode 100644 index 00000000..a9ef3ba4 --- /dev/null +++ b/steps/master/toxicity_guardrail/1.0.0/src/toxicity_guardrail.ipynb @@ -0,0 +1,229 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a1b2c3d4-0001-4000-8000-000000000001", + "metadata": {}, + "source": [ + "# Toxicity Guardrail Step Demo\n", + "\n", + "This notebook walks through a simple example of using the toxicity-guardrail step in serving functions,\n", + "by first downloading the step from the hub and inspecting it, then including the step in a serving graph." + ] + }, + { + "cell_type": "markdown", + "id": "a1b2c3d4-0001-4000-8000-000000000002", + "metadata": {}, + "source": [ + "## Get the step from the hub" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1b2c3d4-0001-4000-8000-000000000003", + "metadata": {}, + "outputs": [], + "source": [ + "import mlrun" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1b2c3d4-0001-4000-8000-000000000004", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'filename': 'toxicity_guardrail.py',\n", + " 'example': 'toxicity_guardrail.ipynb',\n", + " 'local_path': PosixPath('/User'),\n", + " 'url': 'hub://toxicity_guardrail',\n", + " 'class_name': 'ToxicityGuardrailStep',\n", + " 'name': 'toxicity_guardrail',\n", + " 'version': '1.0.0',\n", + " 'categories': ['data-preparation', 'model-serving', 'genai'],\n", + " 'description': 'Filters toxic requests using a pre-trained text classifier before they reach the LLM'}" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "hub_step = mlrun.get_hub_step(\"hub://toxicity_guardrail\")\n", + "hub_step.to_dict()" + ] + }, + { + "cell_type": "markdown", + "id": "a1b2c3d4-0001-4000-8000-000000000005", + "metadata": {}, + "source": [ + "## Use the step in a serving function\n", + "\n", + "Add `ToxicityGuardrailStep` as the first step in an async serving graph.\n", + "Any request whose toxicity score meets or exceeds the threshold will be rejected\n", + "before reaching downstream steps." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1b2c3d4-0001-4000-8000-000000000006", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2026-04-27 12:00:00,000 [info] Project loaded successfully: {\"project_name\":\"toxicity-guardrail-demo\"}\n" + ] + } + ], + "source": [ + "project = mlrun.get_or_create_project(\"toxicity-guardrail-demo\", \"./toxicity-guardrail-demo\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1b2c3d4-0001-4000-8000-000000000007", + "metadata": {}, + "outputs": [], + "source": [ + "fn = project.set_function(\n", + " hub_step.get_src_file_path(),\n", + " name=\"guardrail-fn\",\n", + " kind=\"serving\",\n", + " image=\"mlrun/mlrun\",\n", + " requirements=[\"transformers\", \"torch\"],\n", + ")\n", + "graph = fn.set_topology(\"flow\", engine=\"async\")\n", + "graph.to(\n", + " class_name=\"ToxicityGuardrailStep\",\n", + " name=\"toxicity_guardrail\",\n", + " threshold=0.5,\n", + ").respond()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1b2c3d4-0001-4000-8000-000000000008", + "metadata": {}, + "outputs": [], + "source": [ + "project.deploy_function(fn)" + ] + }, + { + "cell_type": "markdown", + "id": "a1b2c3d4-0001-4000-8000-000000000009", + "metadata": {}, + "source": [ + "### Test with a safe input" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1b2c3d4-0001-4000-8000-00000000000a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Response: {'question': 'What is the capital of France?'}\n" + ] + } + ], + "source": [ + "serving_fn = project.get_function(\"guardrail-fn\")\n", + "event = {\"question\": \"What is the capital of France?\"}\n", + "result = serving_fn.invoke(\"/\", body=event)\n", + "print(\"Response:\", result)" + ] + }, + { + "cell_type": "markdown", + "id": "a1b2c3d4-0001-4000-8000-00000000000b", + "metadata": {}, + "source": [ + "### Test with a toxic input (expect a block)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1b2c3d4-0001-4000-8000-00000000000c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Blocked (expected): bad function response 500: ValueError: Request blocked: toxicity score 0.998 >= 0.5\n" + ] + } + ], + "source": [ + "try:\n", + " result = serving_fn.invoke(\"/\", body={\"question\": \"some toxic text\"})\n", + " print(\"Response:\", result)\n", + "except Exception as e:\n", + " print(f\"Blocked (expected): {e}\")" + ] + }, + { + "cell_type": "markdown", + "id": "a1b2c3d4-0001-4000-8000-00000000000d", + "metadata": {}, + "source": [ + "## Add the step directly from the hub\n", + "\n", + "If no customisation is needed, the step can be referenced directly from the hub\n", + "without downloading the source file first." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1b2c3d4-0001-4000-8000-00000000000e", + "metadata": {}, + "outputs": [], + "source": [ + "fn2 = project.set_function(\n", + " name=\"guardrail-fn-2\",\n", + " kind=\"serving\",\n", + " image=\"mlrun/mlrun\",\n", + " requirements=[\"transformers\", \"torch\"],\n", + ")\n", + "graph2 = fn2.set_topology(\"flow\", engine=\"async\")\n", + "graph2.add_step(\n", + " class_name=\"hub://toxicity_guardrail\",\n", + " name=\"toxicity_guardrail\",\n", + " threshold=0.5,\n", + ").respond()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/steps/master/toxicity_guardrail/1.0.0/src/toxicity_guardrail.py b/steps/master/toxicity_guardrail/1.0.0/src/toxicity_guardrail.py new file mode 100644 index 00000000..def0616b --- /dev/null +++ b/steps/master/toxicity_guardrail/1.0.0/src/toxicity_guardrail.py @@ -0,0 +1,61 @@ +# Copyright 2025 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Any, Dict + + +class ToxicityGuardrailStep: + """ + A serving graph step that filters out toxic requests using a pre-trained + text classification model. + + If the toxicity score of the input text meets or exceeds the threshold, + the request is blocked with a ValueError. Safe requests are passed through + unchanged. + + The classifier label "toxic" maps directly to the toxicity score; any + other label (e.g. "non-toxic") inverts the model's confidence score. + """ + + def __init__( + self, + context=None, + name=None, + threshold: float = 0.5, + model_name: str = "unitary/toxic-bert", + **kwargs, + ): + self.threshold = threshold + self.model_name = model_name + self._classifier = None + + def post_init(self, mode="sync", **kwargs): + from transformers import pipeline + + self._classifier = pipeline("text-classification", model=self.model_name) + + def do(self, event: Dict[str, Any]) -> Dict[str, Any]: + question = event.get("question", "") + result = self._classifier(question)[0] + score = ( + result["score"] + if result["label"] == "toxic" + else 1 - result["score"] + ) + if score >= self.threshold: + raise ValueError( + f"Request blocked: toxicity score {score:.3f} >= {self.threshold}" + ) + return event