Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions steps/master/toxicity_guardrail/1.0.0/src/item.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
apiVersion: v1
categories:
- data-preparation
- model-serving
- genai
description: Filters toxic requests using a pre-trained text classifier before they reach the LLM
example: toxicity_guardrail.ipynb
generationDate: 2026-04-27:12-00
hidden: false
labels:
author: Iguazio
mlrunVersion: 1.10.0
name: toxicity_guardrail
className: ToxicityGuardrailStep
defaultHandler:
spec:
filename: toxicity_guardrail.py
image: mlrun/mlrun
requirements:
- transformers
- torch
kind: generic
version: 1.0.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# Copyright 2025 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from unittest.mock import MagicMock

from toxicity_guardrail import ToxicityGuardrailStep


class TestToxicityGuardrailStep:
def _make_step(self, threshold=0.5):
step = ToxicityGuardrailStep(threshold=threshold)
step._classifier = MagicMock()
return step

def test_safe_input_passes(self):
step = self._make_step()
step._classifier.return_value = [{"label": "non-toxic", "score": 0.999}]
event = {"question": "What is the capital of France?"}
result = step.do(event)
assert result == event

def test_toxic_input_blocked(self):
step = self._make_step()
step._classifier.return_value = [{"label": "toxic", "score": 0.998}]
event = {"question": "some clearly toxic text"}
try:
step.do(event)
assert False, "Expected ValueError to be raised"
except ValueError as e:
assert "Request blocked" in str(e)
assert "0.998" in str(e)

def test_custom_threshold_passes_below(self):
step = self._make_step(threshold=0.9)
# Score 0.85 < threshold 0.9 — should pass through
step._classifier.return_value = [{"label": "toxic", "score": 0.85}]
event = {"question": "borderline content"}
result = step.do(event)
assert result == event

def test_score_at_threshold_is_blocked(self):
step = self._make_step(threshold=0.5)
# Score exactly equal to threshold — should be blocked
step._classifier.return_value = [{"label": "toxic", "score": 0.5}]
event = {"question": "borderline content"}
try:
step.do(event)
assert False, "Expected ValueError to be raised"
except ValueError as e:
assert "0.500" in str(e)

def test_non_toxic_label_inverts_score(self):
step = self._make_step(threshold=0.5)
# label="non-toxic", score=0.99 → toxicity score = 1 - 0.99 = 0.01 → safe
step._classifier.return_value = [{"label": "non-toxic", "score": 0.99}]
event = {"question": "a perfectly safe question"}
result = step.do(event)
assert result == event

def test_empty_question_is_safe(self):
step = self._make_step()
step._classifier.return_value = [{"label": "non-toxic", "score": 0.999}]
event = {"question": ""}
result = step.do(event)
assert result == event

def test_event_passthrough_unchanged(self):
step = self._make_step()
step._classifier.return_value = [{"label": "non-toxic", "score": 0.99}]
event = {"question": "Hello world", "extra_field": 42}
result = step.do(event)
assert result["extra_field"] == 42
229 changes: 229 additions & 0 deletions steps/master/toxicity_guardrail/1.0.0/src/toxicity_guardrail.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "a1b2c3d4-0001-4000-8000-000000000001",
"metadata": {},
"source": [
"# Toxicity Guardrail Step Demo\n",
"\n",
"This notebook walks through a simple example of using the toxicity-guardrail step in serving functions,\n",
"by first downloading the step from the hub and inspecting it, then including the step in a serving graph."
]
},
{
"cell_type": "markdown",
"id": "a1b2c3d4-0001-4000-8000-000000000002",
"metadata": {},
"source": [
"## Get the step from the hub"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a1b2c3d4-0001-4000-8000-000000000003",
"metadata": {},
"outputs": [],
"source": [
"import mlrun"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a1b2c3d4-0001-4000-8000-000000000004",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'filename': 'toxicity_guardrail.py',\n",
" 'example': 'toxicity_guardrail.ipynb',\n",
" 'local_path': PosixPath('/User'),\n",
" 'url': 'hub://toxicity_guardrail',\n",
" 'class_name': 'ToxicityGuardrailStep',\n",
" 'name': 'toxicity_guardrail',\n",
" 'version': '1.0.0',\n",
" 'categories': ['data-preparation', 'model-serving', 'genai'],\n",
" 'description': 'Filters toxic requests using a pre-trained text classifier before they reach the LLM'}"
]
},
"execution_count": null,
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"hub_step = mlrun.get_hub_step(\"hub://toxicity_guardrail\")\n",
"hub_step.to_dict()"
]
},
{
"cell_type": "markdown",
"id": "a1b2c3d4-0001-4000-8000-000000000005",
"metadata": {},
"source": [
"## Use the step in a serving function\n",
"\n",
"Add `ToxicityGuardrailStep` as the first step in an async serving graph.\n",
"Any request whose toxicity score meets or exceeds the threshold will be rejected\n",
"before reaching downstream steps."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a1b2c3d4-0001-4000-8000-000000000006",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"> 2026-04-27 12:00:00,000 [info] Project loaded successfully: {\"project_name\":\"toxicity-guardrail-demo\"}\n"
]
}
],
"source": [
"project = mlrun.get_or_create_project(\"toxicity-guardrail-demo\", \"./toxicity-guardrail-demo\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a1b2c3d4-0001-4000-8000-000000000007",
"metadata": {},
"outputs": [],
"source": [
"fn = project.set_function(\n",
" hub_step.get_src_file_path(),\n",
" name=\"guardrail-fn\",\n",
" kind=\"serving\",\n",
" image=\"mlrun/mlrun\",\n",
" requirements=[\"transformers\", \"torch\"],\n",
")\n",
"graph = fn.set_topology(\"flow\", engine=\"async\")\n",
"graph.to(\n",
" class_name=\"ToxicityGuardrailStep\",\n",
" name=\"toxicity_guardrail\",\n",
" threshold=0.5,\n",
").respond()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a1b2c3d4-0001-4000-8000-000000000008",
"metadata": {},
"outputs": [],
"source": [
"project.deploy_function(fn)"
]
},
{
"cell_type": "markdown",
"id": "a1b2c3d4-0001-4000-8000-000000000009",
"metadata": {},
"source": [
"### Test with a safe input"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a1b2c3d4-0001-4000-8000-00000000000a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Response: {'question': 'What is the capital of France?'}\n"
]
}
],
"source": [
"serving_fn = project.get_function(\"guardrail-fn\")\n",
"event = {\"question\": \"What is the capital of France?\"}\n",
"result = serving_fn.invoke(\"/\", body=event)\n",
"print(\"Response:\", result)"
]
},
{
"cell_type": "markdown",
"id": "a1b2c3d4-0001-4000-8000-00000000000b",
"metadata": {},
"source": [
"### Test with a toxic input (expect a block)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a1b2c3d4-0001-4000-8000-00000000000c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Blocked (expected): bad function response 500: ValueError: Request blocked: toxicity score 0.998 >= 0.5\n"
]
}
],
"source": [
"try:\n",
" result = serving_fn.invoke(\"/\", body={\"question\": \"some toxic text\"})\n",
" print(\"Response:\", result)\n",
"except Exception as e:\n",
" print(f\"Blocked (expected): {e}\")"
]
},
{
"cell_type": "markdown",
"id": "a1b2c3d4-0001-4000-8000-00000000000d",
"metadata": {},
"source": [
"## Add the step directly from the hub\n",
"\n",
"If no customisation is needed, the step can be referenced directly from the hub\n",
"without downloading the source file first."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a1b2c3d4-0001-4000-8000-00000000000e",
"metadata": {},
"outputs": [],
"source": [
"fn2 = project.set_function(\n",
" name=\"guardrail-fn-2\",\n",
" kind=\"serving\",\n",
" image=\"mlrun/mlrun\",\n",
" requirements=[\"transformers\", \"torch\"],\n",
")\n",
"graph2 = fn2.set_topology(\"flow\", engine=\"async\")\n",
"graph2.add_step(\n",
" class_name=\"hub://toxicity_guardrail\",\n",
" name=\"toxicity_guardrail\",\n",
" threshold=0.5,\n",
").respond()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.11.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading