Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion nemoguardrails/library/jailbreak_detection/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ RUN apt-get update && apt-get install -y git gcc g++ python3-dev wget && apt-get

# Predownload embedding-based jailbreak detection models, set environment variable for path
WORKDIR /models
RUN wget https://huggingface.co/nvidia/NemoGuard-JailbreakDetect/resolve/main/snowflake.pkl
RUN wget https://huggingface.co/nvidia/NemoGuard-JailbreakDetect/resolve/main/snowflake.onnx
ENV EMBEDDING_CLASSIFIER_PATH=/models

# Set working directory
Expand Down
2 changes: 1 addition & 1 deletion nemoguardrails/library/jailbreak_detection/Dockerfile-GPU
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ ENV JAILBREAK_CHECK_DEVICE=cuda:0

# Predownload embedding-based jailbreak detection models, set environment variable for path
WORKDIR /models
RUN wget https://huggingface.co/nvidia/NemoGuard-JailbreakDetect/resolve/main/snowflake.pkl
RUN wget https://huggingface.co/nvidia/NemoGuard-JailbreakDetect/resolve/main/snowflake.onnx
ENV EMBEDDING_CLASSIFIER_PATH=/models

# Set working directory
Expand Down
17 changes: 14 additions & 3 deletions nemoguardrails/library/jailbreak_detection/model_based/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,25 @@ def initialize_model() -> Union[None, "JailbreakClassifier"]:

if classifier_path is None:
# Log a warning, but do not throw an exception
logger.warning("No embedding classifier path set. Server /model endpoint will not work.")
logger.warning(
"No embedding classifier path set. Server /model endpoint will not work."
)
return None

from nemoguardrails.library.jailbreak_detection.model_based.models import (
# check if model is present. If not, download it.
if not Path(classifier_path).joinpath("snowflake.onnx").is_file():
from huggingface_hub import hf_hub_download

hf_hub_download(repo_id="nvidia/NemoGuard-JailbreakDetect", filename="snowflake.onnx", cache_dir=classifier_path)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't we want to use local_dir instead of cache_dir?
Here also the test fails if the classifier_path cannot be created for some reason.



from model_based.models import (
JailbreakClassifier,
)

jailbreak_classifier = JailbreakClassifier(str(Path(classifier_path).joinpath("snowflake.pkl")))
jailbreak_classifier = JailbreakClassifier(
str(Path(classifier_path).joinpath("snowflake.onnx"))
)

return jailbreak_classifier

Expand Down
30 changes: 19 additions & 11 deletions nemoguardrails/library/jailbreak_detection/model_based/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,17 @@

from typing import Tuple

import numpy as np


class SnowflakeEmbed:
def __init__(self):
import torch
from transformers import AutoModel, AutoTokenizer

self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
self.tokenizer = AutoTokenizer.from_pretrained("Snowflake/snowflake-arctic-embed-m-long")
self.device = "cuda" if torch.cuda.is_available() else "cpu"
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't we also use JAILBREAK_CHECK_DEVICE env var if it is set in the DOCKERFILE?

self.tokenizer = AutoTokenizer.from_pretrained(
"Snowflake/snowflake-arctic-embed-m-long",
trust_remote_code=True,
)
Comment thread
erickgalinkin marked this conversation as resolved.
self.model = AutoModel.from_pretrained(
"Snowflake/snowflake-arctic-embed-m-long",
trust_remote_code=True,
Expand All @@ -35,24 +36,31 @@ def __init__(self):
self.model.eval()

def __call__(self, text: str):
tokens = self.tokenizer([text], padding=True, truncation=True, return_tensors="pt", max_length=2048)
tokens = self.tokenizer(
[text], padding=True, truncation=True, return_tensors="pt", max_length=2048
)
tokens = tokens.to(self.device)
embeddings = self.model(**tokens)[0][:, 0]
return embeddings.detach().cpu().squeeze(0).numpy()


class JailbreakClassifier:
def __init__(self, random_forest_path: str):
import pickle
from onnxruntime import InferenceSession

self.embed = SnowflakeEmbed()
with open(random_forest_path, "rb") as fd:
self.classifier = pickle.load(fd)
# See https://onnx.ai/sklearn-onnx/auto_examples/plot_convert_decision_function.html
self.classifier = InferenceSession(
random_forest_path, providers=["CPUExecutionProvider"]
)

def __call__(self, text: str) -> Tuple[bool, float]:
e = self.embed(text)
probs = self.classifier.predict_proba([e])
classification = np.argmax(probs)
prob = np.max(probs)
res = self.classifier.run(None, {"X": [e]})
# InferenceSession returns a result where the first item is equivalent to argmax over probabilities
classification = res[0].item()
# The second is a list of dicts of probabilities -- the slice res[1][:2] should have only one element.
# We access the dict entry for the class.
prob = res[1][:2][0][classification]
Comment thread
erickgalinkin marked this conversation as resolved.
score = -prob if classification == 0 else prob
return bool(classification), float(score)
13 changes: 7 additions & 6 deletions nemoguardrails/library/jailbreak_detection/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@ fastapi>=0.103.1
starlette>=0.50.0
typer>=0.7.0
uvicorn>=0.23.2
transformers>=4.57.6
torch>=2.9.1
nemoguardrails>=0.14.0
numpy==1.23.5
scikit-learn==1.2.2
einops>=0.7.0
transformers>=5.3.0
torch>=2.9.0
torchvision>=0.25.0
Comment thread
erickgalinkin marked this conversation as resolved.
numpy==1.26.4
einops>=0.8.2
onnxruntime>=1.24.3
huggingface_hub>=1.0
22 changes: 14 additions & 8 deletions tests/test_jailbreak_model_based.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ def test_lazy_import_does_not_require_heavy_deps():
"""
Importing the checks module should not require torch, transformers, or sklearn unless model-based classifier is used.
"""
with mock.patch.dict(sys.modules, {"torch": None, "transformers": None, "sklearn": None}):
with mock.patch.dict(
sys.modules, {"torch": None, "transformers": None, "sklearn": None}
):
import nemoguardrails.library.jailbreak_detection.model_based.checks as checks

# Just importing and calling unrelated functions should not raise ImportError
Expand All @@ -38,20 +40,22 @@ def test_lazy_import_does_not_require_heavy_deps():

def test_model_based_classifier_imports(monkeypatch):
"""
Instantiating JailbreakClassifier should require sklearn and pickle, and use SnowflakeEmbed which requires torch/transformers.
Instantiating JailbreakClassifier should require sklearn and onnxruntime, and use SnowflakeEmbed which requires torch/transformers.
"""
# Mock dependencies
fake_rf = mock.MagicMock()
fake_embed = mock.MagicMock(return_value=[0.0])
fake_pickle = types.SimpleNamespace(load=mock.MagicMock(return_value=fake_rf))
fake_onnx = types.SimpleNamespace(
InferenceSession=mock.MagicMock(return_value=fake_rf)
)
fake_snowflake = mock.MagicMock(return_value=fake_embed)

monkeypatch.setitem(
sys.modules,
"sklearn.ensemble",
types.SimpleNamespace(RandomForestClassifier=mock.MagicMock()),
)
monkeypatch.setitem(sys.modules, "pickle", fake_pickle)
monkeypatch.setitem(sys.modules, "onnxruntime", fake_onnx)
monkeypatch.setitem(sys.modules, "torch", mock.MagicMock())
monkeypatch.setitem(sys.modules, "transformers", mock.MagicMock())

Expand All @@ -64,7 +68,7 @@ def test_model_based_classifier_imports(monkeypatch):
mock_open = mock.mock_open()
with mock.patch("builtins.open", mock_open):
# Should not raise
classifier = models.JailbreakClassifier("fake_model_path.pkl")
classifier = models.JailbreakClassifier("fake_model_path.onnx")
assert classifier is not None
# Should be callable
result = classifier("test")
Expand All @@ -86,7 +90,7 @@ def test_model_based_classifier_missing_deps(monkeypatch):
mock_open = mock.mock_open()
with mock.patch("builtins.open", mock_open):
with pytest.raises(ImportError):
models.JailbreakClassifier("fake_model_path.pkl")
models.JailbreakClassifier("fake_model_path.onnx")


# Test 4: Return None when EMBEDDING_CLASSIFIER_PATH is not set
Expand Down Expand Up @@ -145,7 +149,9 @@ def test_snowflake_embed_torch_imports(monkeypatch):
# the code does self.model(**tokens)[0][:, 0]
# so we need to mock this properly
mock_tensor_output = mock.MagicMock()
mock_tensor_output.detach.return_value.cpu.return_value.squeeze.return_value.numpy.return_value = fake_embedding
mock_tensor_output.detach.return_value.cpu.return_value.squeeze.return_value.numpy.return_value = (
fake_embedding
)

mock_first_index = mock.MagicMock()
mock_first_index.__getitem__.return_value = mock_tensor_output # for [:, 0]
Expand Down Expand Up @@ -253,7 +259,7 @@ def test_initialize_model_with_valid_path(monkeypatch):

assert result == mock_classifier

expected_path = str(Path(test_path).joinpath("snowflake.pkl"))
expected_path = str(Path(test_path).joinpath("snowflake.onnx"))
mock_jailbreak_classifier_class.assert_called_once_with(expected_path)


Expand Down
Loading