diff --git a/nemoguardrails/library/jailbreak_detection/Dockerfile b/nemoguardrails/library/jailbreak_detection/Dockerfile index 9367436f82..de444e5dc1 100644 --- a/nemoguardrails/library/jailbreak_detection/Dockerfile +++ b/nemoguardrails/library/jailbreak_detection/Dockerfile @@ -6,7 +6,7 @@ RUN apt-get update && apt-get install -y git gcc g++ python3-dev wget && apt-get # Predownload embedding-based jailbreak detection models, set environment variable for path WORKDIR /models -RUN wget https://huggingface.co/nvidia/NemoGuard-JailbreakDetect/resolve/main/snowflake.pkl +RUN wget https://huggingface.co/nvidia/NemoGuard-JailbreakDetect/resolve/main/snowflake.onnx ENV EMBEDDING_CLASSIFIER_PATH=/models # Set working directory diff --git a/nemoguardrails/library/jailbreak_detection/Dockerfile-GPU b/nemoguardrails/library/jailbreak_detection/Dockerfile-GPU index b75ad3023c..392568a958 100644 --- a/nemoguardrails/library/jailbreak_detection/Dockerfile-GPU +++ b/nemoguardrails/library/jailbreak_detection/Dockerfile-GPU @@ -9,7 +9,7 @@ ENV JAILBREAK_CHECK_DEVICE=cuda:0 # Predownload embedding-based jailbreak detection models, set environment variable for path WORKDIR /models -RUN wget https://huggingface.co/nvidia/NemoGuard-JailbreakDetect/resolve/main/snowflake.pkl +RUN wget https://huggingface.co/nvidia/NemoGuard-JailbreakDetect/resolve/main/snowflake.onnx ENV EMBEDDING_CLASSIFIER_PATH=/models # Set working directory diff --git a/nemoguardrails/library/jailbreak_detection/model_based/checks.py b/nemoguardrails/library/jailbreak_detection/model_based/checks.py index 9cb5d39d19..ee62daf4dc 100644 --- a/nemoguardrails/library/jailbreak_detection/model_based/checks.py +++ b/nemoguardrails/library/jailbreak_detection/model_based/checks.py @@ -36,14 +36,25 @@ def initialize_model() -> Union[None, "JailbreakClassifier"]: if classifier_path is None: # Log a warning, but do not throw an exception - logger.warning("No embedding classifier path set. Server /model endpoint will not work.") + logger.warning( + "No embedding classifier path set. Server /model endpoint will not work." + ) return None - from nemoguardrails.library.jailbreak_detection.model_based.models import ( + # check if model is present. If not, download it. + if not Path(classifier_path).joinpath("snowflake.onnx").is_file(): + from huggingface_hub import hf_hub_download + + hf_hub_download(repo_id="nvidia/NemoGuard-JailbreakDetect", filename="snowflake.onnx", cache_dir=classifier_path) + + + from model_based.models import ( JailbreakClassifier, ) - jailbreak_classifier = JailbreakClassifier(str(Path(classifier_path).joinpath("snowflake.pkl"))) + jailbreak_classifier = JailbreakClassifier( + str(Path(classifier_path).joinpath("snowflake.onnx")) + ) return jailbreak_classifier diff --git a/nemoguardrails/library/jailbreak_detection/model_based/models.py b/nemoguardrails/library/jailbreak_detection/model_based/models.py index d5986f72e1..90ac6d542d 100644 --- a/nemoguardrails/library/jailbreak_detection/model_based/models.py +++ b/nemoguardrails/library/jailbreak_detection/model_based/models.py @@ -15,16 +15,17 @@ from typing import Tuple -import numpy as np - class SnowflakeEmbed: def __init__(self): import torch from transformers import AutoModel, AutoTokenizer - self.device = "cuda:0" if torch.cuda.is_available() else "cpu" - self.tokenizer = AutoTokenizer.from_pretrained("Snowflake/snowflake-arctic-embed-m-long") + self.device = "cuda" if torch.cuda.is_available() else "cpu" + self.tokenizer = AutoTokenizer.from_pretrained( + "Snowflake/snowflake-arctic-embed-m-long", + trust_remote_code=True, + ) self.model = AutoModel.from_pretrained( "Snowflake/snowflake-arctic-embed-m-long", trust_remote_code=True, @@ -35,7 +36,9 @@ def __init__(self): self.model.eval() def __call__(self, text: str): - tokens = self.tokenizer([text], padding=True, truncation=True, return_tensors="pt", max_length=2048) + tokens = self.tokenizer( + [text], padding=True, truncation=True, return_tensors="pt", max_length=2048 + ) tokens = tokens.to(self.device) embeddings = self.model(**tokens)[0][:, 0] return embeddings.detach().cpu().squeeze(0).numpy() @@ -43,16 +46,21 @@ def __call__(self, text: str): class JailbreakClassifier: def __init__(self, random_forest_path: str): - import pickle + from onnxruntime import InferenceSession self.embed = SnowflakeEmbed() - with open(random_forest_path, "rb") as fd: - self.classifier = pickle.load(fd) + # See https://onnx.ai/sklearn-onnx/auto_examples/plot_convert_decision_function.html + self.classifier = InferenceSession( + random_forest_path, providers=["CPUExecutionProvider"] + ) def __call__(self, text: str) -> Tuple[bool, float]: e = self.embed(text) - probs = self.classifier.predict_proba([e]) - classification = np.argmax(probs) - prob = np.max(probs) + res = self.classifier.run(None, {"X": [e]}) + # InferenceSession returns a result where the first item is equivalent to argmax over probabilities + classification = res[0].item() + # The second is a list of dicts of probabilities -- the slice res[1][:2] should have only one element. + # We access the dict entry for the class. + prob = res[1][:2][0][classification] score = -prob if classification == 0 else prob return bool(classification), float(score) diff --git a/nemoguardrails/library/jailbreak_detection/requirements.txt b/nemoguardrails/library/jailbreak_detection/requirements.txt index d9a2999484..f98352b1b9 100644 --- a/nemoguardrails/library/jailbreak_detection/requirements.txt +++ b/nemoguardrails/library/jailbreak_detection/requirements.txt @@ -4,9 +4,10 @@ fastapi>=0.103.1 starlette>=0.50.0 typer>=0.7.0 uvicorn>=0.23.2 -transformers>=4.57.6 -torch>=2.9.1 -nemoguardrails>=0.14.0 -numpy==1.23.5 -scikit-learn==1.2.2 -einops>=0.7.0 +transformers>=5.3.0 +torch>=2.9.0 +torchvision>=0.25.0 +numpy==1.26.4 +einops>=0.8.2 +onnxruntime>=1.24.3 +huggingface_hub>=1.0 diff --git a/tests/test_jailbreak_model_based.py b/tests/test_jailbreak_model_based.py index 4c392610e2..36d6fae1ac 100644 --- a/tests/test_jailbreak_model_based.py +++ b/tests/test_jailbreak_model_based.py @@ -26,7 +26,9 @@ def test_lazy_import_does_not_require_heavy_deps(): """ Importing the checks module should not require torch, transformers, or sklearn unless model-based classifier is used. """ - with mock.patch.dict(sys.modules, {"torch": None, "transformers": None, "sklearn": None}): + with mock.patch.dict( + sys.modules, {"torch": None, "transformers": None, "sklearn": None} + ): import nemoguardrails.library.jailbreak_detection.model_based.checks as checks # Just importing and calling unrelated functions should not raise ImportError @@ -38,12 +40,14 @@ def test_lazy_import_does_not_require_heavy_deps(): def test_model_based_classifier_imports(monkeypatch): """ - Instantiating JailbreakClassifier should require sklearn and pickle, and use SnowflakeEmbed which requires torch/transformers. + Instantiating JailbreakClassifier should require sklearn and onnxruntime, and use SnowflakeEmbed which requires torch/transformers. """ # Mock dependencies fake_rf = mock.MagicMock() fake_embed = mock.MagicMock(return_value=[0.0]) - fake_pickle = types.SimpleNamespace(load=mock.MagicMock(return_value=fake_rf)) + fake_onnx = types.SimpleNamespace( + InferenceSession=mock.MagicMock(return_value=fake_rf) + ) fake_snowflake = mock.MagicMock(return_value=fake_embed) monkeypatch.setitem( @@ -51,7 +55,7 @@ def test_model_based_classifier_imports(monkeypatch): "sklearn.ensemble", types.SimpleNamespace(RandomForestClassifier=mock.MagicMock()), ) - monkeypatch.setitem(sys.modules, "pickle", fake_pickle) + monkeypatch.setitem(sys.modules, "onnxruntime", fake_onnx) monkeypatch.setitem(sys.modules, "torch", mock.MagicMock()) monkeypatch.setitem(sys.modules, "transformers", mock.MagicMock()) @@ -64,7 +68,7 @@ def test_model_based_classifier_imports(monkeypatch): mock_open = mock.mock_open() with mock.patch("builtins.open", mock_open): # Should not raise - classifier = models.JailbreakClassifier("fake_model_path.pkl") + classifier = models.JailbreakClassifier("fake_model_path.onnx") assert classifier is not None # Should be callable result = classifier("test") @@ -86,7 +90,7 @@ def test_model_based_classifier_missing_deps(monkeypatch): mock_open = mock.mock_open() with mock.patch("builtins.open", mock_open): with pytest.raises(ImportError): - models.JailbreakClassifier("fake_model_path.pkl") + models.JailbreakClassifier("fake_model_path.onnx") # Test 4: Return None when EMBEDDING_CLASSIFIER_PATH is not set @@ -145,7 +149,9 @@ def test_snowflake_embed_torch_imports(monkeypatch): # the code does self.model(**tokens)[0][:, 0] # so we need to mock this properly mock_tensor_output = mock.MagicMock() - mock_tensor_output.detach.return_value.cpu.return_value.squeeze.return_value.numpy.return_value = fake_embedding + mock_tensor_output.detach.return_value.cpu.return_value.squeeze.return_value.numpy.return_value = ( + fake_embedding + ) mock_first_index = mock.MagicMock() mock_first_index.__getitem__.return_value = mock_tensor_output # for [:, 0] @@ -253,7 +259,7 @@ def test_initialize_model_with_valid_path(monkeypatch): assert result == mock_classifier - expected_path = str(Path(test_path).joinpath("snowflake.pkl")) + expected_path = str(Path(test_path).joinpath("snowflake.onnx")) mock_jailbreak_classifier_class.assert_called_once_with(expected_path)