NVIDIA-NeMo · erickgalinkin · Mar 10, 2026 · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026
diff --git a/nemoguardrails/library/jailbreak_detection/Dockerfile b/nemoguardrails/library/jailbreak_detection/Dockerfile
@@ -6,7 +6,7 @@ RUN apt-get update && apt-get install -y git gcc g++ python3-dev wget && apt-get
 
 # Predownload embedding-based jailbreak detection models, set environment variable for path
 WORKDIR /models
-RUN wget https://huggingface.co/nvidia/NemoGuard-JailbreakDetect/resolve/main/snowflake.pkl
+RUN wget https://huggingface.co/nvidia/NemoGuard-JailbreakDetect/resolve/main/snowflake.onnx
 ENV EMBEDDING_CLASSIFIER_PATH=/models
 
 # Set working directory

diff --git a/nemoguardrails/library/jailbreak_detection/Dockerfile-GPU b/nemoguardrails/library/jailbreak_detection/Dockerfile-GPU
@@ -9,7 +9,7 @@ ENV JAILBREAK_CHECK_DEVICE=cuda:0
 
 # Predownload embedding-based jailbreak detection models, set environment variable for path
 WORKDIR /models
-RUN wget https://huggingface.co/nvidia/NemoGuard-JailbreakDetect/resolve/main/snowflake.pkl
+RUN wget https://huggingface.co/nvidia/NemoGuard-JailbreakDetect/resolve/main/snowflake.onnx
 ENV EMBEDDING_CLASSIFIER_PATH=/models
 
 # Set working directory

diff --git a/nemoguardrails/library/jailbreak_detection/model_based/checks.py b/nemoguardrails/library/jailbreak_detection/model_based/checks.py
@@ -36,14 +36,25 @@ def initialize_model() -> Union[None, "JailbreakClassifier"]:
 
     if classifier_path is None:
         # Log a warning, but do not throw an exception
-        logger.warning("No embedding classifier path set. Server /model endpoint will not work.")
+        logger.warning(
+            "No embedding classifier path set. Server /model endpoint will not work."
+        )
         return None
 
-    from nemoguardrails.library.jailbreak_detection.model_based.models import (
+    # check if model is present. If not, download it.
+    if not Path(classifier_path).joinpath("snowflake.onnx").is_file():
+        from huggingface_hub import hf_hub_download
+
+        hf_hub_download(repo_id="nvidia/NemoGuard-JailbreakDetect", filename="snowflake.onnx", cache_dir=classifier_path)
+
+
+    from model_based.models import (
         JailbreakClassifier,
     )
 
-    jailbreak_classifier = JailbreakClassifier(str(Path(classifier_path).joinpath("snowflake.pkl")))
+    jailbreak_classifier = JailbreakClassifier(
+        str(Path(classifier_path).joinpath("snowflake.onnx"))
+    )
 
     return jailbreak_classifier
 

diff --git a/nemoguardrails/library/jailbreak_detection/model_based/models.py b/nemoguardrails/library/jailbreak_detection/model_based/models.py
@@ -15,16 +15,17 @@
 
 from typing import Tuple
 
-import numpy as np
-
 
 class SnowflakeEmbed:
     def __init__(self):
         import torch
         from transformers import AutoModel, AutoTokenizer
 
-        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
-        self.tokenizer = AutoTokenizer.from_pretrained("Snowflake/snowflake-arctic-embed-m-long")
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            "Snowflake/snowflake-arctic-embed-m-long",
+            trust_remote_code=True,
+        )
         self.model = AutoModel.from_pretrained(
             "Snowflake/snowflake-arctic-embed-m-long",
             trust_remote_code=True,
@@ -35,24 +36,31 @@ def __init__(self):
         self.model.eval()
 
     def __call__(self, text: str):
-        tokens = self.tokenizer([text], padding=True, truncation=True, return_tensors="pt", max_length=2048)
+        tokens = self.tokenizer(
+            [text], padding=True, truncation=True, return_tensors="pt", max_length=2048
+        )
         tokens = tokens.to(self.device)
         embeddings = self.model(**tokens)[0][:, 0]
         return embeddings.detach().cpu().squeeze(0).numpy()
 
 
 class JailbreakClassifier:
     def __init__(self, random_forest_path: str):
-        import pickle
+        from onnxruntime import InferenceSession
 
         self.embed = SnowflakeEmbed()
-        with open(random_forest_path, "rb") as fd:
-            self.classifier = pickle.load(fd)
+        # See https://onnx.ai/sklearn-onnx/auto_examples/plot_convert_decision_function.html
+        self.classifier = InferenceSession(
+            random_forest_path, providers=["CPUExecutionProvider"]
+        )
 
     def __call__(self, text: str) -> Tuple[bool, float]:
         e = self.embed(text)
-        probs = self.classifier.predict_proba([e])
-        classification = np.argmax(probs)
-        prob = np.max(probs)
+        res = self.classifier.run(None, {"X": [e]})
+        # InferenceSession returns a result where the first item is equivalent to argmax over probabilities
+        classification = res[0].item()
+        # The second is a list of dicts of probabilities -- the slice res[1][:2] should have only one element.
+        # We access the dict entry for the class.
+        prob = res[1][:2][0][classification]
         score = -prob if classification == 0 else prob
         return bool(classification), float(score)
diff --git a/nemoguardrails/library/jailbreak_detection/requirements.txt b/nemoguardrails/library/jailbreak_detection/requirements.txt
@@ -4,9 +4,10 @@ fastapi>=0.103.1
 starlette>=0.50.0
 typer>=0.7.0
 uvicorn>=0.23.2
-transformers>=4.57.6
-torch>=2.9.1
-nemoguardrails>=0.14.0
-numpy==1.23.5
-scikit-learn==1.2.2
-einops>=0.7.0
+transformers>=5.3.0
+torch>=2.9.0
+torchvision>=0.25.0
+numpy==1.26.4
+einops>=0.8.2
+onnxruntime>=1.24.3
+huggingface_hub>=1.0
diff --git a/tests/test_jailbreak_model_based.py b/tests/test_jailbreak_model_based.py
@@ -26,7 +26,9 @@ def test_lazy_import_does_not_require_heavy_deps():
     """
     Importing the checks module should not require torch, transformers, or sklearn unless model-based classifier is used.
     """
-    with mock.patch.dict(sys.modules, {"torch": None, "transformers": None, "sklearn": None}):
+    with mock.patch.dict(
+        sys.modules, {"torch": None, "transformers": None, "sklearn": None}
+    ):
         import nemoguardrails.library.jailbreak_detection.model_based.checks as checks
 
         # Just importing and calling unrelated functions should not raise ImportError
@@ -38,20 +40,22 @@ def test_lazy_import_does_not_require_heavy_deps():
 
 def test_model_based_classifier_imports(monkeypatch):
     """
-    Instantiating JailbreakClassifier should require sklearn and pickle, and use SnowflakeEmbed which requires torch/transformers.
+    Instantiating JailbreakClassifier should require sklearn and onnxruntime, and use SnowflakeEmbed which requires torch/transformers.
     """
     # Mock dependencies
     fake_rf = mock.MagicMock()
     fake_embed = mock.MagicMock(return_value=[0.0])
-    fake_pickle = types.SimpleNamespace(load=mock.MagicMock(return_value=fake_rf))
+    fake_onnx = types.SimpleNamespace(
+        InferenceSession=mock.MagicMock(return_value=fake_rf)
+    )
     fake_snowflake = mock.MagicMock(return_value=fake_embed)
 
     monkeypatch.setitem(
         sys.modules,
         "sklearn.ensemble",
         types.SimpleNamespace(RandomForestClassifier=mock.MagicMock()),
     )
-    monkeypatch.setitem(sys.modules, "pickle", fake_pickle)
+    monkeypatch.setitem(sys.modules, "onnxruntime", fake_onnx)
     monkeypatch.setitem(sys.modules, "torch", mock.MagicMock())
     monkeypatch.setitem(sys.modules, "transformers", mock.MagicMock())
 
@@ -64,7 +68,7 @@ def test_model_based_classifier_imports(monkeypatch):
     mock_open = mock.mock_open()
     with mock.patch("builtins.open", mock_open):
         # Should not raise
-        classifier = models.JailbreakClassifier("fake_model_path.pkl")
+        classifier = models.JailbreakClassifier("fake_model_path.onnx")
         assert classifier is not None
         # Should be callable
         result = classifier("test")
@@ -86,7 +90,7 @@ def test_model_based_classifier_missing_deps(monkeypatch):
     mock_open = mock.mock_open()
     with mock.patch("builtins.open", mock_open):
         with pytest.raises(ImportError):
-            models.JailbreakClassifier("fake_model_path.pkl")
+            models.JailbreakClassifier("fake_model_path.onnx")
 
 
 # Test 4: Return None when EMBEDDING_CLASSIFIER_PATH is not set
@@ -145,7 +149,9 @@ def test_snowflake_embed_torch_imports(monkeypatch):
     # the code does self.model(**tokens)[0][:, 0]
     # so we need to mock this properly
     mock_tensor_output = mock.MagicMock()
-    mock_tensor_output.detach.return_value.cpu.return_value.squeeze.return_value.numpy.return_value = fake_embedding
+    mock_tensor_output.detach.return_value.cpu.return_value.squeeze.return_value.numpy.return_value = (
+        fake_embedding
+    )
 
     mock_first_index = mock.MagicMock()
     mock_first_index.__getitem__.return_value = mock_tensor_output  # for [:, 0]
@@ -253,7 +259,7 @@ def test_initialize_model_with_valid_path(monkeypatch):
 
     assert result == mock_classifier
 
-    expected_path = str(Path(test_path).joinpath("snowflake.pkl"))
+    expected_path = str(Path(test_path).joinpath("snowflake.onnx"))
     mock_jailbreak_classifier_class.assert_called_once_with(expected_path)