Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def initialize_model() -> Union[None, "JailbreakClassifier"]:
JailbreakClassifier,
)

jailbreak_classifier = JailbreakClassifier(str(Path(classifier_path).joinpath("snowflake.pkl")))
jailbreak_classifier = JailbreakClassifier(str(Path(classifier_path).joinpath("snowflake.onnx")))

return jailbreak_classifier

Expand Down
24 changes: 14 additions & 10 deletions nemoguardrails/library/jailbreak_detection/model_based/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,18 @@

from typing import Tuple

import numpy as np


class SnowflakeEmbed:
def __init__(self):
import torch
from transformers import AutoModel, AutoTokenizer

self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
self.tokenizer = AutoTokenizer.from_pretrained("Snowflake/snowflake-arctic-embed-m-long")
self.device = "cuda" if torch.cuda.is_available() else "cpu"
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't we also use JAILBREAK_CHECK_DEVICE env var if it is set in the DOCKERFILE?

self.tokenizer = AutoTokenizer.from_pretrained(
"Snowflake/snowflake-arctic-embed-m-long",
trust_remote_code=True,
safe_serialization=True
)
Comment thread
erickgalinkin marked this conversation as resolved.
self.model = AutoModel.from_pretrained(
"Snowflake/snowflake-arctic-embed-m-long",
trust_remote_code=True,
Expand All @@ -43,16 +45,18 @@ def __call__(self, text: str):

class JailbreakClassifier:
def __init__(self, random_forest_path: str):
import pickle
from onnxruntime import InferenceSession

self.embed = SnowflakeEmbed()
with open(random_forest_path, "rb") as fd:
self.classifier = pickle.load(fd)
# See https://onnx.ai/sklearn-onnx/auto_examples/plot_convert_decision_function.html
self.classifier = InferenceSession(random_forest_path, providers=["CPUExecutionProvider"])

def __call__(self, text: str) -> Tuple[bool, float]:
e = self.embed(text)
probs = self.classifier.predict_proba([e])
classification = np.argmax(probs)
prob = np.max(probs)
res = self.classifier.run(None, {"X": [e]})
# InferenceSession returns a result where the first item is equivalent to argmax over probabilities
classification = res[0].item()
# The second is a list of dicts of probabilities -- the list should have only one element.
prob = res[1][:2][0][classification]
Comment thread
erickgalinkin marked this conversation as resolved.
score = -prob if classification == 0 else prob
return bool(classification), float(score)
14 changes: 8 additions & 6 deletions nemoguardrails/library/jailbreak_detection/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@ fastapi>=0.103.1
starlette>=0.50.0
typer>=0.7.0
uvicorn>=0.23.2
transformers>=4.57.6
torch>=2.9.1
nemoguardrails>=0.14.0
numpy==1.23.5
scikit-learn==1.2.2
einops>=0.7.0
transformers>=5.3.0
torch>=2.10.0
torchvision>=0.25.0
Comment thread
erickgalinkin marked this conversation as resolved.
nemoguardrails>=0.20.0
numpy==1.26.4
scikit-learn==1.5.2
einops>=0.8.2
onnxruntime>=1.24.3
Comment thread
erickgalinkin marked this conversation as resolved.
Outdated
Comment thread
erickgalinkin marked this conversation as resolved.
Outdated
2 changes: 1 addition & 1 deletion tests/test_jailbreak_model_based.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ def test_initialize_model_with_valid_path(monkeypatch):

assert result == mock_classifier

expected_path = str(Path(test_path).joinpath("snowflake.pkl"))
expected_path = str(Path(test_path).joinpath("snowflake.onnx"))
mock_jailbreak_classifier_class.assert_called_once_with(expected_path)


Expand Down
Loading