|
@@ -8,15 +8,20 @@ from transformers import (
|
|
|
|
|
|
"""
|
|
|
Utilities for loading the PromptGuard model and evaluating text for jailbreaks and indirect injections.
|
|
|
+
|
|
|
+Note that the underlying model has a maximum recommended input size of 512 tokens as a DeBERTa model.
|
|
|
+The final two functions in this file implement efficient parallel batched evaluation of the model on a list
|
|
|
+of input strings of arbirary length, with the final score for each input being the maximum score across all
|
|
|
+chunks of the input string.
|
|
|
"""
|
|
|
|
|
|
|
|
|
-def load_model_and_tokenizer(model_name='meta-llama/PromptGuard'):
|
|
|
+def load_model_and_tokenizer(model_name='meta-llama/Prompt-Guard-86M'):
|
|
|
"""
|
|
|
Load the PromptGuard model from Hugging Face or a local model.
|
|
|
|
|
|
Args:
|
|
|
- model_name (str): The name of the model to load. Default is 'meta-llama/PromptGuard'.
|
|
|
+ model_name (str): The name of the model to load. Default is 'meta-llama/Prompt-Guard-86M'.
|
|
|
|
|
|
Returns:
|
|
|
transformers.PreTrainedModel: The loaded model.
|
|
@@ -29,6 +34,7 @@ def load_model_and_tokenizer(model_name='meta-llama/PromptGuard'):
|
|
|
def get_class_probabilities(model, tokenizer, text, temperature=1.0, device='cpu'):
|
|
|
"""
|
|
|
Evaluate the model on the given text with temperature-adjusted softmax.
|
|
|
+ Note, as this is a DeBERTa model, the input text should have a maximum length of 512.
|
|
|
|
|
|
Args:
|
|
|
text (str): The input text to classify.
|
|
@@ -83,3 +89,92 @@ def get_indirect_injection_score(model, tokenizer, text, temperature=1.0, device
|
|
|
"""
|
|
|
probabilities = get_class_probabilities(model, tokenizer, text, temperature, device)
|
|
|
return (probabilities[0, 1] + probabilities[0, 2]).item()
|
|
|
+
|
|
|
+
|
|
|
+def process_text_batch(model, tokenizer, texts, temperature=1.0, device='cpu'):
|
|
|
+ """
|
|
|
+ Process a batch of texts and return their class probabilities.
|
|
|
+ Args:
|
|
|
+ model (transformers.PreTrainedModel): The loaded model.
|
|
|
+ tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model.
|
|
|
+ texts (list[str]): A list of texts to process.
|
|
|
+ temperature (float): The temperature for the softmax function.
|
|
|
+ device (str): The device to evaluate the model on.
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ torch.Tensor: A tensor containing the class probabilities for each text in the batch.
|
|
|
+ """
|
|
|
+ inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
|
|
+ inputs = inputs.to(device)
|
|
|
+ with torch.no_grad():
|
|
|
+ logits = model(**inputs).logits
|
|
|
+ scaled_logits = logits / temperature
|
|
|
+ probabilities = softmax(scaled_logits, dim=-1)
|
|
|
+ return probabilities
|
|
|
+
|
|
|
+
|
|
|
+def get_scores_for_texts(model, tokenizer, texts, score_indices, temperature=1.0, device='cpu', max_batch_size=16):
|
|
|
+ """
|
|
|
+ Compute scores for a list of texts, handling texts of arbitrary length by breaking them into chunks and processing in parallel.
|
|
|
+ Args:
|
|
|
+ model (transformers.PreTrainedModel): The loaded model.
|
|
|
+ tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model.
|
|
|
+ texts (list[str]): A list of texts to evaluate.
|
|
|
+ score_indices (list[int]): Indices of scores to sum for final score calculation.
|
|
|
+ temperature (float): The temperature for the softmax function.
|
|
|
+ device (str): The device to evaluate the model on.
|
|
|
+ max_batch_size (int): The maximum number of text chunks to process in a single batch.
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ list[float]: A list of scores for each text.
|
|
|
+ """
|
|
|
+ all_chunks = []
|
|
|
+ text_indices = []
|
|
|
+ for index, text in enumerate(texts):
|
|
|
+ chunks = [text[i:i+512] for i in range(0, len(text), 512)]
|
|
|
+ all_chunks.extend(chunks)
|
|
|
+ text_indices.extend([index] * len(chunks))
|
|
|
+ all_scores = [0] * len(texts)
|
|
|
+ for i in range(0, len(all_chunks), max_batch_size):
|
|
|
+ batch_chunks = all_chunks[i:i+max_batch_size]
|
|
|
+ batch_indices = text_indices[i:i+max_batch_size]
|
|
|
+ probabilities = process_text_batch(model, tokenizer, batch_chunks, temperature, device)
|
|
|
+ scores = probabilities[:, score_indices].sum(dim=1).tolist()
|
|
|
+
|
|
|
+ for idx, score in zip(batch_indices, scores):
|
|
|
+ all_scores[idx] = max(all_scores[idx], score)
|
|
|
+ return all_scores
|
|
|
+
|
|
|
+
|
|
|
+def get_jailbreak_scores_for_texts(model, tokenizer, texts, temperature=1.0, device='cpu', max_batch_size=16):
|
|
|
+ """
|
|
|
+ Compute jailbreak scores for a list of texts.
|
|
|
+ Args:
|
|
|
+ model (transformers.PreTrainedModel): The loaded model.
|
|
|
+ tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model.
|
|
|
+ texts (list[str]): A list of texts to evaluate.
|
|
|
+ temperature (float): The temperature for the softmax function.
|
|
|
+ device (str): The device to evaluate the model on.
|
|
|
+ max_batch_size (int): The maximum number of text chunks to process in a single batch.
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ list[float]: A list of jailbreak scores for each text.
|
|
|
+ """
|
|
|
+ return get_scores_for_texts(model, tokenizer, texts, [2], temperature, device, max_batch_size)
|
|
|
+
|
|
|
+
|
|
|
+def get_indirect_injection_scores_for_texts(model, tokenizer, texts, temperature=1.0, device='cpu', max_batch_size=16):
|
|
|
+ """
|
|
|
+ Compute indirect injection scores for a list of texts.
|
|
|
+ Args:
|
|
|
+ model (transformers.PreTrainedModel): The loaded model.
|
|
|
+ tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model.
|
|
|
+ texts (list[str]): A list of texts to evaluate.
|
|
|
+ temperature (float): The temperature for the softmax function.
|
|
|
+ device (str): The device to evaluate the model on.
|
|
|
+ max_batch_size (int): The maximum number of text chunks to process in a single batch.
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ list[float]: A list of indirect injection scores for each text.
|
|
|
+ """
|
|
|
+ return get_scores_for_texts(model, tokenizer, texts, [1, 2], temperature, device, max_batch_size)
|