Explorar el Código

Prompt Guard Inference for long strings

Cyrus Nikolaidis hace 9 meses
padre
commit
883def17f0
Se han modificado 1 ficheros con 95 adiciones y 0 borrados
  1. 95 0
      recipes/responsible_ai/prompt_guard/inference.py

+ 95 - 0
recipes/responsible_ai/prompt_guard/inference.py

@@ -8,6 +8,11 @@ from transformers import (
 
 """
 Utilities for loading the PromptGuard model and evaluating text for jailbreaks and indirect injections.
+
+Note that the underlying model has a maximum recommended input size of 512 tokens as a DeBERTa model.
+The final two functions in this file implement efficient parallel batched evaluation of the model on a list
+of input strings of arbirary length, with the final score for each input being the maximum score across all
+chunks of the input string.
 """
 
 
@@ -29,6 +34,7 @@ def load_model_and_tokenizer(model_name='meta-llama/PromptGuard'):
 def get_class_probabilities(model, tokenizer, text, temperature=1.0, device='cpu'):
     """
     Evaluate the model on the given text with temperature-adjusted softmax.
+    Note, as this is a DeBERTa model, the input text should have a maximum length of 512.
     
     Args:
         text (str): The input text to classify.
@@ -83,3 +89,92 @@ def get_indirect_injection_score(model, tokenizer, text, temperature=1.0, device
     """
     probabilities = get_class_probabilities(model, tokenizer, text, temperature, device)
     return (probabilities[0, 1] + probabilities[0, 2]).item()
+
+
+def process_text_batch(model, tokenizer, texts, temperature=1.0, device='cpu'):
+    """
+    Process a batch of texts and return their class probabilities.
+    Args:
+        model (transformers.PreTrainedModel): The loaded model.
+        tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model.
+        texts (list[str]): A list of texts to process.
+        temperature (float): The temperature for the softmax function.
+        device (str): The device to evaluate the model on.
+        
+    Returns:
+        torch.Tensor: A tensor containing the class probabilities for each text in the batch.
+    """
+    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
+    inputs = inputs.to(device)
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    scaled_logits = logits / temperature
+    probabilities = softmax(scaled_logits, dim=-1)
+    return probabilities
+
+
+def get_scores_for_texts(model, tokenizer, texts, score_indices, temperature=1.0, device='cpu', max_batch_size=16):
+    """
+    Compute scores for a list of texts, handling texts of arbitrary length by breaking them into chunks and processing in parallel.
+    Args:
+        model (transformers.PreTrainedModel): The loaded model.
+        tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model.
+        texts (list[str]): A list of texts to evaluate.
+        score_indices (list[int]): Indices of scores to sum for final score calculation.
+        temperature (float): The temperature for the softmax function.
+        device (str): The device to evaluate the model on.
+        max_batch_size (int): The maximum number of text chunks to process in a single batch.
+        
+    Returns:
+        list[float]: A list of scores for each text.
+    """
+    all_chunks = []
+    text_indices = []
+    for index, text in enumerate(texts):
+        chunks = [text[i:i+512] for i in range(0, len(text), 512)]
+        all_chunks.extend(chunks)
+        text_indices.extend([index] * len(chunks))
+    all_scores = [0] * len(texts)
+    for i in range(0, len(all_chunks), max_batch_size):
+        batch_chunks = all_chunks[i:i+max_batch_size]
+        batch_indices = text_indices[i:i+max_batch_size]
+        probabilities = process_text_batch(model, tokenizer, batch_chunks, temperature, device)
+        scores = probabilities[:, score_indices].sum(dim=1).tolist()
+        
+        for idx, score in zip(batch_indices, scores):
+            all_scores[idx] = max(all_scores[idx], score)
+    return all_scores
+
+
+def get_jailbreak_scores_for_texts(model, tokenizer, texts, temperature=1.0, device='cpu', max_batch_size=16):
+    """
+    Compute jailbreak scores for a list of texts.
+    Args:
+        model (transformers.PreTrainedModel): The loaded model.
+        tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model.
+        texts (list[str]): A list of texts to evaluate.
+        temperature (float): The temperature for the softmax function.
+        device (str): The device to evaluate the model on.
+        max_batch_size (int): The maximum number of text chunks to process in a single batch.
+        
+    Returns:
+        list[float]: A list of jailbreak scores for each text.
+    """
+    return get_scores_for_texts(model, tokenizer, texts, [2], temperature, device, max_batch_size)
+
+
+def get_indirect_injection_scores_for_texts(model, tokenizer, texts, temperature=1.0, device='cpu', max_batch_size=16):
+    """
+    Compute indirect injection scores for a list of texts.
+    Args:
+        model (transformers.PreTrainedModel): The loaded model.
+        tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model.
+        texts (list[str]): A list of texts to evaluate.
+        temperature (float): The temperature for the softmax function.
+        device (str): The device to evaluate the model on.
+        max_batch_size (int): The maximum number of text chunks to process in a single batch.
+        
+    Returns:
+        list[float]: A list of indirect injection scores for each text.
+    """
+    return get_scores_for_texts(model, tokenizer, texts, [1, 2], temperature, device, max_batch_size)