radu
/
LLamaRecipes
mirror of https://github.com/facebookresearch/llama-recipes.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
							from typing import List, Tuple

import torch
from torch.nn.functional import softmax
from transformers import AutoModelForSequenceClassification, AutoTokenizer

"""
Utilities for loading the PromptGuard model and evaluating text for jailbreaking techniques.

NOTE: this code is for PromptGuard 2. For our older PromptGuard 1 model, see prompt_guard_1_inference.py

Note that the underlying model has a maximum recommended input size of 512 tokens as a DeBERTa model.
The final two functions in this file implement efficient parallel batched evaluation of the model on a list
of input strings of arbitrary length, with the final score for each input being the maximum score across all
chunks of the input string.
"""

MAX_TOKENS = 512
DEFAULT_BATCH_SIZE = 16
DEFAULT_TEMPERATURE = 1.0
DEFAULT_DEVICE = "cpu"
DEFAULT_MODEL_NAME = "meta-llama/Llama-Prompt-Guard-2-86M"


def load_model_and_tokenizer(
    model_name: str = "meta-llama/Llama-Prompt-Guard-2-86M", device: str = DEFAULT_DEVICE
) -> Tuple[AutoModelForSequenceClassification, AutoTokenizer, str]:
    """
    Load the PromptGuard model and tokenizer, and move the model to the specified device.

    Args:
        model_name (str): The name of the model to load.
        device (str): The device to load the model on. If None, it will use CUDA if available, else CPU.

    Returns:
        tuple: The loaded model, tokenizer, and the device used.
    """
    try:
        if device is None:
            device = "cuda" if torch.cuda.is_available() else "cpu"

        model = AutoModelForSequenceClassification.from_pretrained(model_name)
        model = model.to(device)
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        return model, tokenizer, device
    except Exception as e:
        raise RuntimeError(f"Failed to load model and tokenizer: {str(e)}")


def get_class_scores(
    model: AutoModelForSequenceClassification,
    tokenizer: AutoTokenizer,
    text: str,
    temperature: float = DEFAULT_TEMPERATURE,
) -> torch.Tensor:
    """
    Evaluate the model on the given text with temperature-adjusted softmax.
    Note, as this is a DeBERTa model, the input text should have a maximum length of 512.

    Args:
        model: The loaded model.
        tokenizer: The loaded tokenizer.
        text (str): The input text to classify.
        temperature (float): The temperature for the softmax function. Default is 1.0.

    Returns:
        torch.Tensor: The scores for each class adjusted by the temperature.
    """

    # Get the device from the model
    device = next(model.parameters()).device

    # Encode the text
    inputs = tokenizer(
        text, return_tensors="pt", padding=True, truncation=True, max_length=MAX_TOKENS
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}
    # Get logits from the model
    with torch.no_grad():
        logits = model(**inputs).logits
    # Apply temperature scaling
    scaled_logits = logits / temperature
    # Apply softmax to get scores
    scores = softmax(scaled_logits, dim=-1)
    return scores


def get_jailbreak_score(
    model: AutoModelForSequenceClassification,
    tokenizer: AutoTokenizer,
    text: str,
    temperature: float = DEFAULT_TEMPERATURE,
) -> float:
    """
    Evaluate the probability that a given string contains malicious jailbreak or prompt injection.
    Appropriate for filtering dialogue between a user and an LLM.

    Args:
        model: The loaded model.
        tokenizer: The loaded tokenizer.
        text (str): The input text to evaluate.
        temperature (float): The temperature for the softmax function. Default is 1.0.

    Returns:
        float: The probability of the text containing malicious content.
    """
    probabilities = get_class_scores(model, tokenizer, text, temperature)
    return probabilities[0, 1].item()


def process_text_batch(
    model: AutoModelForSequenceClassification,
    tokenizer: AutoTokenizer,
    texts: List[str],
    temperature: float = DEFAULT_TEMPERATURE,
) -> torch.Tensor:
    """
    Process a batch of texts and return their class probabilities.
    Args:
        model (transformers.PreTrainedModel): The loaded model.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model.
        texts (list[str]): A list of texts to process.
        temperature (float): The temperature for the softmax function.

    Returns:
        torch.Tensor: A tensor containing the class probabilities for each text in the batch.
    """
    # Get the device from the model
    device = next(model.parameters()).device

    # encode the texts
    inputs = tokenizer(
        texts, return_tensors="pt", padding=True, truncation=True, max_length=MAX_TOKENS
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        logits = model(**inputs).logits
    scaled_logits = logits / temperature
    probabilities = softmax(scaled_logits, dim=-1)
    return probabilities


def get_scores_for_texts(
    model: AutoModelForSequenceClassification,
    tokenizer: AutoTokenizer,
    texts: List[str],
    score_indices: List[int],
    temperature: float = DEFAULT_TEMPERATURE,
    max_batch_size: int = DEFAULT_BATCH_SIZE,
) -> List[float]:
    """
    Compute scores for a list of texts, handling texts of arbitrary length by breaking them into chunks and processing in parallel.
    The final score for each text is the maximum score across all chunks of the text.

    Args:
        model (transformers.PreTrainedModel): The loaded model.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model.
        texts (list[str]): A list of texts to evaluate.
        score_indices (list[int]): Indices of scores to sum for final score calculation.
        temperature (float): The temperature for the softmax function.
        max_batch_size (int): The maximum number of text chunks to process in a single batch.

    Returns:
        list[float]: A list of scores for each text.
    """
    all_chunks = []
    text_indices = []
    for index, text in enumerate(texts):
        # Tokenize the text and split into chunks of MAX_TOKENS
        tokens = tokenizer(text, return_tensors="pt", truncation=False)["input_ids"][0]
        chunks = [tokens[i : i + MAX_TOKENS] for i in range(0, len(tokens), MAX_TOKENS)]
        all_chunks.extend(chunks)
        text_indices.extend([index] * len(chunks))
    all_scores = [0.0] * len(texts)
    for i in range(0, len(all_chunks), max_batch_size):
        batch_chunks = all_chunks[i : i + max_batch_size]
        batch_indices = text_indices[i : i + max_batch_size]
        # Decode the token chunks back to text
        batch_texts = [
            tokenizer.decode(chunk, skip_special_tokens=True) for chunk in batch_chunks
        ]
        probabilities = process_text_batch(model, tokenizer, batch_texts, temperature)
        scores = probabilities[:, score_indices].sum(dim=1).tolist()
        for idx, score in zip(batch_indices, scores):
            all_scores[idx] = max(all_scores[idx], score)
    return all_scores


def get_jailbreak_scores_for_texts(
    model: AutoModelForSequenceClassification,
    tokenizer: AutoTokenizer,
    texts: List[str],
    temperature: float = DEFAULT_TEMPERATURE,
    max_batch_size: int = DEFAULT_BATCH_SIZE,
) -> List[float]:
    """
    Compute jailbreak scores for a list of texts.
    Args:
        model (transformers.PreTrainedModel): The loaded model.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model.
        texts (list[str]): A list of texts to evaluate.
        temperature (float): The temperature for the softmax function.
        max_batch_size (int): The maximum number of text chunks to process in a single batch.

    Returns:
        list[float]: A list of jailbreak scores for each text.
    """
    return get_scores_for_texts(
        model, tokenizer, texts, [1], temperature, max_batch_size
    )