1 year ago · d28400efb5
--- a/recipes/3p_integrations/lamini/text2sql_memory_tuning/meta_lamini.ipynb
+++ b/recipes/3p_integrations/lamini/text2sql_memory_tuning/meta_lamini.ipynb
@@ -145,7 +145,7 @@
 
				     "class Args:\n",
			
 
				     "    def __init__(self, \n",
			
 
				     "                 max_examples=100, \n",
			
 
				-    "                 sql_model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\", \n",
			
 
				+    "                 sql_model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\", \n",
			
 
				     "                 gold_file_name=\"gold-test-set.jsonl\",\n",
			
 
				     "                 training_file_name=\"generated_queries.jsonl\",\n",
			
 
				     "                 num_to_generate=10):\n",
			
@@ -197,7 +197,7 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\")\n",
			
 
				+    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
			
 
				     "\n",
			
 
				     "question = \"\"\"Who is the highest paid NBA player?\"\"\"\n",
			
 
				     "system = f\"\"\"You are an NBA analyst with 15 years of experience writing complex SQL queries. Consider the nba_roster table with the following schema:\n",
			
@@ -418,7 +418,7 @@
 
				     "class ScoreStage(GenerationNode):\n",
			
 
				     "    def __init__(self):\n",
			
 
				     "        super().__init__(\n",
			
 
				-    "            model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n",
			
 
				+    "            model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
			
 
				     "            max_new_tokens=150,\n",
			
 
				     "        )\n",
			
 
				     "\n",
			
@@ -712,7 +712,7 @@
 
				     "class ModelStage(GenerationNode):\n",
			
 
				     "    def __init__(self):\n",
			
 
				     "        super().__init__(\n",
			
 
				-    "            model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n",
			
 
				+    "            model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
			
 
				     "            max_new_tokens=300,\n",
			
 
				     "        )\n",
			
 
				     "\n",
			
@@ -808,7 +808,7 @@
 
				     "class QuestionStage(GenerationNode):\n",
			
 
				     "    def __init__(self):\n",
			
 
				     "        super().__init__(\n",
			
 
				-    "            model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n",
			
 
				+    "            model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
			
 
				     "            max_new_tokens=150,\n",
			
 
				     "        )\n",
			
 
				     "\n",
			
@@ -1055,7 +1055,7 @@
 
				    ],
			
 
				    "source": [
			
 
				     "args = Args()\n",
			
 
				-    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\")\n",
			
 
				+    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
			
 
				     "\n",
			
 
				     "dataset = get_dataset(args, make_question)\n",
			
 
				     "finetune_args = get_default_finetune_args()\n",
			
@@ -1601,7 +1601,7 @@
 
				    ],
			
 
				    "source": [
			
 
				     "args = Args(training_file_name=\"archive/generated_queries_large_filtered_cleaned.jsonl\")\n",
			
 
				-    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\")\n",
			
 
				+    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
			
 
				     "\n",
			
 
				     "dataset = get_dataset(args, make_question)\n",
			
 
				     "finetune_args = get_default_finetune_args()\n",
			
@@ -1798,7 +1798,7 @@
 
				    ],
			
 
				    "source": [
			
 
				     "args = Args(training_file_name=\"generated_queries_v2.jsonl\")\n",
			
 
				-    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\")\n",
			
 
				+    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
			
 
				     "\n",
			
 
				     "dataset = get_dataset(args, make_question)\n",
			
 
				     "finetune_args = get_default_finetune_args()\n",
			
@@ -1966,7 +1966,7 @@
 
				    ],
			
 
				    "source": [
			
 
				     "args = Args(training_file_name=\"archive/generated_queries_v2_large_filtered_cleaned.jsonl\")\n",
			
 
				-    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\")\n",
			
 
				+    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
			
 
				     "\n",
			
 
				     "dataset = get_dataset(args, make_question)\n",
			
 
				     "finetune_args = get_default_finetune_args()\n",
			
--- a/recipes/3p_integrations/lamini/text2sql_memory_tuning/util/parse_arguments.py
+++ b/recipes/3p_integrations/lamini/text2sql_memory_tuning/util/parse_arguments.py
@@ -16,7 +16,7 @@ def parse_arguments():
 
				     parser.add_argument(
			
 
				         "--sql-model-name",
			
 
				         type=str,
			
 
				-        default="meta-llama/Meta-Llama-3-8B-Instruct",
			
 
				+        default="meta-llama/Meta-Llama-3.1-8B-Instruct",
			
 
				         help="The model to use for text2sql",
			
 
				         required=False,
			
 
				     )
			
--- a/recipes/3p_integrations/llama_on_prem.md
+++ b/recipes/3p_integrations/llama_on_prem.md
@@ -8,7 +8,7 @@ We'll use the Amazon EC2 instance running Ubuntu with an A10G 24GB GPU as an exa
 
				 
			
 
				 The Colab notebook to connect via LangChain with Llama 3 hosted as the vLLM and TGI API services is [here](https://colab.research.google.com/drive/1rYWLdgTGIU1yCHmRpAOB2D-84fPzmOJg), also shown in the sections below.
			
 
				 
			
 
				-This tutorial assumes that you you have been granted access to the Meta Llama 3 on Hugging Face - you can open a Hugging Face Meta model page [here](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) to confirm that you see "Gated model You have been granted access to this model"; if you see "You need to agree to share your contact information to access this model", simply complete and submit the form in the page.
			
 
				+This tutorial assumes that you you have been granted access to the Meta Llama 3 on Hugging Face - you can open a Hugging Face Meta model page [here](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) to confirm that you see "Gated model You have been granted access to this model"; if you see "You need to agree to share your contact information to access this model", simply complete and submit the form in the page.
			
 
				 
			
 
				 You'll also need your Hugging Face access token which you can get at your Settings page [here](https://huggingface.co/settings/tokens).
			
 
				 
			
@@ -33,7 +33,7 @@ There are two ways to deploy Llama 3 via vLLM, as a general API server or an Ope
 
				 Run the command below to deploy vLLM as a general Llama 3 service:
			
 
				 
			
 
				 ```
			
 
				-python -m vllm.entrypoints.api_server --host 0.0.0.0 --port 5000 --model meta-llama/Meta-Llama-3-8B-Instruct
			
 
				+python -m vllm.entrypoints.api_server --host 0.0.0.0 --port 5000 --model meta-llama/Meta-Llama-3.1-8B-Instruct
			
 
				 ```
			
 
				 
			
 
				 Then on another terminal you can run:
			
@@ -68,13 +68,13 @@ Also, if you have multiple GPUs, you can add the `--tensor-parallel-size` argume
 
				 git clone https://github.com/vllm-project/vllm
			
 
				 cd vllm/vllm/entrypoints
			
 
				 conda activate llama3
			
 
				-python api_server.py --host 0.0.0.0 --port 5000 --model meta-llama/Meta-Llama-3-8B-Instruct --tensor-parallel-size 4
			
 
				+python api_server.py --host 0.0.0.0 --port 5000 --model meta-llama/Meta-Llama-3.1-8B-Instruct --tensor-parallel-size 4
			
 
				 ```
			
 
				 
			
 
				 With multiple GPUs, you can also run replica of models as long as your model size can fit into targeted GPU memory. For example, if you have two A10G with 24 GB memory, you can run two Llama 3 8B models at the same time. This can be done by launching two api servers each targeting specific CUDA cores on different ports:
			
 
				-`CUDA_VISIBLE_DEVICES=0 python api_server.py --host 0.0.0.0 --port 5000  --model meta-llama/Meta-Llama-3-8B-Instruct`
			
 
				+`CUDA_VISIBLE_DEVICES=0 python api_server.py --host 0.0.0.0 --port 5000  --model meta-llama/Meta-Llama-3.1-8B-Instruct`
			
 
				 and
			
 
				-`CUDA_VISIBLE_DEVICES=1 python api_server.py --host 0.0.0.0 --port 5001  --model meta-llama/Meta-Llama-3-8B-Instruct`
			
 
				+`CUDA_VISIBLE_DEVICES=1 python api_server.py --host 0.0.0.0 --port 5001  --model meta-llama/Meta-Llama-3.1-8B-Instruct`
			
 
				 The benefit would be that you can balance incoming requests to both models, reaching higher batch size processing for a trade-off of generation latency.
			
 
				 
			
 
				 
			
@@ -83,14 +83,14 @@ The benefit would be that you can balance incoming requests to both models, reac
 
				 You can also deploy the vLLM hosted Llama 3 as an OpenAI-Compatible service to easily replace code using OpenAI API. First, run the command below:
			
 
				 
			
 
				 ```
			
 
				-python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 5000 --model meta-llama/Meta-Llama-3-8B-Instruct
			
 
				+python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 5000 --model meta-llama/Meta-Llama-3.1-8B-Instruct
			
 
				 ```
			
 
				 
			
 
				 Then on another terminal, run:
			
 
				 
			
 
				 ```
			
 
				 curl http://localhost:5000/v1/completions -H "Content-Type: application/json" -d '{
			
 
				-        "model": "meta-llama/Meta-Llama-3-8B-Instruct",
			
 
				+        "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
			
 
				         "prompt": "Who wrote the book Innovators dilemma?",
			
 
				         "max_tokens": 300,
			
 
				         "temperature": 0
			
@@ -118,7 +118,7 @@ from langchain.llms import VLLMOpenAI
 
				 llm = VLLMOpenAI(
			
 
				     openai_api_key="EMPTY",
			
 
				     openai_api_base="http://<vllm_server_ip_address>:5000/v1",
			
 
				-    model_name="meta-llama/Meta-Llama-3-8B-Instruct",
			
 
				+    model_name="meta-llama/Meta-Llama-3.1-8B-Instruct",
			
 
				 )
			
 
				 
			
 
				 print(llm("Who wrote the book godfather?"))
			
@@ -136,7 +136,7 @@ You can now use the Llama 3 instance `llm` created this way in any of the demo a
 
				 The easiest way to deploy Llama 3 with TGI is using its official docker image. First, replace `<your_hugging_face_access_token>` and set the three required shell variables (you may replace the `model` value above with another Llama 3 model):
			
 
				 
			
 
				 ```
			
 
				-model=meta-llama/Meta-Llama-3-8B-Instruct
			
 
				+model=meta-llama/Meta-Llama-3.1-8B-Instruct
			
 
				 volume=$PWD/data
			
 
				 token=<your_hugging_face_access_token>
			
 
				 ```
			
--- a/recipes/quickstart/Running_Llama3_Anywhere/Running_Llama_on_HF_transformers.ipynb
+++ b/recipes/quickstart/Running_Llama3_Anywhere/Running_Llama_on_HF_transformers.ipynb
@@ -92,7 +92,7 @@
 
				    "cell_type": "markdown",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "Then, we will set the model variable to a specific model we’d like to use. In this demo, we will use the 8b chat model `meta-llama/Meta-Llama-3-8B-Instruct`. Using Meta models from Hugging Face requires you to\n",
			
 
				+    "Then, we will set the model variable to a specific model we’d like to use. In this demo, we will use the 8b chat model `meta-llama/Meta-Llama-3.1-8B-Instruct`. Using Meta models from Hugging Face requires you to\n",
			
 
				     "\n",
			
 
				     "1. Accept Terms of Service for Meta Llama 3 on Meta [website](https://llama.meta.com/llama-downloads).\n",
			
 
				     "2. Use the same email address from Step (1) to login into Hugging Face.\n",
			
@@ -125,7 +125,7 @@
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "model = \"meta-llama/Meta-Llama-3-8B-Instruct\"\n",
			
 
				+    "model = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n",
			
 
				     "tokenizer = AutoTokenizer.from_pretrained(model)"
			
 
				    ]
			
 
				   },
			
--- a/recipes/quickstart/finetuning/quickstart_peft_finetuning.ipynb
+++ b/recipes/quickstart/finetuning/quickstart_peft_finetuning.ipynb
@@ -90,7 +90,7 @@
 
				     "from llama_recipes.configs import train_config as TRAIN_CONFIG\n",
			
 
				     "\n",
			
 
				     "train_config = TRAIN_CONFIG()\n",
			
 
				-    "train_config.model_name = \"meta-llama/Meta-Llama-3-8B\"\n",
			
 
				+    "train_config.model_name = \"meta-llama/Meta-Llama-3.1-8B\"\n",
			
 
				     "train_config.num_epochs = 1\n",
			
 
				     "train_config.run_validation = False\n",
			
 
				     "train_config.gradient_accumulation_steps = 4\n",
			
--- a/recipes/responsible_ai/llama_guard/llama_guard_customization_via_prompting_and_fine_tuning.ipynb
+++ b/recipes/responsible_ai/llama_guard/llama_guard_customization_via_prompting_and_fine_tuning.ipynb
--- a/recipes/responsible_ai/prompt_guard/inference.py
+++ b/recipes/responsible_ai/prompt_guard/inference.py
@@ -8,6 +8,11 @@ from transformers import (
 
				 
			
 
				 """
			
 
				 Utilities for loading the PromptGuard model and evaluating text for jailbreaks and indirect injections.
			
 
				+
			
 
				+Note that the underlying model has a maximum recommended input size of 512 tokens as a DeBERTa model.
			
 
				+The final two functions in this file implement efficient parallel batched evaluation of the model on a list
			
 
				+of input strings of arbirary length, with the final score for each input being the maximum score across all
			
 
				+chunks of the input string.
			
 
				 """
			
 
				 
			
 
				 
			
@@ -29,6 +34,7 @@ def load_model_and_tokenizer(model_name='meta-llama/Prompt-Guard-86M'):
 
				 def get_class_probabilities(model, tokenizer, text, temperature=1.0, device='cpu'):
			
 
				     """
			
 
				     Evaluate the model on the given text with temperature-adjusted softmax.
			
 
				+    Note, as this is a DeBERTa model, the input text should have a maximum length of 512.
			
 
				     
			
 
				     Args:
			
 
				         text (str): The input text to classify.
			
@@ -83,3 +89,92 @@ def get_indirect_injection_score(model, tokenizer, text, temperature=1.0, device
 
				     """
			
 
				     probabilities = get_class_probabilities(model, tokenizer, text, temperature, device)
			
 
				     return (probabilities[0, 1] + probabilities[0, 2]).item()
			
 
				+
			
 
				+
			
 
				+def process_text_batch(model, tokenizer, texts, temperature=1.0, device='cpu'):
			
 
				+    """
			
 
				+    Process a batch of texts and return their class probabilities.
			
 
				+    Args:
			
 
				+        model (transformers.PreTrainedModel): The loaded model.
			
 
				+        tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model.
			
 
				+        texts (list[str]): A list of texts to process.
			
 
				+        temperature (float): The temperature for the softmax function.
			
 
				+        device (str): The device to evaluate the model on.
			
 
				+        
			
 
				+    Returns:
			
 
				+        torch.Tensor: A tensor containing the class probabilities for each text in the batch.
			
 
				+    """
			
 
				+    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
			
 
				+    inputs = inputs.to(device)
			
 
				+    with torch.no_grad():
			
 
				+        logits = model(**inputs).logits
			
 
				+    scaled_logits = logits / temperature
			
 
				+    probabilities = softmax(scaled_logits, dim=-1)
			
 
				+    return probabilities
			
 
				+
			
 
				+
			
 
				+def get_scores_for_texts(model, tokenizer, texts, score_indices, temperature=1.0, device='cpu', max_batch_size=16):
			
 
				+    """
			
 
				+    Compute scores for a list of texts, handling texts of arbitrary length by breaking them into chunks and processing in parallel.
			
 
				+    Args:
			
 
				+        model (transformers.PreTrainedModel): The loaded model.
			
 
				+        tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model.
			
 
				+        texts (list[str]): A list of texts to evaluate.
			
 
				+        score_indices (list[int]): Indices of scores to sum for final score calculation.
			
 
				+        temperature (float): The temperature for the softmax function.
			
 
				+        device (str): The device to evaluate the model on.
			
 
				+        max_batch_size (int): The maximum number of text chunks to process in a single batch.
			
 
				+        
			
 
				+    Returns:
			
 
				+        list[float]: A list of scores for each text.
			
 
				+    """
			
 
				+    all_chunks = []
			
 
				+    text_indices = []
			
 
				+    for index, text in enumerate(texts):
			
 
				+        chunks = [text[i:i+512] for i in range(0, len(text), 512)]
			
 
				+        all_chunks.extend(chunks)
			
 
				+        text_indices.extend([index] * len(chunks))
			
 
				+    all_scores = [0] * len(texts)
			
 
				+    for i in range(0, len(all_chunks), max_batch_size):
			
 
				+        batch_chunks = all_chunks[i:i+max_batch_size]
			
 
				+        batch_indices = text_indices[i:i+max_batch_size]
			
 
				+        probabilities = process_text_batch(model, tokenizer, batch_chunks, temperature, device)
			
 
				+        scores = probabilities[:, score_indices].sum(dim=1).tolist()
			
 
				+        
			
 
				+        for idx, score in zip(batch_indices, scores):
			
 
				+            all_scores[idx] = max(all_scores[idx], score)
			
 
				+    return all_scores
			
 
				+
			
 
				+
			
 
				+def get_jailbreak_scores_for_texts(model, tokenizer, texts, temperature=1.0, device='cpu', max_batch_size=16):
			
 
				+    """
			
 
				+    Compute jailbreak scores for a list of texts.
			
 
				+    Args:
			
 
				+        model (transformers.PreTrainedModel): The loaded model.
			
 
				+        tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model.
			
 
				+        texts (list[str]): A list of texts to evaluate.
			
 
				+        temperature (float): The temperature for the softmax function.
			
 
				+        device (str): The device to evaluate the model on.
			
 
				+        max_batch_size (int): The maximum number of text chunks to process in a single batch.
			
 
				+        
			
 
				+    Returns:
			
 
				+        list[float]: A list of jailbreak scores for each text.
			
 
				+    """
			
 
				+    return get_scores_for_texts(model, tokenizer, texts, [2], temperature, device, max_batch_size)
			
 
				+
			
 
				+
			
 
				+def get_indirect_injection_scores_for_texts(model, tokenizer, texts, temperature=1.0, device='cpu', max_batch_size=16):
			
 
				+    """
			
 
				+    Compute indirect injection scores for a list of texts.
			
 
				+    Args:
			
 
				+        model (transformers.PreTrainedModel): The loaded model.
			
 
				+        tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model.
			
 
				+        texts (list[str]): A list of texts to evaluate.
			
 
				+        temperature (float): The temperature for the softmax function.
			
 
				+        device (str): The device to evaluate the model on.
			
 
				+        max_batch_size (int): The maximum number of text chunks to process in a single batch.
			
 
				+        
			
 
				+    Returns:
			
 
				+        list[float]: A list of indirect injection scores for each text.
			
 
				+    """
			
 
				+    return get_scores_for_texts(model, tokenizer, texts, [1, 2], temperature, device, max_batch_size)
			
--- a/recipes/use_cases/customerservice_chatbots/RAG_chatbot/RAG_Chatbot_Example.ipynb
+++ b/recipes/use_cases/customerservice_chatbots/RAG_chatbot/RAG_Chatbot_Example.ipynb
@@ -418,7 +418,7 @@
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "model=meta-llama/Meta-Llama-3-8B-Instruct\n",
			
 
				+    "model=meta-llama/Meta-Llama-3.1-8B-Instruct\n",
			
 
				     "volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run\n",
			
 
				     "token=#your-huggingface-token\n",
			
 
				     "docker run --gpus all --shm-size 1g -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0 --model-id $model"
			
--- a/recipes/use_cases/customerservice_chatbots/RAG_chatbot/vectorstore/mongodb/rag_mongodb_llama3_huggingface_open_source.ipynb
+++ b/recipes/use_cases/customerservice_chatbots/RAG_chatbot/vectorstore/mongodb/rag_mongodb_llama3_huggingface_open_source.ipynb
@@ -934,11 +934,11 @@
 
				       "source": [
			
 
				         "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
			
 
				         "import torch\n",
			
 
				-        "tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/Meta-Llama-3-8B-Instruct\")\n",
			
 
				+        "tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
			
 
				         "# CPU Enabled uncomment below 👇🏽\n",
			
 
				-        "# model = AutoModelForCausalLM.from_pretrained(\"meta-llama/Meta-Llama-3-8B-Instruct\")\n",
			
 
				+        "# model = AutoModelForCausalLM.from_pretrained(\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
			
 
				         "# GPU Enabled use below 👇🏽\n",
			
 
				-        "model = AutoModelForCausalLM.from_pretrained(\"meta-llama/Meta-Llama-3-8B-Instruct\", torch_dtype=torch.bfloat16, device_map=\"auto\")"
			
 
				+        "model = AutoModelForCausalLM.from_pretrained(\"meta-llama/Meta-Llama-3.1-8B-Instruct\", torch_dtype=torch.bfloat16, device_map=\"auto\")"
			
 
				       ]
			
 
				     },
			
 
				     {
			
--- a/src/llama_recipes/configs/datasets.py
+++ b/src/llama_recipes/configs/datasets.py
@@ -25,10 +25,16 @@ class alpaca_dataset:
 
				     test_split: str = "val"
			
 
				     data_path: str = "src/llama_recipes/datasets/alpaca_data.json"
			
 
				     
			
 
				-    
			
 
				+
			
 
				 @dataclass
			
 
				 class custom_dataset:
			
 
				     dataset: str = "custom_dataset"
			
 
				     file: str = "recipes/quickstart/finetuning/datasets/custom_dataset.py"
			
 
				     train_split: str = "train"
			
 
				     test_split: str = "validation"
			
 
				+    
			
 
				+@dataclass
			
 
				+class llamaguard_toxicchat_dataset:
			
 
				+    dataset: str = "llamaguard_toxicchat_dataset"
			
 
				+    train_split: str = "train"
			
 
				+    test_split: str = "test"
			
--- a/src/llama_recipes/datasets/__init__.py
+++ b/src/llama_recipes/datasets/__init__.py
@@ -3,4 +3,5 @@
 
				 
			
 
				 from llama_recipes.datasets.grammar_dataset.grammar_dataset import get_dataset as get_grammar_dataset
			
 
				 from llama_recipes.datasets.alpaca_dataset import InstructionDataset as get_alpaca_dataset
			
 
				-from llama_recipes.datasets.samsum_dataset import get_preprocessed_samsum as get_samsum_dataset
			
 
				+from llama_recipes.datasets.samsum_dataset import get_preprocessed_samsum as get_samsum_dataset
			
 
				+from llama_recipes.datasets.toxicchat_dataset import get_llamaguard_toxicchat_dataset as get_llamaguard_toxicchat_dataset
			
--- a/src/llama_recipes/datasets/toxicchat_dataset.py
+++ b/src/llama_recipes/datasets/toxicchat_dataset.py
@@ -0,0 +1,131 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
			
 
				+
			
 
				+# For dataset details visit: https://huggingface.co/datasets/samsum
			
 
				+
			
 
				+import copy
			
 
				+import datasets
			
 
				+import itertools
			
 
				+from llama_recipes.inference.prompt_format_utils import  LLAMA_GUARD_3_CATEGORY
			
 
				+import ast
			
 
				+import fire
			
 
				+
			
 
				+def tokenize_prompt_and_labels(full_prompt, tokenizer):
			
 
				+        prompt_tokens = tokenizer.encode(full_prompt)
			
 
				+        combined_tokens = {
			
 
				+            "input_ids": list(prompt_tokens),
			
 
				+            "labels": list(prompt_tokens)
			
 
				+        }
			
 
				+        return dict(combined_tokens, attention_mask=[1]*len(combined_tokens["input_ids"]))
			
 
				+    
			
 
				+
			
 
				+from llama_recipes.data.llama_guard.finetuning_data_formatter import TrainingExample, Guidelines, Category, LlamaGuardPromptConfigs, LlamaGuardGenerationConfigs, ExplanationPosition, AugmentationConfigs, FormatterConfigs, create_formatted_finetuning_examples
			
 
				+from datasets import Dataset, DatasetInfo
			
 
				+
			
 
				+def mapTcCategoriesToLGCategories(TcCategoriesString):
			
 
				+    TcCategories = ast.literal_eval(TcCategoriesString)
			
 
				+    if(len(TcCategories)==0):
			
 
				+         return None
			
 
				+    ranked = sorted(TcCategories, key=lambda x: x[1], reverse=True)
			
 
				+    primary = ranked[0][0] if len(ranked) else None
			
 
				+    TcMapping = {
			
 
				+        "sexual":"012", 
			
 
				+        "violence":"01", 
			
 
				+        "sexual/minors":"04", 
			
 
				+        "self-harm/intent":"011", 
			
 
				+        "hate":"010", 
			
 
				+        "harassment":"010", 
			
 
				+        "self-harm":"011", 
			
 
				+        "self-harm/instructions":"011", 
			
 
				+        "violence/graphic":"01", 
			
 
				+        "harassment/threatening":"010", 
			
 
				+        "hate/threatening":"010"
			
 
				+    }
			
 
				+    return TcMapping[primary]
			
 
				+
			
 
				+def get_llamaguard_toxicchat_dataset(dataset_config, tokenizer, split, return_jsonl=False):
			
 
				+    dataset = datasets.load_dataset("lmsys/toxic-chat", 'toxicchat0124', split=split)
			
 
				+
			
 
				+
			
 
				+    if return_jsonl:
			
 
				+        jsonl_data = [];
			
 
				+        for x in dataset.to_iterable_dataset():
			
 
				+            jsonl_data.append({
			
 
				+                    "prompt":x['user_input'],
			
 
				+                    "generation":x['model_output'],
			
 
				+                    "label":("good" if x['toxicity'] == 0 else "bad"),
			
 
				+                    "unsafe_content": [mapTcCategoriesToLGCategories(x['openai_moderation'])]
			
 
				+                    })
			
 
				+        return jsonl_data
			
 
				+    
			
 
				+    guidelines = Guidelines(
			
 
				+        categories=LLAMA_GUARD_3_CATEGORY,
			
 
				+        category_code_prefix="O")
			
 
				+    
			
 
				+    # Setting up configs
			
 
				+    llama_guard_prompt_configs = LlamaGuardPromptConfigs(
			
 
				+        instructions_format_string="""<|begin_of_text|><|start_header_id|>user<|end_header_id|> Task: Check if there is unsafe content in 'User' messages in conversations according our safety policy with the below categories.
			
 
				+
			
 
				+    <BEGIN UNSAFE CONTENT CATEGORIES>
			
 
				+    {guidelines}
			
 
				+    <END UNSAFE CONTENT CATEGORIES>
			
 
				+
			
 
				+    <BEGIN CONVERSATION>
			
 
				+
			
 
				+    {conversation}
			
 
				+
			
 
				+    <END CONVERSATION>
			
 
				+
			
 
				+    Provide your safety assessment for $agent_type in the above conversation:
			
 
				+    - First line must read 'safe' or 'unsafe'.
			
 
				+    - If unsafe, a second line must include a comma-separated list of violated categories. <|eot_id|><|start_header_id|>assistant<|end_header_id|>""",
			
 
				+        should_include_category_descriptions=False,
			
 
				+        should_shuffle_category_codes=False
			
 
				+    )
			
 
				+
			
 
				+    llama_guard_generation_configs = LlamaGuardGenerationConfigs(
			
 
				+        should_list_violated_codes=True,
			
 
				+        explanation_position=None
			
 
				+    )
			
 
				+
			
 
				+    augmentation_configs = AugmentationConfigs(
			
 
				+        should_add_examples_with_dropped_nonviolated_prompt_categories=False,
			
 
				+        should_add_examples_with_dropped_violated_and_nonviolated_prompt_categories=False,
			
 
				+        explanation_for_augmentation_with_dropped_violated_and_nonviolated_prompt_categories="Response is safe because any riskiness in it is related to categories we're not trying to detect."
			
 
				+    )
			
 
				+
			
 
				+    formatter_configs = FormatterConfigs(
			
 
				+        guidelines=guidelines,
			
 
				+        llama_guard_prompt_configs=llama_guard_prompt_configs,
			
 
				+        llama_guard_generation_configs=llama_guard_generation_configs,
			
 
				+        augmentation_configs=augmentation_configs,
			
 
				+        random_seed=42
			
 
				+    )
			
 
				+
			
 
				+    dataset = dataset.map(lambda x: {"full_prompt": create_formatted_finetuning_examples(
			
 
				+        [TrainingExample(
			
 
				+            prompt=x["user_input"],
			
 
				+            response=None,
			
 
				+            violated_category_codes = [] if x["toxicity"]==0 else [mapTcCategoriesToLGCategories(x["openai_moderation"])],
			
 
				+            label="safe" if x["toxicity"]==0 else "unsafe",
			
 
				+            explanation="The response contains violating information."
			
 
				+        )],
			
 
				+        formatter_configs)[0]}, 
			
 
				+        remove_columns=list(dataset.features))
			
 
				+
			
 
				+    dataset = dataset.map(lambda x: tokenize_prompt_and_labels(x["full_prompt"], tokenizer), remove_columns=list(dataset.features))
			
 
				+    return dataset
			
 
				+
			
 
				+def main(return_jsonl = False):
			
 
				+    from transformers import AutoTokenizer
			
 
				+    model_id: str = "/home/ubuntu/LG3-interim-hf-weights"
			
 
				+    tokenizer = AutoTokenizer.from_pretrained(model_id)
			
 
				+    if return_jsonl:
			
 
				+        dataset = get_llamaguard_toxicchat_dataset(None, tokenizer, "train", return_jsonl = True)
			
 
				+        print(dataset[0:50])
			
 
				+    else:
			
 
				+        dataset = get_llamaguard_toxicchat_dataset(None, tokenizer, "train")
			
 
				+        print(dataset[0])
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    fire.Fire(main)
			
--- a/src/llama_recipes/inference/safety_utils.py
+++ b/src/llama_recipes/inference/safety_utils.py
@@ -160,7 +160,7 @@ class LlamaGuardSafetyChecker(object):
 
				         from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
			
 
				         from llama_recipes.inference.prompt_format_utils import build_default_prompt, create_conversation, LlamaGuardVersion
			
 
				 
			
 
				-        model_id = "meta-llama/LlamaGuard-7b"
			
 
				+        model_id = "meta-llama/Llama-Guard-3-8B"
			
 
				 
			
 
				         quantization_config = BitsAndBytesConfig(load_in_8bit=True)
			
 
				 
			
--- a/src/llama_recipes/tools/README.md
+++ b/src/llama_recipes/tools/README.md
@@ -7,7 +7,7 @@ This is the reverse conversion for `convert_llama_weights_to_hf.py` script from
 
				 - Copy file params.json from the official llama download into that directory.
			
 
				 - Run the conversion script. `model-path` can be a Hugging Face hub model or a local hf model directory.
			
 
				 ```
			
 
				-python -m llama_recipes.tools.convert_hf_weights_to_llama --model-path meta-llama/Meta-Llama-3-70B-Instruct --output-dir test70B --model-size 70B
			
 
				+python -m llama_recipes.tools.convert_hf_weights_to_llama --model-path meta-llama/Meta-Llama-3.1-70B-Instruct --output-dir test70B --model-size 70B
			
 
				 ```
			
 
				 
			
 
				 ## Step 1: Run inference
			
--- a/src/llama_recipes/utils/dataset_utils.py
+++ b/src/llama_recipes/utils/dataset_utils.py
@@ -11,6 +11,7 @@ from llama_recipes.datasets import (
 
				     get_grammar_dataset,
			
 
				     get_alpaca_dataset,
			
 
				     get_samsum_dataset,
			
 
				+    get_llamaguard_toxicchat_dataset,
			
 
				 )
			
 
				 
			
 
				 
			
@@ -54,6 +55,8 @@ DATASET_PREPROC = {
 
				     "grammar_dataset": get_grammar_dataset,
			
 
				     "samsum_dataset": get_samsum_dataset,
			
 
				     "custom_dataset": get_custom_dataset,
			
 
				+    "llamaguard_toxicchat_dataset": get_llamaguard_toxicchat_dataset,
			
 
				+
			
 
				 }
			
 
				 
			
 
				 
			
--- a/src/tests/conftest.py
+++ b/src/tests/conftest.py
@@ -6,7 +6,7 @@ import pytest
 
				 from transformers import AutoTokenizer
			
 
				 
			
 
				 ACCESS_ERROR_MSG = "Could not access tokenizer at 'meta-llama/Llama-2-7b-hf'. Did you log into huggingface hub and provided the correct token?"
			
 
				-LLAMA_VERSIONS = ["meta-llama/Llama-2-7b-hf", "meta-llama/Meta-Llama-3-8B"]
			
 
				+LLAMA_VERSIONS = ["meta-llama/Llama-2-7b-hf", "meta-llama/Meta-Llama-3.1-8B"]
			
 
				 
			
 
				 @pytest.fixture(params=LLAMA_VERSIONS)
			
 
				 def llama_version(request):
			
--- a/src/tests/datasets/test_custom_dataset.py
+++ b/src/tests/datasets/test_custom_dataset.py
@@ -11,7 +11,7 @@ EXPECTED_RESULTS={
 
				         "example_1": "[INST] Who made Berlin [/INST] dunno",
			
 
				         "example_2": "[INST] Quiero preparar una pizza de pepperoni, puedes darme los pasos para hacerla? [/INST] Claro!",
			
 
				     },
			
 
				-    "meta-llama/Meta-Llama-3-8B":{
			
 
				+    "meta-llama/Meta-Llama-3.1-8B":{
			
 
				         "example_1": "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nWho made Berlin<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\ndunno<|eot_id|><|end_of_text|>",
			
 
				         "example_2": "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHow to start learning guitar and become a master at it?",
			
 
				     },
			
--- a/src/tests/datasets/test_grammar_datasets.py
+++ b/src/tests/datasets/test_grammar_datasets.py
@@ -10,7 +10,7 @@ EXPECTED_RESULTS = {
 
				         "label": 1152,
			
 
				         "pos": 31,
			
 
				     },
			
 
				-    "meta-llama/Meta-Llama-3-8B":{
			
 
				+    "meta-llama/Meta-Llama-3.1-8B":{
			
 
				         "label": 40,
			
 
				         "pos": 26,
			
 
				     },
			
--- a/src/tests/datasets/test_samsum_datasets.py
+++ b/src/tests/datasets/test_samsum_datasets.py
@@ -10,7 +10,7 @@ EXPECTED_RESULTS = {
 
				         "label": 8432,
			
 
				         "pos": 242,
			
 
				     },
			
 
				-    "meta-llama/Meta-Llama-3-8B":{
			
 
				+    "meta-llama/Meta-Llama-3.1-8B":{
			
 
				         "label": 2250,
			
 
				         "pos": 211,
			
 
				     },
			
--- a/src/tests/test_batching.py
+++ b/src/tests/test_batching.py
@@ -9,7 +9,7 @@ EXPECTED_SAMPLE_NUMBER ={
 
				         "train": 96,
			
 
				         "eval": 42,
			
 
				     },
			
 
				-    "meta-llama/Meta-Llama-3-8B": {
			
 
				+    "meta-llama/Meta-Llama-3.1-8B": {
			
 
				         "train": 79,
			
 
				         "eval": 34,
			
 
				     }
			
--- a/tools/benchmarks/inference/on_prem/README.md
+++ b/tools/benchmarks/inference/on_prem/README.md
@@ -17,8 +17,8 @@ For example, we have an instance from Azure that has 8xA100 80G GPUs, and we wan
 
				 
			
 
				 Here are examples for deploying 2x70B chat models over 8 GPUs with vLLM.
			
 
				 ```
			
 
				-CUDA_VISIBLE_DEVICES=0,1,2,3 python -m vllm.entrypoints.openai.api_server  --model meta-llama/Meta-Llama-3-70B-Instruct --tensor-parallel-size 4 --disable-log-requests --port 8000
			
 
				-CUDA_VISIBLE_DEVICES=4,5,6,7 python -m vllm.entrypoints.openai.api_server  --model meta-llama/Meta-Llama-3-70B-Instruct --tensor-parallel-size 4 --disable-log-requests --port 8001
			
 
				+CUDA_VISIBLE_DEVICES=0,1,2,3 python -m vllm.entrypoints.openai.api_server  --model meta-llama/Meta-Llama-3.1-70B-Instruct --tensor-parallel-size 4 --disable-log-requests --port 8000
			
 
				+CUDA_VISIBLE_DEVICES=4,5,6,7 python -m vllm.entrypoints.openai.api_server  --model meta-llama/Meta-Llama-3.1-70B-Instruct --tensor-parallel-size 4 --disable-log-requests --port 8001
			
 
				 ```
			
 
				 Once you have finished deployment, you can use the command below to run benchmark scripts in a separate terminal.
			
 
				 
			
--- a/tools/benchmarks/llm_eval_harness/README.md
+++ b/tools/benchmarks/llm_eval_harness/README.md