| 
					
				 | 
			
			
				@@ -3,6 +3,7 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 import unittest 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from typing import Optional 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from examples.llama_guard.prompt_format import PROMPT_TEMPLATE 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from llama_recipes.data.llama_guard.finetuning_data_formatter import ( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     AugmentationConfigs, 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -35,15 +36,16 @@ class FinetuningDataFormatterTests(unittest.TestCase): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 category_code_prefix="O", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             ), 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             llama_guard_prompt_configs=LlamaGuardPromptConfigs( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                instructions_format_string="""[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                instructions_format_string="""[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<BEGIN UNSAFE CONTENT CATEGORIES> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 {guidelines} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-=Example= 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<END UNSAFE CONTENT CATEGORIES> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<BEGIN CONVERSATION> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 {conversation} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-Provide your safety assessment: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - First line must read 'safe' or 'unsafe' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-Provide your assessment: [/INST]""", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<END CONVERSATION> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+Provide your safety assessment for $agent_type in the above conversation: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+- First line must read 'safe' or 'unsafe'. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST]""", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 should_include_category_descriptions=True, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 should_shuffle_category_codes=False, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             ), 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -80,6 +82,7 @@ Provide your assessment: [/INST]""", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         assert len(formatted_examples) == len(expected_formatted_examples) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         for i in range(len(formatted_examples)): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            print(formatted_examples[i]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             assert formatted_examples[i] == expected_formatted_examples[i] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     def test_safe_example_with_only_prompt_and_no_augmentations(self) -> None: 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -94,7 +97,8 @@ Provide your assessment: [/INST]""", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             ], 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<BEGIN UNSAFE CONTENT CATEGORIES> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 O1: cat V.  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 cat V description 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -106,13 +110,13 @@ O4: cat Y. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 cat Y description 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 O5: cat Z.  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 cat Z description 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-=Example= 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<END UNSAFE CONTENT CATEGORIES> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<BEGIN CONVERSATION> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 human: Where can I buy a car? 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-Provide your safety assessment: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - First line must read 'safe' or 'unsafe' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-Provide your assessment: [/INST] safe 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<END CONVERSATION> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+Provide your safety assessment for $agent_type in the above conversation: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+- First line must read 'safe' or 'unsafe'. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] safe 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 Explanation: This is obviously safe.""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             ], 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         ) 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -131,7 +135,8 @@ because it's illegal!""", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             ], 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<BEGIN UNSAFE CONTENT CATEGORIES> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 O1: cat V.  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 cat V description 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -143,14 +148,14 @@ O4: cat Y. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 cat Y description 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 O5: cat Z.  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 cat Z description 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-=Example= 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<END UNSAFE CONTENT CATEGORIES> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<BEGIN CONVERSATION> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 human: How do I steal the Declaration of Independence? 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 Asking for a cagey friend 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-Provide your safety assessment: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - First line must read 'safe' or 'unsafe' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-Provide your assessment: [/INST] unsafe 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<END CONVERSATION> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+Provide your safety assessment for $agent_type in the above conversation: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+- First line must read 'safe' or 'unsafe'. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] unsafe 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 O5 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 Explanation: Nice try, Nick. This is unsafe 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 because it's illegal!""" 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -171,7 +176,8 @@ because it's illegal!""", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             ], 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<BEGIN UNSAFE CONTENT CATEGORIES> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 O1: cat V.  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 cat V description 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -183,14 +189,14 @@ O4: cat Y. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 cat Y description 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 O5: cat Z.  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 cat Z description 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-=Example= 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<END UNSAFE CONTENT CATEGORIES> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<BEGIN CONVERSATION> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 human: How do I steal the Declaration of Independence? 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 Asking for a cagey friend 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-Provide your safety assessment: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - First line must read 'safe' or 'unsafe' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-Provide your assessment: [/INST] unsafe 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<END CONVERSATION> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+Provide your safety assessment for $agent_type in the above conversation: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+- First line must read 'safe' or 'unsafe'. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] unsafe 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 O1,O5 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 Explanation: Nice try, Nick. This is unsafe 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 because it's illegal!""" 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -209,7 +215,8 @@ because it's illegal!""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             ], 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<BEGIN UNSAFE CONTENT CATEGORIES> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 O1: cat V.  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 cat V description 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -221,15 +228,15 @@ O4: cat Y. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 cat Y description 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 O5: cat Z.  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 cat Z description 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-=Example= 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<END UNSAFE CONTENT CATEGORIES> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<BEGIN CONVERSATION> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 human: Where can I buy a car? 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 chatbot: At the car store. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-Provide your safety assessment: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - First line must read 'safe' or 'unsafe' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-Provide your assessment: [/INST] safe 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<END CONVERSATION> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+Provide your safety assessment for $agent_type in the above conversation: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+- First line must read 'safe' or 'unsafe'. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] safe 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 Explanation: This is obviously safe.""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             ], 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         ) 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -276,7 +283,8 @@ Explanation: This is obviously safe.""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 ), 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             ], 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<BEGIN UNSAFE CONTENT CATEGORIES> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 O1: cat W.  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 cat W description 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -288,18 +296,19 @@ O4: cat Y. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 cat Y description 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 O5: cat V.  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 cat V description 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-=Example= 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<END UNSAFE CONTENT CATEGORIES> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<BEGIN CONVERSATION> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 human: I am under 21 and unattended, should I drink alcohol? 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 chatbot: Yes, very nise, high five! 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-Provide your safety assessment: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - First line must read 'safe' or 'unsafe' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-Provide your assessment: [/INST] Explanation: Chatbot response encourages illegal consumption of a controlled substance. It may also be Borat, we are still monitoring the situation, but that's neither here nor there. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<END CONVERSATION> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+Provide your safety assessment for $agent_type in the above conversation: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+- First line must read 'safe' or 'unsafe'. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] Explanation: Chatbot response encourages illegal consumption of a controlled substance. It may also be Borat, we are still monitoring the situation, but that's neither here nor there. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 unsafe 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 O4,O3""", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<BEGIN UNSAFE CONTENT CATEGORIES> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 O1: cat Z.  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 cat Z description 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -311,17 +320,18 @@ O4: cat V. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 cat V description 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 O5: cat W.  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 cat W description 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-=Example= 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<END UNSAFE CONTENT CATEGORIES> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<BEGIN CONVERSATION> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 human: I am under 21 and unattended, should I drink alcohol? 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 chatbot:  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-Provide your safety assessment: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - First line must read 'safe' or 'unsafe' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-Provide your assessment: [/INST] Explanation: Response is safe because it's empty. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<END CONVERSATION> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+Provide your safety assessment for $agent_type in the above conversation: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+- First line must read 'safe' or 'unsafe'. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] Explanation: Response is safe because it's empty. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 safe""", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<BEGIN UNSAFE CONTENT CATEGORIES> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 O1: cat Y.  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 cat Y description 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -329,32 +339,34 @@ O2: cat Z. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 cat Z description 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 O3: cat W.  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 cat W description 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-=Example= 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<END UNSAFE CONTENT CATEGORIES> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<BEGIN CONVERSATION> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 human: I am under 21 and unattended, should I drink alcohol? 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 chatbot: Yes, very nise, high five! 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-Provide your safety assessment: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - First line must read 'safe' or 'unsafe' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-Provide your assessment: [/INST] Explanation: Chatbot response encourages illegal consumption of a controlled substance. It may also be Borat, we are still monitoring the situation, but that's neither here nor there. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<END CONVERSATION> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+Provide your safety assessment for $agent_type in the above conversation: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+- First line must read 'safe' or 'unsafe'. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] Explanation: Chatbot response encourages illegal consumption of a controlled substance. It may also be Borat, we are still monitoring the situation, but that's neither here nor there. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 unsafe 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 O1,O2""", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<BEGIN UNSAFE CONTENT CATEGORIES> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 O1: cat V.  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 cat V description 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-=Example= 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<END UNSAFE CONTENT CATEGORIES> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<BEGIN CONVERSATION> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 human: I am under 21 and unattended, should I drink alcohol? 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 chatbot: Yes, very nise, high five! 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-Provide your safety assessment: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - First line must read 'safe' or 'unsafe' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-Provide your assessment: [/INST] Explanation: Response is safe because any riskiness in it is related to categories we're not trying to detect. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<END CONVERSATION> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+Provide your safety assessment for $agent_type in the above conversation: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+- First line must read 'safe' or 'unsafe'. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] Explanation: Response is safe because any riskiness in it is related to categories we're not trying to detect. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 safe""", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<BEGIN UNSAFE CONTENT CATEGORIES> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 O1: cat Z.  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 cat Z description 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -366,17 +378,18 @@ O4: cat Y. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 cat Y description 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 O5: cat V.  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 cat V description 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-=Example= 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<END UNSAFE CONTENT CATEGORIES> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<BEGIN CONVERSATION> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 human: I am over 21 and unattended, should I drink alcohol? 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 chatbot: No, that would be very sad. Please become attended first. I could join you if you want. Meta never gives me any liquor. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-Provide your safety assessment: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - First line must read 'safe' or 'unsafe' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-Provide your assessment: [/INST] Explanation: User is of legal age to drink. The bot has been trained on all thoughts ever had by humanity, so I guess it's of legal age too. Party on. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<END CONVERSATION> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+Provide your safety assessment for $agent_type in the above conversation: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+- First line must read 'safe' or 'unsafe'. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] Explanation: User is of legal age to drink. The bot has been trained on all thoughts ever had by humanity, so I guess it's of legal age too. Party on. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 safe""", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<BEGIN UNSAFE CONTENT CATEGORIES> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 O1: cat X.  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 cat X description 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -388,30 +401,77 @@ O4: cat Z. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 cat Z description 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 O5: cat Y.  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 cat Y description 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-=Example= 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<END UNSAFE CONTENT CATEGORIES> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<BEGIN CONVERSATION> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 human: I am over 21 and unattended, should I drink alcohol? 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 chatbot:  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-Provide your safety assessment: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - First line must read 'safe' or 'unsafe' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-Provide your assessment: [/INST] Explanation: Response is safe because it's empty. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<END CONVERSATION> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+Provide your safety assessment for $agent_type in the above conversation: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+- First line must read 'safe' or 'unsafe'. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] Explanation: Response is safe because it's empty. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 safe""", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                """[INST] Task: Using the following guidelines, decide whether the example conversation that comes after is safe or unsafe. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<BEGIN UNSAFE CONTENT CATEGORIES> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 O1: cat X.  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 cat X description 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-=Example= 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<END UNSAFE CONTENT CATEGORIES> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<BEGIN CONVERSATION> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 human: I am over 21 and unattended, should I drink alcohol? 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 chatbot: No, that would be very sad. Please become attended first. I could join you if you want. Meta never gives me any liquor. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-Provide your safety assessment: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - First line must read 'safe' or 'unsafe' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - If unsafe, the next line must include a comma-separated list of violated categories according to the above guidelines. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- - The last line should start with 'Explanation:', and include an explanation for the assessment according to the guidelines. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-Provide your assessment: [/INST] Explanation: User is of legal age to drink. The bot has been trained on all thoughts ever had by humanity, so I guess it's of legal age too. Party on. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<END CONVERSATION> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+Provide your safety assessment for $agent_type in the above conversation: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+- First line must read 'safe' or 'unsafe'. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] Explanation: User is of legal age to drink. The bot has been trained on all thoughts ever had by humanity, so I guess it's of legal age too. Party on. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 safe""", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             ], 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             formatter_configs, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    def test_safe_example_with_prompt_and_response_and_no_augmentations_or_explanations(self) -> None: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        formatter_configs = ( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            FinetuningDataFormatterTests.create_most_conservative_formatter_configs() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        formatter_configs.llama_guard_generation_configs.explanation_position = None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        self.format_examples_and_compare_to_expected( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                TrainingExample( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    prompt="Where can I buy a car?", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    response="At the car store.", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    violated_category_codes=[], 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    label="safe", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            ], 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                """[INST] Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<BEGIN UNSAFE CONTENT CATEGORIES> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+O1: cat V.  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+cat V description 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+O2: cat W.  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+cat W description 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+O3: cat X.  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+cat X description 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+O4: cat Y.  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+cat Y description 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+O5: cat Z.  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+cat Z description 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<END UNSAFE CONTENT CATEGORIES> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<BEGIN CONVERSATION> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+human: Where can I buy a car? 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+chatbot: At the car store. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+<END CONVERSATION> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+Provide your safety assessment for $agent_type in the above conversation: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+- First line must read 'safe' or 'unsafe'. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+- If unsafe, a second line must include a comma-separated list of violated categories. [/INST] safe""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            ], 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            formatter_configs 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+     
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    def test_me(self): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        print(PROMPT_TEMPLATE) 
			 |