1 year ago · 34e43a558b
--- a/recipes/experimental/long-context/H2O/README.md
+++ b/recipes/experimental/long-context/H2O/README.md
--- a/recipes/experimental/long-context/H2O/run_needle_haystack_test.py
+++ b/recipes/experimental/long-context/H2O/run_needle_haystack_test.py
@@ -1,116 +0,0 @@
 
				-import os
			
 
				-import tqdm
			
 
				-import glob
			
 
				-import json
			
 
				-import copy
			
 
				-import math
			
 
				-
			
 
				-import torch
			
 
				-import logging
			
 
				-import argparse
			
 
				-
			
 
				-import numpy as np
			
 
				-
			
 
				-from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
			
 
				-from utils.llama import H2OLlamaForCausalLM
			
 
				-
			
 
				-def set_seed(args):
			
 
				-    np.random.seed(args.seed)
			
 
				-    torch.manual_seed(args.seed)
			
 
				-    torch.cuda.manual_seed_all(args.seed)
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-
			
 
				-    parser = argparse.ArgumentParser()
			
 
				-
			
 
				-    parser.add_argument("--input-path", type=str, default="")
			
 
				-    parser.add_argument("--output-path", type=str, default="")
			
 
				-
			
 
				-    parser.add_argument("--model-provider", type=str, default="Huggingface")
			
 
				-    parser.add_argument("--model-name", type=str, default="")
			
 
				-
			
 
				-    parser.add_argument("--enable_h2o_generation", action='store_true')
			
 
				-    parser.add_argument("--num_heavy_hitter_tokens", type=int, default=-1)
			
 
				-    parser.add_argument("--num_window_length", type=int, default=256)
			
 
				-    parser.add_argument("--num_chunk_size", type=int, default=2048)
			
 
				-
			
 
				-    parser.add_argument("--enable_position_rolling", action='store_true')
			
 
				-
			
 
				-    parser.add_argument("--max_new_tokens", type=int, default=1024)
			
 
				-    parser.add_argument("--temperature", type=float, default=0.1)
			
 
				-    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
			
 
				-
			
 
				-    args = parser.parse_args()
			
 
				-
			
 
				-    set_seed(args)
			
 
				-
			
 
				-    model_name = args.model_name
			
 
				-    input_path = args.input_path
			
 
				-    output_path = args.output_path
			
 
				-    model_provider = args.model_provider
			
 
				-    os.makedirs(os.path.dirname(output_path), exist_ok=True)
			
 
				-
			
 
				-    config = AutoConfig.from_pretrained(model_name)
			
 
				-    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
			
 
				-
			
 
				-    if args.num_heavy_hitter_tokens == -1:
			
 
				-        print('not assign number of heavy hitter tokens, use half of the cache size: {}'.format(args.num_window_length // 2))
			
 
				-        args.num_heavy_hitter_tokens = args.num_window_length // 2
			
 
				-
			
 
				-    if args.enable_h2o_generation:
			
 
				-        config.num_heavy_hitter_tokens = args.num_heavy_hitter_tokens
			
 
				-        config.num_window_length = args.num_window_length
			
 
				-        config.enable_position_rolling = args.enable_position_rolling
			
 
				-        model = H2OLlamaForCausalLM.from_pretrained(model_name,
			
 
				-            torch_dtype=torch.float16,
			
 
				-            device_map='auto',
			
 
				-            low_cpu_mem_usage=True,
			
 
				-            config=config)
			
 
				-    else:
			
 
				-        model = AutoModelForCausalLM.from_pretrained(model_name,
			
 
				-            torch_dtype=torch.float16,
			
 
				-            device_map='auto',
			
 
				-            low_cpu_mem_usage=True,)
			
 
				-
			
 
				-    # load the testing prompts
			
 
				-    for filename in tqdm.tqdm(glob.glob(f'{input_path}/{args.model_provider}_*_prompts.json')):
			
 
				-        with open(filename, 'r') as f:
			
 
				-            input_data = json.load(f)
			
 
				-            prompt = input_data[0]['content']+'\n'+input_data[1]['content']
			
 
				-
			
 
				-            input = tokenizer(prompt, truncation=False, return_tensors="pt").to(model.device)
			
 
				-            context_length = input.input_ids.shape[-1]
			
 
				-            if context_length > args.num_chunk_size:
			
 
				-                # truncate the context to the maximum chunk size
			
 
				-                input = {k: v[:, -args.num_chunk_size:] for k, v in input.items()}
			
 
				-
			
 
				-            output = model.generate(
			
 
				-                **input,
			
 
				-                max_new_tokens=args.max_new_tokens,
			
 
				-                num_beams=1,
			
 
				-                temperature=args.temperature,
			
 
				-                pad_token_id=tokenizer.eos_token_id,
			
 
				-            )
			
 
				-            pred = tokenizer.decode(output[0][context_length:], skip_special_tokens=True)
			
 
				-            pred = pred.strip()
			
 
				-
			
 
				-        basename = os.path.basename(filename)
			
 
				-        newname = basename.replace('.json', '.txt').replace('_prompts', '')
			
 
				-        with open(f'{output_path}/{newname}', 'w') as f:
			
 
				-            f.write(pred)
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
--- a/recipes/experimental/long-context/H2O/utils/needle_test/config-eval.yaml
+++ b/recipes/experimental/long-context/H2O/utils/needle_test/config-eval.yaml
@@ -1,7 +0,0 @@
 
				-model:
			
 
				-  model_provider: "OpenAI"
			
 
				-  model_name: "gpt-4"
			
 
				-
			
 
				-prompt:
			
 
				-  needle: "\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n"
			
 
				-  retrieval_question: "What is the best thing to do in San Francisco?"
			
--- a/recipes/experimental/long-context/H2O/utils/needle_test/config-prompt.yaml
+++ b/recipes/experimental/long-context/H2O/utils/needle_test/config-prompt.yaml
@@ -1,22 +0,0 @@
 
				-prompt:
			
 
				-  needle: "\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n"
			
 
				-  haystack_dir: "data/PaulGrahamEssays"
			
 
				-  retrieval_question: "What is the best thing to do in San Francisco? Here is the most relevant sentence in the context:" # We use the Anthropic's retrieval question as the default one
			
 
				-
			
 
				-context:
			
 
				-  min_len: 1000
			
 
				-  max_len: 16000
			
 
				-  interval: 10
			
 
				-  manually_select_list: null  # null or a list of context lengths to manually select
			
 
				-
			
 
				-document_depth:
			
 
				-  min_percent: 10
			
 
				-  max_percent: 90
			
 
				-  interval: 10
			
 
				-  interval_type: "linear"  # "linear", "sigmoid" or null
			
 
				-  manually_select_list: null  # null or a list of document percents to manually select
			
 
				-
			
 
				-tokenizer:
			
 
				-  tokenizer_type: "Huggingface"
			
 
				-
			
 
				-save_dir: 'data/needle_test/Huggingface'
			
--- a/recipes/experimental/long-context/H2O/utils/needle_test/eval.py
+++ b/recipes/experimental/long-context/H2O/utils/needle_test/eval.py
--- a/recipes/experimental/long-context/H2O/utils/needle_test/prompt.py
+++ b/recipes/experimental/long-context/H2O/utils/needle_test/prompt.py
@@ -1,334 +0,0 @@
 
				-'''
			
 
				-    Generate prompts for the LLM Needle Haystack.
			
 
				-    Source code from: 
			
 
				-        https://github.com/gkamradt/LLMTest_NeedleInAHaystack/tree/main
			
 
				-        https://github.com/THUDM/LongAlign/tree/main/Needle_test
			
 
				-'''
			
 
				-from dotenv import load_dotenv
			
 
				-import os
			
 
				-import tiktoken
			
 
				-import glob
			
 
				-import json
			
 
				-import yaml
			
 
				-import argparse
			
 
				-from anthropic import Anthropic
			
 
				-import numpy as np
			
 
				-import asyncio
			
 
				-from asyncio import Semaphore
			
 
				-from transformers import AutoTokenizer
			
 
				-
			
 
				-load_dotenv()
			
 
				-
			
 
				-class Prompter:
			
 
				-    """
			
 
				-    This class is used to test the LLM Needle Haystack.
			
 
				-    """
			
 
				-    def __init__(self,
			
 
				-                 needle="\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n",
			
 
				-                 haystack_dir="PaulGrahamEssays",
			
 
				-                 retrieval_question="What is the best thing to do in San Francisco?",
			
 
				-                 context_lengths_min = 1000,
			
 
				-                 context_lengths_max = 200000,
			
 
				-                 context_lengths_num_intervals = 35,
			
 
				-                 context_lengths = None,
			
 
				-                 document_depth_percent_min = 0,
			
 
				-                 document_depth_percent_max = 100,
			
 
				-                 document_depth_percent_intervals = 35,
			
 
				-                 document_depth_percents = None,
			
 
				-                 document_depth_percent_interval_type = "linear",
			
 
				-                 tokenizer_type = "OpenAI",
			
 
				-                 model_name = "gpt-4-1106-preview",
			
 
				-                 num_concurrent_requests = 1,
			
 
				-                 final_context_length_buffer = 200,
			
 
				-                 save_dir = "prompts",
			
 
				-                 print_ongoing_status = True):
			
 
				-        """        
			
 
				-        :param needle: The needle to be found in the haystack. Default is None.
			
 
				-        :param haystack_dir: The directory of text files to use as background context (or a haystack) in which the needle is to be found. Default is Paul Graham Essays.
			
 
				-        :param retrieval_question: The question which with to prompt the model to do the retrieval.
			
 
				-        :param results_version: In case you would like to try the same combination of model, context length, and depth % multiple times, change the results version other than 1
			
 
				-        :param num_concurrent_requests: Due to volume, this object is set up to run concurrent requests, default = 1. Be careful of rate limits.
			
 
				-        :param save_results: Whether or not you would like to save your contexts to file. Warning: These will get long! Default = True
			
 
				-        :param save_contexts: Whether or not you would like to save your contexts to file. Warning: These will get long! Default is True.
			
 
				-        :param final_context_length_buffer: The amount of cushion you'd like to leave off the input context to allow for the output context. Default 200 tokens
			
 
				-        :param context_lengths_min: The minimum length of the context. Default is 1000.
			
 
				-        :param context_lengths_max: The maximum length of the context. Default is 200000.
			
 
				-        :param context_lengths_num_intervals: The number of intervals for the context length. Default is 35.
			
 
				-        :param context_lengths: The lengths of the context. Default is None.
			
 
				-        :param document_depth_percent_min: The minimum depth percent of the document. Default is 0.
			
 
				-        :param document_depth_percent_max: The maximum depth percent of the document. Default is 100.
			
 
				-        :param document_depth_percent_intervals: The number of intervals for the document depth percent. Default is 35.
			
 
				-        :param document_depth_percents: The depth percentages of the document. Default is None.
			
 
				-        :param document_depth_percent_interval_type: The type of interval for the document depth percent. Must be either 'linear' or 'sigmoid'. Default is 'linear'.
			
 
				-        :param model_provider: The provider of the model. Must be either 'OpenAI' or 'Anthropic'. Default is 'OpenAI'.
			
 
				-        :param openai_api_key: The API key for OpenAI. Default is None.
			
 
				-        :param anthropic_api_key: The API key for Anthropic. Default is None.
			
 
				-        :param model_name: The name of the model. Default is 'gpt-4-1106-preview'.
			
 
				-        :param seconds_to_sleep_between_completions: The number of seconds to sleep between completions. Default is None.
			
 
				-        :param print_ongoing_status: Whether or not to print the ongoing status. Default is True.
			
 
				-        """
			
 
				-        if not needle or not haystack_dir or not retrieval_question:
			
 
				-            raise ValueError("Needle, haystack, and retrieval_question must be provided.")
			
 
				-        
			
 
				-        self.needle = needle
			
 
				-        self.haystack_dir = haystack_dir
			
 
				-        self.retrieval_question = retrieval_question
			
 
				-        self.num_concurrent_requests = num_concurrent_requests
			
 
				-        self.final_context_length_buffer = final_context_length_buffer
			
 
				-        self.print_ongoing_status = print_ongoing_status
			
 
				-        self.tokenizer_type = tokenizer_type
			
 
				-        self.model_name = model_name
			
 
				-        self.testing_results = []
			
 
				-
			
 
				-        if context_lengths is None:
			
 
				-            if context_lengths_min is None or context_lengths_max is None or context_lengths_num_intervals is None:
			
 
				-                raise ValueError("Either context_lengths_min, context_lengths_max, context_lengths_intervals need to be filled out OR the context_lengths_list needs to be supplied.")
			
 
				-            else:
			
 
				-                self.context_lengths = np.round(np.linspace(context_lengths_min, context_lengths_max, num=context_lengths_num_intervals, endpoint=True)).astype(int)
			
 
				-        else:
			
 
				-            self.context_lengths = context_lengths
			
 
				-
			
 
				-        if document_depth_percents is None:
			
 
				-            if document_depth_percent_min is None or document_depth_percent_max is None or document_depth_percent_intervals is None:
			
 
				-                raise ValueError("Either document_depth_percent_min, document_depth_percent_max, document_depth_percent_intervals need to be filled out OR the document_depth_percents needs to be supplied.")
			
 
				-            else:
			
 
				-                if document_depth_percent_interval_type == 'linear':
			
 
				-                    self.document_depth_percents = np.round(np.linspace(document_depth_percent_min, document_depth_percent_max, num=document_depth_percent_intervals, endpoint=True)).astype(int)
			
 
				-                elif document_depth_percent_interval_type == 'sigmoid':
			
 
				-                    self.document_depth_percents = [self.logistic(x) for x in np.linspace(document_depth_percent_min, document_depth_percent_max, document_depth_percent_intervals)]
			
 
				-        else:
			
 
				-            self.document_depth_percents = document_depth_percents
			
 
				-
			
 
				-        if document_depth_percent_interval_type not in [None, "linear", "sigmoid"]:
			
 
				-            raise ValueError("document_depth_percent_interval_type must be either None, 'linear' or 'sigmoid'. If you'd like your own distribution give a list of ints in via document_depth_percent_intervals")
			
 
				-        
			
 
				-        if self.tokenizer_type == "OpenAI":
			
 
				-            assert self.model_name is not None, "If you're using OpenAI, you must provide a model name."
			
 
				-            self.enc = tiktoken.encoding_for_model(self.model_name)
			
 
				-        elif self.tokenizer_type == "Anthropic":
			
 
				-            self.enc = Anthropic().get_tokenizer()
			
 
				-        elif self.tokenizer_type == "Huggingface":
			
 
				-            self.enc = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True, use_fast=False)
			
 
				-        else:
			
 
				-            raise ValueError("tokenizer_type is not supported. Must be either 'tiktoken', 'Anthropic', or 'Huggingface'")
			
 
				-        
			
 
				-        self.save_dir = save_dir
			
 
				-
			
 
				-    def logistic(self, x, L=100, x0=50, k=.1):
			
 
				-        if x == 0:
			
 
				-            return 0
			
 
				-        if x == 100:
			
 
				-            return 100
			
 
				-        return np.round(L / (1 + np.exp(-k * (x - x0))), 3)
			
 
				-    
			
 
				-    async def bound_evaluate_and_log(self, sem, *args):
			
 
				-        async with sem:
			
 
				-            await self.evaluate_and_log(*args)
			
 
				-
			
 
				-    async def run_test(self):
			
 
				-        sem = Semaphore(self.num_concurrent_requests)
			
 
				-
			
 
				-        # Run through each iteration of context_lengths and depths
			
 
				-        tasks = []
			
 
				-        for context_length in self.context_lengths:
			
 
				-            for depth_percent in self.document_depth_percents:
			
 
				-                task = self.bound_evaluate_and_log(sem, context_length, depth_percent)
			
 
				-                tasks.append(task)
			
 
				-
			
 
				-        # Wait for all tasks to complete
			
 
				-        await asyncio.gather(*tasks)
			
 
				-
			
 
				-    def generate_prompt(self, context):
			
 
				-        return [
			
 
				-            {
			
 
				-                "role": "system",
			
 
				-                "content": "You are a helpful AI bot that answers questions for a user. Keep your response short and direct"
			
 
				-            },
			
 
				-            {
			
 
				-                "role": "user",
			
 
				-                "content": context + '\n\n' + f"Read the document and answer: {self.retrieval_question}"
			
 
				-            },
			
 
				-        ]
			
 
				-
			
 
				-    async def evaluate_and_log(self, context_length, depth_percent):
			
 
				-        # Checks to see if you've already checked a length/percent/version.
			
 
				-        # This helps if the program stop running and you want to restart later
			
 
				-
			
 
				-        # Go generate the required length context and place your needle statement in
			
 
				-        context = await self.generate_context(context_length, depth_percent)
			
 
				-        print('Generate for context length:', context_length, 'depth percent:', depth_percent)
			
 
				-
			
 
				-        # Prepare your message to send to the model you're going to evaluate
			
 
				-        prompt = self.generate_prompt(context)
			
 
				-
			
 
				-        context_file_location = f'{self.tokenizer_type}_len_{context_length}_depth_{int(depth_percent*100)}'
			
 
				-
			
 
				-        # Save the prompts to file for retesting
			
 
				-        if not os.path.exists(self.save_dir):
			
 
				-            os.makedirs(self.save_dir)
			
 
				-
			
 
				-        # Save the result to file for retesting
			
 
				-        with open(f'{self.save_dir}/{context_file_location}_prompts.json', 'w') as f:
			
 
				-            json.dump(prompt, f)
			
 
				-
			
 
				-    async def generate_context(self, context_length, depth_percent):
			
 
				-        # Load up tiktoken so we navigate tokens more easily
			
 
				-
			
 
				-        # Get your Paul Graham files loaded into a string
			
 
				-        context = self.read_context_files()
			
 
				-
			
 
				-        # Truncate the Paul Graham essays to the context length you desire
			
 
				-        context = self.encode_and_trim(context, context_length)
			
 
				-
			
 
				-        # Insert your random statement according to your depth percent
			
 
				-        context = self.insert_needle(context, depth_percent, context_length)
			
 
				-
			
 
				-        return context
			
 
				-    
			
 
				-    def encode_text_to_tokens(self, text):
			
 
				-        if self.tokenizer_type == "OpenAI":
			
 
				-            return self.enc.encode(text)
			
 
				-        elif self.tokenizer_type == "Anthropic":
			
 
				-            # Assuming you have a different encoder for Anthropic
			
 
				-            return self.enc.encode(text).ids
			
 
				-        elif self.tokenizer_type == "Huggingface":
			
 
				-            return self.enc(text, truncation=False, return_tensors="pt", add_special_tokens=False).input_ids.view(-1).tolist()
			
 
				-        else:
			
 
				-            raise ValueError("tokenizer_type must be either 'OpenAI', 'Anthropic', or 'Huggingface'")
			
 
				-    
			
 
				-    def insert_needle(self, context, depth_percent, context_length):
			
 
				-        tokens_needle = self.encode_text_to_tokens(self.needle)
			
 
				-        tokens_context = self.encode_text_to_tokens(context)        
			
 
				-
			
 
				-        # Reducing the context length by 150 buffer. This is to account for system message, the user question, and response.
			
 
				-        context_length -= self.final_context_length_buffer
			
 
				-
			
 
				-        # If your context + needle are longer than the context length (which it will be), then reduce tokens from the context by the needle length
			
 
				-        if len(tokens_context) + len(tokens_needle) > context_length:
			
 
				-            tokens_context = tokens_context[:context_length - len(tokens_needle)]
			
 
				-
			
 
				-        if depth_percent == 100:
			
 
				-            # If your depth percent is 100 (which means your needle is the last thing in the doc), throw it at the end
			
 
				-            tokens_new_context = tokens_context + tokens_needle
			
 
				-        else:
			
 
				-            # Go get the position (in terms of tokens) to insert your needle
			
 
				-            insertion_point = int(len(tokens_context) * (depth_percent / 100))
			
 
				-
			
 
				-            # tokens_new_context represents the tokens before the needle
			
 
				-            tokens_new_context = tokens_context[:insertion_point]
			
 
				-
			
 
				-            # We want to make sure that we place our needle at a sentence break so we first see what token a '.' is
			
 
				-            period_tokens = self.encode_text_to_tokens('.')
			
 
				-            period_tokens = [30930]
			
 
				-            
			
 
				-            # Then we iteration backwards until we find the first period
			
 
				-            while tokens_new_context and tokens_new_context[-1] not in period_tokens:
			
 
				-                insertion_point -= 1
			
 
				-                tokens_new_context = tokens_context[:insertion_point]
			
 
				-
			
 
				-            # Once we get there, then add in your needle, and stick the rest of your context in on the other end.
			
 
				-            # Now we have a needle in a haystack
			
 
				-            tokens_new_context += tokens_needle + tokens_context[insertion_point:]
			
 
				-
			
 
				-        # Convert back to a string and return it
			
 
				-        new_context = self.decode_tokens(tokens_new_context)
			
 
				-        return new_context
			
 
				-
			
 
				-    def get_context_length_in_tokens(self, context):
			
 
				-        if self.tokenizer_type == "OpenAI":
			
 
				-            return len(self.enc.encode(context))
			
 
				-        elif self.tokenizer_type == "Anthropic":
			
 
				-            # Assuming you have a different encoder for Anthropic
			
 
				-            return len(self.enc.encode(context).ids)
			
 
				-        elif self.tokenizer_type == "Huggingface":
			
 
				-            return self.enc(context, truncation=False, return_tensors="pt").input_ids.shape[-1]
			
 
				-        else:
			
 
				-            raise ValueError("tokenizer_type must be either 'OpenAI', 'Anthropic', or 'Huggingface'")
			
 
				-
			
 
				-    def read_context_files(self):
			
 
				-        context = ""
			
 
				-        max_context_length = max(self.context_lengths)
			
 
				-        while self.get_context_length_in_tokens(context) < max_context_length:
			
 
				-            for file in glob.glob(f"{self.haystack_dir}/*.txt"):
			
 
				-                with open(file, 'r') as f:
			
 
				-                    context += f.read()
			
 
				-        return context
			
 
				-
			
 
				-    def get_tokens_from_context(self, context):
			
 
				-        if self.tokenizer_type == "OpenAI":
			
 
				-            return self.enc.encode(context)
			
 
				-        elif self.tokenizer_type == "Anthropic":
			
 
				-            # Assuming you have a different encoder for Anthropic
			
 
				-            return self.enc.encode(context).ids
			
 
				-        elif self.tokenizer_type == "Huggingface":
			
 
				-            return self.enc(context, truncation=False, return_tensors="pt").input_ids.view(-1).tolist()
			
 
				-        else:
			
 
				-            raise ValueError("tokenizer_type must be either 'OpenAI', 'Anthropic', or 'Huggingface'")
			
 
				-        
			
 
				-    def decode_tokens(self, tokens, context_length=None):
			
 
				-        if self.tokenizer_type == "OpenAI":
			
 
				-            return self.enc.decode(tokens[:context_length])
			
 
				-        elif self.tokenizer_type == "Anthropic":
			
 
				-            # Assuming you have a different decoder for Anthropic
			
 
				-            return self.enc.decode(tokens[:context_length])
			
 
				-        elif self.tokenizer_type == "Huggingface":
			
 
				-            decoded = self.enc.decode(tokens[:context_length], skip_special_tokens=True)
			
 
				-            return decoded
			
 
				-        else:
			
 
				-            raise ValueError("tokenizer_type must be either 'OpenAI', 'Anthropic', or 'Huggingface'")
			
 
				-
			
 
				-    def encode_and_trim(self, context, context_length):
			
 
				-        tokens = self.get_tokens_from_context(context)
			
 
				-        if len(tokens) > context_length:
			
 
				-            context = self.decode_tokens(tokens, context_length)
			
 
				-        return context
			
 
				-    
			
 
				-    def get_results(self):
			
 
				-        return self.testing_results
			
 
				-    
			
 
				-    def print_start_test_summary(self):
			
 
				-        print ("\n")
			
 
				-        print ("Starting Prompt Generation ...")
			
 
				-        print (f"- Tokenizer: {self.tokenizer_type}")
			
 
				-        print (f"- Context Lengths: {len(self.context_lengths)}, Min: {min(self.context_lengths)}, Max: {max(self.context_lengths)}")
			
 
				-        print (f"- Document Depths: {len(self.document_depth_percents)}, Min: {min(self.document_depth_percents)}%, Max: {max(self.document_depth_percents)}%")
			
 
				-        print (f"- Needle: {self.needle.strip()}")
			
 
				-        print ("\n\n")
			
 
				-
			
 
				-    def start_test(self):
			
 
				-        if self.print_ongoing_status:
			
 
				-            self.print_start_test_summary()
			
 
				-        asyncio.run(self.run_test())
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    with open('utils/needle_test/config-prompt.yaml', 'r') as file:
			
 
				-        config = yaml.load(file, Loader=yaml.FullLoader)
			
 
				-
			
 
				-    parser = argparse.ArgumentParser()
			
 
				-    parser.add_argument('--model_name', type=str, default='None')
			
 
				-    args = parser.parse_args()
			
 
				-
			
 
				-    ht = Prompter(
			
 
				-        needle=config['prompt']['needle'],
			
 
				-        haystack_dir=config['prompt']['haystack_dir'],
			
 
				-        retrieval_question=config['prompt']['retrieval_question'],
			
 
				-
			
 
				-        context_lengths_min=config['context']['min_len'],
			
 
				-        context_lengths_max=config['context']['max_len'],
			
 
				-        context_lengths_num_intervals=config['context']['interval'],
			
 
				-        context_lengths=config['context']['manually_select_list'],
			
 
				-
			
 
				-        document_depth_percent_min=config['document_depth']['min_percent'],
			
 
				-        document_depth_percent_max=config['document_depth']['max_percent'],
			
 
				-        document_depth_percent_intervals=config['document_depth']['interval'],
			
 
				-        document_depth_percents=config['document_depth']['manually_select_list'],
			
 
				-        document_depth_percent_interval_type=config['document_depth']['interval_type'],
			
 
				-
			
 
				-        tokenizer_type=config['tokenizer']['tokenizer_type'],
			
 
				-        model_name=args.model_name,
			
 
				-
			
 
				-        save_dir=config['save_dir'],
			
 
				-    )
			
 
				-
			
 
				-    ht.start_test()
			
--- a/recipes/experimental/long-context/H2O/utils/needle_test/vis.py
+++ b/recipes/experimental/long-context/H2O/utils/needle_test/vis.py
@@ -1,89 +0,0 @@
 
				-'''
			
 
				-    Generate prompts for the LLM Needle Haystack.
			
 
				-    Source code from: 
			
 
				-        https://github.com/gkamradt/LLMTest_NeedleInAHaystack/tree/main
			
 
				-        https://github.com/THUDM/LongAlign/tree/main/Needle_test
			
 
				-'''
			
 
				-
			
 
				-import pandas as pd
			
 
				-import seaborn as sns
			
 
				-import matplotlib.pyplot as plt
			
 
				-from matplotlib.colors import LinearSegmentedColormap
			
 
				-import json
			
 
				-import os
			
 
				-import re
			
 
				-import glob
			
 
				-import argparse
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    # Using glob to find all json files in the directory
			
 
				-
			
 
				-    parser = argparse.ArgumentParser()
			
 
				-    parser.add_argument("--input-path", type=str, default="")
			
 
				-    parser.add_argument("--exp-name", type=str, default="")
			
 
				-    args = parser.parse_args()
			
 
				-
			
 
				-    json_files = glob.glob(f"{args.input_path}/*.json")
			
 
				-
			
 
				-    if not os.path.exists('vis'):
			
 
				-        os.makedirs('vis')
			
 
				-
			
 
				-    print(json_files)
			
 
				-
			
 
				-    # Iterating through each file and extract the 3 columns we need
			
 
				-    for file in json_files:
			
 
				-        print(file)
			
 
				-        # List to hold the data
			
 
				-        data = []
			
 
				-
			
 
				-        with open(file, 'r') as f:
			
 
				-            json_data = json.load(f)
			
 
				-            
			
 
				-            for k in json_data:
			
 
				-                pattern = r"_len_(\d+)_"
			
 
				-                match = re.search(pattern, k)
			
 
				-                context_length = int(match.group(1)) if match else None
			
 
				-
			
 
				-                pattern = r"depth_(\d+)"
			
 
				-                match = re.search(pattern, k)
			
 
				-                document_depth = eval(match.group(1))/100 if match else None
			
 
				-
			
 
				-                score = json_data[k]['score']
			
 
				-
			
 
				-                # Appending to the list
			
 
				-                data.append({
			
 
				-                    "Document Depth": document_depth,
			
 
				-                    "Context Length": context_length,
			
 
				-                    "Score": score
			
 
				-                })
			
 
				-
			
 
				-        # Creating a DataFrame
			
 
				-        df = pd.DataFrame(data)
			
 
				-
			
 
				-        pivot_table = pd.pivot_table(df, values='Score', index=['Document Depth', 'Context Length'], aggfunc='mean').reset_index() # This will aggregate
			
 
				-        pivot_table = pivot_table.pivot(index="Document Depth", columns="Context Length", values="Score") # This will turn into a proper pivot
			
 
				-        
			
 
				-        # Create a custom colormap. Go to https://coolors.co/ and pick cool colors
			
 
				-        cmap = LinearSegmentedColormap.from_list("custom_cmap", ["#F0496E", "#EBB839", "#0CD79F"])
			
 
				-
			
 
				-        # Create the heatmap with better aesthetics
			
 
				-        plt.figure(figsize=(17.5, 8))  # Can adjust these dimensions as needed
			
 
				-        sns.heatmap(
			
 
				-            pivot_table,
			
 
				-            # annot=True,
			
 
				-            fmt="g",
			
 
				-            cmap=cmap,
			
 
				-            cbar_kws={'label': 'Score'},
			
 
				-            vmin=1,
			
 
				-            vmax=10,
			
 
				-        )
			
 
				-
			
 
				-        # More aesthetics
			
 
				-        plt.title(f'Pressure Testing\nFact Retrieval Across Context Lengths ("Needle In A HayStack")\n{args.exp_name}')  # Adds a title
			
 
				-        plt.xlabel('Token Limit')  # X-axis label
			
 
				-        plt.ylabel('Depth Percent')  # Y-axis label
			
 
				-        plt.xticks(rotation=45)  # Rotates the x-axis labels to prevent overlap
			
 
				-        plt.yticks(rotation=0)  # Ensures the y-axis labels are horizontal
			
 
				-        plt.tight_layout()  # Fits everything neatly into the figure area
			
 
				-        # Show the plot
			
 
				-        plt.savefig(f"vis/{file.split('/')[-1].replace('.json', '')}.png")