2 éve · 6147fe8e77
--- a/.github/scripts/spellcheck_conf/wordlist.txt
+++ b/.github/scripts/spellcheck_conf/wordlist.txt
@@ -1352,3 +1352,39 @@ MediaGen
 
				 SDXL
			
 
				 SVD
			
 
				 QLORA
			
 
				+KV
			
 
				+KVs
			
 
				+XSUM
			
 
				+contrains
			
 
				+knowlege
			
 
				+kv
			
 
				+prefilling
			
 
				+DataFrame
			
 
				+DuckDB
			
 
				+Groq
			
 
				+GroqCloud
			
 
				+Replit
			
 
				+Teslas
			
 
				+duckdb
			
 
				+teslas
			
 
				+Groqs
			
 
				+groq
			
 
				+schemas
			
 
				+Pinecone
			
 
				+Pinecone's
			
 
				+Repl
			
 
				+docsearch
			
 
				+presidental
			
 
				+CrewAI
			
 
				+kickstart
			
 
				+DataFrames
			
 
				+Groqing
			
 
				+Langchain
			
 
				+Plotly
			
 
				+dfs
			
 
				+yfinance
			
 
				+Groq's
			
 
				+LlamaChat
			
 
				+chatbot's
			
 
				+ConversationBufferWindowMemory
			
 
				+chatbot's
			
--- a/recipes/experimental/long-context/H2O/README.md
+++ b/recipes/experimental/long-context/H2O/README.md
--- a/recipes/experimental/long-context/H2O/data/summarization/cnn_dailymail.jsonl
+++ b/recipes/experimental/long-context/H2O/data/summarization/cnn_dailymail.jsonl
--- a/recipes/experimental/long-context/H2O/data/summarization/xsum.jsonl
+++ b/recipes/experimental/long-context/H2O/data/summarization/xsum.jsonl
--- a/recipes/experimental/long-context/H2O/requirements.txt
+++ b/recipes/experimental/long-context/H2O/requirements.txt
@@ -0,0 +1,4 @@
 
				+transformers
			
 
				+rouge
			
 
				+xopen
			
 
				+needlehaystack
			
--- a/recipes/experimental/long-context/H2O/run_streaming.py
+++ b/recipes/experimental/long-context/H2O/run_streaming.py
@@ -0,0 +1,91 @@
 
				+import torch
			
 
				+import argparse
			
 
				+import json
			
 
				+import os
			
 
				+import time
			
 
				+import re
			
 
				+import sys
			
 
				+
			
 
				+from utils.streaming import load, download_url, load_jsonl, greedy_generate
			
 
				+
			
 
				+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
			
 
				+from utils.llama import H2OLlamaForCausalLM
			
 
				+from utils.cache import Cache, HHCache, StaticCache
			
 
				+
			
 
				+
			
 
				+@torch.no_grad()
			
 
				+def streaming_inference_h2o(model, tokenizer, config, prompts, max_gen_len=1000, enable_h2o_generation=False):
			
 
				+    past_key_values = None
			
 
				+    for idx, prompt in enumerate(prompts):
			
 
				+        prompt = "USER: " + prompt + "\n\nASSISTANT: "
			
 
				+        print("\n" + prompt, end="")
			
 
				+        input_ids = tokenizer(prompt, return_tensors="pt").input_ids
			
 
				+        input_ids = input_ids.to(model.device)
			
 
				+        seq_len = input_ids.shape[1]
			
 
				+
			
 
				+        past_key_values = greedy_generate(
			
 
				+            model, tokenizer, input_ids, past_key_values, max_gen_len=max_gen_len
			
 
				+        )
			
 
				+        if enable_h2o_generation:
			
 
				+            space_needed = seq_len + max_gen_len
			
 
				+            past_key_values = HHCache.from_legacy_cache(config.num_window_length, config.num_heavy_hitter_tokens, past_key_values)
			
 
				+            past_key_values.evict_for_space(space_needed)
			
 
				+            past_key_values = past_key_values.to_legacy_cache()
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    parser = argparse.ArgumentParser()
			
 
				+
			
 
				+    parser.add_argument("--input-path", type=str, default="")
			
 
				+    parser.add_argument("--model-name", type=str, default="lmsys/vicuna-13b-v1.5")
			
 
				+
			
 
				+    parser.add_argument("--enable_h2o_generation", action='store_true')
			
 
				+    parser.add_argument("--num_heavy_hitter_tokens", type=int, default=128)
			
 
				+    parser.add_argument("--num_window_length", type=int, default=256)
			
 
				+
			
 
				+    parser.add_argument("--enable_position_rolling", action='store_true')
			
 
				+
			
 
				+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
			
 
				+
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    model_name = args.model_name
			
 
				+    data_root = args.input_path
			
 
				+
			
 
				+    config = AutoConfig.from_pretrained(model_name)
			
 
				+    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
			
 
				+
			
 
				+    if args.enable_h2o_generation:
			
 
				+        config.num_heavy_hitter_tokens = args.num_heavy_hitter_tokens
			
 
				+        config.num_window_length = args.num_window_length
			
 
				+        config.enable_position_rolling = args.enable_position_rolling
			
 
				+        model = H2OLlamaForCausalLM.from_pretrained(model_name,
			
 
				+            torch_dtype=torch.float16,
			
 
				+            device_map='auto',
			
 
				+            low_cpu_mem_usage=True,
			
 
				+            config=config)
			
 
				+    else:
			
 
				+        model = AutoModelForCausalLM.from_pretrained(model_name,
			
 
				+            torch_dtype=torch.float16,
			
 
				+            device_map='auto',
			
 
				+            low_cpu_mem_usage=True,)
			
 
				+
			
 
				+    test_filepath = os.path.join(data_root, "mt_bench.jsonl")
			
 
				+    print(f"Loading data from {test_filepath} ...")
			
 
				+
			
 
				+    if not os.path.exists(test_filepath):
			
 
				+        download_url(
			
 
				+            "https://raw.githubusercontent.com/lm-sys/FastChat/main/fastchat/llm_judge/data/mt_bench/question.jsonl",
			
 
				+            data_root,
			
 
				+        )
			
 
				+        os.rename(os.path.join(data_root, "question.jsonl"), test_filepath)
			
 
				+
			
 
				+    list_data = load_jsonl(test_filepath)
			
 
				+    prompts = []
			
 
				+    for sample in list_data:
			
 
				+        prompts += sample["turns"]
			
 
				+
			
 
				+    streaming_inference_h2o(model, tokenizer, config, prompts, enable_h2o_generation=args.enable_h2o_generation)
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/recipes/experimental/long-context/H2O/run_summarization.py
+++ b/recipes/experimental/long-context/H2O/run_summarization.py
@@ -0,0 +1,147 @@
 
				+import os
			
 
				+import tqdm
			
 
				+import json
			
 
				+import copy
			
 
				+import math
			
 
				+
			
 
				+import torch
			
 
				+import logging
			
 
				+import argparse
			
 
				+
			
 
				+import numpy as np
			
 
				+from rouge import Rouge
			
 
				+
			
 
				+import dataclasses
			
 
				+from xopen import xopen
			
 
				+
			
 
				+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
			
 
				+from utils.llama import H2OLlamaForCausalLM
			
 
				+
			
 
				+def set_seed(args):
			
 
				+    np.random.seed(args.seed)
			
 
				+    torch.manual_seed(args.seed)
			
 
				+    torch.cuda.manual_seed_all(args.seed)
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+
			
 
				+    parser = argparse.ArgumentParser()
			
 
				+
			
 
				+    parser.add_argument("--input-path", type=str, default="")
			
 
				+    parser.add_argument("--output-path", type=str, default="")
			
 
				+
			
 
				+    parser.add_argument("--model-name", type=str, default="")
			
 
				+
			
 
				+    parser.add_argument("--enable_h2o_generation", action='store_true')
			
 
				+    parser.add_argument("--num_heavy_hitter_tokens", type=int, default=-1)
			
 
				+    parser.add_argument("--num_window_length", type=int, default=256)
			
 
				+
			
 
				+    parser.add_argument("--enable_position_rolling", action='store_true')
			
 
				+
			
 
				+    parser.add_argument("--sample_num", type=int, default=500)
			
 
				+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
			
 
				+
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    set_seed(args)
			
 
				+
			
 
				+    model_name = args.model_name
			
 
				+    input_path = args.input_path
			
 
				+    output_path = args.output_path
			
 
				+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
			
 
				+
			
 
				+    config = AutoConfig.from_pretrained(model_name)
			
 
				+    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
			
 
				+    if args.num_heavy_hitter_tokens == -1:
			
 
				+        print('not assign number of heavy hitter tokens, use half of the cache size: {}'.format(args.num_window_length // 2))
			
 
				+        args.num_heavy_hitter_tokens = args.num_window_length // 2
			
 
				+
			
 
				+    if args.enable_h2o_generation:
			
 
				+        config.num_heavy_hitter_tokens = args.num_heavy_hitter_tokens
			
 
				+        config.num_window_length = args.num_window_length
			
 
				+        config.enable_position_rolling = args.enable_position_rolling
			
 
				+        model = H2OLlamaForCausalLM.from_pretrained(model_name,
			
 
				+            torch_dtype=torch.float16,
			
 
				+            device_map='auto',
			
 
				+            low_cpu_mem_usage=True,
			
 
				+            config=config)
			
 
				+    else:
			
 
				+        model = AutoModelForCausalLM.from_pretrained(model_name,
			
 
				+            torch_dtype=torch.float16,
			
 
				+            device_map='auto',
			
 
				+            low_cpu_mem_usage=True,)
			
 
				+
			
 
				+    # loading inference data
			
 
				+    requests = []
			
 
				+    with open(input_path, 'r') as f:
			
 
				+        for line in f:
			
 
				+            if line.strip() != '':
			
 
				+                requests.append(json.loads(line))
			
 
				+
			
 
				+    if args.sample_num < len(requests):
			
 
				+        print('Sample {} Examples from {} samples'.format(args.sample_num, len(requests)))
			
 
				+    requests = requests[:args.sample_num]
			
 
				+
			
 
				+    results = []
			
 
				+    rouge = Rouge()
			
 
				+    rouge1_score_list = []
			
 
				+    rouge2_score_list = []
			
 
				+    rougel_score_list = []
			
 
				+
			
 
				+    with torch.no_grad():
			
 
				+        for request in tqdm.tqdm(requests):
			
 
				+            result = {'request': request, 'result': {}}
			
 
				+            prompt = request['article']
			
 
				+            label = request['summary_gt']
			
 
				+            temperature = request['temperature']
			
 
				+            stop = request['stop']
			
 
				+
			
 
				+            input_ids = tokenizer(prompt, add_special_tokens=False, return_tensors='pt').input_ids.to(model.device)
			
 
				+
			
 
				+            output_sequences = model.generate(
			
 
				+                input_ids=input_ids,
			
 
				+                max_length=request['max_tokens'] + len(input_ids[0]),
			
 
				+                temperature=temperature,
			
 
				+                top_p=request['top_p'],
			
 
				+                do_sample=True,
			
 
				+                num_return_sequences=request['n'],
			
 
				+                return_dict_in_generate=True, output_scores=True,
			
 
				+                pad_token_id=tokenizer.eos_token_id
			
 
				+            )
			
 
				+
			
 
				+            tokens = tokenizer.convert_ids_to_tokens(output_sequences['sequences'].squeeze(0))[len(input_ids[0]):]
			
 
				+            logprobs = [logits.log_softmax(dim=-1).max().item() for logits in output_sequences['scores']]
			
 
				+            top_logprobs = [{i: v for i, v in zip(tokens, logprobs)}]
			
 
				+
			
 
				+            generate_text = tokenizer.decode(output_sequences['sequences'].squeeze(0)[len(input_ids[0]):])
			
 
				+            generate_text = generate_text[: generate_text.find(stop[0])]
			
 
				+
			
 
				+            scores = rouge.get_scores(generate_text, label)[0]
			
 
				+            rouge1_score_list.append(scores['rouge-1']['f'])
			
 
				+            rouge2_score_list.append(scores['rouge-2']['f'])
			
 
				+            rougel_score_list.append(scores['rouge-l']['f'])
			
 
				+
			
 
				+            result['result'] = {
			
 
				+                "choices": [
			
 
				+                    {
			
 
				+                        "text": generate_text,
			
 
				+                        "logprobs": {
			
 
				+                            "tokens": tokens, 
			
 
				+                            "token_logprobs": logprobs, 
			
 
				+                            "top_logprobs": top_logprobs, 
			
 
				+                            "text_offset": []
			
 
				+                        }, 
			
 
				+                        "finish_reason": "length"
			
 
				+                    }
			
 
				+                ], 
			
 
				+                "request_time": {
			
 
				+                    "batch_time": 0, 
			
 
				+                    "batch_size": 1}
			
 
				+            }
			
 
				+            
			
 
				+            results.append(result)
			
 
				+
			
 
				+    print('Average Rouge1: {:.6f}, Rouge-2: {:.6f}, Rouge-l: {:.6f}'.format(np.mean(rouge1_score_list), np.mean(rouge2_score_list), np.mean(rougel_score_list)))
			
 
				+    with open(output_path, 'w') as f:
			
 
				+        for result in results:
			
 
				+            f.write(json.dumps(result) + '\n')
			
 
				+
			
--- a/recipes/experimental/long-context/H2O/src/streaming.sh
+++ b/recipes/experimental/long-context/H2O/src/streaming.sh
@@ -0,0 +1,23 @@
 
				+method=$1
			
 
				+if [[ ${method} == 'h2o' ]]; then
			
 
				+    python -u run_streaming.py \
			
 
				+        --input-path data \
			
 
				+        --model-name lmsys/vicuna-13b-v1.5 \
			
 
				+        --enable_h2o_generation \
			
 
				+        --num_heavy_hitter_tokens 2048 \
			
 
				+        --num_window_length 4096 \
			
 
				+        --enable_position_rolling
			
 
				+elif [[ ${method} == 'full' ]]; then
			
 
				+    python -u run_streaming.py \
			
 
				+        --input-path data \
			
 
				+        --model-name lmsys/vicuna-13b-v1.5
			
 
				+else
			
 
				+    echo 'unknown argment for method'
			
 
				+fi
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/recipes/experimental/long-context/H2O/utils/cache.py
+++ b/recipes/experimental/long-context/H2O/utils/cache.py
@@ -0,0 +1,644 @@
 
				+from dataclasses import dataclass
			
 
				+from typing import Any, Dict, List, Optional, Tuple
			
 
				+
			
 
				+import torch
			
 
				+
			
 
				+from transformers.configuration_utils import PretrainedConfig
			
 
				+from transformers.utils import logging
			
 
				+
			
 
				+logger = logging.get_logger(__name__)
			
 
				+
			
 
				+@dataclass
			
 
				+class Cache:
			
 
				+    """
			
 
				+    Base, abstract class for all caches. The actual data structure is specific to each subclass.
			
 
				+    """
			
 
				+
			
 
				+    def update(
			
 
				+        self,
			
 
				+        key_states: torch.Tensor,
			
 
				+        value_states: torch.Tensor,
			
 
				+        layer_idx: int,
			
 
				+        cache_kwargs: Optional[Dict[str, Any]] = None,
			
 
				+    ) -> Tuple[torch.Tensor, torch.Tensor]:
			
 
				+        """
			
 
				+        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
			
 
				+
			
 
				+        Parameters:
			
 
				+            key_states (`torch.Tensor`):
			
 
				+                The new key states to cache.
			
 
				+            value_states (`torch.Tensor`):
			
 
				+                The new value states to cache.
			
 
				+            layer_idx (`int`):
			
 
				+                The index of the layer to cache the states for.
			
 
				+            cache_kwargs (`Dict[str, Any]`, `optional`):
			
 
				+                Additional arguments for the cache subclass. These are specific to each subclass and allow new types of
			
 
				+                cache to be created.
			
 
				+
			
 
				+        Return:
			
 
				+            A tuple containing the updated key and value states.
			
 
				+        """
			
 
				+        raise NotImplementedError("Make sure to implement `update` in a subclass.")
			
 
				+
			
 
				+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
			
 
				+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
			
 
				+        raise NotImplementedError("Make sure to implement `get_seq_length` in a subclass.")
			
 
				+
			
 
				+    def get_max_length(self) -> Optional[int]:
			
 
				+        """Returns the maximum sequence length of the cached states, if there is any."""
			
 
				+        raise NotImplementedError("Make sure to implement `get_max_length` in a subclass.")
			
 
				+
			
 
				+    def get_usable_length(self, new_seq_length: int, layer_idx: Optional[int] = 0) -> int:
			
 
				+        """Given the sequence length of the new inputs, returns the usable length of the cache."""
			
 
				+        # Cache without size limit -> all cache is usable
			
 
				+        # Cache with size limit -> if the length cache plus the length of the new inputs is larger the maximum cache
			
 
				+        #   length, we will need to evict part of the cache (and thus not all cache is usable)
			
 
				+        max_length = self.get_max_length()
			
 
				+        previous_seq_length = self.get_seq_length(layer_idx)
			
 
				+        if max_length is not None and previous_seq_length + new_seq_length > max_length:
			
 
				+            return max_length - new_seq_length
			
 
				+        return previous_seq_length
			
 
				+
			
 
				+    @property
			
 
				+    def seen_tokens(self):
			
 
				+        logger.warning_once(
			
 
				+            "The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` "
			
 
				+            "model input instead."
			
 
				+        )
			
 
				+        if hasattr(self, "_seen_tokens"):
			
 
				+            return self._seen_tokens
			
 
				+        else:
			
 
				+            return None
			
 
				+
			
 
				+
			
 
				+class DynamicCache(Cache):
			
 
				+    """
			
 
				+    A cache that grows dynamically as more tokens are generated. This is the default for generative models.
			
 
				+
			
 
				+    It stores the Key and Value states as a list of tensors, one for each layer. The expected shape for each tensor is
			
 
				+    `[batch_size, num_heads, seq_len, head_dim]`.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self) -> None:
			
 
				+        self.key_cache: List[torch.Tensor] = []
			
 
				+        self.value_cache: List[torch.Tensor] = []
			
 
				+        self._seen_tokens = 0  # Used in `generate` to keep tally of how many tokens the cache has seen
			
 
				+
			
 
				+    def __getitem__(self, layer_idx: int) -> List[Tuple[torch.Tensor]]:
			
 
				+        """
			
 
				+        Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the
			
 
				+        sequence length.
			
 
				+        """
			
 
				+        if layer_idx < len(self):
			
 
				+            return (self.key_cache[layer_idx], self.value_cache[layer_idx])
			
 
				+        else:
			
 
				+            raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}")
			
 
				+
			
 
				+    def __iter__(self):
			
 
				+        """
			
 
				+        Support for backwards-compatible `past_key_value` iteration, e.g. `for x in past_key_value:` to iterate over
			
 
				+        keys and values
			
 
				+        """
			
 
				+        for layer_idx in range(len(self)):
			
 
				+            yield (self.key_cache[layer_idx], self.value_cache[layer_idx])
			
 
				+
			
 
				+    def __len__(self):
			
 
				+        """
			
 
				+        Support for backwards-compatible `past_key_value` length, e.g. `len(past_key_value)`. This value corresponds
			
 
				+        to the number of layers in the model.
			
 
				+        """
			
 
				+        return len(self.key_cache)
			
 
				+
			
 
				+    def update(
			
 
				+        self,
			
 
				+        key_states: torch.Tensor,
			
 
				+        value_states: torch.Tensor,
			
 
				+        layer_idx: int,
			
 
				+        cache_kwargs: Optional[Dict[str, Any]] = None,
			
 
				+    ) -> Tuple[torch.Tensor, torch.Tensor]:
			
 
				+        """
			
 
				+        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
			
 
				+
			
 
				+        Parameters:
			
 
				+            key_states (`torch.Tensor`):
			
 
				+                The new key states to cache.
			
 
				+            value_states (`torch.Tensor`):
			
 
				+                The new value states to cache.
			
 
				+            layer_idx (`int`):
			
 
				+                The index of the layer to cache the states for.
			
 
				+            cache_kwargs (`Dict[str, Any]`, `optional`):
			
 
				+                Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`.
			
 
				+
			
 
				+        Return:
			
 
				+            A tuple containing the updated key and value states.
			
 
				+        """
			
 
				+        # Update the number of seen tokens
			
 
				+        if layer_idx == 0:
			
 
				+            self._seen_tokens += key_states.shape[-2]
			
 
				+
			
 
				+        # Update the cache
			
 
				+        if len(self.key_cache) <= layer_idx:
			
 
				+            self.key_cache.append(key_states)
			
 
				+            self.value_cache.append(value_states)
			
 
				+        else:
			
 
				+            self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
			
 
				+            self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
			
 
				+
			
 
				+        return self.key_cache[layer_idx], self.value_cache[layer_idx]
			
 
				+
			
 
				+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
			
 
				+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
			
 
				+        if len(self.key_cache) <= layer_idx:
			
 
				+            return 0
			
 
				+        return self.key_cache[layer_idx].shape[-2]
			
 
				+
			
 
				+    def get_max_length(self) -> Optional[int]:
			
 
				+        """Returns the maximum sequence length of the cached states. DynamicCache does not have a maximum length."""
			
 
				+        return None
			
 
				+
			
 
				+    def reorder_cache(self, beam_idx: torch.LongTensor):
			
 
				+        """Reorders the cache for beam search, given the selected beam indices."""
			
 
				+        for layer_idx in range(len(self.key_cache)):
			
 
				+            device = self.key_cache[layer_idx].device
			
 
				+            self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device))
			
 
				+            device = self.value_cache[layer_idx].device
			
 
				+            self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
			
 
				+
			
 
				+    def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]:
			
 
				+        """Converts the `DynamicCache` instance into the its equivalent in the legacy cache format."""
			
 
				+        legacy_cache = ()
			
 
				+        for layer_idx in range(len(self)):
			
 
				+            legacy_cache += ((self.key_cache[layer_idx], self.value_cache[layer_idx]),)
			
 
				+        return legacy_cache
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_legacy_cache(cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None) -> "DynamicCache":
			
 
				+        """Converts a cache in the legacy cache format into an equivalent `DynamicCache`."""
			
 
				+        cache = cls()
			
 
				+        if past_key_values is not None:
			
 
				+            for layer_idx in range(len(past_key_values)):
			
 
				+                key_states, value_states = past_key_values[layer_idx]
			
 
				+                cache.update(key_states, value_states, layer_idx)
			
 
				+        return cache
			
 
				+
			
 
				+
			
 
				+class SinkCache(Cache):
			
 
				+    """
			
 
				+    A cache that as described in the [Attention Sinks paper](https://arxiv.org/abs/2309.17453). It allows the model to
			
 
				+    generate beyond the length of its context window, without losing fluency in the conversation. As it discards past
			
 
				+    tokens, the model will lose the ability to generate tokens that depend on the context that was discarded.
			
 
				+
			
 
				+    It stores the Key and Value states as a list of tensors, one for each layer. The expected shape for each tensor is
			
 
				+    `[batch_size, num_heads, seq_len, head_dim]`.
			
 
				+
			
 
				+    Parameters:
			
 
				+        window_length (`int`):
			
 
				+            The length of the context window.
			
 
				+        num_sink_tokens (`int`):
			
 
				+            The number of sink tokens. See the original paper for more information.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, window_length: int, num_sink_tokens: int) -> None:
			
 
				+        self.key_cache: List[torch.Tensor] = []
			
 
				+        self.value_cache: List[torch.Tensor] = []
			
 
				+        self.window_length = window_length
			
 
				+        self.num_sink_tokens = num_sink_tokens
			
 
				+        self.cos_sin_cache = {}
			
 
				+        self._seen_tokens = 0  # Used in `generate` to keep tally of how many tokens the cache has seen
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _rotate_half(x):
			
 
				+        x1 = x[..., : x.shape[-1] // 2]
			
 
				+        x2 = x[..., x.shape[-1] // 2 :]
			
 
				+        return torch.cat((-x2, x1), dim=-1)
			
 
				+
			
 
				+    def _apply_key_rotary_pos_emb(
			
 
				+        self, key_states: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
			
 
				+    ) -> torch.Tensor:
			
 
				+        rotated_key_states = (key_states * cos) + (self._rotate_half(key_states) * sin)
			
 
				+        return rotated_key_states
			
 
				+
			
 
				+    def _get_rerotation_cos_sin(
			
 
				+        self, key_states: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
			
 
				+    ) -> Tuple[torch.Tensor, torch.Tensor]:
			
 
				+        if key_states.shape[-2] not in self.cos_sin_cache:
			
 
				+            # Upcast to float32 temporarily for better accuracy
			
 
				+            cos = cos.to(torch.float32)
			
 
				+            sin = sin.to(torch.float32)
			
 
				+
			
 
				+            # Compute the cos and sin required for back- and forward-rotating to one position earlier in the sequence
			
 
				+            original_cos = cos[self.num_sink_tokens + key_states.shape[-2] :]
			
 
				+            shifted_cos = cos[self.num_sink_tokens : -key_states.shape[-2]]
			
 
				+            original_sin = sin[self.num_sink_tokens + key_states.shape[-2] :]
			
 
				+            shifted_sin = sin[self.num_sink_tokens : -key_states.shape[-2]]
			
 
				+            rerotation_cos = original_cos * shifted_cos + original_sin * shifted_sin
			
 
				+            rerotation_sin = -original_sin * shifted_cos + original_cos * shifted_sin
			
 
				+
			
 
				+            self.cos_sin_cache[key_states.shape[-2]] = (
			
 
				+                rerotation_cos.to(key_states.dtype).unsqueeze(0),
			
 
				+                rerotation_sin.to(key_states.dtype).unsqueeze(0),
			
 
				+            )
			
 
				+        return self.cos_sin_cache[key_states.shape[-2]]
			
 
				+
			
 
				+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
			
 
				+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
			
 
				+        # Workaround to make 'key_states.shape[-2] + past_key_value.get_seq_length(self.layer_idx)' <= window_length
			
 
				+        if len(self.key_cache) <= layer_idx:
			
 
				+            return 0
			
 
				+        return self.key_cache[layer_idx].shape[-2]
			
 
				+
			
 
				+    def get_max_length(self) -> Optional[int]:
			
 
				+        """Returns the maximum sequence length of the cached states."""
			
 
				+        return self.window_length
			
 
				+
			
 
				+    def update(
			
 
				+        self,
			
 
				+        key_states: torch.Tensor,
			
 
				+        value_states: torch.Tensor,
			
 
				+        layer_idx: int,
			
 
				+        cache_kwargs: Optional[Dict[str, Any]] = None,
			
 
				+    ) -> Tuple[torch.Tensor, torch.Tensor]:
			
 
				+        """
			
 
				+        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
			
 
				+
			
 
				+        Parameters:
			
 
				+            key_states (`torch.Tensor`):
			
 
				+                The new key states to cache.
			
 
				+            value_states (`torch.Tensor`):
			
 
				+                The new value states to cache.
			
 
				+            layer_idx (`int`):
			
 
				+                The index of the layer to cache the states for.
			
 
				+            cache_kwargs (`Dict[str, Any]`, `optional`):
			
 
				+                Additional arguments for the cache subclass. The following arguments can be used in `SinkCache`: `sin`,
			
 
				+                `cos` and `partial_rotation_size`. These arguments are used with models using RoPE, to recompute the
			
 
				+                rotation as the tokens are shifted.
			
 
				+
			
 
				+        Return:
			
 
				+            A tuple containing the updated key and value states.
			
 
				+        """
			
 
				+        # Optional kwargs for `SinkCache` -- needed on models using RoPE. `partial_rotation_size` is used on models
			
 
				+        # with partially rotated position embeddings, like Phi or Persimmon.
			
 
				+        sin = cache_kwargs.get("sin")
			
 
				+        cos = cache_kwargs.get("cos")
			
 
				+        partial_rotation_size = cache_kwargs.get("partial_rotation_size")
			
 
				+        using_rope = cos is not None and sin is not None
			
 
				+
			
 
				+        # Update the number of seen tokens
			
 
				+        if layer_idx == 0:
			
 
				+            self._seen_tokens += key_states.shape[-2]
			
 
				+
			
 
				+        # [bsz, num_heads, seq_len, head_dim]
			
 
				+        if len(self.key_cache) <= layer_idx:
			
 
				+            # Empty cache
			
 
				+            self.key_cache.append(key_states)
			
 
				+            self.value_cache.append(value_states)
			
 
				+
			
 
				+        elif key_states.shape[-2] + self.get_seq_length(layer_idx) < self.window_length:
			
 
				+            # Growing cache
			
 
				+            self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
			
 
				+            self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
			
 
				+
			
 
				+        else:
			
 
				+            # Shifting cache
			
 
				+            keys_to_keep = self.key_cache[layer_idx][
			
 
				+                :, :, -self.window_length + self.num_sink_tokens + key_states.shape[-2] :
			
 
				+            ]
			
 
				+
			
 
				+            # On RoPE models, we need to recompute the Key rotation as the tokens are shifted
			
 
				+            if using_rope:
			
 
				+                rerotation_cos, rerotation_sin = self._get_rerotation_cos_sin(
			
 
				+                    key_states, cos[: self.window_length], sin[: self.window_length]
			
 
				+                )
			
 
				+                if partial_rotation_size is not None:
			
 
				+                    keys_to_keep, keys_pass = (
			
 
				+                        keys_to_keep[..., :partial_rotation_size],
			
 
				+                        keys_to_keep[..., partial_rotation_size:],
			
 
				+                    )
			
 
				+                keys_to_keep = self._apply_key_rotary_pos_emb(keys_to_keep, rerotation_cos, rerotation_sin)
			
 
				+                if partial_rotation_size is not None:
			
 
				+                    keys_to_keep = torch.cat((keys_to_keep, keys_pass), dim=-1)
			
 
				+
			
 
				+            # Concatenate sink tokens, shifted & rotated tokens (if needed), and new tokens
			
 
				+            sink_keys = self.key_cache[layer_idx][:, :, : self.num_sink_tokens]
			
 
				+            self.key_cache[layer_idx] = torch.cat([sink_keys, keys_to_keep, key_states], dim=-2)
			
 
				+
			
 
				+            sink_values = self.value_cache[layer_idx][:, :, : self.num_sink_tokens]
			
 
				+            values_to_keep = self.value_cache[layer_idx][
			
 
				+                :, :, -self.window_length + self.num_sink_tokens + value_states.shape[-2] :
			
 
				+            ]
			
 
				+            self.value_cache[layer_idx] = torch.cat([sink_values, values_to_keep, value_states], dim=-2)
			
 
				+
			
 
				+        return self.key_cache[layer_idx], self.value_cache[layer_idx]
			
 
				+
			
 
				+    def reorder_cache(self, beam_idx: torch.LongTensor):
			
 
				+        """Reorders the cache for beam search, given the selected beam indices."""
			
 
				+        for layer_idx in range(len(self.key_cache)):
			
 
				+            device = self.key_cache[layer_idx].device
			
 
				+            self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device))
			
 
				+            device = self.value_cache[layer_idx].device
			
 
				+            self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
			
 
				+
			
 
				+
			
 
				+class HHCache(Cache):
			
 
				+    """
			
 
				+    A cache that apply heavy-hitter oracle (https://proceedings.neurips.cc/paper_files/paper/2023/file/6ceefa7b15572587b78ecfcebb2827f8-Paper-Conference.pdf).
			
 
				+    Only the heavy-hitter and the recent tokens are stored in the cache.
			
 
				+
			
 
				+    It stores the Key and Value states as a list of tensors, one for each layer. The expected shape for each tensor is
			
 
				+    `[batch_size, num_heads, seq_len, head_dim]`.
			
 
				+
			
 
				+    Parameters:
			
 
				+        window_length (`int`):
			
 
				+            The length of the context window.
			
 
				+        num_hh_tokens (`int`):
			
 
				+            The number of heavy hitter tokens. See the original paper for more information.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, window_length: int, num_hh_tokens: int) -> None:
			
 
				+        self.key_cache: List[torch.Tensor] = []
			
 
				+        self.value_cache: List[torch.Tensor] = []
			
 
				+        self.window_length = window_length
			
 
				+        self.num_hh_tokens = num_hh_tokens
			
 
				+        self.accumulated_attention_scores: List[torch.Tensor] = []
			
 
				+        self._seen_tokens = 0  # Used in `generate` to keep tally of how many tokens the cache has seen
			
 
				+
			
 
				+    def __getitem__(self, layer_idx: int) -> List[Tuple[torch.Tensor]]:
			
 
				+        """
			
 
				+        Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the
			
 
				+        sequence length.
			
 
				+        """
			
 
				+        if layer_idx < len(self):
			
 
				+            return (self.key_cache[layer_idx], self.value_cache[layer_idx], self.accumulated_attention_scores[layer_idx])
			
 
				+        else:
			
 
				+            raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}")
			
 
				+
			
 
				+    def __iter__(self):
			
 
				+        """
			
 
				+        Support for backwards-compatible `past_key_value` iteration, e.g. `for x in past_key_value:` to iterate over
			
 
				+        keys and values
			
 
				+        """
			
 
				+        for layer_idx in range(len(self)):
			
 
				+            yield (self.key_cache[layer_idx], self.value_cache[layer_idx], self.accumulated_attention_scores[layer_idx])
			
 
				+
			
 
				+    def __len__(self):
			
 
				+        """
			
 
				+        Support for backwards-compatible `past_key_value` length, e.g. `len(past_key_value)`. This value corresponds
			
 
				+        to the number of layers in the model.
			
 
				+        """
			
 
				+        return len(self.key_cache)
			
 
				+
			
 
				+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
			
 
				+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
			
 
				+        # Workaround to make 'key_states.shape[-2] + past_key_value.get_seq_length(self.layer_idx)' <= window_length
			
 
				+        if len(self.key_cache) <= layer_idx:
			
 
				+            return 0
			
 
				+        return self.key_cache[layer_idx].shape[-2]
			
 
				+
			
 
				+    def get_max_length(self) -> Optional[int]:
			
 
				+        """Returns the maximum sequence length of the cached states."""
			
 
				+        return self.window_length
			
 
				+
			
 
				+    def update(
			
 
				+        self,
			
 
				+        key_states: torch.Tensor,
			
 
				+        value_states: torch.Tensor,
			
 
				+        layer_idx: int,
			
 
				+        cache_kwargs: Optional[Dict[str, Any]] = None,
			
 
				+        accumulated_attention_scores: Optional[torch.Tensor] = None,
			
 
				+    ) -> Tuple[torch.Tensor, torch.Tensor]:
			
 
				+        """
			
 
				+        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
			
 
				+
			
 
				+        Parameters:
			
 
				+            key_states (`torch.Tensor`):
			
 
				+                The new key states to cache.
			
 
				+            value_states (`torch.Tensor`):
			
 
				+                The new value states to cache.
			
 
				+            layer_idx (`int`):
			
 
				+                The index of the layer to cache the states for.
			
 
				+            cache_kwargs (`Dict[str, Any]`, `optional`):
			
 
				+                Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`.
			
 
				+
			
 
				+        Return:
			
 
				+            A tuple containing the updated key and value states.
			
 
				+        """
			
 
				+        # Update the number of seen tokens
			
 
				+
			
 
				+        if accumulated_attention_scores is not None:
			
 
				+            self.accumulated_attention_scores.append(accumulated_attention_scores)
			
 
				+
			
 
				+        if layer_idx == 0:
			
 
				+            self._seen_tokens += key_states.shape[-2]
			
 
				+
			
 
				+        # Update the cache
			
 
				+        if len(self.key_cache) <= layer_idx:
			
 
				+            self.key_cache.append(key_states)
			
 
				+            self.value_cache.append(value_states)
			
 
				+        else:
			
 
				+            self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
			
 
				+            self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
			
 
				+
			
 
				+        return self.key_cache[layer_idx], self.value_cache[layer_idx]
			
 
				+
			
 
				+    def update_slimming(
			
 
				+        self,
			
 
				+        attention_scores: torch.Tensor,
			
 
				+        num_kv_groups: int,
			
 
				+        layer_idx: int,
			
 
				+        cache_kwargs: Optional[Dict[str, Any]] = None,
			
 
				+    ) -> Tuple[torch.Tensor, torch.Tensor]:
			
 
				+        """
			
 
				+        Slimming the cache based on accumulated attention scores, only keep heavy-hitters + local tokens.
			
 
				+
			
 
				+        Parameters:
			
 
				+            attention_scores (`torch.Tensor`):
			
 
				+                Attention_scores for current steps.
			
 
				+            num_kv_groups (`int`):
			
 
				+                The number of kv groups in repeat kv.
			
 
				+            layer_idx (`int`):
			
 
				+                The index of the layer to cache the states for.
			
 
				+            cache_kwargs (`Dict[str, Any]`, `optional`):
			
 
				+                Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`.
			
 
				+        Return:
			
 
				+            A tuple containing the updated key and value states.
			
 
				+        """
			
 
				+
			
 
				+        # Update score metrics (Accumulated attention scores)
			
 
				+        if len(self.accumulated_attention_scores) <= layer_idx:
			
 
				+            self.accumulated_attention_scores.append(attention_scores.sum(2)[:,::num_kv_groups, :]) # [bs, num_heads, key_len]
			
 
				+        else:
			
 
				+            num_new_tokens = attention_scores.shape[2]
			
 
				+            updated_attention_scores = attention_scores.sum(2)[:,::num_kv_groups, :] # [bs, num_heads, key_len]
			
 
				+            updated_attention_scores[:, :, :-num_new_tokens] += self.accumulated_attention_scores[layer_idx]
			
 
				+            self.accumulated_attention_scores[layer_idx] = updated_attention_scores
			
 
				+
			
 
				+        # Update KV Cache
			
 
				+        if self.get_seq_length(layer_idx) > self.window_length:
			
 
				+
			
 
				+            seq_scores = self.accumulated_attention_scores[layer_idx][:, :, :-self.window_length + self.num_hh_tokens]
			
 
				+            _, keep_hh_index = torch.topk(seq_scores, self.num_hh_tokens, dim=-1)
			
 
				+            keep_hh_index = keep_hh_index.sort().values
			
 
				+
			
 
				+            keep_local_index = torch.arange(self.get_seq_length(layer_idx) - self.window_length + self.num_hh_tokens, self.get_seq_length(layer_idx), device=keep_hh_index.device).repeat(keep_hh_index.shape[0], keep_hh_index.shape[1], 1)
			
 
				+            keep_index = torch.cat([keep_hh_index, keep_local_index], dim=-1)
			
 
				+
			
 
				+            mask = torch.zeros(self.accumulated_attention_scores[layer_idx].shape, dtype=torch.bool).to(keep_hh_index.device)
			
 
				+            mask = mask.scatter(-1, keep_index, 1)
			
 
				+
			
 
				+            bsz, num_heads, _, head_dim = self.key_cache[layer_idx].shape
			
 
				+            self.key_cache[layer_idx] = self.key_cache[layer_idx][mask].view(bsz, num_heads, -1, head_dim)
			
 
				+            self.value_cache[layer_idx] = self.value_cache[layer_idx][mask].view(bsz, num_heads, -1, head_dim)
			
 
				+            self.accumulated_attention_scores[layer_idx] = self.accumulated_attention_scores[layer_idx][mask].view(bsz, num_heads, -1)
			
 
				+
			
 
				+
			
 
				+    def reorder_cache(self, beam_idx: torch.LongTensor):
			
 
				+        """Reorders the cache for beam search, given the selected beam indices."""
			
 
				+        for layer_idx in range(len(self.key_cache)):
			
 
				+            device = self.key_cache[layer_idx].device
			
 
				+            self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device))
			
 
				+            device = self.value_cache[layer_idx].device
			
 
				+            self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
			
 
				+
			
 
				+    def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]:
			
 
				+        """Converts the `DynamicCache` instance into the its equivalent in the legacy cache format."""
			
 
				+        legacy_cache = ()
			
 
				+        for layer_idx in range(len(self)):
			
 
				+            legacy_cache += ((self.key_cache[layer_idx], self.value_cache[layer_idx], self.accumulated_attention_scores[layer_idx],))
			
 
				+        return legacy_cache
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_legacy_cache(cls, window_length: int, num_hh_tokens: int, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None) -> "DynamicCache":
			
 
				+        """Converts a cache in the legacy cache format into an equivalent `DynamicCache`."""
			
 
				+        cache = cls(window_length, num_hh_tokens)
			
 
				+        if past_key_values is not None:
			
 
				+            for layer_idx in range(len(past_key_values) // 3):
			
 
				+                key_states = past_key_values[layer_idx * 3]
			
 
				+                value_states = past_key_values[layer_idx * 3 + 1]
			
 
				+                accumulated_attention_scores = past_key_values[layer_idx * 3 + 2]
			
 
				+                cache.update(key_states, value_states, layer_idx, accumulated_attention_scores=accumulated_attention_scores)
			
 
				+        return cache
			
 
				+
			
 
				+    def evict_for_space(self, space_needed: int):
			
 
				+        num_layers = len(self.key_cache)
			
 
				+
			
 
				+        # Update score metrics (Accumulated attention scores)
			
 
				+        if len(self.accumulated_attention_scores) < num_layers:
			
 
				+            raise ValueError("The accumulated_attention_scores should be updated before evicting the cache.")
			
 
				+
			
 
				+        for layer_idx in range(num_layers):
			
 
				+            # Update KV Cache, Evict for new coming prompts
			
 
				+            if self.get_seq_length(layer_idx) + space_needed > self.window_length:
			
 
				+                if self.window_length - self.num_hh_tokens <= space_needed:
			
 
				+                    raise ValueError("The space_needed should be less than the window_length - num_hh_tokens.")
			
 
				+
			
 
				+                seq_scores = self.accumulated_attention_scores[layer_idx][:, :, :-self.window_length + self.num_hh_tokens + space_needed]
			
 
				+                _, keep_hh_index = torch.topk(seq_scores, self.num_hh_tokens, dim=-1)
			
 
				+                keep_hh_index = keep_hh_index.sort().values
			
 
				+
			
 
				+                keep_local_index = torch.arange(self.get_seq_length(layer_idx) - self.window_length + self.num_hh_tokens + space_needed, self.get_seq_length(layer_idx), device=keep_hh_index.device).repeat(keep_hh_index.shape[0], keep_hh_index.shape[1], 1)
			
 
				+                keep_index = torch.cat([keep_hh_index, keep_local_index], dim=-1)
			
 
				+
			
 
				+                mask = torch.zeros(self.accumulated_attention_scores[layer_idx].shape, dtype=torch.bool).to(keep_hh_index.device)
			
 
				+                mask = mask.scatter(-1, keep_index, 1)
			
 
				+
			
 
				+                bsz, num_heads, _, head_dim = self.key_cache[layer_idx].shape
			
 
				+                self.key_cache[layer_idx] = self.key_cache[layer_idx][mask].view(bsz, num_heads, -1, head_dim)
			
 
				+                self.value_cache[layer_idx] = self.value_cache[layer_idx][mask].view(bsz, num_heads, -1, head_dim)
			
 
				+                self.accumulated_attention_scores[layer_idx] = self.accumulated_attention_scores[layer_idx][mask].view(bsz, num_heads, -1)
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+class StaticCache(Cache):
			
 
				+    """
			
 
				+    Static Cache class to be used with `torch.compile(model)`.
			
 
				+
			
 
				+    Parameters:
			
 
				+        config (`PretrainedConfig):
			
 
				+            The configuration file defining the `max_position_embeddings`, `hidden_size` and `num_attention_heads`
			
 
				+            required to initialize the static cache.
			
 
				+        max_batch_size (`int`):
			
 
				+            The maximum batch size with which the model will be used.
			
 
				+        max_cache_len (`int`):
			
 
				+            The maximum sequence length with which the model will be used.
			
 
				+        device (`torch.device`):
			
 
				+            The device on which the cache should be initialized. Should be the same as the layer.
			
 
				+        dtype (*optional*, defaults to `torch.float32`):
			
 
				+            The default `dtype` to use when initializing the layer.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, config: PretrainedConfig, max_batch_size: int, max_cache_len: int, device, dtype=None) -> None:
			
 
				+        super().__init__()
			
 
				+        self.max_batch_size = max_batch_size
			
 
				+        self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len
			
 
				+        # Some model define a custom `head_dim` != config.hidden_size // config.num_attention_heads
			
 
				+        self.head_dim = (
			
 
				+            config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads
			
 
				+        )
			
 
				+
			
 
				+        self.dtype = dtype if dtype is not None else torch.float32
			
 
				+        self.num_key_value_heads = (
			
 
				+            config.num_attention_heads if config.num_key_value_heads is None else config.num_key_value_heads
			
 
				+        )
			
 
				+
			
 
				+        cache_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim)
			
 
				+        self.key_cache: torch.Tensor = torch.zeros(cache_shape, dtype=self.dtype, device=device)
			
 
				+        self.value_cache: torch.Tensor = torch.zeros(cache_shape, dtype=self.dtype, device=device)
			
 
				+
			
 
				+    def update(
			
 
				+        self,
			
 
				+        key_states: torch.Tensor,
			
 
				+        value_states: torch.Tensor,
			
 
				+        layer_idx: int,
			
 
				+        cache_kwargs: Optional[Dict[str, Any]] = None,
			
 
				+    ) -> Tuple[torch.Tensor, torch.Tensor]:
			
 
				+        """
			
 
				+        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
			
 
				+        It is VERY important to index using a tensor, otherwise you introduce a copy to the device.
			
 
				+
			
 
				+        Parameters:
			
 
				+            key_states (`torch.Tensor`):
			
 
				+                The new key states to cache.
			
 
				+            value_states (`torch.Tensor`):
			
 
				+                The new value states to cache.
			
 
				+            layer_idx (`int`):
			
 
				+                The index of the layer to cache the states for. Kept for backward compatibility
			
 
				+            cache_kwargs (`Dict[str, Any]`, `optional`):
			
 
				+                Additional arguments for the cache subclass. The `StaticCache` just needs the `q_len`
			
 
				+                to know how much of the cache it should overwrite.
			
 
				+
			
 
				+        Return:
			
 
				+            A tuple containing the updated key and value states.
			
 
				+        """
			
 
				+        new_cache_positions = cache_kwargs.get("cache_position")
			
 
				+        k_out = self.key_cache
			
 
				+        v_out = self.value_cache
			
 
				+
			
 
				+        k_out[:, :, new_cache_positions] = key_states
			
 
				+        v_out[:, :, new_cache_positions] = value_states
			
 
				+
			
 
				+        return k_out, v_out
			
 
				+
			
 
				+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
			
 
				+        """Returns the sequence length of the cached states that were seen by the model. `layer_idx` kept for BC"""
			
 
				+        # Occupied cache == any slot in the 3rd dim (sequence length) holds a non-zero value. To save on compute, let's
			
 
				+        # limit the check to the first batch member and head dimension.
			
 
				+        # TODO: This is error prone, a filled cache may be `0.0`. Let's use a stateless integer instead, after
			
 
				+        # https://github.com/pytorch/pytorch/issues/120248 is fixed
			
 
				+        return (self.key_cache[0, 0].any(dim=-1)).sum()
			
 
				+
			
 
				+    def get_max_length(self) -> Optional[int]:
			
 
				+        """Returns the maximum sequence length of the cached states. DynamicCache does not have a maximum length."""
			
 
				+        return self.max_cache_len
			
 
				+
			
 
				+    def reorder_cache(self, beam_idx: torch.LongTensor):
			
 
				+        """Reorders the cache for beam search, given the selected beam indices."""
			
 
				+        device = self.key_cache.device
			
 
				+        self.key_cache = self.key_cache.index_select(0, beam_idx.to(device))
			
 
				+        device = self.value_cache.device
			
 
				+        self.value_cache = self.value_cache.index_select(0, beam_idx.to(device))
			
 
				+
			
 
				+    def to_legacy_cache(self):
			
 
				+        """Dummy function for BC. We have to keep it because otherwise the call in the forward of models will break it"""
			
 
				+        return None
			
 
				+
			
--- a/recipes/experimental/long-context/H2O/utils/llama.py
+++ b/recipes/experimental/long-context/H2O/utils/llama.py
@@ -0,0 +1,453 @@
 
				+import math
			
 
				+from typing import Any, Dict, List, Optional, Tuple, Union
			
 
				+import warnings
			
 
				+warnings.filterwarnings("ignore")
			
 
				+
			
 
				+import pdb
			
 
				+import types
			
 
				+import torch
			
 
				+from torch import nn
			
 
				+import torch.utils.checkpoint
			
 
				+import torch.nn.functional as F
			
 
				+
			
 
				+from transformers.models.llama.configuration_llama import LlamaConfig
			
 
				+from transformers.models.llama.modeling_llama import (
			
 
				+    LlamaAttention,
			
 
				+    rotate_half,
			
 
				+    apply_rotary_pos_emb,
			
 
				+    repeat_kv,
			
 
				+    LlamaRotaryEmbedding,
			
 
				+    LlamaForCausalLM,
			
 
				+)
			
 
				+from utils.cache import Cache, HHCache, StaticCache
			
 
				+from transformers.utils import logging
			
 
				+from transformers.modeling_outputs import BaseModelOutputWithPast
			
 
				+
			
 
				+logger = logging.get_logger(__name__)
			
 
				+
			
 
				+__all__ = ["H2OLlamaForCausalLM"]
			
 
				+
			
 
				+def _make_causal_mask(
			
 
				+    bsz: int, tgt_len: int, past_key_values_length: int, dtype: torch.dtype, device: torch.device):
			
 
				+    """
			
 
				+    Make causal mask used for bi-directional self-attention.
			
 
				+    """
			
 
				+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
			
 
				+    mask_cond = torch.arange(mask.size(-1), device=device)
			
 
				+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
			
 
				+    mask = mask.to(dtype)
			
 
				+
			
 
				+    if past_key_values_length > 0:
			
 
				+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
			
 
				+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
			
 
				+
			
 
				+def apply_rotary_pos_emb_single(x, cos, sin, position_ids=None, unsqueeze_dim=1):
			
 
				+
			
 
				+    cos = cos.unsqueeze(unsqueeze_dim)
			
 
				+    sin = sin.unsqueeze(unsqueeze_dim)
			
 
				+    x_embed = (x * cos) + (rotate_half(x) * sin)
			
 
				+
			
 
				+    return x_embed
			
 
				+
			
 
				+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
			
 
				+    """
			
 
				+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
			
 
				+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
			
 
				+    """
			
 
				+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
			
 
				+    if n_rep == 1:
			
 
				+        return hidden_states
			
 
				+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
			
 
				+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
			
 
				+
			
 
				+
			
 
				+class H2OLlamaAttention(nn.Module):
			
 
				+    """Multi-headed attention from 'Attention Is All You Need' paper"""
			
 
				+
			
 
				+    def __init__(self, config: LlamaConfig, layer_idx: Optional[int] = None):
			
 
				+        super().__init__()
			
 
				+        self.config = config
			
 
				+        self.layer_idx = layer_idx
			
 
				+        if layer_idx is None:
			
 
				+            logger.warning_once(
			
 
				+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
			
 
				+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
			
 
				+                "when creating this class."
			
 
				+            )
			
 
				+
			
 
				+        self.attention_dropout = config.attention_dropout
			
 
				+        self.hidden_size = config.hidden_size
			
 
				+        self.num_heads = config.num_attention_heads
			
 
				+        self.head_dim = self.hidden_size // self.num_heads
			
 
				+        self.num_key_value_heads = config.num_key_value_heads
			
 
				+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
			
 
				+        self.max_position_embeddings = config.max_position_embeddings
			
 
				+        self.rope_theta = config.rope_theta
			
 
				+        self.is_causal = True
			
 
				+        self.positional_rolling = config.enable_position_rolling
			
 
				+
			
 
				+        if (self.head_dim * self.num_heads) != self.hidden_size:
			
 
				+            raise ValueError(
			
 
				+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
			
 
				+                f" and `num_heads`: {self.num_heads})."
			
 
				+            )
			
 
				+
			
 
				+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
			
 
				+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
			
 
				+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
			
 
				+        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias)
			
 
				+        self._init_rope()
			
 
				+
			
 
				+    def _init_rope(self):
			
 
				+        if self.config.rope_scaling is None:
			
 
				+            self.rotary_emb = LlamaRotaryEmbedding(
			
 
				+                self.head_dim,
			
 
				+                max_position_embeddings=self.max_position_embeddings,
			
 
				+                base=self.rope_theta,
			
 
				+            )
			
 
				+        else:
			
 
				+            scaling_type = self.config.rope_scaling["type"]
			
 
				+            scaling_factor = self.config.rope_scaling["factor"]
			
 
				+            if scaling_type == "linear":
			
 
				+                self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
			
 
				+                    self.head_dim,
			
 
				+                    max_position_embeddings=self.max_position_embeddings,
			
 
				+                    scaling_factor=scaling_factor,
			
 
				+                    base=self.rope_theta,
			
 
				+                )
			
 
				+            elif scaling_type == "dynamic":
			
 
				+                self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
			
 
				+                    self.head_dim,
			
 
				+                    max_position_embeddings=self.max_position_embeddings,
			
 
				+                    scaling_factor=scaling_factor,
			
 
				+                    base=self.rope_theta,
			
 
				+                )
			
 
				+            else:
			
 
				+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
			
 
				+
			
 
				+    def forward(
			
 
				+        self,
			
 
				+        hidden_states: torch.Tensor,
			
 
				+        attention_mask: Optional[torch.Tensor] = None,
			
 
				+        position_ids: Optional[torch.LongTensor] = None,
			
 
				+        past_key_value: Optional[Cache] = None,
			
 
				+        output_attentions: bool = False,
			
 
				+        use_cache: bool = False,
			
 
				+        cache_position: Optional[torch.LongTensor] = None,
			
 
				+        **kwargs,
			
 
				+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
			
 
				+        bsz, q_len, _ = hidden_states.size()
			
 
				+
			
 
				+        if self.config.pretraining_tp > 1:
			
 
				+            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
			
 
				+            query_slices = self.q_proj.weight.split(
			
 
				+                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
			
 
				+            )
			
 
				+            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
			
 
				+            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
			
 
				+
			
 
				+            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
			
 
				+            query_states = torch.cat(query_states, dim=-1)
			
 
				+
			
 
				+            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
			
 
				+            key_states = torch.cat(key_states, dim=-1)
			
 
				+
			
 
				+            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
			
 
				+            value_states = torch.cat(value_states, dim=-1)
			
 
				+
			
 
				+        else:
			
 
				+            query_states = self.q_proj(hidden_states)
			
 
				+            key_states = self.k_proj(hidden_states)
			
 
				+            value_states = self.v_proj(hidden_states)
			
 
				+
			
 
				+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
			
 
				+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
			
 
				+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
			
 
				+
			
 
				+        past_key_value = getattr(self, "past_key_value", past_key_value)
			
 
				+
			
 
				+        if not self.positional_rolling:
			
 
				+            cos, sin = self.rotary_emb(value_states, position_ids)
			
 
				+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
			
 
				+            if past_key_value is not None:
			
 
				+                # sin and cos are specific to RoPE models; cache_position needed for the static cache
			
 
				+                cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
			
 
				+                key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
			
 
				+        else:
			
 
				+            if past_key_value is not None:
			
 
				+                # sin and cos are specific to RoPE models; cache_position needed for the static cache
			
 
				+                key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx)
			
 
				+
			
 
				+            kv_seq_len = past_key_value.get_seq_length(self.layer_idx) if past_key_value is not None else key_states.shape[-2]
			
 
				+
			
 
				+            if not position_ids.nelement() > 1:
			
 
				+                # decoding stage
			
 
				+                key_position_ids = torch.arange(kv_seq_len, device=hidden_states.device).unsqueeze(0)
			
 
				+                query_position_ids = key_position_ids[:, -1].unsqueeze(0)
			
 
				+            elif not kv_seq_len == position_ids.shape[-1]:
			
 
				+                # prefilling stage with evicting
			
 
				+                query_position_ids = position_ids
			
 
				+                key_position_ids = torch.arange(kv_seq_len, device=hidden_states.device).unsqueeze(0)
			
 
				+            else:
			
 
				+                # prefilling stage
			
 
				+                query_position_ids = position_ids
			
 
				+                key_position_ids = position_ids
			
 
				+
			
 
				+            key_cos, key_sin = self.rotary_emb(value_states, key_position_ids)
			
 
				+            query_cos, query_sin = self.rotary_emb(value_states, query_position_ids)
			
 
				+
			
 
				+            query_states = apply_rotary_pos_emb_single(query_states, query_cos, query_sin)
			
 
				+            key_states = apply_rotary_pos_emb_single(key_states, key_cos, key_sin)
			
 
				+
			
 
				+        key_states = repeat_kv(key_states, self.num_key_value_groups)
			
 
				+        value_states = repeat_kv(value_states, self.num_key_value_groups)
			
 
				+
			
 
				+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
			
 
				+
			
 
				+        if attention_mask is not None:  # no matter the length, we just slice it
			
 
				+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
			
 
				+            attn_weights = attn_weights + causal_mask
			
 
				+
			
 
				+        # upcast attention to fp32
			
 
				+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
			
 
				+
			
 
				+        # Update KV Cache based on Heavy-Hitter Oracle
			
 
				+        if past_key_value is not None:
			
 
				+            past_key_value.update_slimming(attn_weights, self.num_key_value_groups, self.layer_idx)
			
 
				+
			
 
				+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
			
 
				+        attn_output = torch.matmul(attn_weights, value_states)
			
 
				+
			
 
				+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
			
 
				+            raise ValueError(
			
 
				+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
			
 
				+                f" {attn_output.size()}"
			
 
				+            )
			
 
				+
			
 
				+        attn_output = attn_output.transpose(1, 2).contiguous()
			
 
				+
			
 
				+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
			
 
				+
			
 
				+        if self.config.pretraining_tp > 1:
			
 
				+            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
			
 
				+            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
			
 
				+            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
			
 
				+        else:
			
 
				+            attn_output = self.o_proj(attn_output)
			
 
				+
			
 
				+        if not output_attentions:
			
 
				+            attn_weights = None
			
 
				+        
			
 
				+        return attn_output, attn_weights, past_key_value
			
 
				+
			
 
				+
			
 
				+def enable_h2ocache_forward(
			
 
				+    self,
			
 
				+    input_ids: torch.LongTensor = None,
			
 
				+    attention_mask: Optional[torch.Tensor] = None,
			
 
				+    position_ids: Optional[torch.LongTensor] = None,
			
 
				+    past_key_values: Optional[List[torch.FloatTensor]] = None,
			
 
				+    inputs_embeds: Optional[torch.FloatTensor] = None,
			
 
				+    use_cache: Optional[bool] = None,
			
 
				+    output_attentions: Optional[bool] = None,
			
 
				+    output_hidden_states: Optional[bool] = None,
			
 
				+    return_dict: Optional[bool] = None,
			
 
				+    cache_position: Optional[torch.LongTensor] = None,
			
 
				+) -> Union[Tuple, BaseModelOutputWithPast]:
			
 
				+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
			
 
				+    output_hidden_states = (
			
 
				+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
			
 
				+    )
			
 
				+    use_cache = use_cache if use_cache is not None else self.config.use_cache
			
 
				+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
			
 
				+
			
 
				+    if (input_ids is None) ^ (inputs_embeds is not None):
			
 
				+        raise ValueError(
			
 
				+            "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
			
 
				+        )
			
 
				+
			
 
				+    if self.gradient_checkpointing and self.training and use_cache:
			
 
				+        logger.warning_once(
			
 
				+            "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
			
 
				+        )
			
 
				+        use_cache = False
			
 
				+
			
 
				+    if inputs_embeds is None:
			
 
				+        inputs_embeds = self.embed_tokens(input_ids)
			
 
				+
			
 
				+    past_seen_tokens = 0
			
 
				+    if use_cache:  # kept for BC (cache positions)
			
 
				+        if not isinstance(past_key_values, StaticCache):
			
 
				+            past_key_values = HHCache.from_legacy_cache(self.num_window_length, self.num_heavy_hitter_tokens, past_key_values)
			
 
				+            past_seen_tokens = past_key_values.get_seq_length()
			
 
				+
			
 
				+    if cache_position is None:
			
 
				+        if isinstance(past_key_values, StaticCache):
			
 
				+            raise ValueError("cache_position is a required argument when using StaticCache.")
			
 
				+        cache_position = torch.arange(
			
 
				+            past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
			
 
				+        )
			
 
				+
			
 
				+    if position_ids is None:
			
 
				+        position_ids = cache_position.unsqueeze(0)
			
 
				+
			
 
				+    causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position)
			
 
				+
			
 
				+    # embed positions
			
 
				+    hidden_states = inputs_embeds
			
 
				+
			
 
				+    # decoder layers
			
 
				+    all_hidden_states = () if output_hidden_states else None
			
 
				+    all_self_attns = () if output_attentions else None
			
 
				+    next_decoder_cache = None
			
 
				+
			
 
				+    for decoder_layer in self.layers:
			
 
				+        if output_hidden_states:
			
 
				+            all_hidden_states += (hidden_states,)
			
 
				+
			
 
				+        if self.gradient_checkpointing and self.training:
			
 
				+            layer_outputs = self._gradient_checkpointing_func(
			
 
				+                decoder_layer.__call__,
			
 
				+                hidden_states,
			
 
				+                causal_mask,
			
 
				+                position_ids,
			
 
				+                past_key_values,
			
 
				+                output_attentions,
			
 
				+                use_cache,
			
 
				+                cache_position,
			
 
				+            )
			
 
				+        else:
			
 
				+            layer_outputs = decoder_layer(
			
 
				+                hidden_states,
			
 
				+                attention_mask=causal_mask,
			
 
				+                position_ids=position_ids,
			
 
				+                past_key_value=past_key_values,
			
 
				+                output_attentions=output_attentions,
			
 
				+                use_cache=use_cache,
			
 
				+                cache_position=cache_position,
			
 
				+            )
			
 
				+
			
 
				+        hidden_states = layer_outputs[0]
			
 
				+
			
 
				+        if use_cache:
			
 
				+            next_decoder_cache = layer_outputs[2 if output_attentions else 1]
			
 
				+
			
 
				+        if output_attentions:
			
 
				+            all_self_attns += (layer_outputs[1],)
			
 
				+
			
 
				+    hidden_states = self.norm(hidden_states)
			
 
				+
			
 
				+    # add hidden states from the last decoder layer
			
 
				+    if output_hidden_states:
			
 
				+        all_hidden_states += (hidden_states,)
			
 
				+
			
 
				+    next_cache = None
			
 
				+    if use_cache:
			
 
				+        next_cache = (
			
 
				+            next_decoder_cache.to_legacy_cache() if isinstance(next_decoder_cache, Cache) else next_decoder_cache
			
 
				+        )
			
 
				+    if not return_dict:
			
 
				+        return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
			
 
				+    
			
 
				+    return BaseModelOutputWithPast(
			
 
				+        last_hidden_state=hidden_states,
			
 
				+        past_key_values=next_cache,
			
 
				+        hidden_states=all_hidden_states,
			
 
				+        attentions=all_self_attns,
			
 
				+    )
			
 
				+
			
 
				+class H2OLlamaForCausalLM(LlamaForCausalLM):
			
 
				+    def __init__(self, config):
			
 
				+        super().__init__(config)
			
 
				+        num_layers = len(self.model.layers)
			
 
				+        for layer_idx in range(num_layers):
			
 
				+            self.model.layers[layer_idx].self_attn = H2OLlamaAttention(config, layer_idx)
			
 
				+
			
 
				+        self.model.forward = types.MethodType(enable_h2ocache_forward, self.model)
			
 
				+        self.model.num_heavy_hitter_tokens = config.num_heavy_hitter_tokens
			
 
				+        self.model.num_window_length = config.num_window_length
			
 
				+    
			
 
				+    def prepare_inputs_for_generation(
			
 
				+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, cache_position=None, **kwargs
			
 
				+    ):
			
 
				+        # With static cache, the `past_key_values` is None
			
 
				+        # TODO joao: standardize interface for the different Cache classes and remove of this if
			
 
				+
			
 
				+        has_static_cache = False
			
 
				+        if past_key_values is None:
			
 
				+            past_key_values = getattr(self.model.layers[0].self_attn, "past_key_value", None)
			
 
				+            has_static_cache = past_key_values is not None
			
 
				+
			
 
				+        past_length = 0
			
 
				+        if past_key_values is not None:
			
 
				+            if isinstance(past_key_values, Cache):
			
 
				+                past_length = cache_position[0]
			
 
				+                max_cache_length = (
			
 
				+                    torch.tensor(past_key_values.get_max_length(), device=input_ids.device)
			
 
				+                    if past_key_values.get_max_length() is not None
			
 
				+                    else None
			
 
				+                )
			
 
				+                cache_length = past_key_values.get_seq_length()
			
 
				+
			
 
				+            # TODO joao: remove this `else` after `generate` prioritizes `Cache` objects
			
 
				+            else:
			
 
				+                past_length = cache_position[0]
			
 
				+                cache_length = past_key_values[0].shape[2] # length = num_layers * 3 (3 -> key, value, score)
			
 
				+                max_cache_length = None
			
 
				+
			
 
				+            # Keep only the unprocessed tokens:
			
 
				+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
			
 
				+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
			
 
				+            # input)
			
 
				+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
			
 
				+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
			
 
				+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
			
 
				+            # input_ids based on the past_length.
			
 
				+            elif past_length < input_ids.shape[1]:
			
 
				+                input_ids = input_ids[:, past_length:]
			
 
				+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
			
 
				+
			
 
				+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
			
 
				+            if (
			
 
				+                max_cache_length is not None
			
 
				+                and attention_mask is not None
			
 
				+                and cache_length + input_ids.shape[1] > max_cache_length
			
 
				+            ):
			
 
				+                attention_mask = attention_mask[:, -max_cache_length:]
			
 
				+
			
 
				+        position_ids = kwargs.get("position_ids", None)
			
 
				+        if attention_mask is not None and position_ids is None:
			
 
				+            # create position_ids on the fly for batch generation
			
 
				+            position_ids = attention_mask.long().cumsum(-1) - 1
			
 
				+            position_ids.masked_fill_(attention_mask == 0, 1)
			
 
				+            if past_key_values:
			
 
				+                position_ids = position_ids[:, -input_ids.shape[1] :]
			
 
				+
			
 
				+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
			
 
				+        if inputs_embeds is not None and past_key_values is None:
			
 
				+            model_inputs = {"inputs_embeds": inputs_embeds}
			
 
				+        else:
			
 
				+            # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise
			
 
				+            # recompiles graphs as the stride of the inputs is a guard. Ref: https://github.com/huggingface/transformers/pull/29114
			
 
				+            # TODO: use `next_tokens` directly instead.
			
 
				+            model_inputs = {"input_ids": input_ids.contiguous()}
			
 
				+
			
 
				+        input_length = position_ids.shape[-1] if position_ids is not None else input_ids.shape[-1]
			
 
				+        if cache_position is None:
			
 
				+            cache_position = torch.arange(past_length, past_length + input_length, device=input_ids.device)
			
 
				+        else:
			
 
				+            cache_position = cache_position[-input_length:]
			
 
				+
			
 
				+        if has_static_cache:
			
 
				+            past_key_values = None
			
 
				+
			
 
				+        model_inputs.update(
			
 
				+            {
			
 
				+                "position_ids": position_ids,
			
 
				+                "cache_position": cache_position,
			
 
				+                "past_key_values": past_key_values,
			
 
				+                "use_cache": kwargs.get("use_cache"),
			
 
				+                "attention_mask": attention_mask,
			
 
				+            }
			
 
				+        )
			
 
				+        return model_inputs
			
--- a/recipes/experimental/long-context/H2O/utils/streaming.py
+++ b/recipes/experimental/long-context/H2O/utils/streaming.py
@@ -0,0 +1,123 @@
 
				+"""
			
 
				+    Source Code: https://github.com/mit-han-lab/streaming-llm/blob/main/streaming_llm/utils.py
			
 
				+"""
			
 
				+
			
 
				+import torch
			
 
				+import argparse
			
 
				+from transformers import (
			
 
				+    AutoTokenizer,
			
 
				+    AutoModelForCausalLM,
			
 
				+)
			
 
				+import os.path as osp
			
 
				+import ssl
			
 
				+import urllib.request
			
 
				+import os
			
 
				+import json
			
 
				+
			
 
				+
			
 
				+def load(model_name_or_path):
			
 
				+    print(f"Loading model from {model_name_or_path} ...")
			
 
				+    # however, tensor parallel for running falcon will occur bugs
			
 
				+    tokenizer = AutoTokenizer.from_pretrained(
			
 
				+        model_name_or_path,
			
 
				+        trust_remote_code=True,
			
 
				+    )
			
 
				+    model = AutoModelForCausalLM.from_pretrained(
			
 
				+        model_name_or_path,
			
 
				+        device_map="auto",
			
 
				+        torch_dtype=torch.float16,
			
 
				+        trust_remote_code=True,
			
 
				+    )
			
 
				+    if tokenizer.pad_token_id is None:
			
 
				+        if tokenizer.eos_token_id is not None:
			
 
				+            tokenizer.pad_token_id = tokenizer.eos_token_id
			
 
				+        else:
			
 
				+            tokenizer.pad_token_id = 0
			
 
				+
			
 
				+    model.eval()
			
 
				+
			
 
				+    return model, tokenizer
			
 
				+
			
 
				+
			
 
				+def download_url(url: str, folder="folder"):
			
 
				+    """
			
 
				+    Downloads the content of an url to a folder. Modified from \
			
 
				+    https://github.com/pyg-team/pytorch_geometric/tree/master/torch_geometric
			
 
				+
			
 
				+    Args:
			
 
				+        url (string): The url of target file.
			
 
				+        folder (string): The target folder.
			
 
				+
			
 
				+    Returns:
			
 
				+        string: File path of downloaded files.
			
 
				+    """
			
 
				+
			
 
				+    file = url.rpartition("/")[2]
			
 
				+    file = file if file[0] == "?" else file.split("?")[0]
			
 
				+    path = osp.join(folder, file)
			
 
				+    if osp.exists(path):
			
 
				+        print(f"File {file} exists, use existing file.")
			
 
				+        return path
			
 
				+
			
 
				+    print(f"Downloading {url}")
			
 
				+    os.makedirs(folder, exist_ok=True)
			
 
				+    ctx = ssl._create_unverified_context()
			
 
				+    data = urllib.request.urlopen(url, context=ctx)
			
 
				+    with open(path, "wb") as f:
			
 
				+        f.write(data.read())
			
 
				+
			
 
				+    return path
			
 
				+
			
 
				+
			
 
				+def load_jsonl(
			
 
				+    file_path,
			
 
				+):
			
 
				+    list_data_dict = []
			
 
				+    with open(file_path, "r") as f:
			
 
				+        for line in f:
			
 
				+            list_data_dict.append(json.loads(line))
			
 
				+    return list_data_dict
			
 
				+
			
 
				+
			
 
				+
			
 
				+@torch.no_grad()
			
 
				+def greedy_generate(model, tokenizer, input_ids, past_key_values, max_gen_len):
			
 
				+    outputs = model(
			
 
				+        input_ids=input_ids,
			
 
				+        past_key_values=past_key_values,
			
 
				+        use_cache=True,
			
 
				+    )
			
 
				+    past_key_values = outputs.past_key_values
			
 
				+    pred_token_idx = outputs.logits[:, -1, :].argmax(dim=-1).unsqueeze(1)
			
 
				+    generated_ids = [pred_token_idx.item()]
			
 
				+    pos = 0
			
 
				+    for _ in range(max_gen_len - 1):
			
 
				+        outputs = model(
			
 
				+            input_ids=pred_token_idx,
			
 
				+            past_key_values=past_key_values,
			
 
				+            use_cache=True,
			
 
				+        )
			
 
				+        past_key_values = outputs.past_key_values
			
 
				+        pred_token_idx = outputs.logits[:, -1, :].argmax(dim=-1).unsqueeze(1)
			
 
				+        generated_ids.append(pred_token_idx.item())
			
 
				+        generated_text = (
			
 
				+            tokenizer.decode(
			
 
				+                generated_ids,
			
 
				+                skip_special_tokens=True,
			
 
				+                clean_up_tokenization_spaces=True,
			
 
				+                spaces_between_special_tokens=False,
			
 
				+            )
			
 
				+            .strip()
			
 
				+            .split(" ")
			
 
				+        )
			
 
				+
			
 
				+        now = len(generated_text) - 1
			
 
				+        if now > pos:
			
 
				+            print(" ".join(generated_text[pos:now]), end=" ", flush=True)
			
 
				+            pos = now
			
 
				+
			
 
				+        if pred_token_idx == tokenizer.eos_token_id:
			
 
				+            break
			
 
				+    print(" ".join(generated_text[pos:]), flush=True)
			
 
				+    return past_key_values
			
 
				+
			
--- a/recipes/finetuning/README.md
+++ b/recipes/finetuning/README.md
@@ -11,7 +11,7 @@ using the canonical [finetuning script](../../src/llama_recipes/finetuning.py) i
 
				 If you are new to fine-tuning techniques, check out an overview: [](./LLM_finetuning_overview.md)
			
 
				 
			
 
				 > [!TIP]
			
 
				-> If you want to try finetuning Meta Llama 3 with Huggingface's trainer, here is a Jupyter notebook with an [example](./huggingface_trainer/peft_finetuning.ipynb)
			
 
				+> If you want to try finetuning Meta Llama 3 in a Jupyter notebook you can find a quickstart notebook [here](./quickstart_peft_finetuning.ipynb)
			
 
				 
			
 
				 
			
 
				 ## How to configure finetuning settings?
			
--- a/recipes/finetuning/huggingface_trainer/peft_finetuning.ipynb
+++ b/recipes/finetuning/huggingface_trainer/peft_finetuning.ipynb
--- a/recipes/finetuning/quickstart_peft_finetuning.ipynb
+++ b/recipes/finetuning/quickstart_peft_finetuning.ipynb
@@ -0,0 +1,480 @@
 
				+{
			
 
				+ "cells": [
			
 
				+  {
			
 
				+   "attachments": {},
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "Copyright (c) Meta Platforms, Inc. and affiliates.\n",
			
 
				+    "This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.\n",
			
 
				+    "\n",
			
 
				+    "<a href=\"https://colab.research.google.com/github/meta-llama/llama-recipes/blob/main/recipes/finetuning/quickstart_peft_finetuning.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "attachments": {},
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "## PEFT Finetuning Quick Start Notebook\n",
			
 
				+    "\n",
			
 
				+    "This notebook shows how to train a Meta Llama 3 model on a single GPU (e.g. A10 with 24GB) using int8 quantization and LoRA finetuning.\n",
			
 
				+    "\n",
			
 
				+    "**_Note:_** To run this notebook on a machine with less than 24GB VRAM (e.g. T4 with 16GB) the context length of the training dataset needs to be adapted.\n",
			
 
				+    "We do this based on the available VRAM during execution.\n",
			
 
				+    "If you run into OOM issues try to further lower the value of train_config.context_length."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Step 0: Install pre-requirements and convert checkpoint\n",
			
 
				+    "\n",
			
 
				+    "We need to have llama-recipes and its dependencies installed for this notebook. Additionally, we need to log in with the huggingface_cli and make sure that the account is able to to access the Meta Llama weights."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 1,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# uncomment if running from Colab T4\n",
			
 
				+    "# ! pip install llama-recipes ipywidgets\n",
			
 
				+    "\n",
			
 
				+    "# import huggingface_hub\n",
			
 
				+    "# huggingface_hub.login()"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "attachments": {},
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Step 1: Load the model\n",
			
 
				+    "\n",
			
 
				+    "Setup training configuration and load the model and tokenizer."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 2,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "data": {
			
 
				+      "application/vnd.jupyter.widget-view+json": {
			
 
				+       "model_id": "c7963d43806d432aaa3d00e2055e355c",
			
 
				+       "version_major": 2,
			
 
				+       "version_minor": 0
			
 
				+      },
			
 
				+      "text/plain": [
			
 
				+       "Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]"
			
 
				+      ]
			
 
				+     },
			
 
				+     "metadata": {},
			
 
				+     "output_type": "display_data"
			
 
				+    },
			
 
				+    {
			
 
				+     "name": "stderr",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "import torch\n",
			
 
				+    "from transformers import LlamaForCausalLM, AutoTokenizer\n",
			
 
				+    "from llama_recipes.configs import train_config as TRAIN_CONFIG\n",
			
 
				+    "\n",
			
 
				+    "train_config = TRAIN_CONFIG()\n",
			
 
				+    "train_config.model_name = \"meta-llama/Meta-Llama-3-8B\"\n",
			
 
				+    "train_config.num_epochs = 1\n",
			
 
				+    "train_config.run_validation = False\n",
			
 
				+    "train_config.gradient_accumulation_steps = 4\n",
			
 
				+    "train_config.batch_size_training = 1\n",
			
 
				+    "train_config.lr = 3e-4\n",
			
 
				+    "train_config.use_fast_kernels = True\n",
			
 
				+    "train_config.use_fp16 = True\n",
			
 
				+    "train_config.context_length = 1024 if torch.cuda.get_device_properties(0).total_memory < 16e9 else 2048 # T4 16GB or A10 24GB\n",
			
 
				+    "train_config.batching_strategy = \"packing\"\n",
			
 
				+    "train_config.output_dir = \"meta-llama-samsum\"\n",
			
 
				+    "\n",
			
 
				+    "from transformers import BitsAndBytesConfig\n",
			
 
				+    "config = BitsAndBytesConfig(\n",
			
 
				+    "    load_in_8bit=True,\n",
			
 
				+    ")\n",
			
 
				+    "\n",
			
 
				+    "model = LlamaForCausalLM.from_pretrained(\n",
			
 
				+    "            train_config.model_name,\n",
			
 
				+    "            device_map=\"auto\",\n",
			
 
				+    "            quantization_config=config,\n",
			
 
				+    "            use_cache=False,\n",
			
 
				+    "            attn_implementation=\"sdpa\" if train_config.use_fast_kernels else None,\n",
			
 
				+    "            torch_dtype=torch.float16,\n",
			
 
				+    "        )\n",
			
 
				+    "\n",
			
 
				+    "tokenizer = AutoTokenizer.from_pretrained(train_config.model_name)\n",
			
 
				+    "tokenizer.pad_token = tokenizer.eos_token"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "attachments": {},
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Step 2: Check base model\n",
			
 
				+    "\n",
			
 
				+    "Run the base model on an example input:"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 3,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stderr",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n"
			
 
				+     ]
			
 
				+    },
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "\n",
			
 
				+      "Summarize this dialog:\n",
			
 
				+      "A: Hi Tom, are you busy tomorrow’s afternoon?\n",
			
 
				+      "B: I’m pretty sure I am. What’s up?\n",
			
 
				+      "A: Can you go with me to the animal shelter?.\n",
			
 
				+      "B: What do you want to do?\n",
			
 
				+      "A: I want to get a puppy for my son.\n",
			
 
				+      "B: That will make him so happy.\n",
			
 
				+      "A: Yeah, we’ve discussed it many times. I think he’s ready now.\n",
			
 
				+      "B: That’s good. Raising a dog is a tough issue. Like having a baby ;-) \n",
			
 
				+      "A: I'll get him one of those little dogs.\n",
			
 
				+      "B: One that won't grow up too big;-)\n",
			
 
				+      "A: And eat too much;-))\n",
			
 
				+      "B: Do you know which one he would like?\n",
			
 
				+      "A: Oh, yes, I took him there last Monday. He showed me one that he really liked.\n",
			
 
				+      "B: I bet you had to drag him away.\n",
			
 
				+      "A: He wanted to take it home right away ;-).\n",
			
 
				+      "B: I wonder what he'll name it.\n",
			
 
				+      "A: He said he’d name it after his dead hamster – Lemmy  - he's  a great Motorhead fan :-)))\n",
			
 
				+      "---\n",
			
 
				+      "Summary:\n",
			
 
				+      "A: Hi Tom, are you busy tomorrow’s afternoon?\n",
			
 
				+      "B: I’m pretty sure I am. What’s up?\n",
			
 
				+      "A: Can you go with me to the animal shelter?.\n",
			
 
				+      "B: What do you want to do?\n",
			
 
				+      "A: I want to get a puppy for my son.\n",
			
 
				+      "B: That will make him so happy.\n",
			
 
				+      "A: Yeah, we’ve discussed it many times. I think he’s ready now.\n",
			
 
				+      "B: That’s good. Raising a dog is a tough issue\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "eval_prompt = \"\"\"\n",
			
 
				+    "Summarize this dialog:\n",
			
 
				+    "A: Hi Tom, are you busy tomorrow’s afternoon?\n",
			
 
				+    "B: I’m pretty sure I am. What’s up?\n",
			
 
				+    "A: Can you go with me to the animal shelter?.\n",
			
 
				+    "B: What do you want to do?\n",
			
 
				+    "A: I want to get a puppy for my son.\n",
			
 
				+    "B: That will make him so happy.\n",
			
 
				+    "A: Yeah, we’ve discussed it many times. I think he’s ready now.\n",
			
 
				+    "B: That’s good. Raising a dog is a tough issue. Like having a baby ;-) \n",
			
 
				+    "A: I'll get him one of those little dogs.\n",
			
 
				+    "B: One that won't grow up too big;-)\n",
			
 
				+    "A: And eat too much;-))\n",
			
 
				+    "B: Do you know which one he would like?\n",
			
 
				+    "A: Oh, yes, I took him there last Monday. He showed me one that he really liked.\n",
			
 
				+    "B: I bet you had to drag him away.\n",
			
 
				+    "A: He wanted to take it home right away ;-).\n",
			
 
				+    "B: I wonder what he'll name it.\n",
			
 
				+    "A: He said he’d name it after his dead hamster – Lemmy  - he's  a great Motorhead fan :-)))\n",
			
 
				+    "---\n",
			
 
				+    "Summary:\n",
			
 
				+    "\"\"\"\n",
			
 
				+    "\n",
			
 
				+    "model_input = tokenizer(eval_prompt, return_tensors=\"pt\").to(\"cuda\")\n",
			
 
				+    "\n",
			
 
				+    "model.eval()\n",
			
 
				+    "with torch.no_grad():\n",
			
 
				+    "    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "attachments": {},
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "We can see that the base model only repeats the conversation.\n",
			
 
				+    "\n",
			
 
				+    "### Step 3: Load the preprocessed dataset\n",
			
 
				+    "\n",
			
 
				+    "We load and preprocess the samsum dataset which consists of curated pairs of dialogs and their summarization:"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 4,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stderr",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "/home/ubuntu/miniconda3/envs/llama/lib/python3.11/site-packages/datasets/load.py:1486: FutureWarning: The repository for samsum contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/samsum\n",
			
 
				+      "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
			
 
				+      "Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
			
 
				+      "  warnings.warn(\n",
			
 
				+      "Preprocessing dataset: 100%|██████████| 14732/14732 [00:02<00:00, 6124.69it/s]\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "from llama_recipes.configs.datasets import samsum_dataset\n",
			
 
				+    "from llama_recipes.data.concatenator import ConcatDataset\n",
			
 
				+    "from llama_recipes.utils.config_utils import get_dataloader_kwargs\n",
			
 
				+    "from llama_recipes.utils.dataset_utils import get_preprocessed_dataset\n",
			
 
				+    "\n",
			
 
				+    "train_dataset = get_preprocessed_dataset(tokenizer, samsum_dataset, 'train')\n",
			
 
				+    "\n",
			
 
				+    "train_dl_kwargs = get_dataloader_kwargs(train_config, train_dataset, tokenizer, \"train\")\n",
			
 
				+    "\n",
			
 
				+    "if train_config.batching_strategy == \"packing\":\n",
			
 
				+    "        train_dataset = ConcatDataset(train_dataset, chunk_size=train_config.context_length)\n",
			
 
				+    "\n",
			
 
				+    "# Create DataLoaders for the training and validation dataset\n",
			
 
				+    "train_dataloader = torch.utils.data.DataLoader(\n",
			
 
				+    "    train_dataset,\n",
			
 
				+    "    num_workers=train_config.num_workers_dataloader,\n",
			
 
				+    "    pin_memory=True,\n",
			
 
				+    "    **train_dl_kwargs,\n",
			
 
				+    ")"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "attachments": {},
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Step 4: Prepare model for PEFT\n",
			
 
				+    "\n",
			
 
				+    "Let's prepare the model for Parameter Efficient Fine Tuning (PEFT):"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 5,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "from peft import get_peft_model, prepare_model_for_kbit_training, LoraConfig\n",
			
 
				+    "from dataclasses import asdict\n",
			
 
				+    "from llama_recipes.configs import lora_config as LORA_CONFIG\n",
			
 
				+    "\n",
			
 
				+    "lora_config = LORA_CONFIG()\n",
			
 
				+    "lora_config.r = 8\n",
			
 
				+    "lora_config.lora_alpha = 32\n",
			
 
				+    "lora_dropout: float=0.01\n",
			
 
				+    "\n",
			
 
				+    "peft_config = LoraConfig(**asdict(lora_config))\n",
			
 
				+    "\n",
			
 
				+    "model = prepare_model_for_kbit_training(model)\n",
			
 
				+    "model = get_peft_model(model, peft_config)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "attachments": {},
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Step 5: Fine tune the model\n",
			
 
				+    "\n",
			
 
				+    "Here, we fine tune the model for a single epoch."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 6,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stderr",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "/home/ubuntu/miniconda3/envs/llama/lib/python3.11/site-packages/torch/cuda/memory.py:330: FutureWarning: torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, which resets /all/ peak memory stats.\n",
			
 
				+      "  warnings.warn(\n",
			
 
				+      "Training Epoch: 1:   0%|\u001b[34m          \u001b[0m| 0/319 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
			
 
				+      "To disable this warning, you can either:\n",
			
 
				+      "\t- Avoid using `tokenizers` before the fork if possible\n",
			
 
				+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
			
 
				+      "/home/ubuntu/miniconda3/envs/llama/lib/python3.11/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n",
			
 
				+      "  warnings.warn(\n",
			
 
				+      "/home/ubuntu/miniconda3/envs/llama/lib/python3.11/site-packages/bitsandbytes/autograd/_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
			
 
				+      "  warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
			
 
				+      "Training Epoch: 1/1, step 1278/1279 completed (loss: 0.27870458364486694): : 320it [2:07:09, 23.84s/it]                      3.94s/it]  \n"
			
 
				+     ]
			
 
				+    },
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "Max CUDA memory allocated was 15 GB\n",
			
 
				+      "Max CUDA memory reserved was 16 GB\n",
			
 
				+      "Peak active CUDA memory was 15 GB\n",
			
 
				+      "CUDA Malloc retries : 0\n",
			
 
				+      "CPU Total Peak Memory consumed during the train (max): 2 GB\n",
			
 
				+      "Epoch 1: train_perplexity=1.3403, train_epoch_loss=0.2929, epoch time 7630.169942979002s\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "import torch.optim as optim\n",
			
 
				+    "from llama_recipes.utils.train_utils import train\n",
			
 
				+    "from torch.optim.lr_scheduler import StepLR\n",
			
 
				+    "\n",
			
 
				+    "model.train()\n",
			
 
				+    "\n",
			
 
				+    "optimizer = optim.AdamW(\n",
			
 
				+    "            model.parameters(),\n",
			
 
				+    "            lr=train_config.lr,\n",
			
 
				+    "            weight_decay=train_config.weight_decay,\n",
			
 
				+    "        )\n",
			
 
				+    "scheduler = StepLR(optimizer, step_size=1, gamma=train_config.gamma)\n",
			
 
				+    "\n",
			
 
				+    "# Start the training process\n",
			
 
				+    "results = train(\n",
			
 
				+    "    model,\n",
			
 
				+    "    train_dataloader,\n",
			
 
				+    "    None,\n",
			
 
				+    "    tokenizer,\n",
			
 
				+    "    optimizer,\n",
			
 
				+    "    scheduler,\n",
			
 
				+    "    train_config.gradient_accumulation_steps,\n",
			
 
				+    "    train_config,\n",
			
 
				+    "    None,\n",
			
 
				+    "    None,\n",
			
 
				+    "    None,\n",
			
 
				+    "    wandb_run=None,\n",
			
 
				+    ")"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "attachments": {},
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Step 6:\n",
			
 
				+    "Save model checkpoint"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 7,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stderr",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "/home/ubuntu/miniconda3/envs/llama/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
			
 
				+      "  warnings.warn(\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "model.save_pretrained(train_config.output_dir)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "attachments": {},
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Step 7:\n",
			
 
				+    "Try the fine tuned model on the same example again to see the learning progress:"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 8,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stderr",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n"
			
 
				+     ]
			
 
				+    },
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "\n",
			
 
				+      "Summarize this dialog:\n",
			
 
				+      "A: Hi Tom, are you busy tomorrow’s afternoon?\n",
			
 
				+      "B: I’m pretty sure I am. What’s up?\n",
			
 
				+      "A: Can you go with me to the animal shelter?.\n",
			
 
				+      "B: What do you want to do?\n",
			
 
				+      "A: I want to get a puppy for my son.\n",
			
 
				+      "B: That will make him so happy.\n",
			
 
				+      "A: Yeah, we’ve discussed it many times. I think he’s ready now.\n",
			
 
				+      "B: That’s good. Raising a dog is a tough issue. Like having a baby ;-) \n",
			
 
				+      "A: I'll get him one of those little dogs.\n",
			
 
				+      "B: One that won't grow up too big;-)\n",
			
 
				+      "A: And eat too much;-))\n",
			
 
				+      "B: Do you know which one he would like?\n",
			
 
				+      "A: Oh, yes, I took him there last Monday. He showed me one that he really liked.\n",
			
 
				+      "B: I bet you had to drag him away.\n",
			
 
				+      "A: He wanted to take it home right away ;-).\n",
			
 
				+      "B: I wonder what he'll name it.\n",
			
 
				+      "A: He said he’d name it after his dead hamster – Lemmy  - he's  a great Motorhead fan :-)))\n",
			
 
				+      "---\n",
			
 
				+      "Summary:\n",
			
 
				+      "A wants to get a puppy for her son. She will take him to the animal shelter tomorrow. B is not sure if he can go with her, but he's willing to.\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "model.eval()\n",
			
 
				+    "with torch.no_grad():\n",
			
 
				+    "    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))\n"
			
 
				+   ]
			
 
				+  }
			
 
				+ ],
			
 
				+ "metadata": {
			
 
				+  "kernelspec": {
			
 
				+   "display_name": "Python 3 (ipykernel)",
			
 
				+   "language": "python",
			
 
				+   "name": "python3"
			
 
				+  },
			
 
				+  "language_info": {
			
 
				+   "codemirror_mode": {
			
 
				+    "name": "ipython",
			
 
				+    "version": 3
			
 
				+   },
			
 
				+   "file_extension": ".py",
			
 
				+   "mimetype": "text/x-python",
			
 
				+   "name": "python",
			
 
				+   "nbconvert_exporter": "python",
			
 
				+   "pygments_lexer": "ipython3",
			
 
				+   "version": "3.10.14"
			
 
				+  },
			
 
				+  "vscode": {
			
 
				+   "interpreter": {
			
 
				+    "hash": "2d58e898dde0263bc564c6968b04150abacfd33eed9b19aaa8e45c040360e146"
			
 
				+   }
			
 
				+  }
			
 
				+ },
			
 
				+ "nbformat": 4,
			
 
				+ "nbformat_minor": 4
			
 
				+}
			
--- a/recipes/llama_api_providers/Groq/groq-api-cookbook/function-calling-101-ecommerce/Function-Calling-101-Ecommerce.ipynb
+++ b/recipes/llama_api_providers/Groq/groq-api-cookbook/function-calling-101-ecommerce/Function-Calling-101-Ecommerce.ipynb
--- a/recipes/llama_api_providers/Groq/groq-api-cookbook/function-calling-101-ecommerce/customers.csv
+++ b/recipes/llama_api_providers/Groq/groq-api-cookbook/function-calling-101-ecommerce/customers.csv
@@ -0,0 +1,41 @@
 
				+customer_id,name,email,address
			
 
				+1,Erin Boyle MD,erin.boyle.md@example.com,"165 Brown Springs
			
 
				+Michaelport, IL 60228"
			
 
				+2,Matthew Saunders,matthew.saunders@example.com,"219 Steven Mountains
			
 
				+Port Gabriellafort, OH 52281"
			
 
				+3,Amanda Anderson,amanda.anderson@example.com,"498 Laurie Glens
			
 
				+Mitchelltown, CT 93655"
			
 
				+4,Julian Butler,julian.butler@example.com,"909 Rodriguez Harbors Suite 119
			
 
				+New Tracyburgh, MS 15487"
			
 
				+5,Zachary Mitchell MD,zachary.mitchell.md@example.com,"9087 Matthew Drives
			
 
				+Caitlynshire, OR 42442"
			
 
				+6,Troy Bennett,troy.bennett@example.com,"73329 Kimberly Loaf Apt. 029
			
 
				+Shellyborough, TX 55939"
			
 
				+7,Allison Hall,allison.hall@example.com,"210 Shannon Camp
			
 
				+New Michael, MO 65990"
			
 
				+8,Carolyn Davis,carolyn.davis@example.com,"64228 Carol Courts Suite 087
			
 
				+New Micheleshire, MT 42516"
			
 
				+9,Cindy Munoz,cindy.munoz@example.com,"1722 Christine Plaza
			
 
				+Danielport, UT 12261"
			
 
				+10,Tom Testuser,tom.testuser@example.com,"451 Victoria Bridge Suite 529
			
 
				+Pageton, WI 27404"
			
 
				+11,Charles Walker,charles.walker@example.com,"2077 Lamb Drive
			
 
				+Salazarton, IN 54619"
			
 
				+12,Brianna Molina,brianna.molina@example.com,"586 Khan Mills Suite 202
			
 
				+Lake Dominique, VA 98527"
			
 
				+13,Austin Andrade,austin.andrade@example.com,"4857 Donna Cliffs
			
 
				+Floydstad, PR 82540"
			
 
				+14,Brandon Andrade,brandon.andrade@example.com,"906 Olivia Motorway
			
 
				+Kelleyfort, AK 48960"
			
 
				+15,Diane Lam,diane.lam@example.com,"070 Eric Rapid Suite 159
			
 
				+Townsendbury, MI 57664"
			
 
				+16,Jason Kelly,jason.kelly@example.com,"873 Angela Track Apt. 972
			
 
				+Stephenville, NV 32705"
			
 
				+17,Mr. Mitchell Saunders,mr..mitchell.saunders@example.com,"USS White
			
 
				+FPO AE 91058"
			
 
				+18,Regina Ross,regina.ross@example.com,"91857 Wendy Place
			
 
				+East Charlesshire, CA 43705"
			
 
				+19,Mrs. Denise May DDS,mrs..denise.may.dds@example.com,"64590 Kathleen Cove Apt. 736
			
 
				+Derrickton, AK 05935"
			
 
				+20,Lisa Boyle,lisa.boyle@example.com,"USNS Russell
			
 
				+FPO AE 51528"
			
--- a/recipes/llama_api_providers/Groq/groq-api-cookbook/function-calling-101-ecommerce/orders.csv
+++ b/recipes/llama_api_providers/Groq/groq-api-cookbook/function-calling-101-ecommerce/orders.csv
@@ -0,0 +1,21 @@
 
				+order_id,product_id,customer_id,order_date

			
 
				+1,13,18,2024-02-15 15:15

			
 
				+2,19,6,2024-01-03 17:43

			
 
				+3,12,20,2024-03-11 1:13

			
 
				+4,7,20,2024-02-04 12:04

			
 
				+5,14,3,2024-05-02 17:12

			
 
				+6,17,6,2024-02-12 1:46

			
 
				+7,20,4,2024-02-26 2:59

			
 
				+8,4,7,2024-05-02 16:51

			
 
				+9,11,2,2024-01-04 11:09

			
 
				+10,6,9,2024-04-09 15:04

			
 
				+11,3,7,2024-02-21 21:17

			
 
				+12,6,18,2024-02-21 18:50

			
 
				+13,17,11,2024-05-02 16:20

			
 
				+14,11,15,2024-04-20 2:49

			
 
				+15,16,7,2024-01-18 1:12

			
 
				+16,16,16,2024-05-03 11:20

			
 
				+17,14,18,2024-03-26 22:51

			
 
				+18,20,16,2024-05-07 23:25

			
 
				+19,1,12,2024-05-20 12:41

			
 
				+20,20,3,2024-01-17 7:25
			
--- a/recipes/llama_api_providers/Groq/groq-api-cookbook/function-calling-101-ecommerce/products.csv
+++ b/recipes/llama_api_providers/Groq/groq-api-cookbook/function-calling-101-ecommerce/products.csv
@@ -0,0 +1,21 @@
 
				+product_id,name,description,price,stock_quantity
			
 
				+1,Laptop,High performance laptop with 16GB RAM and 512GB SSD.,753.03,15
			
 
				+2,Smartphone,Latest model smartphone with a stunning display and great camera.,398.54,59
			
 
				+3,Headphones,Noise-cancelling over-ear headphones with long battery life.,889.79,97
			
 
				+4,Monitor,24-inch 1080p monitor with vibrant colors and wide viewing angles.,604.44,98
			
 
				+5,Keyboard,Mechanical keyboard with customizable RGB lighting.,500.24,52
			
 
				+6,Mouse,Wireless mouse with ergonomic design and long battery life.,321.98,57
			
 
				+7,Printer,All-in-one printer with wireless connectivity and high-quality printing.,695.29,32
			
 
				+8,Tablet,Portable tablet with 10-inch display and powerful processor.,625.75,28
			
 
				+9,Smartwatch,Stylish smartwatch with fitness tracking and notifications.,952.72,42
			
 
				+10,Camera,Digital camera with 20MP sensor and 4K video recording.,247.93,99
			
 
				+11,Speaker,Bluetooth speaker with excellent sound quality and deep bass.,896.4,32
			
 
				+12,Router,Wi-Fi router with high speed and wide coverage.,976.16,59
			
 
				+13,External Hard Drive,1TB external hard drive with fast data transfer speeds.,434.46,18
			
 
				+14,USB Flash Drive,64GB USB flash drive with compact design and reliable storage.,991.09,77
			
 
				+15,Microphone,Professional microphone with clear sound and adjustable settings.,276.23,30
			
 
				+16,Webcam,HD webcam with wide-angle lens and built-in microphone.,890.39,13
			
 
				+17,Drone,Compact drone with HD camera and stable flight controls.,285.93,37
			
 
				+18,Projector,Portable projector with bright display and multiple connectivity options.,290.22,31
			
 
				+19,Fitness Tracker,Fitness tracker with heart rate monitor and sleep tracking.,953.65,4
			
 
				+20,E-Reader,Lightweight e-reader with high-resolution display and long battery life.,132.15,62
			
--- a/recipes/llama_api_providers/Groq/groq-api-cookbook/json-mode-function-calling-for-sql/data/employees.csv
+++ b/recipes/llama_api_providers/Groq/groq-api-cookbook/json-mode-function-calling-for-sql/data/employees.csv
@@ -0,0 +1,8 @@
 
				+employee_id,name,email
			
 
				+1,Richard Hendricks,richard@piedpiper.com
			
 
				+2,Erlich Bachman,erlich@aviato.com
			
 
				+3,Dinesh Chugtai,dinesh@piedpiper.com
			
 
				+4,Bertram Gilfoyle,gilfoyle@piedpiper.com
			
 
				+5,Jared Dunn,jared@piedpiper.com
			
 
				+6,Monica Hall,monica@raviga.com
			
 
				+7,Gavin Belson,gavin@hooli.com
			
--- a/recipes/llama_api_providers/Groq/groq-api-cookbook/json-mode-function-calling-for-sql/data/purchases.csv
+++ b/recipes/llama_api_providers/Groq/groq-api-cookbook/json-mode-function-calling-for-sql/data/purchases.csv
@@ -0,0 +1,6 @@
 
				+purchase_id,purchase_date,product_name,employee_id,amount
			
 
				+1,'2024-02-01',iPhone,1,750
			
 
				+2,'2024-02-02',Tesla,2,70000
			
 
				+3,'2024-02-03',Humane pin,3,500
			
 
				+4,'2024-02-04',iPhone,4,700
			
 
				+5,'2024-02-05',Tesla,5,75000
			
--- a/recipes/llama_api_providers/Groq/groq-api-cookbook/json-mode-function-calling-for-sql/json-mode-function-calling-for-sql.ipynb
+++ b/recipes/llama_api_providers/Groq/groq-api-cookbook/json-mode-function-calling-for-sql/json-mode-function-calling-for-sql.ipynb
--- a/recipes/llama_api_providers/Groq/groq-api-cookbook/json-mode-function-calling-for-sql/verified-queries/employees-without-purchases.yaml
+++ b/recipes/llama_api_providers/Groq/groq-api-cookbook/json-mode-function-calling-for-sql/verified-queries/employees-without-purchases.yaml
@@ -0,0 +1,7 @@
 
				+description: Employees without a purchase since Feb 1, 2024
			
 
				+sql: |
			
 
				+  SELECT employees.name as employees_without_purchases
			
 
				+  FROM employees.csv AS employees
			
 
				+  LEFT JOIN purchases.csv AS purchases ON employees.employee_id = purchases.employee_id
			
 
				+  AND purchases.purchase_date > '2024-02-01'
			
 
				+  WHERE purchases.purchase_id IS NULL
			
--- a/recipes/llama_api_providers/Groq/groq-api-cookbook/json-mode-function-calling-for-sql/verified-queries/most-expensive-purchase.yaml
+++ b/recipes/llama_api_providers/Groq/groq-api-cookbook/json-mode-function-calling-for-sql/verified-queries/most-expensive-purchase.yaml
@@ -0,0 +1,9 @@
 
				+description: Employee with the most expensive purchase
			
 
				+sql: |
			
 
				+  SELECT employees.name AS employee_name,
			
 
				+        MAX(amount) AS max_purchase_amount
			
 
				+  FROM purchases.csv AS purchases
			
 
				+  JOIN employees.csv AS employees ON purchases.employee_id = employees.employee_id
			
 
				+  GROUP BY employees.name
			
 
				+  ORDER BY max_purchase_amount DESC
			
 
				+  LIMIT 1
			
--- a/recipes/llama_api_providers/Groq/groq-api-cookbook/json-mode-function-calling-for-sql/verified-queries/most-recent-purchases.yaml
+++ b/recipes/llama_api_providers/Groq/groq-api-cookbook/json-mode-function-calling-for-sql/verified-queries/most-recent-purchases.yaml
@@ -0,0 +1,11 @@
 
				+description: Five most recent purchases
			
 
				+sql: |
			
 
				+  SELECT 
			
 
				+         purchases.purchase_date,
			
 
				+         purchases.product_name,
			
 
				+         purchases.amount,
			
 
				+         employees.name
			
 
				+  FROM purchases.csv AS purchases
			
 
				+  JOIN employees.csv AS employees ON purchases.employee_id = employees.employee_id
			
 
				+  ORDER BY purchases.purchase_date DESC
			
 
				+  LIMIT 5;
			
--- a/recipes/llama_api_providers/Groq/groq-api-cookbook/json-mode-function-calling-for-sql/verified-queries/number-of-teslas.yaml
+++ b/recipes/llama_api_providers/Groq/groq-api-cookbook/json-mode-function-calling-for-sql/verified-queries/number-of-teslas.yaml
@@ -0,0 +1,6 @@
 
				+description: Number of Teslas purchased
			
 
				+sql: |
			
 
				+  SELECT COUNT(*) as number_of_teslas
			
 
				+  FROM purchases.csv AS p
			
 
				+  JOIN employees.csv AS e ON e.employee_id = p.employee_id
			
 
				+  WHERE p.product_name = 'Tesla'
			
--- a/recipes/llama_api_providers/Groq/groq-api-cookbook/json-mode-social-determinants-of-health/SDOH-Json-mode.ipynb
+++ b/recipes/llama_api_providers/Groq/groq-api-cookbook/json-mode-social-determinants-of-health/SDOH-Json-mode.ipynb
--- a/recipes/llama_api_providers/Groq/groq-api-cookbook/json-mode-social-determinants-of-health/clinical_notes/00456321.txt
+++ b/recipes/llama_api_providers/Groq/groq-api-cookbook/json-mode-social-determinants-of-health/clinical_notes/00456321.txt
--- a/recipes/llama_api_providers/Groq/groq-api-cookbook/json-mode-social-determinants-of-health/clinical_notes/00567289.txt
+++ b/recipes/llama_api_providers/Groq/groq-api-cookbook/json-mode-social-determinants-of-health/clinical_notes/00567289.txt
--- a/recipes/llama_api_providers/Groq/groq-api-cookbook/json-mode-social-determinants-of-health/clinical_notes/00678934.txt
+++ b/recipes/llama_api_providers/Groq/groq-api-cookbook/json-mode-social-determinants-of-health/clinical_notes/00678934.txt
--- a/recipes/llama_api_providers/Groq/groq-api-cookbook/json-mode-social-determinants-of-health/clinical_notes/00785642.txt
+++ b/recipes/llama_api_providers/Groq/groq-api-cookbook/json-mode-social-determinants-of-health/clinical_notes/00785642.txt
@@ -0,0 +1,32 @@
 
				+**Date:** March 28, 2024
			
 
				+
			
 
				+**Patient:** Brian Lee, 55 years old
			
 
				+
			
 
				+**MRN:** 00785642
			
 
				+
			
 
				+**Chief Complaint:** "I've been having trouble managing my blood sugar levels."
			
 
				+
			
 
				+**History of Present Illness:** The patient is a 55-year-old with a known diagnosis of Type 2 Diabetes Mellitus, presenting with difficulty in managing blood sugar levels over the past month. Reports fluctuating blood sugar readings despite adherence to prescribed diet and medication. The patient expresses a desire to avoid any complications associated with poor diabetes management.
			
 
				+
			
 
				+**Past Medical History:** Type 2 Diabetes Mellitus, controlled hypertension
			
 
				+
			
 
				+**Social History:**
			
 
				+The patient is a self-employed graphic designer, working from a home office. They describe their work as fulfilling and report a stable income. They own a home in a well-regarded neighborhood, noting its quiet and safe environment. The patient has a supportive spouse and a close circle of friends, often participating in social gatherings and community events.
			
 
				+
			
 
				+The patient completed a bachelor's degree in graphic design and continues to take online courses to stay updated in their field. They have reliable transportation, a recent model car, ensuring timely access to healthcare appointments. The patient is an active member of a local walking group, which meets thrice a week for exercise and socialization.
			
 
				+
			
 
				+Nutritionally, the patient is mindful of their diet, focusing on low-glycemic foods, and has not faced issues with food security. They have comprehensive health insurance coverage through a private provider, with satisfactory benefits that cover their medical needs, including diabetes management.
			
 
				+
			
 
				+**Review of Systems:** Reports consistent adherence to diabetic diet and medication regimen. Denies any episodes of hypoglycemia or diabetic ketoacidosis.
			
 
				+
			
 
				+**Physical Examination:**
			
 
				+- General: Well-nourished and well-kept appearance. Alert and oriented.
			
 
				+- Vitals: BP 130/80, HR 72, Temp 98.6°F, Resp 14/min
			
 
				+
			
 
				+**Assessment/Plan:**
			
 
				+- Review current diabetes management plan and consider medication adjustments.
			
 
				+- Recommend continuous glucose monitoring (CGM) to better understand glucose patterns and variability.
			
 
				+- Encourage continued engagement with community exercise groups and dietary mindfulness.
			
 
				+- Schedule a follow-up appointment in 3 months or sooner if glucose management issues persist.
			
 
				+
			
 
				+**Comments:** The patient demonstrates a proactive approach to managing their diabetes, supported by a stable and healthy social environment. Continued focus on lifestyle modification and close monitoring of blood sugar levels are key to preventing complications.
			
--- a/recipes/llama_api_providers/Groq/groq-api-cookbook/json-mode-social-determinants-of-health/clinical_notes/00893247.txt
+++ b/recipes/llama_api_providers/Groq/groq-api-cookbook/json-mode-social-determinants-of-health/clinical_notes/00893247.txt
--- a/recipes/llama_api_providers/Groq/groq-api-cookbook/llama3-stock-market-function-calling/llama3-stock-market-function-calling.ipynb
+++ b/recipes/llama_api_providers/Groq/groq-api-cookbook/llama3-stock-market-function-calling/llama3-stock-market-function-calling.ipynb
--- a/recipes/llama_api_providers/Groq/groq-api-cookbook/parallel-tool-use/parallel-tool-use.ipynb
+++ b/recipes/llama_api_providers/Groq/groq-api-cookbook/parallel-tool-use/parallel-tool-use.ipynb
@@ -0,0 +1,340 @@
 
				+{
			
 
				+ "cells": [
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "104f2b97-f9bb-4dcc-a4c8-099710768851",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "# Parallel Tool use"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "f8dc57b6-2c48-4ee3-bb2c-25441274ed2f",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Setup"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "e70814b4",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "Make sure you have `ipykernel` and `pip` pre-installed"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "962ae5e2",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "%pip install -r requirements.txt"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 1,
			
 
				+   "id": "e21816b3",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "data": {
			
 
				+      "text/plain": [
			
 
				+       "'Groq API key configured: gsk_7FdrzM...'"
			
 
				+      ]
			
 
				+     },
			
 
				+     "execution_count": 1,
			
 
				+     "metadata": {},
			
 
				+     "output_type": "execute_result"
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "import os\n",
			
 
				+    "import json\n",
			
 
				+    "\n",
			
 
				+    "from groq import Groq\n",
			
 
				+    "from dotenv import load_dotenv\n",
			
 
				+    "\n",
			
 
				+    "load_dotenv()\n",
			
 
				+    "\"Groq API key configured: \" + os.environ[\"GROQ_API_KEY\"][:10] + \"...\""
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "7f7c9c55-e925-4cc1-89f2-58237acf14a4",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "We will use the ```llama3-70b-8192``` model in this demo. Note that you will need a Groq API Key to proceed and can create an account [here](https://console.groq.com/) to generate one for free. Only Llama 3 models support parallel tool use at this time (05/07/2024).\n",
			
 
				+    "\n",
			
 
				+    "We recommend using the 70B Llama 3 model, 8B has subpar consistency."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 2,
			
 
				+   "id": "0cca781b-1950-4167-b36a-c1099d6b3b00",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "client = Groq(api_key=os.getenv(\"GROQ_API_KEY\"))\n",
			
 
				+    "model = \"llama3-70b-8192\""
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "2c23ec2b",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "Let's define a dummy function we can invoke in our tool use loop"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 3,
			
 
				+   "id": "f2ce18dc",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "def get_weather(city: str):\n",
			
 
				+    "    if city == \"Madrid\":\n",
			
 
				+    "        return 35\n",
			
 
				+    "    elif city == \"San Francisco\":\n",
			
 
				+    "        return 18\n",
			
 
				+    "    elif city == \"Paris\":\n",
			
 
				+    "        return 20\n",
			
 
				+    "    else:\n",
			
 
				+    "        return 15"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "a37e3c92",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "Now we define our messages and tools and run the completion request."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 4,
			
 
				+   "id": "6b454910-4352-40cc-b9b2-cc79edabd7c1",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "messages = [\n",
			
 
				+    "    {\"role\": \"system\", \"content\": \"\"\"You are a helpful assistant.\"\"\"},\n",
			
 
				+    "    {\n",
			
 
				+    "        \"role\": \"user\",\n",
			
 
				+    "        \"content\": \"What is the weather in Paris, Tokyo and Madrid?\",\n",
			
 
				+    "    },\n",
			
 
				+    "]\n",
			
 
				+    "tools = [\n",
			
 
				+    "    {\n",
			
 
				+    "        \"type\": \"function\",\n",
			
 
				+    "        \"function\": {\n",
			
 
				+    "            \"name\": \"get_weather\",\n",
			
 
				+    "            \"description\": \"Returns the weather in the given city in degrees Celsius\",\n",
			
 
				+    "            \"parameters\": {\n",
			
 
				+    "                \"type\": \"object\",\n",
			
 
				+    "                \"properties\": {\n",
			
 
				+    "                    \"city\": {\n",
			
 
				+    "                        \"type\": \"string\",\n",
			
 
				+    "                        \"description\": \"The name of the city\",\n",
			
 
				+    "                    }\n",
			
 
				+    "                },\n",
			
 
				+    "                \"required\": [\"city\"],\n",
			
 
				+    "            },\n",
			
 
				+    "        },\n",
			
 
				+    "    }\n",
			
 
				+    "]\n",
			
 
				+    "response = client.chat.completions.create(\n",
			
 
				+    "    model=model, messages=messages, tools=tools, tool_choice=\"auto\", max_tokens=4096\n",
			
 
				+    ")\n",
			
 
				+    "\n",
			
 
				+    "response_message = response.choices[0].message"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "25c2838f",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "# Processing the tool calls\n",
			
 
				+    "\n",
			
 
				+    "Now we process the assistant message and construct the required messages to continue the conversation. \n",
			
 
				+    "\n",
			
 
				+    "*Including* invoking each tool_call against our actual function."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 5,
			
 
				+   "id": "fe623ab9",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "[\n",
			
 
				+      "  {\n",
			
 
				+      "    \"role\": \"system\",\n",
			
 
				+      "    \"content\": \"You are a helpful assistant.\"\n",
			
 
				+      "  },\n",
			
 
				+      "  {\n",
			
 
				+      "    \"role\": \"user\",\n",
			
 
				+      "    \"content\": \"What is the weather in Paris, Tokyo and Madrid?\"\n",
			
 
				+      "  },\n",
			
 
				+      "  {\n",
			
 
				+      "    \"role\": \"assistant\",\n",
			
 
				+      "    \"tool_calls\": [\n",
			
 
				+      "      {\n",
			
 
				+      "        \"id\": \"call_5ak8\",\n",
			
 
				+      "        \"function\": {\n",
			
 
				+      "          \"name\": \"get_weather\",\n",
			
 
				+      "          \"arguments\": \"{\\\"city\\\":\\\"Paris\\\"}\"\n",
			
 
				+      "        },\n",
			
 
				+      "        \"type\": \"function\"\n",
			
 
				+      "      },\n",
			
 
				+      "      {\n",
			
 
				+      "        \"id\": \"call_zq26\",\n",
			
 
				+      "        \"function\": {\n",
			
 
				+      "          \"name\": \"get_weather\",\n",
			
 
				+      "          \"arguments\": \"{\\\"city\\\":\\\"Tokyo\\\"}\"\n",
			
 
				+      "        },\n",
			
 
				+      "        \"type\": \"function\"\n",
			
 
				+      "      },\n",
			
 
				+      "      {\n",
			
 
				+      "        \"id\": \"call_znf3\",\n",
			
 
				+      "        \"function\": {\n",
			
 
				+      "          \"name\": \"get_weather\",\n",
			
 
				+      "          \"arguments\": \"{\\\"city\\\":\\\"Madrid\\\"}\"\n",
			
 
				+      "        },\n",
			
 
				+      "        \"type\": \"function\"\n",
			
 
				+      "      }\n",
			
 
				+      "    ]\n",
			
 
				+      "  },\n",
			
 
				+      "  {\n",
			
 
				+      "    \"role\": \"tool\",\n",
			
 
				+      "    \"content\": \"20\",\n",
			
 
				+      "    \"tool_call_id\": \"call_5ak8\"\n",
			
 
				+      "  },\n",
			
 
				+      "  {\n",
			
 
				+      "    \"role\": \"tool\",\n",
			
 
				+      "    \"content\": \"15\",\n",
			
 
				+      "    \"tool_call_id\": \"call_zq26\"\n",
			
 
				+      "  },\n",
			
 
				+      "  {\n",
			
 
				+      "    \"role\": \"tool\",\n",
			
 
				+      "    \"content\": \"35\",\n",
			
 
				+      "    \"tool_call_id\": \"call_znf3\"\n",
			
 
				+      "  }\n",
			
 
				+      "]\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "tool_calls = response_message.tool_calls\n",
			
 
				+    "\n",
			
 
				+    "messages.append(\n",
			
 
				+    "    {\n",
			
 
				+    "        \"role\": \"assistant\",\n",
			
 
				+    "        \"tool_calls\": [\n",
			
 
				+    "            {\n",
			
 
				+    "                \"id\": tool_call.id,\n",
			
 
				+    "                \"function\": {\n",
			
 
				+    "                    \"name\": tool_call.function.name,\n",
			
 
				+    "                    \"arguments\": tool_call.function.arguments,\n",
			
 
				+    "                },\n",
			
 
				+    "                \"type\": tool_call.type,\n",
			
 
				+    "            }\n",
			
 
				+    "            for tool_call in tool_calls\n",
			
 
				+    "        ],\n",
			
 
				+    "    }\n",
			
 
				+    ")\n",
			
 
				+    "\n",
			
 
				+    "available_functions = {\n",
			
 
				+    "    \"get_weather\": get_weather,\n",
			
 
				+    "}\n",
			
 
				+    "for tool_call in tool_calls:\n",
			
 
				+    "    function_name = tool_call.function.name\n",
			
 
				+    "    function_to_call = available_functions[function_name]\n",
			
 
				+    "    function_args = json.loads(tool_call.function.arguments)\n",
			
 
				+    "    function_response = function_to_call(**function_args)\n",
			
 
				+    "\n",
			
 
				+    "    # Note how we create a separate tool call message for each tool call\n",
			
 
				+    "    # the model is able to discern the tool call result through the tool_call_id\n",
			
 
				+    "    messages.append(\n",
			
 
				+    "        {\n",
			
 
				+    "            \"role\": \"tool\",\n",
			
 
				+    "            \"content\": json.dumps(function_response),\n",
			
 
				+    "            \"tool_call_id\": tool_call.id,\n",
			
 
				+    "        }\n",
			
 
				+    "    )\n",
			
 
				+    "\n",
			
 
				+    "print(json.dumps(messages, indent=2))"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "1abe981a",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "Now we run our final completion with multiple tool call results included in the messages array.\n",
			
 
				+    "\n",
			
 
				+    "**Note**\n",
			
 
				+    "\n",
			
 
				+    "We pass the tool definitions again to help the model understand:\n",
			
 
				+    "\n",
			
 
				+    "1. The assistant message with the tool call\n",
			
 
				+    "2. Interpret the tool results."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 6,
			
 
				+   "id": "5f077df3",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "The weather in Paris is 20°C, in Tokyo is 15°C, and in Madrid is 35°C.\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "response = client.chat.completions.create(\n",
			
 
				+    "    model=model, messages=messages, tools=tools, tool_choice=\"auto\", max_tokens=4096\n",
			
 
				+    ")\n",
			
 
				+    "\n",
			
 
				+    "print(response.choices[0].message.content)"
			
 
				+   ]
			
 
				+  }
			
 
				+ ],
			
 
				+ "metadata": {
			
 
				+  "kernelspec": {
			
 
				+   "display_name": "Python 3 (ipykernel)",
			
 
				+   "language": "python",
			
 
				+   "name": "python3"
			
 
				+  },
			
 
				+  "language_info": {
			
 
				+   "codemirror_mode": {
			
 
				+    "name": "ipython",
			
 
				+    "version": 3
			
 
				+   },
			
 
				+   "file_extension": ".py",
			
 
				+   "mimetype": "text/x-python",
			
 
				+   "name": "python",
			
 
				+   "nbconvert_exporter": "python",
			
 
				+   "pygments_lexer": "ipython3",
			
 
				+   "version": "3.10.13"
			
 
				+  }
			
 
				+ },
			
 
				+ "nbformat": 4,
			
 
				+ "nbformat_minor": 5
			
 
				+}
			
--- a/recipes/llama_api_providers/Groq/groq-api-cookbook/parallel-tool-use/requirements.txt
+++ b/recipes/llama_api_providers/Groq/groq-api-cookbook/parallel-tool-use/requirements.txt
@@ -0,0 +1,2 @@
 
				+groq
			
 
				+python-dotenv
			
--- a/recipes/llama_api_providers/Groq/groq-api-cookbook/rag-langchain-presidential-speeches/presidential_speeches.csv
+++ b/recipes/llama_api_providers/Groq/groq-api-cookbook/rag-langchain-presidential-speeches/presidential_speeches.csv
--- a/recipes/llama_api_providers/Groq/groq-api-cookbook/rag-langchain-presidential-speeches/rag-langchain-presidential-speeches.ipynb
+++ b/recipes/llama_api_providers/Groq/groq-api-cookbook/rag-langchain-presidential-speeches/rag-langchain-presidential-speeches.ipynb
--- a/recipes/llama_api_providers/Groq/groq-example-templates/conversational-chatbot-langchain/README.md
+++ b/recipes/llama_api_providers/Groq/groq-example-templates/conversational-chatbot-langchain/README.md
@@ -0,0 +1,21 @@
 
				+# Groq LangChain Conversational Chatbot
			
 
				+
			
 
				+A simple application that allows users to interact with a conversational chatbot powered by LangChain. The application uses the Groq API to generate responses and leverages LangChain's [ConversationBufferWindowMemory](https://python.langchain.com/v0.1/docs/modules/memory/types/buffer_window/) to maintain a history of the conversation to provide context for the chatbot's responses.
			
 
				+
			
 
				+## Features
			
 
				+
			
 
				+- **Conversational Interface**: The application provides a conversational interface where users can ask questions or make statements, and the chatbot responds accordingly.
			
 
				+
			
 
				+- **Contextual Responses**: The application maintains a history of the conversation, which is used to provide context for the chatbot's responses.
			
 
				+
			
 
				+- **LangChain Integration**: The chatbot is powered by the LangChain API, which uses advanced natural language processing techniques to generate human-like responses.
			
 
				+
			
 
				+## Usage
			
 
				+
			
 
				+<!-- markdown-link-check-disable -->
			
 
				+
			
 
				+You will need to store a valid Groq API Key as a secret to proceed with this example. You can generate one for free [here](https://console.groq.com/keys).
			
 
				+
			
 
				+<!-- markdown-link-check-enable -->
			
 
				+
			
 
				+You can [fork and run this application on Replit](https://replit.com/@GroqCloud/Chatbot-with-Conversational-Memory-on-LangChain) or run it on the command line with `python main.py`
			
--- a/recipes/llama_api_providers/Groq/groq-example-templates/conversational-chatbot-langchain/main.py
+++ b/recipes/llama_api_providers/Groq/groq-example-templates/conversational-chatbot-langchain/main.py
@@ -0,0 +1,74 @@
 
				+import os
			
 
				+from groq import Groq
			
 
				+
			
 
				+from langchain.chains import ConversationChain, LLMChain
			
 
				+from langchain_core.prompts import (
			
 
				+    ChatPromptTemplate,
			
 
				+    HumanMessagePromptTemplate,
			
 
				+    MessagesPlaceholder,
			
 
				+)
			
 
				+from langchain_core.messages import SystemMessage
			
 
				+from langchain.chains.conversation.memory import ConversationBufferWindowMemory
			
 
				+from langchain_groq import ChatGroq
			
 
				+from langchain.prompts import PromptTemplate
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """
			
 
				+    This function is the main entry point of the application. It sets up the Groq client, the Streamlit interface, and handles the chat interaction.
			
 
				+    """
			
 
				+
			
 
				+    # Get Groq API key
			
 
				+    groq_api_key = os.environ['GROQ_API_KEY']
			
 
				+    model = 'llama3-8b-8192'
			
 
				+    # Initialize Groq Langchain chat object and conversation
			
 
				+    groq_chat = ChatGroq(
			
 
				+            groq_api_key=groq_api_key, 
			
 
				+            model_name=model
			
 
				+    )
			
 
				+    
			
 
				+    print("Hello! I'm your friendly Groq chatbot. I can help answer your questions, provide information, or just chat. I'm also super fast! Let's start our conversation!")
			
 
				+
			
 
				+    system_prompt = 'You are a friendly conversational chatbot'
			
 
				+    conversational_memory_length = 5 # number of previous messages the chatbot will remember during the conversation
			
 
				+
			
 
				+    memory = ConversationBufferWindowMemory(k=conversational_memory_length, memory_key="chat_history", return_messages=True)
			
 
				+
			
 
				+
			
 
				+    #chat_history = []
			
 
				+    while True:
			
 
				+        user_question = input("Ask a question: ")
			
 
				+
			
 
				+        # If the user has asked a question,
			
 
				+        if user_question:
			
 
				+
			
 
				+            # Construct a chat prompt template using various components
			
 
				+            prompt = ChatPromptTemplate.from_messages(
			
 
				+                [
			
 
				+                    SystemMessage(
			
 
				+                        content=system_prompt
			
 
				+                    ),  # This is the persistent system prompt that is always included at the start of the chat.
			
 
				+
			
 
				+                    MessagesPlaceholder(
			
 
				+                        variable_name="chat_history"
			
 
				+                    ),  # This placeholder will be replaced by the actual chat history during the conversation. It helps in maintaining context.
			
 
				+
			
 
				+                    HumanMessagePromptTemplate.from_template(
			
 
				+                        "{human_input}"
			
 
				+                    ),  # This template is where the user's current input will be injected into the prompt.
			
 
				+                ]
			
 
				+            )
			
 
				+
			
 
				+            # Create a conversation chain using the LangChain LLM (Language Learning Model)
			
 
				+            conversation = LLMChain(
			
 
				+                llm=groq_chat,  # The Groq LangChain chat object initialized earlier.
			
 
				+                prompt=prompt,  # The constructed prompt template.
			
 
				+                verbose=False,   # TRUE Enables verbose output, which can be useful for debugging.
			
 
				+                memory=memory,  # The conversational memory object that stores and manages the conversation history.
			
 
				+            )
			
 
				+            # The chatbot's answer is generated by sending the full prompt to the Groq API.
			
 
				+            response = conversation.predict(human_input=user_question)
			
 
				+            print("Chatbot:", response)
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/recipes/llama_api_providers/Groq/groq-example-templates/conversational-chatbot-langchain/requirements.txt
+++ b/recipes/llama_api_providers/Groq/groq-example-templates/conversational-chatbot-langchain/requirements.txt
--- a/recipes/llama_api_providers/Groq/groq-example-templates/crewai-agents/README.md
+++ b/recipes/llama_api_providers/Groq/groq-example-templates/crewai-agents/README.md
@@ -0,0 +1,23 @@
 
				+# CrewAI Machine Learning Assistant
			
 
				+
			
 
				+## Overview
			
 
				+
			
 
				+The [CrewAI](https://docs.crewai.com/) Machine Learning Assistant is a command line application designed to kickstart your machine learning projects. It leverages a team of AI agents to guide you through the initial steps of defining, assessing, and solving machine learning problems.
			
 
				+
			
 
				+## Features
			
 
				+
			
 
				+- **Agents**: Utilizes specialized agents to perform tasks such as problem definition, data assessment, model recommendation, and code generation, enhancing the workflow and efficiency of machine learning projects.
			
 
				+
			
 
				+- **CrewAI Framework**: Integrates multiple agents into a cohesive framework, enabling seamless interaction and task execution to streamline the machine learning process.
			
 
				+
			
 
				+- **LangChain Integration**: Incorporates LangChain to facilitate natural language processing and enhance the interaction between the user and the machine learning assistant.
			
 
				+
			
 
				+## Usage
			
 
				+
			
 
				+<!-- markdown-link-check-disable -->
			
 
				+
			
 
				+You will need to store a valid Groq API Key as a secret to proceed with this example. You can generate one for free [here](https://console.groq.com/keys).
			
 
				+
			
 
				+<!-- markdown-link-check-enable -->
			
 
				+
			
 
				+You can [fork and run this application on Replit](https://replit.com/@GroqCloud/CrewAI-Machine-Learning-Assistant) or run it on the command line with `python main.py`. You can upload a sample .csv to the same directory as `main.py` to give the application a head start on your ML problem. The application will output a Markdown file including python code for your ML use case to the same directory as main.py.
			
--- a/recipes/llama_api_providers/Groq/groq-example-templates/crewai-agents/main.py
+++ b/recipes/llama_api_providers/Groq/groq-example-templates/crewai-agents/main.py
@@ -0,0 +1,184 @@
 
				+import pandas as pd
			
 
				+import os
			
 
				+from crewai import Agent, Task, Crew
			
 
				+from langchain_groq import ChatGroq
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """
			
 
				+    Main function to initialize and run the CrewAI Machine Learning Assistant.
			
 
				+
			
 
				+    This function sets up a machine learning assistant using the Llama 3 model with the ChatGroq API.
			
 
				+    It provides a text-based interface for users to define, assess, and solve machine learning problems
			
 
				+    by interacting with multiple specialized AI agents. The function outputs the results to the console 
			
 
				+    and writes them to a markdown file.
			
 
				+
			
 
				+    Steps:
			
 
				+    1. Initialize the ChatGroq API with the specified model and API key.
			
 
				+    2. Display introductory text about the CrewAI Machine Learning Assistant.
			
 
				+    3. Create and configure four AI agents:
			
 
				+        - Problem_Definition_Agent: Clarifies the machine learning problem.
			
 
				+        - Data_Assessment_Agent: Evaluates the quality and suitability of the provided data.
			
 
				+        - Model_Recommendation_Agent: Suggests suitable machine learning models.
			
 
				+        - Starter_Code_Generator_Agent: Generates starter Python code for the project.
			
 
				+    4. Prompt the user to describe their machine learning problem.
			
 
				+    5. Check if a .csv file is available in the current directory and try to read it as a DataFrame.
			
 
				+    6. Define tasks for the agents based on user input and data availability.
			
 
				+    7. Create a Crew instance with the agents and tasks, and run the tasks.
			
 
				+    8. Print the results and write them to an output markdown file.
			
 
				+    """
			
 
				+
			
 
				+    model = 'llama3-8b-8192'
			
 
				+
			
 
				+    llm = ChatGroq(
			
 
				+            temperature=0, 
			
 
				+            groq_api_key = os.getenv('GROQ_API_KEY'), 
			
 
				+            model_name=model
			
 
				+        )
			
 
				+
			
 
				+    print('CrewAI Machine Learning Assistant')
			
 
				+    multiline_text = """
			
 
				+    The CrewAI Machine Learning Assistant is designed to guide users through the process of defining, assessing, and solving machine learning problems. It leverages a team of AI agents, each with a specific role, to clarify the problem, evaluate the data, recommend suitable models, and generate starter Python code. Whether you're a seasoned data scientist or a beginner, this application provides valuable insights and a head start in your machine learning projects.
			
 
				+    """
			
 
				+
			
 
				+    print(multiline_text)
			
 
				+
			
 
				+
			
 
				+    Problem_Definition_Agent = Agent(
			
 
				+        role='Problem_Definition_Agent',
			
 
				+        goal="""clarify the machine learning problem the user wants to solve, 
			
 
				+            identifying the type of problem (e.g., classification, regression) and any specific requirements.""",
			
 
				+        backstory="""You are an expert in understanding and defining machine learning problems. 
			
 
				+            Your goal is to extract a clear, concise problem statement from the user's input, 
			
 
				+            ensuring the project starts with a solid foundation.""",
			
 
				+        verbose=True,
			
 
				+        allow_delegation=False,
			
 
				+        llm=llm,
			
 
				+    )
			
 
				+
			
 
				+    Data_Assessment_Agent = Agent(
			
 
				+        role='Data_Assessment_Agent',
			
 
				+        goal="""evaluate the data provided by the user, assessing its quality, 
			
 
				+            suitability for the problem, and suggesting preprocessing steps if necessary.""",
			
 
				+        backstory="""You specialize in data evaluation and preprocessing. 
			
 
				+            Your task is to guide the user in preparing their dataset for the machine learning model, 
			
 
				+            including suggestions for data cleaning and augmentation.""",
			
 
				+        verbose=True,
			
 
				+        allow_delegation=False,
			
 
				+        llm=llm,
			
 
				+    )
			
 
				+
			
 
				+    Model_Recommendation_Agent = Agent(
			
 
				+        role='Model_Recommendation_Agent',
			
 
				+        goal="""suggest the most suitable machine learning models based on the problem definition 
			
 
				+            and data assessment, providing reasons for each recommendation.""",
			
 
				+        backstory="""As an expert in machine learning algorithms, you recommend models that best fit 
			
 
				+            the user's problem and data. You provide insights into why certain models may be more effective than others,
			
 
				+            considering classification vs regression and supervised vs unsupervised frameworks.""",
			
 
				+        verbose=True,
			
 
				+        allow_delegation=False,
			
 
				+        llm=llm,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+    Starter_Code_Generator_Agent = Agent(
			
 
				+        role='Starter_Code_Generator_Agent',
			
 
				+        goal="""generate starter Python code for the project, including data loading, 
			
 
				+            model definition, and a basic training loop, based on findings from the problem definitions,
			
 
				+            data assessment and model recommendation""",
			
 
				+        backstory="""You are a code wizard, able to generate starter code templates that users 
			
 
				+            can customize for their projects. Your goal is to give users a head start in their coding efforts.""",
			
 
				+        verbose=True,
			
 
				+        allow_delegation=False,
			
 
				+        llm=llm,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+    user_question = input("Describe your ML problem: ")
			
 
				+    data_upload = False
			
 
				+    # Check if there is a .csv file in the current directory
			
 
				+    if any(file.endswith(".csv") for file in os.listdir()):
			
 
				+        sample_fp = [file for file in os.listdir() if file.endswith(".csv")][0]
			
 
				+        try:
			
 
				+            # Attempt to read the uploaded file as a DataFrame
			
 
				+            df = pd.read_csv(sample_fp).head(5)
			
 
				+
			
 
				+            # If successful, set 'data_upload' to True
			
 
				+            data_upload = True
			
 
				+
			
 
				+            # Display the DataFrame in the app
			
 
				+            print("Data successfully uploaded and read as DataFrame:")
			
 
				+            print(df)
			
 
				+        except Exception as e:
			
 
				+            print(f"Error reading the file: {e}")
			
 
				+
			
 
				+    if user_question:
			
 
				+
			
 
				+        task_define_problem = Task(
			
 
				+        description="""Clarify and define the machine learning problem, 
			
 
				+            including identifying the problem type and specific requirements.
			
 
				+
			
 
				+            Here is the user's problem:
			
 
				+            {ml_problem}
			
 
				+            """.format(ml_problem=user_question),
			
 
				+        agent=Problem_Definition_Agent,
			
 
				+        expected_output="A clear and concise definition of the machine learning problem."
			
 
				+        )
			
 
				+
			
 
				+        if data_upload:
			
 
				+            task_assess_data = Task(
			
 
				+                description="""Evaluate the user's data for quality and suitability, 
			
 
				+                suggesting preprocessing or augmentation steps if needed.
			
 
				+
			
 
				+                Here is a sample of the user's data:
			
 
				+                {df}
			
 
				+                The file name is called {uploaded_file}
			
 
				+
			
 
				+                """.format(df=df.head(),uploaded_file=sample_fp),
			
 
				+                agent=Data_Assessment_Agent,
			
 
				+                expected_output="An assessment of the data's quality and suitability, with suggestions for preprocessing or augmentation if necessary."
			
 
				+            )
			
 
				+        else:
			
 
				+            task_assess_data = Task(
			
 
				+                description="""The user has not uploaded any specific data for this problem,
			
 
				+                but please go ahead and consider a hypothetical dataset that might be useful
			
 
				+                for their machine learning problem. 
			
 
				+                """,
			
 
				+                agent=Data_Assessment_Agent,
			
 
				+                expected_output="A hypothetical dataset that might be useful for the user's machine learning problem, along with any necessary preprocessing steps."
			
 
				+            )
			
 
				+
			
 
				+        task_recommend_model = Task(
			
 
				+        description="""Suggest suitable machine learning models for the defined problem 
			
 
				+            and assessed data, providing rationale for each suggestion.""",
			
 
				+        agent=Model_Recommendation_Agent,
			
 
				+        expected_output="A list of suitable machine learning models for the defined problem and assessed data, along with the rationale for each suggestion."
			
 
				+        )
			
 
				+
			
 
				+
			
 
				+        task_generate_code = Task(
			
 
				+        description="""Generate starter Python code tailored to the user's project using the model recommendation agent's recommendation(s), 
			
 
				+            including snippets for package import, data handling, model definition, and training
			
 
				+            """,
			
 
				+        agent=Starter_Code_Generator_Agent,
			
 
				+        expected_output="Python code snippets for package import, data handling, model definition, and training, tailored to the user's project, plus a brief summary of the problem and model recommendations."
			
 
				+        )
			
 
				+
			
 
				+
			
 
				+        crew = Crew(
			
 
				+            agents=[Problem_Definition_Agent, Data_Assessment_Agent, Model_Recommendation_Agent,  Starter_Code_Generator_Agent], 
			
 
				+            tasks=[task_define_problem, task_assess_data, task_recommend_model,  task_generate_code], 
			
 
				+            verbose=False
			
 
				+        )
			
 
				+
			
 
				+        result = crew.kickoff()
			
 
				+
			
 
				+        print(result)
			
 
				+
			
 
				+        with open('output.md', "w") as file:
			
 
				+            print('\n\nThese results have been exported to output.md')
			
 
				+            file.write(result)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/recipes/llama_api_providers/Groq/groq-example-templates/crewai-agents/requirements.txt
+++ b/recipes/llama_api_providers/Groq/groq-example-templates/crewai-agents/requirements.txt
@@ -0,0 +1,3 @@
 
				+crewai
			
 
				+langchain_groq
			
 
				+pandas
			
--- a/recipes/llama_api_providers/Groq/groq-example-templates/groq-quickstart-conversational-chatbot/README.md
+++ b/recipes/llama_api_providers/Groq/groq-example-templates/groq-quickstart-conversational-chatbot/README.md
@@ -0,0 +1,21 @@
 
				+# Groq Quickstart Conversational Chatbot
			
 
				+
			
 
				+A simple application that allows users to interact with a conversational chatbot powered by Groq. This application is designed to get users up and running quickly with building a chatbot.
			
 
				+
			
 
				+## Features
			
 
				+
			
 
				+**Conversational Interface**: Provides a simple interface where users can input text and receive responses from the chatbot.
			
 
				+
			
 
				+**Short Responses**: The chatbot replies with very short and concise answers, keeping interactions brief and to the point.
			
 
				+
			
 
				+**Groq Integration**: Utilizes the Groq API to generate responses, leveraging the power of the Llama3-70b-8192 model.
			
 
				+
			
 
				+## Usage
			
 
				+
			
 
				+<!-- markdown-link-check-disable -->
			
 
				+
			
 
				+You will need to store a valid Groq API Key as a secret to proceed with this example. You can generate one for free [here](https://console.groq.com/keys).
			
 
				+
			
 
				+<!-- markdown-link-check-enable -->
			
 
				+
			
 
				+You can [fork and run this application on Replit](https://replit.com/@GroqCloud/Groq-Quickstart-Conversational-Chatbot) or run it on the command line with `python main.py`.
			
--- a/recipes/llama_api_providers/Groq/groq-example-templates/groq-quickstart-conversational-chatbot/main.py
+++ b/recipes/llama_api_providers/Groq/groq-example-templates/groq-quickstart-conversational-chatbot/main.py
@@ -0,0 +1,38 @@
 
				+#set GROQ_API_KEY in the secrets
			
 
				+
			
 
				+import os
			
 
				+from groq import Groq
			
 
				+
			
 
				+# Create the Groq client
			
 
				+client = Groq(
			
 
				+    api_key=os.environ.get("GROQ_API_KEY")
			
 
				+)
			
 
				+
			
 
				+# Set the system prompt
			
 
				+system_prompt = {
			
 
				+    "role": "system",
			
 
				+    "content":
			
 
				+    "You are a helpful assistant. You reply with very short answers."
			
 
				+}
			
 
				+
			
 
				+# Initialize the chat history
			
 
				+chat_history = [system_prompt]
			
 
				+
			
 
				+while True:
			
 
				+  # Get user input from the console
			
 
				+  user_input = input("You: ")
			
 
				+
			
 
				+  # Append the user input to the chat history
			
 
				+  chat_history.append({"role": "user", "content": user_input})
			
 
				+
			
 
				+  response = client.chat.completions.create(model="llama3-70b-8192",
			
 
				+                                            messages=chat_history,
			
 
				+                                            max_tokens=100,
			
 
				+                                            temperature=1.2)
			
 
				+  # Append the response to the chat history
			
 
				+  chat_history.append({
			
 
				+      "role": "assistant",
			
 
				+      "content": response.choices[0].message.content
			
 
				+  })
			
 
				+  # Print the response
			
 
				+  print("Assistant:", response.choices[0].message.content)
			
--- a/recipes/llama_api_providers/Groq/groq-example-templates/groq-quickstart-conversational-chatbot/requirements.txt
+++ b/recipes/llama_api_providers/Groq/groq-example-templates/groq-quickstart-conversational-chatbot/requirements.txt
@@ -0,0 +1 @@
 
				+groq
			
--- a/recipes/llama_api_providers/Groq/groq-example-templates/groqing-the-stock-market-function-calling-llama3/README.md
+++ b/recipes/llama_api_providers/Groq/groq-example-templates/groqing-the-stock-market-function-calling-llama3/README.md
@@ -0,0 +1,27 @@
 
				+# 'Groqing the Stock Market' with Llama 3 Function Calling
			
 
				+
			
 
				+This is a simple application that leverages the yfinance API to provide insights into stocks and their prices. The application uses the Llama 3 model on Groq in conjunction with Langchain to call functions based on the user prompt.
			
 
				+
			
 
				+## Key Functions
			
 
				+
			
 
				+- **get_stock_info(symbol, key)**: This function fetches various information about a given stock symbol. The information can be anything from the company's address to its financial ratios. The 'key' parameter specifies the type of information to fetch.
			
 
				+
			
 
				+- **get_historical_price(symbol, start_date, end_date)**: This function fetches the historical stock prices for a given symbol from a specified start date to an end date. The returned data is a DataFrame with the date and closing price of the stock.
			
 
				+
			
 
				+- **plot_price_over_time(historical_price_dfs)**: This function takes a list of DataFrames (each containing historical price data for a stock) and plots the prices over time using Plotly. The plot is saved to the same directory as the app.
			
 
				+
			
 
				+- **call_functions(llm_with_tools, user_prompt)**: This function takes the user's question, invokes the appropriate tool (either get_stock_info or get_historical_price), and generates a response. If the user asked for historical prices, it also calls plot_price_over_time to generate a plot.
			
 
				+
			
 
				+## Function Calling
			
 
				+
			
 
				+The function calling in this application is handled by the Groq API, abstracted with Langchain. When the user asks a question, the application invokes the appropriate tool with parameters based on the user's question. The tool's output is then used to generate a response.
			
 
				+
			
 
				+## Usage
			
 
				+
			
 
				+<!-- markdown-link-check-disable -->
			
 
				+
			
 
				+You will need to store a valid Groq API Key as a secret to proceed with this example. You can generate one for free [here](https://console.groq.com/keys).
			
 
				+
			
 
				+<!-- markdown-link-check-enable -->
			
 
				+
			
 
				+You can [fork and run this application on Replit](https://replit.com/@GroqCloud/Groqing-the-Stock-Market-Function-Calling-with-Llama3) or run it on the command line with `python main.py`.
			
--- a/recipes/llama_api_providers/Groq/groq-example-templates/groqing-the-stock-market-function-calling-llama3/main.py
+++ b/recipes/llama_api_providers/Groq/groq-example-templates/groqing-the-stock-market-function-calling-llama3/main.py
--- a/recipes/llama_api_providers/Groq/groq-example-templates/groqing-the-stock-market-function-calling-llama3/requirements.txt
+++ b/recipes/llama_api_providers/Groq/groq-example-templates/groqing-the-stock-market-function-calling-llama3/requirements.txt
@@ -0,0 +1,12 @@
 
				+streamlit
			
 
				+pandas
			
 
				+numpy
			
 
				+groq
			
 
				+langchain_community
			
 
				+langchain_groq
			
 
				+yfinance
			
 
				+plotly
			
 
				+langchain_core
			
 
				+nbformat>=4.2.0
			
 
				+ipython
			
 
				+kaleido
			
--- a/recipes/llama_api_providers/Groq/groq-example-templates/llamachat-conversational-chatbot-with-llamaIndex/README.md
+++ b/recipes/llama_api_providers/Groq/groq-example-templates/llamachat-conversational-chatbot-with-llamaIndex/README.md
@@ -0,0 +1,21 @@
 
				+# LlamaChat: Conversational Chatbot with LlamaIndex and Llama3
			
 
				+
			
 
				+A simple application that allows users to interact with a conversational chatbot powered by the LlamaIndex framework and Meta's Llama3 model. The application uses the Groq API to generate responses and supports different modes of interaction, including simple chat, streaming chat, and customizable chat with system prompts.
			
 
				+
			
 
				+##Features
			
 
				+
			
 
				+**LlamaIndex**: The application uses LlamaIndex to manage and generate responses, leveraging the power of Groq's language model.
			
 
				+
			
 
				+**Simple Chat**: Generates responses based on user input using the Groq API with LlamaIndex.
			
 
				+
			
 
				+**Streaming Chat**: Provides real-time streaming responses for user input.
			
 
				+
			
 
				+**Customizable Chat**: Allows for chat customization by setting a system prompt to guide the chatbot's responses.
			
 
				+
			
 
				+##Usage
			
 
				+
			
 
				+<!-- markdown-link-check-disable -->
			
 
				+
			
 
				+You will need to store a valid Groq API Key as a secret to proceed with this example. You can generate one for free [here](https://console.groq.com/keys).
			
 
				+
			
 
				+<!-- markdown-link-check-enable -->
			
--- a/recipes/llama_api_providers/Groq/groq-example-templates/llamachat-conversational-chatbot-with-llamaIndex/main.py
+++ b/recipes/llama_api_providers/Groq/groq-example-templates/llamachat-conversational-chatbot-with-llamaIndex/main.py
@@ -0,0 +1,46 @@
 
				+from llama_index.llms.groq import Groq
			
 
				+from llama_index.core.llms import ChatMessage
			
 
				+
			
 
				+llm = Groq(model="llama3-8b-8192")
			
 
				+
			
 
				+
			
 
				+system_prompt = 'You are a friendly but highly sarcastic chatbot assistant'
			
 
				+
			
 
				+while True:
			
 
				+    # Get the user's question
			
 
				+    user_input = input("User: ")
			
 
				+
			
 
				+    #user_input = 'write a few paragraphs explaining generative AI to a college freshman'
			
 
				+
			
 
				+    ##################################
			
 
				+    # Simple Chat
			
 
				+    ##################################
			
 
				+    print('Simple Chat:\n\n')
			
 
				+    response = llm.complete(user_input)
			
 
				+    print(response)
			
 
				+
			
 
				+
			
 
				+    ##################################
			
 
				+    # Streaming Chat
			
 
				+    ##################################
			
 
				+    stream_response = llm.stream_complete(
			
 
				+        user_input
			
 
				+    )
			
 
				+    print('\n\nStreaming Chat:\n')
			
 
				+    for t in stream_response:
			
 
				+        print(t.delta, end="")
			
 
				+
			
 
				+
			
 
				+    ##################################
			
 
				+    # Customizable Chat
			
 
				+    ##################################
			
 
				+    messages = [
			
 
				+        ChatMessage(role="system", content=system_prompt),
			
 
				+        ChatMessage(role="user", content=user_input),
			
 
				+    ]
			
 
				+    print('\n\nChat with System Prompt:\n')
			
 
				+    response_with_system_prompt = llm.chat(messages)
			
 
				+
			
 
				+    print(response_with_system_prompt)
			
 
				+
			
 
				+
			
--- a/recipes/llama_api_providers/Groq/groq-example-templates/llamachat-conversational-chatbot-with-llamaIndex/requirements.txt
+++ b/recipes/llama_api_providers/Groq/groq-example-templates/llamachat-conversational-chatbot-with-llamaIndex/requirements.txt
@@ -0,0 +1,2 @@
 
				+llama_index
			
 
				+llama-index-llms-groq
			
--- a/recipes/llama_api_providers/Groq/groq-example-templates/presidential-speeches-rag-with-pinecone/README.md
+++ b/recipes/llama_api_providers/Groq/groq-example-templates/presidential-speeches-rag-with-pinecone/README.md
@@ -0,0 +1,33 @@
 
				+# Presidential Speeches RAG with Pinecone
			
 
				+
			
 
				+This repository contains a command line application that allows users to ask questions about US presidental speeches by applying Retrieval-Augmented Generation (RAG) over a Pinecone vector database. The application uses RAG to answer the user's question by retrieving the most relevant presidential speeches and using them to supplant the LLM response.
			
 
				+
			
 
				+## Features
			
 
				+
			
 
				+- **RAG (Retrieval-Augmented Generation)**: Enhances the generation of responses by integrating retrieval-based methods. This feature allows the system to fetch relevant information from a large corpus of data, providing more accurate and contextually appropriate answers by combining retrieved content with generative capabilities.
			
 
				+
			
 
				+- **Vector Databases (Pinecone)**: Integrates with Pinecone to store and manage vector embeddings efficiently. Pinecone's high-performance vector database allows for fast and scalable similarity searches, enabling quick retrieval of relevant data for various machine learning and AI applications.
			
 
				+
			
 
				+- **LangChain Integration**: Leverages LangChain to facilitate natural language processing tasks. LangChain enhances the interaction between the user and the system by providing robust language modeling capabilities, ensuring seamless and intuitive communication.
			
 
				+
			
 
				+## Code Overview
			
 
				+
			
 
				+The main script of the application is [main.py](./main.py). Here's a brief overview of its main functions:
			
 
				+
			
 
				+- `get_relevant_excerpts(user_question, docsearch)`: This function takes a user's question and a Pinecone vector store as input, performs a similarity search on the vector store using the user's question, and returns the most relevant excerpts from presidential speeches.
			
 
				+
			
 
				+- `get_relevant_excerpts(user_question, docsearch)`: This function takes a user's question and a Pinecone vector store as input, performs a similarity search on the vector store using the user's question, and returns the most relevant excerpts from presidential speeches.
			
 
				+
			
 
				+- `presidential_speech_chat_completion(client, model, user_question, relevant_excerpts, additional_context)`: This function takes a Groq client, a pre-trained model, a user's question, relevant excerpts from presidential speeches, and additional context as input. It generates a response to the user's question based on the relevant excerpts and the additional context
			
 
				+
			
 
				+## Usage
			
 
				+
			
 
				+<!-- markdown-link-check-disable -->
			
 
				+
			
 
				+You will need to store a valid Groq API Key as a secret to proceed with this example outside of this Repl. You can generate one for free [here](https://console.groq.com/keys).
			
 
				+
			
 
				+<!-- markdown-link-check-enable -->
			
 
				+
			
 
				+You would also need your own [Pinecone](https://www.pinecone.io/) index with presidential speech embeddings to run this code locally. You can create a Pinecone API key and one index for a small project for free on their Starter plan, and visit [this Cookbook post](https://github.com/groq/groq-api-cookbook/blob/dan/replit-conversion/presidential-speeches-rag/presidential-speeches-rag.ipynb) for more info on RAG and a guide to uploading these embeddings to a vector database
			
 
				+
			
 
				+You can [fork and run this application on Replit](https://replit.com/@GroqCloud/Presidential-Speeches-RAG-with-Pinecone) or run it on the command line with `python main.py`.
			
--- a/recipes/llama_api_providers/Groq/groq-example-templates/presidential-speeches-rag-with-pinecone/main.py
+++ b/recipes/llama_api_providers/Groq/groq-example-templates/presidential-speeches-rag-with-pinecone/main.py
@@ -0,0 +1,114 @@
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+from groq import Groq
			
 
				+from pinecone import Pinecone
			
 
				+import os
			
 
				+
			
 
				+from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
			
 
				+from langchain_pinecone import PineconeVectorStore
			
 
				+
			
 
				+
			
 
				+def get_relevant_excerpts(user_question, docsearch):
			
 
				+    """
			
 
				+    This function retrieves the most relevant excerpts from presidential speeches based on the user's question.
			
 
				+    Parameters:
			
 
				+    user_question (str): The question asked by the user.
			
 
				+    docsearch (PineconeVectorStore): The Pinecone vector store containing the presidential speeches.
			
 
				+    Returns:
			
 
				+    str: A string containing the most relevant excerpts from presidential speeches.
			
 
				+    """
			
 
				+
			
 
				+    # Perform a similarity search on the Pinecone vector store using the user's question
			
 
				+    relevent_docs = docsearch.similarity_search(user_question)
			
 
				+
			
 
				+    # Extract the page content from the top 3 most relevant documents and join them into a single string
			
 
				+    relevant_excerpts = '\n\n------------------------------------------------------\n\n'.join([doc.page_content for doc in relevent_docs[:3]])
			
 
				+
			
 
				+    return relevant_excerpts
			
 
				+
			
 
				+
			
 
				+def presidential_speech_chat_completion(client, model, user_question, relevant_excerpts):
			
 
				+    """
			
 
				+    This function generates a response to the user's question using a pre-trained model.
			
 
				+    Parameters:
			
 
				+    client (Groq): The Groq client used to interact with the pre-trained model.
			
 
				+    model (str): The name of the pre-trained model.
			
 
				+    user_question (str): The question asked by the user.
			
 
				+    relevant_excerpts (str): A string containing the most relevant excerpts from presidential speeches.
			
 
				+    Returns:
			
 
				+    str: A string containing the response to the user's question.
			
 
				+    """
			
 
				+
			
 
				+    # Define the system prompt
			
 
				+    system_prompt = '''
			
 
				+    You are a presidential historian. Given the user's question and relevant excerpts from 
			
 
				+    presidential speeches, answer the question by including direct quotes from presidential speeches. 
			
 
				+    When using a quote, site the speech that it was from (ignoring the chunk).
			
 
				+    '''
			
 
				+
			
 
				+    # Generate a response to the user's question using the pre-trained model
			
 
				+    chat_completion = client.chat.completions.create(
			
 
				+        messages = [
			
 
				+            {
			
 
				+                "role": "system",
			
 
				+                "content":  system_prompt
			
 
				+            },
			
 
				+            {
			
 
				+                "role": "user",
			
 
				+                "content": "User Question: " + user_question + "\n\nRelevant Speech Exerpt(s):\n\n" + relevant_excerpts,
			
 
				+            }
			
 
				+        ],
			
 
				+        model = model
			
 
				+    )
			
 
				+
			
 
				+    # Extract the response from the chat completion
			
 
				+    response = chat_completion.choices[0].message.content
			
 
				+
			
 
				+    return response
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """
			
 
				+    This is the main function that runs the application. It initializes the Groq client and the SentenceTransformer model,
			
 
				+    gets user input from the Streamlit interface, retrieves relevant excerpts from presidential speeches based on the user's question,
			
 
				+    generates a response to the user's question using a pre-trained model, and displays the response.
			
 
				+    """
			
 
				+
			
 
				+    model = 'llama3-8b-8192'
			
 
				+
			
 
				+    embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
			
 
				+
			
 
				+    # Initialize the Groq client
			
 
				+    groq_api_key = os.getenv('GROQ_API_KEY')
			
 
				+    pinecone_api_key=os.getenv('PINECONE_API_KEY')
			
 
				+    pinecone_index_name = "presidential-speeches"
			
 
				+    client = Groq(
			
 
				+        api_key=groq_api_key
			
 
				+    )
			
 
				+
			
 
				+    pc = Pinecone(api_key = pinecone_api_key)
			
 
				+    docsearch = PineconeVectorStore(index_name=pinecone_index_name, embedding=embedding_function)
			
 
				+
			
 
				+    # Display the title and introduction of the application
			
 
				+    print("Presidential Speeches RAG")
			
 
				+    multiline_text = """
			
 
				+    Welcome! Ask questions about U.S. presidents, like "What were George Washington's views on democracy?" or "What did Abraham Lincoln say about national unity?". The app matches your question to relevant excerpts from presidential speeches and generates a response using a pre-trained model.
			
 
				+    """
			
 
				+
			
 
				+    print(multiline_text)
			
 
				+
			
 
				+
			
 
				+    while True:
			
 
				+        # Get the user's question
			
 
				+        user_question = input("Ask a question about a US president: ")
			
 
				+
			
 
				+        if user_question:
			
 
				+            pinecone_index_name = "presidential-speeches"
			
 
				+            relevant_excerpts = get_relevant_excerpts(user_question, docsearch)
			
 
				+            response = presidential_speech_chat_completion(client, model, user_question, relevant_excerpts)
			
 
				+            print(response)
			
 
				+
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/recipes/llama_api_providers/Groq/groq-example-templates/presidential-speeches-rag-with-pinecone/requirements.txt
+++ b/recipes/llama_api_providers/Groq/groq-example-templates/presidential-speeches-rag-with-pinecone/requirements.txt
@@ -0,0 +1,8 @@
 
				+pandas
			
 
				+numpy
			
 
				+groq
			
 
				+langchain_community
			
 
				+langchain_pinecone
			
 
				+transformers
			
 
				+scikit-learn
			
 
				+sentence-transformers
			
--- a/recipes/llama_api_providers/Groq/groq-example-templates/text-to-sql-json-mode/README.md
+++ b/recipes/llama_api_providers/Groq/groq-example-templates/text-to-sql-json-mode/README.md
@@ -0,0 +1,57 @@
 
				+# DuckDB Text-to-SQL with JSON Mode
			
 
				+
			
 
				+A command line application that allows users to ask questions about their DuckDB data. The application leverages Groq API's JSON mode to generate SQL queries based on the user's questions and execute them on a DuckDB database.
			
 
				+
			
 
				+## Features
			
 
				+
			
 
				+- **Text-to-SQL**: The application uses natural language processing to convert user questions into SQL queries, making it easy for users to query their data without knowing SQL.
			
 
				+
			
 
				+- **JSON mode**: A feature which enables the LLM to respond strictly in a structured JSON output, provided we supply it with the desired format
			
 
				+
			
 
				+- **Data Summarization**: After executing a SQL query, the application uses the AI to summarize the resulting data in relation to the user's original question.
			
 
				+
			
 
				+## Data
			
 
				+
			
 
				+The application queries data from two CSV files located in the `data` folder:
			
 
				+
			
 
				+- `employees.csv`: Contains employee data including their ID, full name, and email address.
			
 
				+
			
 
				+- `purchases.csv`: Records purchase details including purchase ID, date, associated employee ID, amount, and product name.
			
 
				+
			
 
				+## Prompts
			
 
				+
			
 
				+The base prompt for the AI is stored in a text file in the `prompts` folder:
			
 
				+
			
 
				+- `base_prompt.txt`
			
 
				+
			
 
				+A well-crafted system prompt is essential for building a functional Text-to-SQL application. Ours will serve 3 purposes:
			
 
				+
			
 
				+1. Provide the metadata schemas for our database tables
			
 
				+2. Indicate any relevant context or tips for querying the DuckDB language or our database schema specifically
			
 
				+3. Define our desired JSON output (note that to use JSON mode, we must include 'JSON' in the prompt)
			
 
				+
			
 
				+## Functions
			
 
				+
			
 
				+- `chat_with_groq()`: Sends a prompt to the Groq API and returns the AI's response.
			
 
				+- `execute_duckdb_query()`: Executes a SQL query on a DuckDB database and returns the result.
			
 
				+- `get_summarization()`: Generates a prompt for the AI to summarize the data resulting from a SQL query.
			
 
				+
			
 
				+## Usage
			
 
				+
			
 
				+<!-- markdown-link-check-disable -->
			
 
				+
			
 
				+You will need to store a valid Groq API Key as a secret to proceed with this example. You can generate one for free [here](https://console.groq.com/keys).
			
 
				+
			
 
				+<!-- markdown-link-check-enable -->
			
 
				+
			
 
				+You can [fork and run this application on Replit](https://replit.com/@GroqCloud/Building-a-Text-to-SQL-app-with-Groqs-JSON-mode) or run it on the command line with `python main.py`.
			
 
				+
			
 
				+## Customizing with Your Own Data
			
 
				+
			
 
				+This application is designed to be flexible and can be easily customized to work with your own data. If you want to use your own data, follow these steps:
			
 
				+
			
 
				+1. **Replace the CSV files**: The application queries data from two CSV files located in the `data` folder: `employees.csv` and `purchases.csv`. Replace these files with your own CSV files.
			
 
				+
			
 
				+2. **Modify the base prompt**: The base prompt for the AI, stored in the `prompts` folder as `base_prompt.txt`, contains specific information about the data metadata. Modify this prompt to match the structure and content of your own data. Make sure to accurately describe the tables, columns, and any specific rules or tips for querying your dataset.
			
 
				+
			
 
				+By following these steps, you can tailor the DuckDB Query Generator to your own data and use cases. Feel free to experiment and build off this repository to create your own powerful data querying applications.
			
--- a/recipes/llama_api_providers/Groq/groq-example-templates/text-to-sql-json-mode/data/employees.csv
+++ b/recipes/llama_api_providers/Groq/groq-example-templates/text-to-sql-json-mode/data/employees.csv
@@ -0,0 +1,8 @@
 
				+employee_id,name,email
			
 
				+1,Richard Hendricks,richard@piedpiper.com
			
 
				+2,Erlich Bachman,erlich@aviato.com
			
 
				+3,Dinesh Chugtai,dinesh@piedpiper.com
			
 
				+4,Bertram Gilfoyle,gilfoyle@piedpiper.com
			
 
				+5,Jared Dunn,jared@piedpiper.com
			
 
				+6,Monica Hall,monica@raviga.com
			
 
				+7,Gavin Belson,gavin@hooli.com
			
--- a/recipes/llama_api_providers/Groq/groq-example-templates/text-to-sql-json-mode/data/purchases.csv
+++ b/recipes/llama_api_providers/Groq/groq-example-templates/text-to-sql-json-mode/data/purchases.csv
@@ -0,0 +1,6 @@
 
				+purchase_id,purchase_date,product_name,employee_id,amount
			
 
				+1,'2024-02-01',iPhone,1,750
			
 
				+2,'2024-02-02',Tesla,2,70000
			
 
				+3,'2024-02-03',Humane pin,3,500
			
 
				+4,'2024-02-04',iPhone,4,700
			
 
				+5,'2024-02-05',Tesla,5,75000
			
--- a/recipes/llama_api_providers/Groq/groq-example-templates/text-to-sql-json-mode/main.py
+++ b/recipes/llama_api_providers/Groq/groq-example-templates/text-to-sql-json-mode/main.py
@@ -0,0 +1,145 @@
 
				+import os
			
 
				+from groq import Groq
			
 
				+import json
			
 
				+import duckdb
			
 
				+import sqlparse
			
 
				+
			
 
				+def chat_with_groq(client, prompt, model, response_format):
			
 
				+    """
			
 
				+    This function sends a prompt to the Groq API and retrieves the AI's response.
			
 
				+
			
 
				+    Parameters:
			
 
				+    client (Groq): The Groq API client.
			
 
				+    prompt (str): The prompt to send to the AI.
			
 
				+    model (str): The AI model to use for the response.
			
 
				+    response_format (dict): The format of the response. 
			
 
				+        If response_format is a dictionary with {"type": "json_object"}, it configures JSON mode.
			
 
				+
			
 
				+    Returns:
			
 
				+    str: The content of the AI's response.
			
 
				+    """
			
 
				+    
			
 
				+    completion = client.chat.completions.create(
			
 
				+    model=model,
			
 
				+    messages=[
			
 
				+        {
			
 
				+            "role": "user",
			
 
				+            "content": prompt
			
 
				+        }
			
 
				+    ],
			
 
				+    response_format=response_format
			
 
				+    )
			
 
				+
			
 
				+    return completion.choices[0].message.content
			
 
				+
			
 
				+
			
 
				+def execute_duckdb_query(query):
			
 
				+    """
			
 
				+    This function executes a SQL query on a DuckDB database and returns the result.
			
 
				+
			
 
				+    Parameters:
			
 
				+    query (str): The SQL query to execute.
			
 
				+
			
 
				+    Returns:
			
 
				+    DataFrame: The result of the query as a pandas DataFrame.
			
 
				+    """
			
 
				+    original_cwd = os.getcwd()
			
 
				+    os.chdir('data')
			
 
				+
			
 
				+    try:
			
 
				+        conn = duckdb.connect(database=':memory:', read_only=False)
			
 
				+        query_result = conn.execute(query).fetchdf().reset_index(drop=True)
			
 
				+    finally:
			
 
				+        os.chdir(original_cwd)
			
 
				+
			
 
				+    return query_result
			
 
				+
			
 
				+
			
 
				+def get_summarization(client, user_question, df, model):
			
 
				+    """
			
 
				+    This function generates a summarization prompt based on the user's question and the resulting data. 
			
 
				+    It then sends this summarization prompt to the Groq API and retrieves the AI's response.
			
 
				+
			
 
				+    Parameters:
			
 
				+    client (Groqcloud): The Groq API client.
			
 
				+    user_question (str): The user's question.
			
 
				+    df (DataFrame): The DataFrame resulting from the SQL query.
			
 
				+    model (str): The AI model to use for the response.
			
 
				+    
			
 
				+    Returns:
			
 
				+    str: The content of the AI's response to the summarization prompt.
			
 
				+    """
			
 
				+    prompt = '''
			
 
				+    A user asked the following question pertaining to local database tables:
			
 
				+    
			
 
				+    {user_question}
			
 
				+    
			
 
				+    To answer the question, a dataframe was returned:
			
 
				+    
			
 
				+    Dataframe:
			
 
				+    {df}
			
 
				+    
			
 
				+    In a few sentences, summarize the data in the table as it pertains to the original user question. Avoid qualifiers like "based on the data" and do not comment on the structure or metadata of the table itself
			
 
				+    '''.format(user_question = user_question, df = df)
			
 
				+    
			
 
				+    # Response format is set to 'None'
			
 
				+    return chat_with_groq(client,prompt,model,None)
			
 
				+
			
 
				+def main():
			
 
				+    """
			
 
				+    The main function of the application. It handles user input, controls the flow of the application, 
			
 
				+    and initiates a conversation in the command line.
			
 
				+    """
			
 
				+
			
 
				+    model = "llama3-70b-8192"
			
 
				+
			
 
				+    # Get the Groq API key and create a Groq client
			
 
				+    groq_api_key = os.getenv('GROQ_API_KEY')
			
 
				+    client = Groq(
			
 
				+        api_key=groq_api_key
			
 
				+    )
			
 
				+
			
 
				+    print("Welcome to the DuckDB Query Generator!")
			
 
				+    print("You can ask questions about the data in the 'employees.csv' and 'purchases.csv' files.")
			
 
				+
			
 
				+    # Load the base prompt
			
 
				+    with open('prompts/base_prompt.txt', 'r') as file:
			
 
				+        base_prompt = file.read()
			
 
				+
			
 
				+    while True:
			
 
				+        # Get the user's question
			
 
				+        user_question = input("Ask a question: ")
			
 
				+
			
 
				+        if user_question:
			
 
				+            # Generate the full prompt for the AI
			
 
				+            full_prompt = base_prompt.format(user_question=user_question)
			
 
				+
			
 
				+            # Get the AI's response. Call with '{"type": "json_object"}' to use JSON mode
			
 
				+            llm_response = chat_with_groq(client, full_prompt, model, {"type": "json_object"})
			
 
				+
			
 
				+            result_json = json.loads(llm_response)
			
 
				+            if 'sql' in result_json:
			
 
				+                sql_query = result_json['sql']
			
 
				+                results_df = execute_duckdb_query(sql_query)
			
 
				+
			
 
				+                formatted_sql_query = sqlparse.format(sql_query, reindent=True, keyword_case='upper')
			
 
				+
			
 
				+                print("```sql\n" + formatted_sql_query + "\n```")
			
 
				+                print(results_df.to_markdown(index=False))
			
 
				+
			
 
				+                summarization = get_summarization(client,user_question,results_df,model)
			
 
				+                print(summarization.replace('$','\\$'))
			
 
				+            elif 'error' in result_json:
			
 
				+                print("ERROR:", 'Could not generate valid SQL for this question')
			
 
				+                print(result_json['error'])
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/recipes/llama_api_providers/Groq/groq-example-templates/text-to-sql-json-mode/prompts/base_prompt.txt
+++ b/recipes/llama_api_providers/Groq/groq-example-templates/text-to-sql-json-mode/prompts/base_prompt.txt
@@ -0,0 +1,42 @@
 
				+You are Groq Advisor, and you are tasked with generating SQL queries for DuckDB based on user questions about data stored in two tables derived from CSV files:
			
 
				+
			
 
				+Table: employees.csv
			
 
				+Columns:
			
 
				+employee_id (INTEGER): A unique identifier for each employee.
			
 
				+name (VARCHAR): The full name of the employee.
			
 
				+email (VARCHAR): employee's email address
			
 
				+
			
 
				+Table: purchases.csv
			
 
				+Columns:
			
 
				+purchase_id (INTEGER): A unique identifier for each purchase.
			
 
				+purchase_date (DATE): Date of purchase
			
 
				+employee_id (INTEGER): References the employee_id from the employees table, indicating which employee made the purchase.
			
 
				+amount (FLOAT): The monetary value of the purchase.
			
 
				+product_name (STRING): The name of the product purchased
			
 
				+
			
 
				+Given a user's question about this data, write a valid DuckDB SQL query that accurately extracts or calculates the requested information from these tables and adheres to SQL best practices for DuckDB, optimizing for readability and performance where applicable.
			
 
				+
			
 
				+Here are some tips for writing DuckDB queries:
			
 
				+* DuckDB syntax requires querying from the .csv file itself, i.e. employees.csv and purchases.csv. For example: SELECT * FROM employees.csv as employees
			
 
				+* All tables referenced MUST be aliased
			
 
				+* DuckDB does not implicitly include a GROUP BY clause
			
 
				+* CURRENT_DATE gets today's date
			
 
				+* Aggregated fields like COUNT(*) must be appropriately named
			
 
				+
			
 
				+And some rules for querying the dataset:
			
 
				+* Never include employee_id in the output - show employee name instead
			
 
				+
			
 
				+Also note that:
			
 
				+* Valid values for product_name include 'Tesla','iPhone' and 'Humane pin'
			
 
				+
			
 
				+
			
 
				+Question:
			
 
				+--------
			
 
				+{user_question}
			
 
				+--------
			
 
				+Reminder: Generate a DuckDB SQL to answer to the question:
			
 
				+* respond as a valid JSON Document
			
 
				+* [Best] If the question can be answered with the available tables: {{"sql": <sql here>}} 
			
 
				+* If the question cannot be answered with the available tables: {{"error": <explanation here>}}
			
 
				+* Ensure that the entire output is returned on only one single line
			
 
				+* Keep your query as simple and straightforward as possible; do not use subqueries
			
--- a/recipes/llama_api_providers/Groq/groq-example-templates/text-to-sql-json-mode/requirements.txt
+++ b/recipes/llama_api_providers/Groq/groq-example-templates/text-to-sql-json-mode/requirements.txt
@@ -0,0 +1,4 @@
 
				+duckdb
			
 
				+groq
			
 
				+sqlparse
			
 
				+pandas
			
--- a/recipes/llama_api_providers/Groq/groq-example-templates/verified-sql-function-calling/README.md
+++ b/recipes/llama_api_providers/Groq/groq-example-templates/verified-sql-function-calling/README.md
@@ -0,0 +1,53 @@
 
				+# Executing Verified Queries with Function Calling
			
 
				+
			
 
				+A command line application that allows users to ask questions about their DuckDB data using the Groq API. The application uses function calling to find the most similar pre-verified query to the user's question, execute it against the data, and return the results.
			
 
				+
			
 
				+## Features
			
 
				+
			
 
				+- **Function Calling**: The application uses function calling to match the user's question to the most relevant pre-verified SQL query.
			
 
				+
			
 
				+- **SQL Execution**: The application executes the selected SQL query on a DuckDB database and displays the result.
			
 
				+
			
 
				+## Functions
			
 
				+
			
 
				+- `get_verified_queries(directory_path)`: Reads YAML files from the specified directory and loads the verified SQL queries and their descriptions.
			
 
				+
			
 
				+- `execute_duckdb_query_function_calling(query_name, verified_queries_dict)`: Executes the provided SQL query using DuckDB and returns the result as a DataFrame.
			
 
				+
			
 
				+## Data
			
 
				+
			
 
				+The application queries data from CSV files located in the data folder:
			
 
				+
			
 
				+- `employees.csv`: Contains employee data including their ID, full name, and email address.
			
 
				+
			
 
				+- `purchases.csv`: Records purchase details including purchase ID, date, associated employee ID, amount, and product name.
			
 
				+
			
 
				+## Verified Queries
			
 
				+
			
 
				+The verified SQL queries and their descriptions are stored in YAML files located in the `verified-queries` folder. Descriptions are used to semantically map prompts to queries:
			
 
				+
			
 
				+- `most-recent-purchases.yaml`: Returns the 5 most recent purchases
			
 
				+
			
 
				+- `most-expensive-purchase.yaml`: Finds the most expensive purchases
			
 
				+
			
 
				+- `number-of-teslas.yaml`: Counts the number of Teslas purchased
			
 
				+
			
 
				+- `employees-without-purchases.yaml`: Gets employees without any recent purchases
			
 
				+
			
 
				+## Usage
			
 
				+
			
 
				+<!-- markdown-link-check-disable -->
			
 
				+
			
 
				+You will need to store a valid Groq API Key as a secret to proceed with this example. You can generate one for free [here](https://console.groq.com/keys).
			
 
				+
			
 
				+<!-- markdown-link-check-enable -->
			
 
				+
			
 
				+You can [fork and run this application on Replit](https://replit.com/@GroqCloud/Execute-Verified-SQL-Queries-with-Function-Calling) or run it on the command line with `python main.py`.
			
 
				+
			
 
				+## Customizing with Your Own Data
			
 
				+
			
 
				+This application is designed to be flexible and can be easily customized to work with your own data. If you want to use your own data, follow these steps:
			
 
				+
			
 
				+1. **Replace the CSV files**: The application queries data from CSV files located in the `data` folder. Replace these files with your own CSV files.
			
 
				+
			
 
				+2. **Modify the verified queries**: The verified SQL queries and their descriptions are stored in YAML files located in the `verified-queries` folder. Replace these files with your own verified SQL queries and descriptions.
			
--- a/recipes/llama_api_providers/Groq/groq-example-templates/verified-sql-function-calling/data/employees.csv
+++ b/recipes/llama_api_providers/Groq/groq-example-templates/verified-sql-function-calling/data/employees.csv
@@ -0,0 +1,8 @@
 
				+employee_id,name,email
			
 
				+1,Richard Hendricks,richard@piedpiper.com
			
 
				+2,Erlich Bachman,erlich@aviato.com
			
 
				+3,Dinesh Chugtai,dinesh@piedpiper.com
			
 
				+4,Bertram Gilfoyle,gilfoyle@piedpiper.com
			
 
				+5,Jared Dunn,jared@piedpiper.com
			
 
				+6,Monica Hall,monica@raviga.com
			
 
				+7,Gavin Belson,gavin@hooli.com
			
--- a/recipes/llama_api_providers/Groq/groq-example-templates/verified-sql-function-calling/data/purchases.csv
+++ b/recipes/llama_api_providers/Groq/groq-example-templates/verified-sql-function-calling/data/purchases.csv
@@ -0,0 +1,6 @@
 
				+purchase_id,purchase_date,product_name,employee_id,amount
			
 
				+1,'2024-02-01',iPhone,1,750
			
 
				+2,'2024-02-02',Tesla,2,70000
			
 
				+3,'2024-02-03',Humane pin,3,500
			
 
				+4,'2024-02-04',iPhone,4,700
			
 
				+5,'2024-02-05',Tesla,5,75000
			
--- a/recipes/llama_api_providers/Groq/groq-example-templates/verified-sql-function-calling/main.py
+++ b/recipes/llama_api_providers/Groq/groq-example-templates/verified-sql-function-calling/main.py
@@ -0,0 +1,158 @@
 
				+import os
			
 
				+from groq import Groq
			
 
				+import duckdb
			
 
				+import yaml
			
 
				+import glob
			
 
				+import json
			
 
				+
			
 
				+def get_verified_queries(directory_path):
			
 
				+    """
			
 
				+    Reads YAML files from the specified directory, loads the verified SQL queries and their descriptions,
			
 
				+    and stores them in a dictionary.
			
 
				+
			
 
				+    Parameters:
			
 
				+        directory_path (str): The path to the directory containing the YAML files with verified queries.
			
 
				+
			
 
				+    Returns:
			
 
				+        dict: A dictionary where the keys are the names of the YAML files (without the directory path and file extension)
			
 
				+              and the values are the parsed content of the YAML files.
			
 
				+    """
			
 
				+    verified_queries_yaml_files = glob.glob(os.path.join(directory_path, '*.yaml'))
			
 
				+    verified_queries_dict = {}
			
 
				+    for file in verified_queries_yaml_files:
			
 
				+        with open(file, 'r') as stream:
			
 
				+            try:
			
 
				+                file_name = file[len(directory_path):-5]
			
 
				+                verified_queries_dict[file_name] = yaml.safe_load(stream)
			
 
				+            except yaml.YAMLError as exc:
			
 
				+                continue
			
 
				+        
			
 
				+    return verified_queries_dict
			
 
				+
			
 
				+
			
 
				+def execute_duckdb_query_function_calling(query_name,verified_queries_dict):
			
 
				+    """
			
 
				+    Executes a SQL query from the verified queries dictionary using DuckDB and returns the result as a DataFrame.
			
 
				+
			
 
				+    Parameters:
			
 
				+        query_name (str): The name of the query to be executed, corresponding to a key in the verified queries dictionary.
			
 
				+        verified_queries_dict (dict): A dictionary containing verified queries, where the keys are query names and the values
			
 
				+                                      are dictionaries with query details including the SQL statement.
			
 
				+
			
 
				+    Returns:
			
 
				+        pandas.DataFrame: The result of the executed query as a DataFrame.
			
 
				+    """
			
 
				+    
			
 
				+    original_cwd = os.getcwd()
			
 
				+    os.chdir('data')
			
 
				+
			
 
				+    query = verified_queries_dict[query_name]['sql']
			
 
				+    
			
 
				+    try:
			
 
				+        conn = duckdb.connect(database=':memory:', read_only=False)
			
 
				+        query_result = conn.execute(query).fetchdf().reset_index(drop=True)
			
 
				+    finally:
			
 
				+        os.chdir(original_cwd)
			
 
				+
			
 
				+    return query_result
			
 
				+
			
 
				+
			
 
				+model = "llama3-8b-8192"
			
 
				+
			
 
				+# Initialize the Groq client
			
 
				+groq_api_key = os.getenv('GROQ_API_KEY')
			
 
				+client = Groq(
			
 
				+    api_key=groq_api_key
			
 
				+)
			
 
				+
			
 
				+directory_path = 'verified-queries/'
			
 
				+verified_queries_dict = get_verified_queries(directory_path)
			
 
				+
			
 
				+# Display the title and introduction of the application
			
 
				+multiline_text = """
			
 
				+Welcome! Ask questions about employee data or purchase details, like "Show the 5 most recent purchases" or "What was the most expensive purchase?". The app matches your question to pre-verified SQL queries for accurate results.
			
 
				+"""
			
 
				+
			
 
				+print(multiline_text)
			
 
				+
			
 
				+    
			
 
				+while True:
			
 
				+    # Get user input from the console
			
 
				+    user_input = input("You: ")
			
 
				+
			
 
				+    
			
 
				+    #Simplify verified_queries_dict to just show query name and description
			
 
				+    query_description_mapping = {key: subdict['description'] for key, subdict in verified_queries_dict.items()}
			
 
				+    
			
 
				+    # Step 1: send the conversation and available functions to the model
			
 
				+    # Define the messages to be sent to the Groq API
			
 
				+    messages = [
			
 
				+        {
			
 
				+            "role": "system",
			
 
				+            "content": '''You are a function calling LLM that uses the data extracted from the execute_duckdb_query_function_calling function to answer questions around a DuckDB dataset.
			
 
				+
			
 
				+            Extract the query_name parameter from this mapping by finding the one whose description best matches the user's question: 
			
 
				+            {query_description_mapping}
			
 
				+            '''.format(query_description_mapping=query_description_mapping)
			
 
				+        },
			
 
				+        {
			
 
				+            "role": "user",
			
 
				+            "content": user_input,
			
 
				+        }
			
 
				+    ]
			
 
				+
			
 
				+    # Define the tool (function) to be used by the Groq API
			
 
				+    tools = [
			
 
				+        {
			
 
				+            "type": "function",
			
 
				+            "function": {
			
 
				+                "name": "execute_duckdb_query_function_calling",
			
 
				+                "description": "Executes a verified DuckDB SQL Query",
			
 
				+                "parameters": {
			
 
				+                    "type": "object",
			
 
				+                    "properties": {
			
 
				+                        "query_name": {
			
 
				+                            "type": "string",
			
 
				+                            "description": "The name of the verified query (i.e. 'most-recent-purchases')",
			
 
				+                        }
			
 
				+                    },
			
 
				+                    "required": ["query_name"],
			
 
				+                },
			
 
				+            },
			
 
				+        }
			
 
				+    ]
			
 
				+
			
 
				+    # Send the conversation and available functions to the Groq API
			
 
				+    response = client.chat.completions.create(
			
 
				+        model=model,
			
 
				+        messages=messages,
			
 
				+        tools=tools,
			
 
				+        tool_choice="auto",  
			
 
				+        max_tokens=4096
			
 
				+    )
			
 
				+
			
 
				+    # Extract the response message and any tool calls from the response
			
 
				+    response_message = response.choices[0].message
			
 
				+    tool_calls = response_message.tool_calls
			
 
				+
			
 
				+    # Define a dictionary of available functions
			
 
				+    available_functions = {
			
 
				+        "execute_duckdb_query_function_calling": execute_duckdb_query_function_calling,
			
 
				+    }
			
 
				+
			
 
				+    # Iterate over the tool calls in the response
			
 
				+    for tool_call in tool_calls:
			
 
				+        function_name = tool_call.function.name  # Get the function name
			
 
				+        function_to_call = available_functions[function_name]  # Get the function to call
			
 
				+        function_args = json.loads(tool_call.function.arguments)  # Parse the function arguments
			
 
				+        print('Query found: ', function_args.get("query_name"))
			
 
				+        
			
 
				+        # Call the function with the provided arguments
			
 
				+        function_response = function_to_call(
			
 
				+            query_name=function_args.get("query_name"),
			
 
				+            verified_queries_dict=verified_queries_dict
			
 
				+        )
			
 
				+
			
 
				+    # Print the function response (query result)
			
 
				+    print(function_response)
			
 
				+
			
--- a/recipes/llama_api_providers/Groq/groq-example-templates/verified-sql-function-calling/requirements.txt
+++ b/recipes/llama_api_providers/Groq/groq-example-templates/verified-sql-function-calling/requirements.txt
@@ -0,0 +1,9 @@
 
				+groq
			
 
				+sentence-transformers
			
 
				+langchain_community
			
 
				+scikit-learn
			
 
				+numpy
			
 
				+duckdb
			
 
				+pyyaml
			
 
				+sqlparse
			
 
				+tabulate
			
--- a/recipes/llama_api_providers/Groq/groq-example-templates/verified-sql-function-calling/verified-queries/employees-without-purchases.yaml
+++ b/recipes/llama_api_providers/Groq/groq-example-templates/verified-sql-function-calling/verified-queries/employees-without-purchases.yaml
@@ -0,0 +1,7 @@
 
				+description: Employees without a purchase since Feb 1, 2024
			
 
				+sql: |
			
 
				+  SELECT employees.name as employees_without_purchases
			
 
				+  FROM employees.csv AS employees
			
 
				+  LEFT JOIN purchases.csv AS purchases ON employees.employee_id = purchases.employee_id
			
 
				+  AND purchases.purchase_date > '2024-02-01'
			
 
				+  WHERE purchases.purchase_id IS NULL
			
--- a/recipes/llama_api_providers/Groq/groq-example-templates/verified-sql-function-calling/verified-queries/most-expensive-purchase.yaml
+++ b/recipes/llama_api_providers/Groq/groq-example-templates/verified-sql-function-calling/verified-queries/most-expensive-purchase.yaml
@@ -0,0 +1,9 @@
 
				+description: Employee with the most expensive purchase
			
 
				+sql: |
			
 
				+  SELECT employees.name AS employee_name,
			
 
				+        MAX(amount) AS max_purchase_amount
			
 
				+  FROM purchases.csv AS purchases
			
 
				+  JOIN employees.csv AS employees ON purchases.employee_id = employees.employee_id
			
 
				+  GROUP BY employees.name
			
 
				+  ORDER BY max_purchase_amount DESC
			
 
				+  LIMIT 1
			
--- a/recipes/llama_api_providers/Groq/groq-example-templates/verified-sql-function-calling/verified-queries/most-recent-purchases.yaml
+++ b/recipes/llama_api_providers/Groq/groq-example-templates/verified-sql-function-calling/verified-queries/most-recent-purchases.yaml
@@ -0,0 +1,9 @@
 
				+description: Five most recent purchases
			
 
				+sql: |
			
 
				+  SELECT purchases.product_name,
			
 
				+         purchases.amount,
			
 
				+         employees.name
			
 
				+  FROM purchases.csv AS purchases
			
 
				+  JOIN employees.csv AS employees ON purchases.employee_id = employees.employee_id
			
 
				+  ORDER BY purchases.purchase_date DESC
			
 
				+  LIMIT 5;
			
--- a/recipes/llama_api_providers/Groq/groq-example-templates/verified-sql-function-calling/verified-queries/number-of-teslas.yaml
+++ b/recipes/llama_api_providers/Groq/groq-example-templates/verified-sql-function-calling/verified-queries/number-of-teslas.yaml
@@ -0,0 +1,6 @@
 
				+description: Number of Teslas purchased
			
 
				+sql: |
			
 
				+  SELECT COUNT(*) as number_of_teslas
			
 
				+  FROM purchases.csv AS p
			
 
				+  JOIN employees.csv AS e ON e.employee_id = p.employee_id
			
 
				+  WHERE p.product_name = 'Tesla'
			
--- a/recipes/llama_api_providers/Groq/llama3_cookbook_groq.ipynb
+++ b/recipes/llama_api_providers/Groq/llama3_cookbook_groq.ipynb
--- a/recipes/llama_api_providers/llama3_cookbook_groq.ipynb
+++ b/recipes/llama_api_providers/llama3_cookbook_groq.ipynb
@@ -1,937 +0,0 @@
 
				-{
			
 
				- "cells": [
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "09211e76-286f-4b12-acd7-cfb082dc2d66",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "# Llama 3 Cookbook with LlamaIndex and Groq\n",
			
 
				-    "\n",
			
 
				-    "<a href=\"https://colab.research.google.com/github/meta-llama/llama-recipes/blob/main/recipes/llama_api_providers/llama3_cookbook_groq.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
			
 
				-    "\n",
			
 
				-    "Meta developed and released the Meta [Llama 3](https://ai.meta.com/blog/meta-llama-3/) family of large language models (LLMs), a collection of pretrained and instruction tuned generative text models in 8 and 70B sizes. The Llama 3 instruction tuned models are optimized for dialogue use cases and outperform many of the available open source chat models on common industry benchmarks.\n",
			
 
				-    "\n",
			
 
				-    "In this notebook, we demonstrate how to use Llama 3 with LlamaIndex for a comprehensive set of use cases. \n",
			
 
				-    "1. Basic completion / chat \n",
			
 
				-    "2. Basic RAG (Vector Search, Summarization)\n",
			
 
				-    "3. Advanced RAG (Routing)\n",
			
 
				-    "4. Text-to-SQL \n",
			
 
				-    "5. Structured Data Extraction\n",
			
 
				-    "6. Chat Engine + Memory\n",
			
 
				-    "7. Agents\n",
			
 
				-    "\n",
			
 
				-    "\n",
			
 
				-    "We use Llama3-8B and Llama3-70B through [Groq](https://groq.com) - you can sign up there to get a free trial API key."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "de2901c0-e20d-48e5-9385-dbca2258c564",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "## Installation and Setup"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "bcf643ac-b025-4812-aaed-f8f85d1ba505",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "!pip install llama-index\n",
			
 
				-    "!pip install llama-index-llms-groq\n",
			
 
				-    "!pip install llama-index-embeddings-huggingface\n",
			
 
				-    "!pip install llama-parse"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "641fa5c8-d63e-47f8-b5bc-ebf994f6e314",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "import nest_asyncio\n",
			
 
				-    "\n",
			
 
				-    "nest_asyncio.apply()"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "1714ea83-6cd4-44bb-b53f-4499126c3809",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "### Setup LLM using Groq\n",
			
 
				-    "\n",
			
 
				-    "To use [Groq](https://groq.com), you need to make sure that `GROQ_API_KEY` is specified as an environment variable."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "5d46440c",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "import os\n",
			
 
				-    "\n",
			
 
				-    "os.environ[\"GROQ_API_KEY\"] = \"YOUR_GROQ_API_KEY\""
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "d5256970-eba4-499a-b438-8766a290a61a",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "from llama_index.llms.groq import Groq\n",
			
 
				-    "\n",
			
 
				-    "llm = Groq(model=\"llama3-8b-8192\")\n",
			
 
				-    "llm_70b = Groq(model=\"llama3-70b-8192\")"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "41c3f154-d345-465d-8eed-63b99adbd3ca",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "### Setup Embedding Model"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "0cda736d-e414-44e3-8c15-6be49f5f0282",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n",
			
 
				-    "\n",
			
 
				-    "embed_model = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "3625cf29-7c56-475a-8efd-fbe8ffce194d",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "### Define Global Settings Configuration\n",
			
 
				-    "\n",
			
 
				-    "In LlamaIndex, you can define global settings so you don't have to pass the LLM / embedding model objects everywhere."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "be3565d1-cc5b-4149-ad5a-7be8f7818e0c",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "from llama_index.core import Settings\n",
			
 
				-    "\n",
			
 
				-    "Settings.llm = llm\n",
			
 
				-    "Settings.embed_model = embed_model"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "42449b68-47f5-40cf-9207-191307b25e8e",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "### Download Data\n",
			
 
				-    "\n",
			
 
				-    "Here you'll download data that's used in section 2 and onwards.\n",
			
 
				-    "\n",
			
 
				-    "We'll download some articles on Kendrick, Drake, and their beef (as of May 2024)."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "59b18640-cdfa-42c1-ab53-115983c1fdc4",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "!mkdir data\n",
			
 
				-    "!wget \"https://www.dropbox.com/scl/fi/t1soxfjdp0v44an6sdymd/drake_kendrick_beef.pdf?rlkey=u9546ymb7fj8lk2v64r6p5r5k&st=wjzzrgil&dl=1\" -O data/drake_kendrick_beef.pdf\n",
			
 
				-    "!wget \"https://www.dropbox.com/scl/fi/nts3n64s6kymner2jppd6/drake.pdf?rlkey=hksirpqwzlzqoejn55zemk6ld&st=mohyfyh4&dl=1\" -O data/drake.pdf\n",
			
 
				-    "!wget \"https://www.dropbox.com/scl/fi/8ax2vnoebhmy44bes2n1d/kendrick.pdf?rlkey=fhxvn94t5amdqcv9vshifd3hj&st=dxdtytn6&dl=1\" -O data/kendrick.pdf"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "9edee491-05f8-4fbb-9394-baa82f1e5087",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "### Load Data\n",
			
 
				-    "\n",
			
 
				-    "We load data using LlamaParse by default, but you can also choose to opt for our free pypdf reader (in SimpleDirectoryReader by default) if you don't have an account! \n",
			
 
				-    "\n",
			
 
				-    "1. LlamaParse: Signup for an account here: cloud.llamaindex.ai. You get 1k free pages a day, and paid plan is 7k free pages + 0.3c per additional page. LlamaParse is a good option if you want to parse complex documents, like PDFs with charts, tables, and more. \n",
			
 
				-    "\n",
			
 
				-    "2. Default PDF Parser (In `SimpleDirectoryReader`). If you don't want to signup for an account / use a PDF service, just use the default PyPDF reader bundled in our file loader. It's a good choice for getting started!"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "b648635a-2672-407f-bae6-01660e5426d7",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# Uncomment this code if you want to use LlamaParse\n",
			
 
				-    "# from llama_parse import LlamaParse\n",
			
 
				-    "\n",
			
 
				-    "# docs_kendrick = LlamaParse(result_type=\"text\").load_data(\"./data/kendrick.pdf\")\n",
			
 
				-    "# docs_drake = LlamaParse(result_type=\"text\").load_data(\"./data/drake.pdf\")\n",
			
 
				-    "# docs_both = LlamaParse(result_type=\"text\").load_data(\n",
			
 
				-    "#     \"./data/drake_kendrick_beef.pdf\"\n",
			
 
				-    "# )\n",
			
 
				-    "\n",
			
 
				-    "# Uncomment this code if you want to use SimpleDirectoryReader / default PDF Parser\n",
			
 
				-    "# from llama_index.core import SimpleDirectoryReader\n",
			
 
				-    "\n",
			
 
				-    "# docs_kendrick = SimpleDirectoryReader(input_files=[\"data/kendrick.pdf\"]).load_data()\n",
			
 
				-    "# docs_drake = SimpleDirectoryReader(input_files=[\"data/drake.pdf\"]).load_data()\n",
			
 
				-    "# docs_both = SimpleDirectoryReader(input_files=[\"data/drake_kendrick_beef.pdf\"]).load_data()"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "071a8f44-2765-4d57-b8da-15d3c718874d",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "## 1. Basic Completion and Chat"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "c0b1ace8-32fb-46b2-a065-8817ddc0310b",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "### Call complete with a prompt"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "a2db43f9-74af-453c-9f83-8db0379c3302",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "response = llm.complete(\"do you like drake or kendrick better?\")\n",
			
 
				-    "\n",
			
 
				-    "print(response)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "89326153-e2d2-4136-8193-fb27d20670c3",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "stream_response = llm.stream_complete(\n",
			
 
				-    "    \"you're a drake fan. tell me why you like drake more than kendrick\"\n",
			
 
				-    ")\n",
			
 
				-    "\n",
			
 
				-    "for t in stream_response:\n",
			
 
				-    "    print(t.delta, end=\"\")"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "a4558339-c8a1-4d26-a430-eb71768b5351",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "### Call chat with a list of messages"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "5f393031-f743-4a28-a122-71817e3fbd1b",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "from llama_index.core.llms import ChatMessage\n",
			
 
				-    "\n",
			
 
				-    "messages = [\n",
			
 
				-    "    ChatMessage(role=\"system\", content=\"You are Kendrick.\"),\n",
			
 
				-    "    ChatMessage(role=\"user\", content=\"Write a verse.\"),\n",
			
 
				-    "]\n",
			
 
				-    "response = llm.chat(messages)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "8e9551fc-0efc-4671-bc57-339121004c39",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "print(response)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "6a67a33d-fe7d-4381-983f-ca3a6945995d",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "## 2. Basic RAG (Vector Search, Summarization)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "c104a0c5-e43b-475b-9fa6-186906c1f327",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "### Basic RAG (Vector Search)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "216787b7-e40a-43fc-a4ca-c43cb798ce9e",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "from llama_index.core import VectorStoreIndex\n",
			
 
				-    "\n",
			
 
				-    "index = VectorStoreIndex.from_documents(docs_both)\n",
			
 
				-    "query_engine = index.as_query_engine(similarity_top_k=3)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "a854e9d3-70f1-4927-a2f6-59e90c31f2f0",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "response = query_engine.query(\"Tell me about family matters\")"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "da796970-bc38-4cb4-9d32-ebd1b71d4bdc",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "print(str(response))"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "eff935b7-4f37-4758-8997-82fb0852e732",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "### Basic RAG (Summarization)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "dfe72300-7a38-453e-b1f2-bc1c00a01ff7",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "from llama_index.core import SummaryIndex\n",
			
 
				-    "\n",
			
 
				-    "summary_index = SummaryIndex.from_documents(docs_both)\n",
			
 
				-    "summary_engine = summary_index.as_query_engine()"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "178f1f12-51f7-4b45-9346-c16ed12b3b8d",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "response = summary_engine.query(\n",
			
 
				-    "    \"Given your assessment of this article, who won the beef?\"\n",
			
 
				-    ")"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "b8125382-d576-4b99-a0da-2fbb71a5b19b",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "print(str(response))"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "68918eb6-f1e6-460c-b1d5-fb49c3fed4b8",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "## 3. Advanced RAG (Routing)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "94fd7097-0287-4522-8e43-3e088291fa8a",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "### Build a Router that can choose whether to do vector search or summarization"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "3949dd41-e9a1-47f6-900f-4f987cad3f84",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "from llama_index.core.tools import QueryEngineTool, ToolMetadata\n",
			
 
				-    "\n",
			
 
				-    "vector_tool = QueryEngineTool(\n",
			
 
				-    "    index.as_query_engine(),\n",
			
 
				-    "    metadata=ToolMetadata(\n",
			
 
				-    "        name=\"vector_search\",\n",
			
 
				-    "        description=\"Useful for searching for specific facts.\",\n",
			
 
				-    "    ),\n",
			
 
				-    ")\n",
			
 
				-    "\n",
			
 
				-    "summary_tool = QueryEngineTool(\n",
			
 
				-    "    index.as_query_engine(response_mode=\"tree_summarize\"),\n",
			
 
				-    "    metadata=ToolMetadata(\n",
			
 
				-    "        name=\"summary\",\n",
			
 
				-    "        description=\"Useful for summarizing an entire document.\",\n",
			
 
				-    "    ),\n",
			
 
				-    ")"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "d063d07b-c03e-4b26-8556-e3c058d2fd52",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "from llama_index.core.query_engine import RouterQueryEngine\n",
			
 
				-    "\n",
			
 
				-    "query_engine = RouterQueryEngine.from_defaults(\n",
			
 
				-    "    [vector_tool, summary_tool], select_multi=False, verbose=True, llm=llm_70b\n",
			
 
				-    ")\n",
			
 
				-    "\n",
			
 
				-    "response = query_engine.query(\n",
			
 
				-    "    \"Tell me about the song meet the grahams - why is it significant\"\n",
			
 
				-    ")"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "396aad75-5a71-4bd9-a760-7f13fe223079",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "print(response)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "a795f0bc-e871-4580-8983-6fb27d421fc5",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "## 4. Text-to-SQL \n",
			
 
				-    "\n",
			
 
				-    "Here, we download and use a sample SQLite database with 11 tables, with various info about music, playlists, and customers. We will limit to a select few tables for this test."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "a5096501-92c3-41af-a871-ade869d710fb",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "!wget \"https://www.sqlitetutorial.net/wp-content/uploads/2018/03/chinook.zip\" -O \"./data/chinook.zip\"\n",
			
 
				-    "!unzip \"./data/chinook.zip\""
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "d4db989e-c18d-4416-928e-7be4ead4d869",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "from sqlalchemy import (\n",
			
 
				-    "    create_engine,\n",
			
 
				-    "    MetaData,\n",
			
 
				-    "    Table,\n",
			
 
				-    "    Column,\n",
			
 
				-    "    String,\n",
			
 
				-    "    Integer,\n",
			
 
				-    "    select,\n",
			
 
				-    "    column,\n",
			
 
				-    ")\n",
			
 
				-    "\n",
			
 
				-    "engine = create_engine(\"sqlite:///chinook.db\")"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "bf6ed233-0ea3-4d4f-8c33-5b6d558b89b9",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "from llama_index.core import SQLDatabase\n",
			
 
				-    "\n",
			
 
				-    "sql_database = SQLDatabase(engine)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "debae423-1004-40f6-9356-e1c3add4d965",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "from llama_index.core.indices.struct_store import NLSQLTableQueryEngine\n",
			
 
				-    "\n",
			
 
				-    "query_engine = NLSQLTableQueryEngine(\n",
			
 
				-    "    sql_database=sql_database,\n",
			
 
				-    "    tables=[\"albums\", \"tracks\", \"artists\"],\n",
			
 
				-    "    llm=llm_70b,\n",
			
 
				-    ")"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "a65ecd70-09c4-4872-b712-3a8235d03db2",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "response = query_engine.query(\"What are some albums?\")\n",
			
 
				-    "\n",
			
 
				-    "print(response)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "c12b93ef-d6d1-4d15-9cb2-343070f72851",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "response = query_engine.query(\"What are some artists? Limit it to 5.\")\n",
			
 
				-    "\n",
			
 
				-    "print(response)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "2c243d38-c6ac-445c-b9d4-53a9ae013b7b",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "This last query should be a more complex join"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "553741c2-1050-445d-979a-ae2150ee3248",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "response = query_engine.query(\n",
			
 
				-    "    \"What are some tracks from the artist AC/DC? Limit it to 3\"\n",
			
 
				-    ")\n",
			
 
				-    "\n",
			
 
				-    "print(response)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "300689d7-9e67-4404-9898-27404ee6d4b5",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "print(response.metadata[\"sql_query\"])"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "1419fe67-aa6a-47db-88cd-9bb251c15615",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "## 5. Structured Data Extraction\n",
			
 
				-    "\n",
			
 
				-    "An important use case for function calling is extracting structured objects. LlamaIndex provides an intuitive interface for this through `structured_predict` - simply define the target Pydantic class (can be nested), and given a prompt, we extract out the desired object.\n",
			
 
				-    "\n",
			
 
				-    "**NOTE**: Since there's no native function calling support with Llama3, the structured extraction is performed by prompting the LLM + output parsing."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "4432f35a-5f29-45e9-a928-32e6d77b158e",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "from llama_index.llms.groq import Groq\n",
			
 
				-    "from llama_index.core.prompts import PromptTemplate\n",
			
 
				-    "from pydantic import BaseModel\n",
			
 
				-    "\n",
			
 
				-    "\n",
			
 
				-    "class Restaurant(BaseModel):\n",
			
 
				-    "    \"\"\"A restaurant with name, city, and cuisine.\"\"\"\n",
			
 
				-    "\n",
			
 
				-    "    name: str\n",
			
 
				-    "    city: str\n",
			
 
				-    "    cuisine: str\n",
			
 
				-    "\n",
			
 
				-    "\n",
			
 
				-    "llm = Groq(model=\"llama3-8b-8192\", pydantic_program_mode=\"llm\")\n",
			
 
				-    "prompt_tmpl = PromptTemplate(\n",
			
 
				-    "    \"Generate a restaurant in a given city {city_name}\"\n",
			
 
				-    ")"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "2c451f52-a051-4ba2-a683-0c1fd258d986",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "restaurant_obj = llm.structured_predict(\n",
			
 
				-    "    Restaurant, prompt_tmpl, city_name=\"Miami\"\n",
			
 
				-    ")\n",
			
 
				-    "print(restaurant_obj)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "839018a9-b65f-4824-83f7-2e4e52b55c5d",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "## 6. Adding Chat History to RAG (Chat Engine)\n",
			
 
				-    "\n",
			
 
				-    "In this section we create a stateful chatbot from a RAG pipeline, with our chat engine abstraction.\n",
			
 
				-    "\n",
			
 
				-    "Unlike a stateless query engine, the chat engine maintains conversation history (through a memory module like buffer memory). It performs retrieval given a condensed question, and feeds the condensed question + context + chat history into the final LLM prompt.\n",
			
 
				-    "\n",
			
 
				-    "Related resource: https://docs.llamaindex.ai/en/stable/examples/chat_engine/chat_engine_condense_plus_context/"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "27e56315-9513-4b32-bf9a-ce97c3ab52df",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "from llama_index.core.memory import ChatMemoryBuffer\n",
			
 
				-    "from llama_index.core.chat_engine import CondensePlusContextChatEngine\n",
			
 
				-    "\n",
			
 
				-    "memory = ChatMemoryBuffer.from_defaults(token_limit=3900)\n",
			
 
				-    "\n",
			
 
				-    "chat_engine = CondensePlusContextChatEngine.from_defaults(\n",
			
 
				-    "    index.as_retriever(),\n",
			
 
				-    "    memory=memory,\n",
			
 
				-    "    llm=llm,\n",
			
 
				-    "    context_prompt=(\n",
			
 
				-    "        \"You are a chatbot, able to have normal interactions, as well as talk\"\n",
			
 
				-    "        \" about the Kendrick and Drake beef.\"\n",
			
 
				-    "        \"Here are the relevant documents for the context:\\n\"\n",
			
 
				-    "        \"{context_str}\"\n",
			
 
				-    "        \"\\nInstruction: Use the previous chat history, or the context above, to interact and help the user.\"\n",
			
 
				-    "    ),\n",
			
 
				-    "    verbose=True,\n",
			
 
				-    ")"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "b24524d2-fdce-4237-8ecc-67f139302303",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "response = chat_engine.chat(\n",
			
 
				-    "    \"Tell me about the songs Drake released in the beef.\"\n",
			
 
				-    ")\n",
			
 
				-    "print(str(response))"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "f9a87a16-2864-4c48-95e7-a2103e119242",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "response = chat_engine.chat(\"What about Kendrick?\")\n",
			
 
				-    "print(str(response))"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "a7fa07ed-58f0-445e-bbd3-4ad8bac6598e",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "## 7. Agents\n",
			
 
				-    "\n",
			
 
				-    "Here we build agents with Llama 3. We perform RAG over simple functions as well as the documents above."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "aa98d735-5d43-413f-aab3-fc3adeed81b1",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "### Agents And Tools"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "fb73a01f-8a2e-4dd6-91f8-710c92b81c56",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "import json\n",
			
 
				-    "from typing import Sequence, List\n",
			
 
				-    "\n",
			
 
				-    "from llama_index.core.llms import ChatMessage\n",
			
 
				-    "from llama_index.core.tools import BaseTool, FunctionTool\n",
			
 
				-    "from llama_index.core.agent import ReActAgent\n",
			
 
				-    "\n",
			
 
				-    "import nest_asyncio\n",
			
 
				-    "\n",
			
 
				-    "nest_asyncio.apply()"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "efbee832-9786-4551-93f2-01ee90fa0f4d",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "### Define Tools"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "b2058b36-8053-4dc8-9218-c286702ecf66",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "def multiply(a: int, b: int) -> int:\n",
			
 
				-    "    \"\"\"Multiple two integers and returns the result integer\"\"\"\n",
			
 
				-    "    return a * b\n",
			
 
				-    "\n",
			
 
				-    "\n",
			
 
				-    "def add(a: int, b: int) -> int:\n",
			
 
				-    "    \"\"\"Add two integers and returns the result integer\"\"\"\n",
			
 
				-    "    return a + b\n",
			
 
				-    "\n",
			
 
				-    "\n",
			
 
				-    "def subtract(a: int, b: int) -> int:\n",
			
 
				-    "    \"\"\"Subtract two integers and returns the result integer\"\"\"\n",
			
 
				-    "    return a - b\n",
			
 
				-    "\n",
			
 
				-    "\n",
			
 
				-    "def divide(a: int, b: int) -> int:\n",
			
 
				-    "    \"\"\"Divides two integers and returns the result integer\"\"\"\n",
			
 
				-    "    return a / b\n",
			
 
				-    "\n",
			
 
				-    "\n",
			
 
				-    "multiply_tool = FunctionTool.from_defaults(fn=multiply)\n",
			
 
				-    "add_tool = FunctionTool.from_defaults(fn=add)\n",
			
 
				-    "subtract_tool = FunctionTool.from_defaults(fn=subtract)\n",
			
 
				-    "divide_tool = FunctionTool.from_defaults(fn=divide)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "22d7d4dc-e2ce-402c-9350-0e7010d0080c",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "### ReAct Agent"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "72a48053-e30d-4884-bcac-80752047d940",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "agent = ReActAgent.from_tools(\n",
			
 
				-    "    [multiply_tool, add_tool, subtract_tool, divide_tool],\n",
			
 
				-    "    llm=llm_70b,\n",
			
 
				-    "    verbose=True,\n",
			
 
				-    ")"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "7ada828a-3b05-4fc1-90e8-986c5607ae61",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "### Querying"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "9c0b1e56-d9f7-4615-a15a-c91fea1adb00",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "response = agent.chat(\"What is (121 + 2) * 5?\")\n",
			
 
				-    "print(str(response))"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "67ce45f6-bdd4-42aa-8f74-43a50f14094e",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "### ReAct Agent With RAG QueryEngine Tools"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "97fce5f1-eacf-4ecc-9e83-072e74d3a2a9",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "from llama_index.core import (\n",
			
 
				-    "    SimpleDirectoryReader,\n",
			
 
				-    "    VectorStoreIndex,\n",
			
 
				-    "    StorageContext,\n",
			
 
				-    "    load_index_from_storage,\n",
			
 
				-    ")\n",
			
 
				-    "\n",
			
 
				-    "from llama_index.core.tools import QueryEngineTool, ToolMetadata"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "23963d00-e3d2-4ce1-9ac3-aa486bf4b1a5",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "### Create ReAct Agent using RAG QueryEngine Tools"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "1844dbbd-477c-4c4d-bb18-2c2e16a75a50",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "This may take 4 minutes to run:"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "66ab1e60-3374-4eb9-b7dc-c28db3b47c51",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "drake_index = VectorStoreIndex.from_documents(docs_drake)\n",
			
 
				-    "drake_query_engine = drake_index.as_query_engine(similarity_top_k=3)\n",
			
 
				-    "\n",
			
 
				-    "kendrick_index = VectorStoreIndex.from_documents(docs_kendrick)\n",
			
 
				-    "kendrick_query_engine = kendrick_index.as_query_engine(similarity_top_k=3)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "0e241fe9-f390-4be5-b3c4-da4f56db01ef",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "drake_tool = QueryEngineTool(\n",
			
 
				-    "    drake_index.as_query_engine(),\n",
			
 
				-    "    metadata=ToolMetadata(\n",
			
 
				-    "        name=\"drake_search\",\n",
			
 
				-    "        description=\"Useful for searching over Drake's life.\",\n",
			
 
				-    "    ),\n",
			
 
				-    ")\n",
			
 
				-    "\n",
			
 
				-    "kendrick_tool = QueryEngineTool(\n",
			
 
				-    "    kendrick_index.as_query_engine(),\n",
			
 
				-    "    metadata=ToolMetadata(\n",
			
 
				-    "        name=\"kendrick_search\",\n",
			
 
				-    "        description=\"Useful for searching over Kendrick's life.\",\n",
			
 
				-    "    ),\n",
			
 
				-    ")\n",
			
 
				-    "\n",
			
 
				-    "query_engine_tools = [drake_tool, kendrick_tool]"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "b922feac-b221-4737-92c6-e63eeab4eab7",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "agent = ReActAgent.from_tools(\n",
			
 
				-    "    query_engine_tools,\n",
			
 
				-    "    llm=llm_70b,\n",
			
 
				-    "    verbose=True,\n",
			
 
				-    ")"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "7e38edc8-47f8-4f1a-ad87-bc3a9e31a65e",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "### Querying"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "035c2c8b-5a5e-4df0-a423-4c2d6054f457",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "response = agent.chat(\"Tell me about how Kendrick and Drake grew up\")\n",
			
 
				-    "print(str(response))"
			
 
				-   ]
			
 
				-  }
			
 
				- ],
			
 
				- "metadata": {
			
 
				-  "kernelspec": {
			
 
				-   "display_name": "Python 3 (ipykernel)",
			
 
				-   "language": "python",
			
 
				-   "name": "python3"
			
 
				-  },
			
 
				-  "language_info": {
			
 
				-   "codemirror_mode": {
			
 
				-    "name": "ipython",
			
 
				-    "version": 3
			
 
				-   },
			
 
				-   "file_extension": ".py",
			
 
				-   "mimetype": "text/x-python",
			
 
				-   "name": "python",
			
 
				-   "nbconvert_exporter": "python",
			
 
				-   "pygments_lexer": "ipython3",
			
 
				-   "version": "3.10.14"
			
 
				-  }
			
 
				- },
			
 
				- "nbformat": 4,
			
 
				- "nbformat_minor": 5
			
 
				-}
			
--- a/recipes/use_cases/agents/langchain/langgraph-tool-calling-agent.ipynb
+++ b/recipes/use_cases/agents/langchain/langgraph-tool-calling-agent.ipynb
@@ -138,8 +138,11 @@
 
				    "source": [
			
 
				     "import replicate\n",
			
 
				     "\n",
			
 
				+    "from langchain_core.messages import ToolMessage\n",
			
 
				+    "from langchain_core.runnables import RunnableLambda\n",
			
 
				     "from langchain_core.tools import tool\n",
			
 
				     "from langgraph.prebuilt import ToolNode\n",
			
 
				+    "\n",
			
 
				     "from langchain_community.tools.tavily_search import TavilySearchResults\n",
			
 
				     "\n",
			
 
				     "@tool\n",
			
@@ -375,7 +378,6 @@
 
				     "from langgraph.checkpoint.sqlite import SqliteSaver\n",
			
 
				     "from langgraph.graph import END, StateGraph\n",
			
 
				     "from langgraph.prebuilt import ToolNode, tools_condition\n",
			
 
				-    "from langchain_core.runnables import RunnableLambda\n",
			
 
				     "\n",
			
 
				     "# Graph\n",
			
 
				     "builder = StateGraph(State)\n",
			
@@ -690,7 +692,7 @@
 
				    "name": "python",
			
 
				    "nbconvert_exporter": "python",
			
 
				    "pygments_lexer": "ipython3",
			
 
				-   "version": "3.10.14"
			
 
				+   "version": "3.11.8"
			
 
				   }
			
 
				  },
			
 
				  "nbformat": 4,
			
--- a/src/llama_recipes/finetuning.py
+++ b/src/llama_recipes/finetuning.py
@@ -151,7 +151,7 @@ def main(**kwargs):
 
				 
			
 
				     hsdp_device_mesh = None
			
 
				     if fsdp_config.hsdp and fsdp_config.sharding_strategy == ShardingStrategy.HYBRID_SHARD:
			
 
				-        hsdp_device_mesh = hsdp_device_mesh(replica_group_size=fsdp_config.replica_group_size, sharding_group_size=fsdp_config.sharding_group_size)
			
 
				+        hsdp_device_mesh_plan = hsdp_device_mesh(replica_group_size=fsdp_config.replica_group_size, sharding_group_size=fsdp_config.sharding_group_size)
			
 
				         print("HSDP device mesh is ready")
			
 
				 
			
 
				     #setting up FSDP if enable_fsdp is enabled
			
@@ -173,7 +173,7 @@ def main(**kwargs):
 
				             cpu_offload=CPUOffload(offload_params=True) if fsdp_config.fsdp_cpu_offload else None,
			
 
				             mixed_precision=mixed_precision_policy if not fsdp_config.pure_bf16 else None,
			
 
				             sharding_strategy=fsdp_config.sharding_strategy,
			
 
				-            device_mesh=hsdp_device_mesh,
			
 
				+            device_mesh=hsdp_device_mesh_plan,
			
 
				             device_id=device_id,
			
 
				             limit_all_gathers=True,
			
 
				             sync_module_states=train_config.low_cpu_fsdp,